@researai/deepscientist 1.5.16 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +309 -130
- package/AISB/catalog/aisb.b1.agentic_coding.yaml +244 -0
- package/AISB/catalog/aisb.b10.climate_earth.yaml +235 -0
- package/AISB/catalog/aisb.b11.model_efficiency.yaml +231 -0
- package/AISB/catalog/aisb.b12.embodied_ai.yaml +238 -0
- package/AISB/catalog/aisb.b2.agent_systems.yaml +229 -0
- package/AISB/catalog/aisb.b3.self_evolving_rl.yaml +237 -0
- package/AISB/catalog/aisb.b4.lm_reasoning.yaml +240 -0
- package/AISB/catalog/aisb.b5.math_proof.yaml +235 -0
- package/AISB/catalog/aisb.b6.research_process.yaml +243 -0
- package/AISB/catalog/aisb.b7.multimodal_fusion.yaml +232 -0
- package/AISB/catalog/aisb.b8.lifesci_drug.yaml +275 -0
- package/AISB/catalog/aisb.b9.material_science.yaml +237 -0
- package/AISB/catalog/aisb.t3.001_savvy.yaml +159 -0
- package/AISB/catalog/aisb.t3.001_savvy.zh.yaml +121 -0
- package/AISB/catalog/aisb.t3.002_pinet.yaml +189 -0
- package/AISB/catalog/aisb.t3.002_pinet.zh.yaml +130 -0
- package/AISB/catalog/aisb.t3.004_decentralattn.yaml +184 -0
- package/AISB/catalog/aisb.t3.004_decentralattn.zh.yaml +153 -0
- package/AISB/catalog/aisb.t3.005_tsae.yaml +193 -0
- package/AISB/catalog/aisb.t3.005_tsae.zh.yaml +139 -0
- package/AISB/catalog/aisb.t3.006_physense.yaml +194 -0
- package/AISB/catalog/aisb.t3.006_physense.zh.yaml +118 -0
- package/AISB/catalog/aisb.t3.007_reasoningiqa.yaml +169 -0
- package/AISB/catalog/aisb.t3.007_reasoningiqa.zh.yaml +133 -0
- package/AISB/catalog/aisb.t3.008_meanflows.yaml +188 -0
- package/AISB/catalog/aisb.t3.008_meanflows.zh.yaml +140 -0
- package/AISB/catalog/aisb.t3.009_scoremissing.yaml +179 -0
- package/AISB/catalog/aisb.t3.009_scoremissing.zh.yaml +119 -0
- package/AISB/catalog/aisb.t3.010_suitabilityfilter.yaml +221 -0
- package/AISB/catalog/aisb.t3.010_suitabilityfilter.zh.yaml +141 -0
- package/AISB/catalog/aisb.t3.011_osd.yaml +206 -0
- package/AISB/catalog/aisb.t3.011_osd.zh.yaml +163 -0
- package/AISB/catalog/aisb.t3.012_efficientqat.yaml +206 -0
- package/AISB/catalog/aisb.t3.012_efficientqat.zh.yaml +159 -0
- package/AISB/catalog/aisb.t3.013_appl.yaml +152 -0
- package/AISB/catalog/aisb.t3.013_appl.zh.yaml +126 -0
- package/AISB/catalog/aisb.t3.014_piguard.yaml +207 -0
- package/AISB/catalog/aisb.t3.014_piguard.zh.yaml +164 -0
- package/AISB/catalog/aisb.t3.015_frspec.yaml +209 -0
- package/AISB/catalog/aisb.t3.015_frspec.zh.yaml +163 -0
- package/AISB/catalog/aisb.t3.016_mathfusion.yaml +166 -0
- package/AISB/catalog/aisb.t3.016_mathfusion.zh.yaml +145 -0
- package/AISB/catalog/aisb.t3.017_multimodalglp.yaml +171 -0
- package/AISB/catalog/aisb.t3.017_multimodalglp.zh.yaml +122 -0
- package/AISB/catalog/aisb.t3.018_cotsynth.yaml +206 -0
- package/AISB/catalog/aisb.t3.018_cotsynth.zh.yaml +162 -0
- package/AISB/catalog/aisb.t3.019_dyscaleut.yaml +211 -0
- package/AISB/catalog/aisb.t3.019_dyscaleut.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.020_aristotle.yaml +173 -0
- package/AISB/catalog/aisb.t3.020_aristotle.zh.yaml +119 -0
- package/AISB/catalog/aisb.t3.021_tokenrecycling.yaml +160 -0
- package/AISB/catalog/aisb.t3.021_tokenrecycling.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.022_chainofreasoning.yaml +204 -0
- package/AISB/catalog/aisb.t3.022_chainofreasoning.zh.yaml +161 -0
- package/AISB/catalog/aisb.t3.023_guidedembed.yaml +211 -0
- package/AISB/catalog/aisb.t3.023_guidedembed.zh.yaml +189 -0
- package/AISB/catalog/aisb.t3.024_outputcentric.yaml +148 -0
- package/AISB/catalog/aisb.t3.024_outputcentric.zh.yaml +131 -0
- package/AISB/catalog/aisb.t3.025_deeper.yaml +143 -0
- package/AISB/catalog/aisb.t3.025_deeper.zh.yaml +116 -0
- package/AISB/catalog/aisb.t3.026_gartkg.yaml +195 -0
- package/AISB/catalog/aisb.t3.026_gartkg.zh.yaml +127 -0
- package/AISB/catalog/aisb.t3.027_citeeval.yaml +182 -0
- package/AISB/catalog/aisb.t3.027_citeeval.zh.yaml +135 -0
- package/AISB/catalog/aisb.t3.028_sbam.yaml +206 -0
- package/AISB/catalog/aisb.t3.028_sbam.zh.yaml +166 -0
- package/AISB/catalog/aisb.t3.029_cdqgeoembed.yaml +224 -0
- package/AISB/catalog/aisb.t3.029_cdqgeoembed.zh.yaml +142 -0
- package/AISB/catalog/aisb.t3.030_processrm.yaml +211 -0
- package/AISB/catalog/aisb.t3.030_processrm.zh.yaml +166 -0
- package/AISB/catalog/aisb.t3.031_circuitstability.yaml +172 -0
- package/AISB/catalog/aisb.t3.031_circuitstability.zh.yaml +134 -0
- package/AISB/catalog/aisb.t3.032_ptsolver.yaml +169 -0
- package/AISB/catalog/aisb.t3.032_ptsolver.zh.yaml +135 -0
- package/AISB/catalog/aisb.t3.033_gcse.yaml +144 -0
- package/AISB/catalog/aisb.t3.033_gcse.zh.yaml +126 -0
- package/AISB/catalog/aisb.t3.034_ensemblewm.yaml +183 -0
- package/AISB/catalog/aisb.t3.034_ensemblewm.zh.yaml +146 -0
- package/AISB/catalog/aisb.t3.035_moralvalueswa.yaml +207 -0
- package/AISB/catalog/aisb.t3.035_moralvalueswa.zh.yaml +165 -0
- package/AISB/catalog/aisb.t3.036_weakstrongpref.yaml +210 -0
- package/AISB/catalog/aisb.t3.036_weakstrongpref.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.037_dementiamask.yaml +172 -0
- package/AISB/catalog/aisb.t3.037_dementiamask.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.038_tinysam.yaml +284 -0
- package/AISB/catalog/aisb.t3.038_tinysam.zh.yaml +240 -0
- package/AISB/catalog/aisb.t3.039_calf.yaml +224 -0
- package/AISB/catalog/aisb.t3.039_calf.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.040_graniteguardian.yaml +199 -0
- package/AISB/catalog/aisb.t3.040_graniteguardian.zh.yaml +174 -0
- package/AISB/catalog/aisb.t3.041_amdm.yaml +149 -0
- package/AISB/catalog/aisb.t3.041_amdm.zh.yaml +137 -0
- package/AISB/catalog/aisb.t3.042_xpatch.yaml +216 -0
- package/AISB/catalog/aisb.t3.042_xpatch.zh.yaml +182 -0
- package/AISB/catalog/aisb.t3.043_vhm.yaml +268 -0
- package/AISB/catalog/aisb.t3.043_vhm.zh.yaml +193 -0
- package/AISB/catalog/aisb.t3.044_rgvi.yaml +224 -0
- package/AISB/catalog/aisb.t3.044_rgvi.zh.yaml +176 -0
- package/AISB/catalog/aisb.t3.045_pslstm.yaml +203 -0
- package/AISB/catalog/aisb.t3.045_pslstm.zh.yaml +179 -0
- package/AISB/catalog/aisb.t3.046_nonstatts.yaml +208 -0
- package/AISB/catalog/aisb.t3.046_nonstatts.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.047_timepfn.yaml +156 -0
- package/AISB/catalog/aisb.t3.047_timepfn.zh.yaml +124 -0
- package/AISB/catalog/aisb.t3.048_proxyspex.yaml +148 -0
- package/AISB/catalog/aisb.t3.048_proxyspex.zh.yaml +125 -0
- package/AISB/catalog/aisb.t3.049_hogwildinference.yaml +183 -0
- package/AISB/catalog/aisb.t3.049_hogwildinference.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.050_causalpfn.yaml +214 -0
- package/AISB/catalog/aisb.t3.050_causalpfn.zh.yaml +190 -0
- package/AISB/catalog/aisb.t3.051_flashtp.yaml +169 -0
- package/AISB/catalog/aisb.t3.051_flashtp.zh.yaml +124 -0
- package/AISB/catalog/aisb.t3.052_nsdiff.yaml +155 -0
- package/AISB/catalog/aisb.t3.052_nsdiff.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.053_k2vae.yaml +158 -0
- package/AISB/catalog/aisb.t3.053_k2vae.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.054_timebase.yaml +178 -0
- package/AISB/catalog/aisb.t3.054_timebase.zh.yaml +158 -0
- package/AISB/catalog/aisb.t3.055_csbrain.yaml +238 -0
- package/AISB/catalog/aisb.t3.055_csbrain.zh.yaml +184 -0
- package/AISB/catalog/aisb.t3.056_infosam.yaml +224 -0
- package/AISB/catalog/aisb.t3.056_infosam.zh.yaml +189 -0
- package/AISB/catalog/aisb.t3.057_mdreid.yaml +129 -0
- package/AISB/catalog/aisb.t3.057_mdreid.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.058_mindglitch.yaml +171 -0
- package/AISB/catalog/aisb.t3.058_mindglitch.zh.yaml +145 -0
- package/AISB/catalog/aisb.t3.059_selfsupervised.yaml +154 -0
- package/AISB/catalog/aisb.t3.059_selfsupervised.zh.yaml +125 -0
- package/AISB/catalog/aisb.t3.060_iaggad.yaml +121 -0
- package/AISB/catalog/aisb.t3.060_iaggad.zh.yaml +100 -0
- package/AISB/catalog/aisb.t3.061_hsgkn.yaml +136 -0
- package/AISB/catalog/aisb.t3.061_hsgkn.zh.yaml +113 -0
- package/AISB/catalog/aisb.t3.062_visionts.yaml +237 -0
- package/AISB/catalog/aisb.t3.062_visionts.zh.yaml +216 -0
- package/AISB/catalog/aisb.t3.063_tsrag.yaml +162 -0
- package/AISB/catalog/aisb.t3.063_tsrag.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.064_pir.yaml +221 -0
- package/AISB/catalog/aisb.t3.064_pir.zh.yaml +197 -0
- package/AISB/catalog/aisb.t3.065_proteinbinding.yaml +234 -0
- package/AISB/catalog/aisb.t3.065_proteinbinding.zh.yaml +167 -0
- package/AISB/catalog/aisb.t3.066_tropicalattention.yaml +267 -0
- package/AISB/catalog/aisb.t3.066_tropicalattention.zh.yaml +229 -0
- package/AISB/catalog/aisb.t3.067_kanad.yaml +193 -0
- package/AISB/catalog/aisb.t3.067_kanad.zh.yaml +167 -0
- package/AISB/catalog/aisb.t3.068_sempo.yaml +187 -0
- package/AISB/catalog/aisb.t3.068_sempo.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.069_treehfd.yaml +129 -0
- package/AISB/catalog/aisb.t3.069_treehfd.zh.yaml +111 -0
- package/AISB/catalog/aisb.t3.070_certifiedunlearning.yaml +224 -0
- package/AISB/catalog/aisb.t3.070_certifiedunlearning.zh.yaml +171 -0
- package/AISB/catalog/aisb.t3.071_neuralmjd.yaml +142 -0
- package/AISB/catalog/aisb.t3.071_neuralmjd.zh.yaml +120 -0
- package/AISB/catalog/aisb.t3.072_fedgmt.yaml +181 -0
- package/AISB/catalog/aisb.t3.072_fedgmt.zh.yaml +158 -0
- package/AISB/catalog/aisb.t3.073_rld.yaml +161 -0
- package/AISB/catalog/aisb.t3.073_rld.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.074_lsvi.yaml +163 -0
- package/AISB/catalog/aisb.t3.074_lsvi.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.075_treeslicedentropy.yaml +201 -0
- package/AISB/catalog/aisb.t3.075_treeslicedentropy.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.076_aanet.yaml +169 -0
- package/AISB/catalog/aisb.t3.076_aanet.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.077_cmnn.yaml +199 -0
- package/AISB/catalog/aisb.t3.077_cmnn.zh.yaml +165 -0
- package/AISB/catalog/aisb.t3.078_conformalanomaly.yaml +146 -0
- package/AISB/catalog/aisb.t3.078_conformalanomaly.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.079_dpfkmeans.yaml +131 -0
- package/AISB/catalog/aisb.t3.079_dpfkmeans.zh.yaml +104 -0
- package/AISB/catalog/aisb.t3.080_latentscorereweight.yaml +169 -0
- package/AISB/catalog/aisb.t3.080_latentscorereweight.zh.yaml +123 -0
- package/AISB/catalog/aisb.t3.081_qmamba.yaml +150 -0
- package/AISB/catalog/aisb.t3.081_qmamba.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.082_onlinellmrouting.yaml +160 -0
- package/AISB/catalog/aisb.t3.082_onlinellmrouting.zh.yaml +133 -0
- package/AISB/catalog/aisb.t3.083_starformer.yaml +178 -0
- package/AISB/catalog/aisb.t3.083_starformer.zh.yaml +140 -0
- package/AISB/catalog/aisb.t3.084_ift.yaml +139 -0
- package/AISB/catalog/aisb.t3.084_ift.zh.yaml +111 -0
- package/AISB/catalog/aisb.t3.085_neuralsurv.yaml +183 -0
- package/AISB/catalog/aisb.t3.085_neuralsurv.zh.yaml +143 -0
- package/AISB/catalog/aisb.t3.086_stella.yaml +197 -0
- package/AISB/catalog/aisb.t3.086_stella.zh.yaml +142 -0
- package/AISB/catalog/aisb.t3.087_moses.yaml +167 -0
- package/AISB/catalog/aisb.t3.087_moses.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.088_channelnorm.yaml +140 -0
- package/AISB/catalog/aisb.t3.088_channelnorm.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.089_causalvelocity.yaml +730 -0
- package/AISB/catalog/aisb.t3.089_causalvelocity.zh.yaml +668 -0
- package/AISB/catalog/aisb.t3.090_rstib.yaml +144 -0
- package/AISB/catalog/aisb.t3.090_rstib.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.091_timeawarecausal.yaml +132 -0
- package/AISB/catalog/aisb.t3.091_timeawarecausal.zh.yaml +107 -0
- package/AISB/catalog/aisb.t3.092_kmeanslocalopt.yaml +138 -0
- package/AISB/catalog/aisb.t3.092_kmeanslocalopt.zh.yaml +110 -0
- package/AISB/catalog/aisb.t3.093_fedwmsam.yaml +134 -0
- package/AISB/catalog/aisb.t3.093_fedwmsam.zh.yaml +106 -0
- package/AISB/catalog/aisb.t3.094_boundre.yaml +147 -0
- package/AISB/catalog/aisb.t3.094_boundre.zh.yaml +114 -0
- package/AISB/catalog/aisb.t3.095_fastfeaturecp.yaml +153 -0
- package/AISB/catalog/aisb.t3.095_fastfeaturecp.zh.yaml +118 -0
- package/AISB/catalog/aisb.t3.096_m3svm.yaml +189 -0
- package/AISB/catalog/aisb.t3.096_m3svm.zh.yaml +149 -0
- package/AISB/catalog/aisb.t3.097_wassersteintl.yaml +212 -0
- package/AISB/catalog/aisb.t3.097_wassersteintl.zh.yaml +169 -0
- package/AISB/catalog/aisb.t3.098_xmahalanobis.yaml +171 -0
- package/AISB/catalog/aisb.t3.098_xmahalanobis.zh.yaml +127 -0
- package/AISB/catalog/aisb.t3.099_ollalanding.yaml +248 -0
- package/AISB/catalog/aisb.t3.099_ollalanding.zh.yaml +182 -0
- package/AISB/catalog/aisb.t3.100_invmissingdata.yaml +179 -0
- package/AISB/catalog/aisb.t3.100_invmissingdata.zh.yaml +150 -0
- package/AISB/catalog/aisb.t3.101_acia.yaml +164 -0
- package/AISB/catalog/aisb.t3.101_acia.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.102_stochasticff.yaml +178 -0
- package/AISB/catalog/aisb.t3.102_stochasticff.zh.yaml +130 -0
- package/AISB/catalog/aisb.t3.103_qdcp.yaml +150 -0
- package/AISB/catalog/aisb.t3.103_qdcp.zh.yaml +116 -0
- package/AISB/catalog/aisb.t3.104_balancedactiveinf.yaml +137 -0
- package/AISB/catalog/aisb.t3.104_balancedactiveinf.zh.yaml +104 -0
- package/AISB/catalog/aisb.t3.105_binaryclasseval.yaml +161 -0
- package/AISB/catalog/aisb.t3.105_binaryclasseval.zh.yaml +130 -0
- package/AISB/image/001_aisb.t3.001_savvy.jpg +0 -0
- package/AISB/image/002_aisb.t3.002_pinet.jpg +0 -0
- package/AISB/image/003_aisb.t3.003_dmsqd.jpg +0 -0
- package/AISB/image/004_aisb.t3.004_decentralattn.jpg +0 -0
- package/AISB/image/005_aisb.t3.005_tsae.jpg +0 -0
- package/AISB/image/006_aisb.t3.006_physense.jpg +0 -0
- package/AISB/image/007_aisb.t3.007_reasoningiqa.jpg +0 -0
- package/AISB/image/008_aisb.t3.008_meanflows.jpg +0 -0
- package/AISB/image/009_aisb.t3.009_scoremissing.jpg +0 -0
- package/AISB/image/010_aisb.t3.010_suitabilityfilter.jpg +0 -0
- package/AISB/image/011_aisb.t3.011_osd.jpg +0 -0
- package/AISB/image/012_aisb.t3.012_efficientqat.jpg +0 -0
- package/AISB/image/013_aisb.t3.013_appl.jpg +0 -0
- package/AISB/image/014_aisb.t3.014_piguard.jpg +0 -0
- package/AISB/image/015_aisb.t3.015_frspec.jpg +0 -0
- package/AISB/image/016_aisb.t3.016_mathfusion.jpg +0 -0
- package/AISB/image/017_aisb.t3.017_multimodalglp.jpg +0 -0
- package/AISB/image/018_aisb.t3.018_cotsynth.jpg +0 -0
- package/AISB/image/019_aisb.t3.019_dyscaleut.jpg +0 -0
- package/AISB/image/020_aisb.t3.020_aristotle.jpg +0 -0
- package/AISB/image/021_aisb.t3.021_tokenrecycling.jpg +0 -0
- package/AISB/image/022_aisb.t3.022_chainofreasoning.jpg +0 -0
- package/AISB/image/023_aisb.t3.023_guidedembed.jpg +0 -0
- package/AISB/image/024_aisb.t3.024_outputcentric.jpg +0 -0
- package/AISB/image/025_aisb.t3.025_deeper.jpg +0 -0
- package/AISB/image/026_aisb.t3.026_gartkg.jpg +0 -0
- package/AISB/image/027_aisb.t3.027_citeeval.jpg +0 -0
- package/AISB/image/028_aisb.t3.028_sbam.jpg +0 -0
- package/AISB/image/029_aisb.t3.029_cdqgeoembed.jpg +0 -0
- package/AISB/image/030_aisb.t3.030_processrm.jpg +0 -0
- package/AISB/image/031_aisb.t3.031_circuitstability.jpg +0 -0
- package/AISB/image/032_aisb.t3.032_ptsolver.jpg +0 -0
- package/AISB/image/033_aisb.t3.033_gcse.jpg +0 -0
- package/AISB/image/034_aisb.t3.034_ensemblewm.jpg +0 -0
- package/AISB/image/035_aisb.t3.035_moralvalueswa.jpg +0 -0
- package/AISB/image/036_aisb.t3.036_weakstrongpref.jpg +0 -0
- package/AISB/image/037_aisb.t3.037_dementiamask.jpg +0 -0
- package/AISB/image/038_aisb.t3.038_tinysam.jpg +0 -0
- package/AISB/image/039_aisb.t3.039_calf.jpg +0 -0
- package/AISB/image/040_aisb.t3.040_graniteguardian.jpg +0 -0
- package/AISB/image/041_aisb.t3.041_amdm.jpg +0 -0
- package/AISB/image/042_aisb.t3.042_xpatch.jpg +0 -0
- package/AISB/image/043_aisb.t3.043_vhm.jpg +0 -0
- package/AISB/image/044_aisb.t3.044_rgvi.jpg +0 -0
- package/AISB/image/045_aisb.t3.045_pslstm.jpg +0 -0
- package/AISB/image/046_aisb.t3.046_nonstatts.jpg +0 -0
- package/AISB/image/047_aisb.t3.047_timepfn.jpg +0 -0
- package/AISB/image/048_aisb.t3.048_proxyspex.jpg +0 -0
- package/AISB/image/049_aisb.t3.049_hogwildinference.jpg +0 -0
- package/AISB/image/050_aisb.t3.050_causalpfn.jpg +0 -0
- package/AISB/image/051_aisb.t3.051_flashtp.jpg +0 -0
- package/AISB/image/052_aisb.t3.052_nsdiff.jpg +0 -0
- package/AISB/image/053_aisb.t3.053_k2vae.jpg +0 -0
- package/AISB/image/054_aisb.t3.054_timebase.jpg +0 -0
- package/AISB/image/055_aisb.t3.055_csbrain.jpg +0 -0
- package/AISB/image/056_aisb.t3.056_infosam.jpg +0 -0
- package/AISB/image/057_aisb.t3.057_mdreid.jpg +0 -0
- package/AISB/image/058_aisb.t3.058_mindglitch.jpg +0 -0
- package/AISB/image/059_aisb.t3.059_selfsupervised.jpg +0 -0
- package/AISB/image/060_aisb.t3.060_iaggad.jpg +0 -0
- package/AISB/image/061_aisb.t3.061_hsgkn.jpg +0 -0
- package/AISB/image/062_aisb.t3.062_visionts.jpg +0 -0
- package/AISB/image/063_aisb.t3.063_tsrag.jpg +0 -0
- package/AISB/image/064_aisb.t3.064_pir.jpg +0 -0
- package/AISB/image/065_aisb.t3.065_proteinbinding.jpg +0 -0
- package/AISB/image/066_aisb.t3.066_tropicalattention.jpg +0 -0
- package/AISB/image/067_aisb.t3.067_kanad.jpg +0 -0
- package/AISB/image/068_aisb.t3.068_sempo.jpg +0 -0
- package/AISB/image/069_aisb.t3.069_treehfd.jpg +0 -0
- package/AISB/image/070_aisb.t3.070_certifiedunlearning.jpg +0 -0
- package/AISB/image/071_aisb.t3.071_neuralmjd.jpg +0 -0
- package/AISB/image/072_aisb.t3.072_fedgmt.jpg +0 -0
- package/AISB/image/073_aisb.t3.073_rld.jpg +0 -0
- package/AISB/image/074_aisb.t3.074_lsvi.jpg +0 -0
- package/AISB/image/075_aisb.t3.075_treeslicedentropy.jpg +0 -0
- package/AISB/image/076_aisb.t3.076_aanet.jpg +0 -0
- package/AISB/image/077_aisb.t3.077_cmnn.jpg +0 -0
- package/AISB/image/078_aisb.t3.078_conformalanomaly.jpg +0 -0
- package/AISB/image/079_aisb.t3.079_dpfkmeans.jpg +0 -0
- package/AISB/image/080_aisb.t3.080_latentscorereweight.jpg +0 -0
- package/AISB/image/081_aisb.t3.081_qmamba.jpg +0 -0
- package/AISB/image/082_aisb.t3.082_onlinellmrouting.jpg +0 -0
- package/AISB/image/083_aisb.t3.083_starformer.jpg +0 -0
- package/AISB/image/084_aisb.t3.084_ift.jpg +0 -0
- package/AISB/image/085_aisb.t3.085_neuralsurv.jpg +0 -0
- package/AISB/image/086_aisb.t3.086_stella.jpg +0 -0
- package/AISB/image/087_aisb.t3.087_moses.jpg +0 -0
- package/AISB/image/088_aisb.t3.088_channelnorm.jpg +0 -0
- package/AISB/image/089_aisb.t3.089_causalvelocity.jpg +0 -0
- package/AISB/image/090_aisb.t3.090_rstib.jpg +0 -0
- package/AISB/image/091_aisb.t3.091_timeawarecausal.jpg +0 -0
- package/AISB/image/092_aisb.t3.092_kmeanslocalopt.jpg +0 -0
- package/AISB/image/093_aisb.t3.093_fedwmsam.jpg +0 -0
- package/AISB/image/094_aisb.t3.094_boundre.jpg +0 -0
- package/AISB/image/095_aisb.t3.095_fastfeaturecp.jpg +0 -0
- package/AISB/image/096_aisb.t3.096_m3svm.jpg +0 -0
- package/AISB/image/097_aisb.t3.097_wassersteintl.jpg +0 -0
- package/AISB/image/098_aisb.t3.098_xmahalanobis.jpg +0 -0
- package/AISB/image/099_aisb.t3.099_ollalanding.jpg +0 -0
- package/AISB/image/100_aisb.t3.100_invmissingdata.jpg +0 -0
- package/AISB/image/101_aisb.t3.101_acia.jpg +0 -0
- package/AISB/image/102_aisb.t3.102_stochasticff.jpg +0 -0
- package/AISB/image/103_aisb.t3.103_qdcp.jpg +0 -0
- package/AISB/image/104_aisb.t3.104_balancedactiveinf.jpg +0 -0
- package/AISB/image/105_aisb.t3.105_binaryclasseval.jpg +0 -0
- package/AISB/image/106_aisb.t1.reasoning_lite.jpg +0 -0
- package/AISB/image/107_aisb.t2.paper_audit.jpg +0 -0
- package/AISB/image/108_aisb.t3.multi_gpu_search.jpg +0 -0
- package/AISB/image/109_aisb.t3.tdc_admet.jpg +0 -0
- package/AISB/image/aisb.b1.agentic_coding.svg +16 -0
- package/AISB/image/aisb.b10.climate_earth.svg +16 -0
- package/AISB/image/aisb.b11.model_efficiency.svg +16 -0
- package/AISB/image/aisb.b12.embodied_ai.svg +16 -0
- package/AISB/image/aisb.b2.agent_systems.svg +16 -0
- package/AISB/image/aisb.b3.self_evolving_rl.svg +16 -0
- package/AISB/image/aisb.b4.lm_reasoning.svg +16 -0
- package/AISB/image/aisb.b5.math_proof.svg +16 -0
- package/AISB/image/aisb.b6.research_process.svg +16 -0
- package/AISB/image/aisb.b7.multimodal_fusion.svg +16 -0
- package/AISB/image/aisb.b8.lifesci_drug.svg +16 -0
- package/AISB/image/aisb.b9.material_science.svg +16 -0
- package/README.md +196 -32
- package/bin/ds.js +924 -66
- package/docs/en/00_QUICK_START.md +195 -18
- package/docs/en/01_SETTINGS_REFERENCE.md +468 -96
- package/docs/en/02_START_RESEARCH_GUIDE.md +26 -5
- package/docs/en/03_QQ_CONNECTOR_GUIDE.md +14 -3
- package/docs/en/04_LINGZHU_CONNECTOR_GUIDE.md +2 -0
- package/docs/en/05_TUI_GUIDE.md +171 -2
- package/docs/en/07_MEMORY_AND_MCP.md +38 -2
- package/docs/en/09_DOCTOR.md +78 -7
- package/docs/en/10_WEIXIN_CONNECTOR_GUIDE.md +38 -1
- package/docs/en/11_LICENSE_AND_RISK.md +4 -0
- package/docs/en/12_GUIDED_WORKFLOW_TOUR.md +15 -0
- package/docs/en/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
- package/docs/en/15_CODEX_PROVIDER_SETUP.md +624 -180
- package/docs/en/16_TELEGRAM_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/17_WHATSAPP_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/18_FEISHU_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/21_LOCAL_MODEL_BACKENDS_GUIDE.md +386 -0
- package/docs/en/22_BENCHSTORE_YAML_REFERENCE.md +469 -0
- package/docs/en/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +316 -0
- package/docs/en/24_CLAUDE_CODE_PROVIDER_SETUP.md +469 -0
- package/docs/en/25_OPENCODE_PROVIDER_SETUP.md +653 -0
- package/docs/en/26_CITATION_AND_ATTRIBUTION.md +119 -0
- package/docs/en/27_KIMI_CODE_PROVIDER_SETUP.md +180 -0
- package/docs/en/28_DISCORD_CONNECTOR_GUIDE.md +61 -0
- package/docs/en/29_SLACK_CONNECTOR_GUIDE.md +60 -0
- package/docs/en/30_SETTINGS_CONTROL_CENTER_GUIDE.md +371 -0
- package/docs/en/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
- package/docs/en/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +273 -0
- package/docs/en/33_WORKSPACE_EXPLORER_QA.md +121 -0
- package/docs/en/91_DEVELOPMENT.md +266 -0
- package/docs/en/99_ACKNOWLEDGEMENTS.md +24 -19
- package/docs/en/README.md +48 -7
- package/docs/images/admin/admin-connectors-health-en.png +0 -0
- package/docs/images/admin/admin-controllers-en.png +0 -0
- package/docs/images/admin/admin-diagnostics-en.png +0 -0
- package/docs/images/admin/admin-errors-en.png +0 -0
- package/docs/images/admin/admin-issues-en.png +0 -0
- package/docs/images/admin/admin-logs-en.png +0 -0
- package/docs/images/admin/admin-quest-detail-en.png +0 -0
- package/docs/images/admin/admin-quests-en.png +0 -0
- package/docs/images/admin/admin-repairs-en.png +0 -0
- package/docs/images/admin/admin-runtime-en.png +0 -0
- package/docs/images/admin/admin-search-en.png +0 -0
- package/docs/images/admin/admin-stats-en.png +0 -0
- package/docs/images/admin/admin-summary-en.png +0 -0
- package/docs/images/connectors/connector-discord-en.png +0 -0
- package/docs/images/connectors/connector-feishu-en.png +0 -0
- package/docs/images/connectors/connector-lingzhu-en.png +0 -0
- package/docs/images/connectors/connector-qq-en.png +0 -0
- package/docs/images/connectors/connector-slack-en.png +0 -0
- package/docs/images/connectors/connector-telegram-en.png +0 -0
- package/docs/images/connectors/connector-weixin-en.png +0 -0
- package/docs/images/connectors/connector-whatsapp-en.png +0 -0
- package/docs/images/settings/settings-baselines-en.png +0 -0
- package/docs/images/settings/settings-config-en.png +0 -0
- package/docs/images/settings/settings-connectors-overview-en.png +0 -0
- package/docs/images/settings/settings-deepxiv-en.png +0 -0
- package/docs/images/settings/settings-mcp-servers-en.png +0 -0
- package/docs/images/settings/settings-plugins-en.png +0 -0
- package/docs/images/settings/settings-runners-en.png +0 -0
- package/docs/zh/00_QUICK_START.md +142 -18
- package/docs/zh/01_SETTINGS_REFERENCE.md +219 -98
- package/docs/zh/02_START_RESEARCH_GUIDE.md +26 -5
- package/docs/zh/05_TUI_GUIDE.md +171 -2
- package/docs/zh/07_MEMORY_AND_MCP.md +29 -2
- package/docs/zh/09_DOCTOR.md +54 -8
- package/docs/zh/10_WEIXIN_CONNECTOR_GUIDE.md +24 -1
- package/docs/zh/11_LICENSE_AND_RISK.md +4 -0
- package/docs/zh/12_GUIDED_WORKFLOW_TOUR.md +15 -0
- package/docs/zh/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
- package/docs/zh/15_CODEX_PROVIDER_SETUP.md +552 -181
- package/docs/zh/21_LOCAL_MODEL_BACKENDS_GUIDE.md +384 -0
- package/docs/zh/22_BENCHSTORE_YAML_REFERENCE.md +459 -0
- package/docs/zh/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +287 -0
- package/docs/zh/23_CLAUDE_RUNNER_GUIDE.md +103 -0
- package/docs/zh/24_CLAUDE_CODE_PROVIDER_SETUP.md +460 -0
- package/docs/zh/25_OPENCODE_PROVIDER_SETUP.md +660 -0
- package/docs/zh/26_CITATION_AND_ATTRIBUTION.md +102 -0
- package/docs/zh/27_KIMI_CODE_PROVIDER_SETUP.md +51 -0
- package/docs/zh/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
- package/docs/zh/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +264 -0
- package/docs/zh/33_WORKSPACE_EXPLORER_QA.md +127 -0
- package/docs/zh/99_ACKNOWLEDGEMENTS.md +23 -19
- package/docs/zh/README.md +33 -7
- package/install.sh +168 -20
- package/package.json +5 -1
- package/pyproject.toml +2 -1
- package/src/deepscientist/__init__.py +1 -1
- package/src/deepscientist/acp/envelope.py +13 -0
- package/src/deepscientist/admin/__init__.py +3 -0
- package/src/deepscientist/admin/charts.py +681 -0
- package/src/deepscientist/admin/logs.py +119 -0
- package/src/deepscientist/admin/repairs.py +217 -0
- package/src/deepscientist/admin/service.py +1310 -0
- package/src/deepscientist/admin/system_info.py +700 -0
- package/src/deepscientist/admin/tasks.py +465 -0
- package/src/deepscientist/admin/tool_metrics.py +600 -0
- package/src/deepscientist/artifact/guidance.py +8 -4
- package/src/deepscientist/artifact/schemas.py +115 -0
- package/src/deepscientist/artifact/service.py +4268 -260
- package/src/deepscientist/bash_exec/monitor.py +30 -3
- package/src/deepscientist/bash_exec/service.py +134 -1
- package/src/deepscientist/benchstore/__init__.py +4 -0
- package/src/deepscientist/benchstore/prompt_builder.py +224 -0
- package/src/deepscientist/benchstore/service.py +1716 -0
- package/src/deepscientist/bridges/connectors.py +8 -2
- package/src/deepscientist/channels/weixin_ilink.py +8 -1
- package/src/deepscientist/cli.py +92 -17
- package/src/deepscientist/codex_cli_compat.py +187 -74
- package/src/deepscientist/config/models.py +82 -11
- package/src/deepscientist/config/service.py +1077 -93
- package/src/deepscientist/connector/weixin_support.py +48 -17
- package/src/deepscientist/daemon/api/handlers.py +827 -235
- package/src/deepscientist/daemon/api/router.py +81 -1
- package/src/deepscientist/daemon/app.py +1512 -85
- package/src/deepscientist/diagnostics/__init__.py +6 -0
- package/src/deepscientist/diagnostics/runner_failures.py +277 -0
- package/src/deepscientist/doctor.py +407 -56
- package/src/deepscientist/evidence_packets.py +590 -0
- package/src/deepscientist/home.py +52 -4
- package/src/deepscientist/kimi_cli_compat.py +50 -0
- package/src/deepscientist/latex_runtime.py +2 -2
- package/src/deepscientist/mcp/context.py +2 -0
- package/src/deepscientist/mcp/schemas.py +114 -0
- package/src/deepscientist/mcp/server.py +1566 -126
- package/src/deepscientist/memory/service.py +203 -16
- package/src/deepscientist/process_control.py +8 -1
- package/src/deepscientist/prompts/builder.py +850 -88
- package/src/deepscientist/quest/__init__.py +2 -2
- package/src/deepscientist/quest/layout.py +12 -1
- package/src/deepscientist/quest/node_traces.py +10 -0
- package/src/deepscientist/quest/service.py +1852 -161
- package/src/deepscientist/quest/stage_views.py +1 -1
- package/src/deepscientist/runners/__init__.py +18 -0
- package/src/deepscientist/runners/base.py +89 -1
- package/src/deepscientist/runners/builtins.py +13 -1
- package/src/deepscientist/runners/claude.py +391 -0
- package/src/deepscientist/runners/codex.py +480 -35
- package/src/deepscientist/runners/codex_telemetry.py +127 -0
- package/src/deepscientist/runners/kimi.py +334 -0
- package/src/deepscientist/runners/metadata.py +68 -0
- package/src/deepscientist/runners/opencode.py +414 -0
- package/src/deepscientist/runners/runtime_overrides.py +100 -0
- package/src/deepscientist/runners/simple_cli.py +538 -0
- package/src/deepscientist/runtime_storage.py +303 -0
- package/src/deepscientist/shared.py +80 -16
- package/src/deepscientist/skills/installer.py +37 -0
- package/src/deepscientist/skills/registry.py +2 -0
- package/src/deepscientist/tinytex.py +2 -2
- package/src/deepscientist/tui.py +10 -3
- package/src/prompts/benchstore/system.md +77 -0
- package/src/prompts/connectors/qq.md +33 -2
- package/src/prompts/connectors/weixin.md +208 -23
- package/src/prompts/contracts/admin_ops.md +74 -0
- package/src/prompts/contracts/admin_ops_knowledge.md +138 -0
- package/src/prompts/contracts/shared_interaction.md +5 -10
- package/src/prompts/start_setup/system.md +422 -0
- package/src/prompts/system.md +411 -304
- package/src/prompts/system_copilot.md +89 -0
- package/src/skills/analysis-campaign/SKILL.md +239 -578
- package/src/skills/analysis-campaign/references/artifact-flow-examples.md +102 -0
- package/src/skills/analysis-campaign/references/boundary-cases.md +98 -0
- package/src/skills/analysis-campaign/references/campaign-checklist-template.md +39 -24
- package/src/skills/analysis-campaign/references/campaign-design.md +26 -10
- package/src/skills/analysis-campaign/references/campaign-plan-template.md +53 -54
- package/src/skills/analysis-campaign/references/operational-guidance.md +97 -0
- package/src/skills/analysis-campaign/references/writing-facing-slice-examples.md +10 -20
- package/src/skills/baseline/SKILL.md +183 -461
- package/src/skills/baseline/references/artifact-flow-examples.md +106 -0
- package/src/skills/baseline/references/artifact-payload-examples.md +1 -1
- package/src/skills/baseline/references/baseline-checklist-template.md +27 -35
- package/src/skills/baseline/references/baseline-plan-template.md +37 -76
- package/src/skills/baseline/references/boundary-cases.md +86 -0
- package/src/skills/baseline/references/codebase-audit-checklist.md +2 -6
- package/src/skills/baseline/references/comparability-contract.md +7 -12
- package/src/skills/baseline/references/operational-guidance.md +56 -0
- package/src/skills/baseline/references/route-selection.md +5 -25
- package/src/skills/decision/SKILL.md +113 -306
- package/src/skills/decision/references/checkpoint-memory-template.md +47 -0
- package/src/skills/decision/references/operational-guidance.md +94 -0
- package/src/skills/decision/references/research-route-criteria.md +7 -8
- package/src/skills/decision/references/strategic-decision-template.md +13 -26
- package/src/skills/experiment/SKILL.md +132 -670
- package/src/skills/experiment/references/execution-playbook.md +374 -0
- package/src/skills/experiment/references/main-experiment-checklist-template.md +26 -2
- package/src/skills/experiment/references/main-experiment-plan-template.md +28 -17
- package/src/skills/experiment/references/operational-guidance.md +108 -0
- package/src/skills/finalize/SKILL.md +62 -0
- package/src/skills/finalize/references/checkpoint-memory-template.md +49 -0
- package/src/skills/finalize/references/resume-packet-template.md +7 -0
- package/src/skills/idea/SKILL.md +228 -15
- package/src/skills/idea/references/controlled-brainstorming-playbook.md +78 -0
- package/src/skills/idea/references/current-board-packet-template.md +61 -0
- package/src/skills/idea/references/high-value-idea-sourcing.md +119 -0
- package/src/skills/idea/references/idea-generation-playbook.md +21 -0
- package/src/skills/idea/references/idea-thinking-flow.md +6 -0
- package/src/skills/idea/references/literature-survey-template.md +3 -0
- package/src/skills/idea/references/objective-contract-template.md +54 -0
- package/src/skills/idea/references/outline-seeding-example.md +56 -0
- package/src/skills/idea/references/pre-idea-draft-template.md +105 -0
- package/src/skills/idea/references/related-work-playbook.md +75 -2
- package/src/skills/idea/references/research-history-playbook.md +114 -0
- package/src/skills/idea/references/selection-gate.md +58 -6
- package/src/skills/intake-audit/SKILL.md +43 -2
- package/src/skills/intake-audit/references/state-audit-template.md +10 -0
- package/src/skills/nature-data/SKILL.md +128 -0
- package/src/skills/nature-data/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-data/agents/openai.yaml +4 -0
- package/src/skills/nature-data/references/chinese-author-alignment.md +84 -0
- package/src/skills/nature-data/references/fair-metadata-checklist.md +105 -0
- package/src/skills/nature-data/references/policy-principles.md +103 -0
- package/src/skills/nature-data/references/repository-and-identifiers.md +96 -0
- package/src/skills/nature-data/references/source-basis.md +54 -0
- package/src/skills/nature-data/references/statement-patterns.md +153 -0
- package/src/skills/nature-figure/SKILL.md +197 -0
- package/src/skills/nature-figure/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-figure/agents/openai.yaml +4 -0
- package/src/skills/nature-figure/evals/evals.json +37 -0
- package/src/skills/nature-figure/references/api.md +428 -0
- package/src/skills/nature-figure/references/backend-selection.md +100 -0
- package/src/skills/nature-figure/references/chart-types.md +281 -0
- package/src/skills/nature-figure/references/common-patterns.md +349 -0
- package/src/skills/nature-figure/references/design-theory.md +436 -0
- package/src/skills/nature-figure/references/figure-contract.md +93 -0
- package/src/skills/nature-figure/references/nature-2026-observations.md +112 -0
- package/src/skills/nature-figure/references/qa-contract.md +119 -0
- package/src/skills/nature-figure/references/r-template-index.md +66 -0
- package/src/skills/nature-figure/references/r-workflow.md +161 -0
- package/src/skills/nature-figure/references/tutorials.md +250 -0
- package/src/skills/nature-paper2ppt/SKILL.md +507 -0
- package/src/skills/nature-paper2ppt/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-paper2ppt/agents/openai.yaml +4 -0
- package/src/skills/nature-polishing/SKILL.md +385 -0
- package/src/skills/nature-polishing/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-polishing/agents/openai.yaml +4 -0
- package/src/skills/nature-polishing/references/phrasebank-playbook.md +162 -0
- package/src/skills/nature-polishing/references/section-moves.md +240 -0
- package/src/skills/nature-polishing/references/style-guardrails.md +94 -0
- package/src/skills/nature-polishing/references/writing-strategy.md +148 -0
- package/src/skills/optimize/SKILL.md +177 -1568
- package/src/skills/optimize/references/brief-shaping-playbook.md +95 -0
- package/src/skills/optimize/references/candidate-board-template.md +13 -0
- package/src/skills/optimize/references/candidate-ranking-template.md +51 -0
- package/src/skills/optimize/references/codegen-route-playbook.md +50 -0
- package/src/skills/optimize/references/debug-response-template.md +29 -0
- package/src/skills/optimize/references/frontier-review-template.md +32 -0
- package/src/skills/optimize/references/fusion-playbook.md +36 -0
- package/src/skills/optimize/references/method-brief-template.md +73 -0
- package/src/skills/optimize/references/operational-guidance.md +621 -0
- package/src/skills/optimize/references/optimization-memory-template.md +30 -0
- package/src/skills/optimize/references/optimize-checklist-template.md +18 -0
- package/src/skills/optimize/references/plateau-response-playbook.md +28 -0
- package/src/skills/optimize/references/prompt-patterns.md +49 -0
- package/src/skills/paper-outline/SKILL.md +227 -0
- package/src/skills/paper-outline/references/outline-patterns.md +87 -0
- package/src/skills/paper-plot/SKILL.md +79 -0
- package/src/skills/paper-plot/agents/openai.yaml +4 -0
- package/src/skills/paper-plot/references/bar_grouped_hatch.md +96 -0
- package/src/skills/paper-plot/references/bar_paired_delta.md +72 -0
- package/src/skills/paper-plot/references/line_confidence_band.md +75 -0
- package/src/skills/paper-plot/references/line_loss_with_inset.md +65 -0
- package/src/skills/paper-plot/references/line_training_curve.md +44 -0
- package/src/skills/paper-plot/references/radar_dual_series.md +59 -0
- package/src/skills/paper-plot/references/scatter_broken_axis.md +59 -0
- package/src/skills/paper-plot/references/scatter_tsne_cluster.md +72 -0
- package/src/skills/paper-plot/scripts/bar_memevolve.py +109 -0
- package/src/skills/paper-plot/scripts/bar_spice.py +166 -0
- package/src/skills/paper-plot/scripts/line_aime.py +94 -0
- package/src/skills/paper-plot/scripts/line_loss_inset.py +157 -0
- package/src/skills/paper-plot/scripts/line_selfdistill.py +168 -0
- package/src/skills/paper-plot/scripts/radar_dora.py +151 -0
- package/src/skills/paper-plot/scripts/scatter_break.py +169 -0
- package/src/skills/paper-plot/scripts/scatter_tsne.py +133 -0
- package/src/skills/rebuttal/SKILL.md +9 -0
- package/src/skills/references/tool-usage-by-stage.md +438 -0
- package/src/skills/review/SKILL.md +105 -7
- package/src/skills/science/PROVENANCE.md +44 -0
- package/src/skills/science/SKILL.md +137 -0
- package/src/skills/science/references/artifact-science-tool.md +110 -0
- package/src/skills/science/references/claim-type-discipline.md +56 -0
- package/src/skills/science/references/domain-index.md +422 -0
- package/src/skills/science/references/hpc-via-bash-exec.md +42 -0
- package/src/skills/science/references/package-check-playbook.md +64 -0
- package/src/skills/science/references/package-index.min.json +3616 -0
- package/src/skills/science/references/packages/abinit.md +80 -0
- package/src/skills/science/references/packages/acts.md +73 -0
- package/src/skills/science/references/packages/aiida-core.md +80 -0
- package/src/skills/science/references/packages/alamode.md +80 -0
- package/src/skills/science/references/packages/amuse.md +88 -0
- package/src/skills/science/references/packages/anndata.md +88 -0
- package/src/skills/science/references/packages/arbor.md +80 -0
- package/src/skills/science/references/packages/arc.md +73 -0
- package/src/skills/science/references/packages/astropy.md +88 -0
- package/src/skills/science/references/packages/astroquery.md +88 -0
- package/src/skills/science/references/packages/atomate2.md +80 -0
- package/src/skills/science/references/packages/atomsmltr.md +73 -0
- package/src/skills/science/references/packages/awkward.md +73 -0
- package/src/skills/science/references/packages/batman.md +88 -0
- package/src/skills/science/references/packages/biopython.md +88 -0
- package/src/skills/science/references/packages/bloqade.md +73 -0
- package/src/skills/science/references/packages/brian2.md +73 -0
- package/src/skills/science/references/packages/bullet3.md +73 -0
- package/src/skills/science/references/packages/calculix.md +80 -0
- package/src/skills/science/references/packages/cantera.md +73 -0
- package/src/skills/science/references/packages/cavity-md-ipi.md +80 -0
- package/src/skills/science/references/packages/ccdproc.md +88 -0
- package/src/skills/science/references/packages/celerite2.md +88 -0
- package/src/skills/science/references/packages/cellrank.md +73 -0
- package/src/skills/science/references/packages/cesm.md +80 -0
- package/src/skills/science/references/packages/chemicals.md +73 -0
- package/src/skills/science/references/packages/chempy.md +73 -0
- package/src/skills/science/references/packages/cirq.md +73 -0
- package/src/skills/science/references/packages/coffea.md +73 -0
- package/src/skills/science/references/packages/cp2k.md +88 -0
- package/src/skills/science/references/packages/custodian.md +80 -0
- package/src/skills/science/references/packages/dart.md +73 -0
- package/src/skills/science/references/packages/datamol.md +88 -0
- package/src/skills/science/references/packages/dd4hep.md +73 -0
- package/src/skills/science/references/packages/dealii.md +80 -0
- package/src/skills/science/references/packages/deepchem.md +88 -0
- package/src/skills/science/references/packages/delphes.md +73 -0
- package/src/skills/science/references/packages/devito.md +80 -0
- package/src/skills/science/references/packages/dftb.md +88 -0
- package/src/skills/science/references/packages/dftd4.md +88 -0
- package/src/skills/science/references/packages/dftk-jl.md +80 -0
- package/src/skills/science/references/packages/dolfinx.md +80 -0
- package/src/skills/science/references/packages/drake.md +73 -0
- package/src/skills/science/references/packages/dumux.md +73 -0
- package/src/skills/science/references/packages/elk.md +80 -0
- package/src/skills/science/references/packages/elmerfem.md +80 -0
- package/src/skills/science/references/packages/enzo-e.md +88 -0
- package/src/skills/science/references/packages/espresso.md +80 -0
- package/src/skills/science/references/packages/exoplanet.md +88 -0
- package/src/skills/science/references/packages/fairroot.md +73 -0
- package/src/skills/science/references/packages/fbpic.md +80 -0
- package/src/skills/science/references/packages/fdtdbath-meep.md +80 -0
- package/src/skills/science/references/packages/geant4.md +73 -0
- package/src/skills/science/references/packages/geosx.md +80 -0
- package/src/skills/science/references/packages/gprmax.md +80 -0
- package/src/skills/science/references/packages/gromacs.md +80 -0
- package/src/skills/science/references/packages/gwaslab.md +73 -0
- package/src/skills/science/references/packages/gz-sim.md +73 -0
- package/src/skills/science/references/packages/hail.md +88 -0
- package/src/skills/science/references/packages/hiphive.md +80 -0
- package/src/skills/science/references/packages/hoomd-blue.md +80 -0
- package/src/skills/science/references/packages/itensor.md +73 -0
- package/src/skills/science/references/packages/itensors-jl.md +73 -0
- package/src/skills/science/references/packages/jdftx.md +73 -0
- package/src/skills/science/references/packages/jobflow.md +80 -0
- package/src/skills/science/references/packages/kadanoffbaym-jl.md +73 -0
- package/src/skills/science/references/packages/kite.md +80 -0
- package/src/skills/science/references/packages/kratos.md +80 -0
- package/src/skills/science/references/packages/kwant.md +73 -0
- package/src/skills/science/references/packages/lammps.md +80 -0
- package/src/skills/science/references/packages/lightkurve.md +88 -0
- package/src/skills/science/references/packages/limix.md +73 -0
- package/src/skills/science/references/packages/maxwelllink.md +80 -0
- package/src/skills/science/references/packages/mcdc.md +73 -0
- package/src/skills/science/references/packages/meep.md +80 -0
- package/src/skills/science/references/packages/mfem.md +80 -0
- package/src/skills/science/references/packages/mitgcm.md +73 -0
- package/src/skills/science/references/packages/modflow6.md +73 -0
- package/src/skills/science/references/packages/molecool.md +73 -0
- package/src/skills/science/references/packages/mom6.md +73 -0
- package/src/skills/science/references/packages/moose.md +80 -0
- package/src/skills/science/references/packages/mpas-model.md +73 -0
- package/src/skills/science/references/packages/mujoco.md +73 -0
- package/src/skills/science/references/packages/mumax3.md +73 -0
- package/src/skills/science/references/packages/nekrs.md +80 -0
- package/src/skills/science/references/packages/nessi.md +73 -0
- package/src/skills/science/references/packages/nest-simulator.md +73 -0
- package/src/skills/science/references/packages/netket.md +73 -0
- package/src/skills/science/references/packages/neuron.md +73 -0
- package/src/skills/science/references/packages/nextflow.md +88 -0
- package/src/skills/science/references/packages/nwchem.md +88 -0
- package/src/skills/science/references/packages/openbabel.md +88 -0
- package/src/skills/science/references/packages/openems.md +80 -0
- package/src/skills/science/references/packages/openff-toolkit.md +88 -0
- package/src/skills/science/references/packages/openfoam-dev.md +80 -0
- package/src/skills/science/references/packages/openmc.md +73 -0
- package/src/skills/science/references/packages/openmm.md +80 -0
- package/src/skills/science/references/packages/openmoc.md +73 -0
- package/src/skills/science/references/packages/openmx.md +80 -0
- package/src/skills/science/references/packages/opensees.md +80 -0
- package/src/skills/science/references/packages/opensn.md +80 -0
- package/src/skills/science/references/packages/opm-simulators.md +73 -0
- package/src/skills/science/references/packages/oqupy.md +73 -0
- package/src/skills/science/references/packages/packmol.md +80 -0
- package/src/skills/science/references/packages/palabos.md +80 -0
- package/src/skills/science/references/packages/parflow.md +80 -0
- package/src/skills/science/references/packages/pennylane.md +88 -0
- package/src/skills/science/references/packages/perceval.md +73 -0
- package/src/skills/science/references/packages/phono3py.md +73 -0
- package/src/skills/science/references/packages/phonopy.md +73 -0
- package/src/skills/science/references/packages/photutils.md +88 -0
- package/src/skills/science/references/packages/picongpu.md +80 -0
- package/src/skills/science/references/packages/plink-ng.md +88 -0
- package/src/skills/science/references/packages/precice.md +73 -0
- package/src/skills/science/references/packages/psc.md +80 -0
- package/src/skills/science/references/packages/psi4.md +88 -0
- package/src/skills/science/references/packages/pybinding.md +73 -0
- package/src/skills/science/references/packages/pyfr.md +80 -0
- package/src/skills/science/references/packages/pyhf.md +73 -0
- package/src/skills/science/references/packages/pyiron_base.md +80 -0
- package/src/skills/science/references/packages/pylcp.md +73 -0
- package/src/skills/science/references/packages/pylith.md +80 -0
- package/src/skills/science/references/packages/pynbody.md +88 -0
- package/src/skills/science/references/packages/pysam.md +88 -0
- package/src/skills/science/references/packages/pyscf.md +88 -0
- package/src/skills/science/references/packages/q-e.md +73 -0
- package/src/skills/science/references/packages/qibo.md +73 -0
- package/src/skills/science/references/packages/qiskit.md +73 -0
- package/src/skills/science/references/packages/quantica-jl.md +73 -0
- package/src/skills/science/references/packages/quantumoptics-jl.md +73 -0
- package/src/skills/science/references/packages/quimb.md +73 -0
- package/src/skills/science/references/packages/qulacs.md +73 -0
- package/src/skills/science/references/packages/qutip.md +73 -0
- package/src/skills/science/references/packages/rdkit.md +88 -0
- package/src/skills/science/references/packages/rmg-py.md +73 -0
- package/src/skills/science/references/packages/root.md +73 -0
- package/src/skills/science/references/packages/scanpy.md +88 -0
- package/src/skills/science/references/packages/scikit-allel.md +88 -0
- package/src/skills/science/references/packages/scikit-bio.md +88 -0
- package/src/skills/science/references/packages/scqubits.md +73 -0
- package/src/skills/science/references/packages/scuff-em.md +80 -0
- package/src/skills/science/references/packages/scvi-tools.md +73 -0
- package/src/skills/science/references/packages/seissol.md +73 -0
- package/src/skills/science/references/packages/sfepy.md +80 -0
- package/src/skills/science/references/packages/sisl.md +73 -0
- package/src/skills/science/references/packages/smilei.md +80 -0
- package/src/skills/science/references/packages/snakemake.md +88 -0
- package/src/skills/science/references/packages/specfem3d-globe.md +80 -0
- package/src/skills/science/references/packages/specutils.md +88 -0
- package/src/skills/science/references/packages/spglib.md +80 -0
- package/src/skills/science/references/packages/squidpy.md +88 -0
- package/src/skills/science/references/packages/starry.md +88 -0
- package/src/skills/science/references/packages/strawberryfields.md +73 -0
- package/src/skills/science/references/packages/su2.md +80 -0
- package/src/skills/science/references/packages/sunny-jl.md +73 -0
- package/src/skills/science/references/packages/sw4.md +73 -0
- package/src/skills/science/references/packages/swift.md +88 -0
- package/src/skills/science/references/packages/tdnegf.md +73 -0
- package/src/skills/science/references/packages/tenpy.md +73 -0
- package/src/skills/science/references/packages/thermo.md +73 -0
- package/src/skills/science/references/packages/tkwant.md +73 -0
- package/src/skills/science/references/packages/tvb-root.md +73 -0
- package/src/skills/science/references/packages/uproot5.md +73 -0
- package/src/skills/science/references/packages/vampire.md +80 -0
- package/src/skills/science/references/packages/wannier_tools.md +73 -0
- package/src/skills/science/references/packages/warpx.md +80 -0
- package/src/skills/science/references/packages/wrf.md +73 -0
- package/src/skills/science/references/packages/xtb.md +88 -0
- package/src/skills/science/references/packages/yt.md +73 -0
- package/src/skills/science/references/science-task-brief-template.md +71 -0
- package/src/skills/scout/SKILL.md +83 -425
- package/src/skills/scout/references/literature-scout-template.md +5 -24
- package/src/skills/scout/references/operational-guidance.md +191 -0
- package/src/skills/scout/references/paper-triage-playbook.md +11 -35
- package/src/skills/write/SKILL.md +744 -1246
- package/src/skills/write/references/experiments_analysis_patterns.md +129 -0
- package/src/skills/write/references/oral_package_patterns.md +252 -0
- package/src/skills/write/references/oral_writing_principles.md +291 -0
- package/src/skills/write/references/section_rewrite_checklist.md +234 -0
- package/src/tui/dist/app/AppContainer.js +1314 -27
- package/src/tui/dist/components/Composer.js +26 -1
- package/src/tui/dist/components/ConfigScreen.js +2 -1
- package/src/tui/dist/components/InputPrompt.js +25 -9
- package/src/tui/dist/components/MainContent.js +18 -3
- package/src/tui/dist/components/QuestScreen.js +3 -2
- package/src/tui/dist/components/UtilityScreen.js +37 -0
- package/src/tui/dist/hooks/useSafeInput.js +10 -0
- package/src/tui/dist/index.js +13 -1
- package/src/tui/dist/layouts/DefaultAppLayout.js +11 -8
- package/src/tui/dist/lib/api.js +89 -1
- package/src/tui/package.json +1 -1
- package/src/ui/dist/assets/{AnalysisPlugin-DnSm0GZn.js → AnalysisPlugin-CA94NGmI.js} +1 -1
- package/src/ui/dist/assets/CliPlugin-DHBzphZU.js +79 -0
- package/src/ui/dist/assets/CodeEditorPlugin-BOFwD2rn.js +2 -0
- package/src/ui/dist/assets/{CodeViewerPlugin-itb0tltR.js → CodeViewerPlugin-CqDpgjik.js} +4 -4
- package/src/ui/dist/assets/{DocViewerPlugin-DqKkiCI6.js → DocViewerPlugin-UDBgt8-4.js} +3 -3
- package/src/ui/dist/assets/GitCommitViewerPlugin-BmHtZ0bZ.js +6 -0
- package/src/ui/dist/assets/{GitDiffViewerPlugin-DxL2ezFG.js → GitDiffViewerPlugin-CAxjNorQ.js} +2 -2
- package/src/ui/dist/assets/{GitSnapshotViewer-B_RQm1YZ.js → GitSnapshotViewer-CweA6VON.js} +2 -2
- package/src/ui/dist/assets/{ImageViewerPlugin-tHqlXY3n.js → ImageViewerPlugin-C8wHGvGN.js} +5 -5
- package/src/ui/dist/assets/LabPlugin-COyyLUol.js +32 -0
- package/src/ui/dist/assets/{LatexPlugin-B495DTXC.js → LatexPlugin-BQjAaA5J.js} +4 -4
- package/src/ui/dist/assets/{MarkdownViewerPlugin-DG28-61B.js → MarkdownViewerPlugin-Dy1NE2dI.js} +3 -3
- package/src/ui/dist/assets/{MarketplacePlugin-BiOGT-Kj.js → MarketplacePlugin-DMIZtEJ2.js} +2 -2
- package/src/ui/dist/assets/NotebookEditor-CFHMq_Qt.js +91 -0
- package/src/ui/dist/assets/{NotebookEditor-CVsj8h_T.js → NotebookEditor-WFyd8Ybt.js} +23 -23
- package/src/ui/dist/assets/{PdfLoader-CASDQmxJ.js → PdfLoader-CLE5u5TS.js} +3 -3
- package/src/ui/dist/assets/{PdfMarkdownPlugin-BFhwoKsY.js → PdfMarkdownPlugin-_iNK_H83.js} +1 -1
- package/src/ui/dist/assets/PdfViewerPlugin-DgWsbInT.js +22 -0
- package/src/ui/dist/assets/SearchPlugin-DrZmn5iw.js +11 -0
- package/src/ui/dist/assets/{TextViewerPlugin-CB4DYfWO.js → TextViewerPlugin-D1-T3aC7.js} +4 -4
- package/src/ui/dist/assets/branding/runner-claude.svg +107 -0
- package/src/ui/dist/assets/branding/runner-codex.svg +10 -0
- package/src/ui/dist/assets/branding/runner-kimi.svg +14 -0
- package/src/ui/dist/assets/branding/runner-opencode.svg +7 -0
- package/src/ui/dist/assets/cli-store-CoZ-x5Ip.js +1 -0
- package/src/ui/dist/assets/{code-DLC6G24T.js → code-DbsmSd3Y.js} +1 -1
- package/src/ui/dist/assets/file-diff-panel-DsvyRz47.js +1 -0
- package/src/ui/dist/assets/{wrap-text-CwMn-iqb.js → file-jump-queue-DeQBikaw.js} +3 -3
- package/src/ui/dist/assets/{file-socket-Cu4Qln7Y.js → file-socket-DA5XIx88.js} +1 -1
- package/src/ui/dist/assets/fonts/ds-fonts.css +50 -4
- package/src/ui/dist/assets/images/deepxiv/register-guide.png +0 -0
- package/src/ui/dist/assets/index-39vY9LmZ.js +1 -0
- package/src/ui/dist/assets/{index-wQ7RIIRd.js → index-BsO46tJA.js} +1 -1
- package/src/ui/dist/assets/index-CHzJ2xtB.js +3530 -0
- package/src/ui/dist/assets/index-DH-zxoZ3.css +33 -0
- package/src/ui/dist/assets/{plugin-notebook-HbW2K-1c.js → plugin-notebook-JRhysCqj.js} +2 -2
- package/src/ui/dist/assets/{project-sync-CsX08Qno.js → project-sync-DPmWKmKD.js} +1 -1
- package/src/ui/dist/assets/{zoom-out-R-GWEhzS.js → zoom-out-DAukFWen.js} +3 -3
- package/src/ui/dist/index.html +3 -3
- package/src/skills/analysis-campaign/references/artifact-orchestration.md +0 -58
- package/src/skills/baseline/references/memory-playbook.md +0 -40
- package/src/skills/baseline/references/publishable-baseline-package.md +0 -30
- package/src/skills/write/references/outline-evidence-contract-example.md +0 -107
- package/src/skills/write/references/paper-experiment-matrix-template.md +0 -131
- package/src/skills/write/references/paper-section-playbook.md +0 -64
- package/src/skills/write/references/reviewer-first-writing.md +0 -64
- package/src/skills/write/references/revision-checklist.md +0 -70
- package/src/skills/write/references/section-contracts.md +0 -82
- package/src/skills/write/references/sentence-level-proofing.md +0 -49
- package/src/ui/dist/assets/AiManusChatView-COFACy7V.js +0 -204
- package/src/ui/dist/assets/CliPlugin-CvwCmDQ5.js +0 -109
- package/src/ui/dist/assets/CodeEditorPlugin-cOqSa0xq.js +0 -2
- package/src/ui/dist/assets/GitCommitViewerPlugin-DVgNHBCS.js +0 -1
- package/src/ui/dist/assets/LabCopilotPanel-ClMbq5Yu.js +0 -14
- package/src/ui/dist/assets/LabPlugin-L_SuE8ow.js +0 -22
- package/src/ui/dist/assets/NotebookEditor-C-4Kt1p9.js +0 -81
- package/src/ui/dist/assets/PdfViewerPlugin-DcOzU9vd.js +0 -17
- package/src/ui/dist/assets/SearchPlugin-CHj7M58O.js +0 -16
- package/src/ui/dist/assets/VNCViewer-CjlbyCB3.js +0 -11
- package/src/ui/dist/assets/bot-CFkZY-JP.js +0 -6
- package/src/ui/dist/assets/chevron-up-Dq5ofbht.js +0 -6
- package/src/ui/dist/assets/file-content-Dv4LoZec.js +0 -1
- package/src/ui/dist/assets/file-diff-panel-Denq-lC3.js +0 -1
- package/src/ui/dist/assets/file-jump-queue-DA-SdG__.js +0 -1
- package/src/ui/dist/assets/git-commit-horizontal-BUh6G52n.js +0 -6
- package/src/ui/dist/assets/image-B9HUUddG.js +0 -6
- package/src/ui/dist/assets/index-B2B1sg-M.js +0 -1
- package/src/ui/dist/assets/index-Cgla8biy.css +0 -33
- package/src/ui/dist/assets/index-DRyx7vAc.js +0 -1
- package/src/ui/dist/assets/index-Gbl53BNp.js +0 -2496
- package/src/ui/dist/assets/pdf-effect-queue-ZtnHFCAi.js +0 -6
- package/src/ui/dist/assets/popover-DL6h35vr.js +0 -1
- package/src/ui/dist/assets/select-DvmXt1yY.js +0 -11
- package/src/ui/dist/assets/sigma-7jpXazui.js +0 -6
- package/src/ui/dist/assets/trash-xA7kFt8i.js +0 -11
- package/src/ui/dist/assets/useCliAccess-DsMwDjOp.js +0 -1
- package/src/ui/dist/assets/useFileDiffOverlay-FuhcnKiw.js +0 -1
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.018_cotsynth
|
|
3
|
+
name: 'CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis'
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: 'Inference-time answer-synthesis benchmark: generate diverse CoT candidate
|
|
6
|
+
responses from 8B-scale policy models, then synthesize a superior answer via a trained
|
|
7
|
+
Synthesizer-8B, evaluating accuracy on GSM8k, MATH500, WikiTQ, and FeTaQA.
|
|
8
|
+
|
|
9
|
+
'
|
|
10
|
+
task_description: 'This benchmark reproduces the CoT-based Synthesizer paper (ACL
|
|
11
|
+
2025). The core workflow is: (1) use a policy model (e.g. Llama3-8B-Instruct) to
|
|
12
|
+
sample N diverse candidate responses per question via temperature/top-p decoding,
|
|
13
|
+
(2) feed candidates into a Synthesizer model that performs CoT-based analysis and
|
|
14
|
+
synthesis to produce a refined final answer, and (3) evaluate on four benchmarks
|
|
15
|
+
— GSM8k (exact-match accuracy), MATH500 (exact-match accuracy via DART-Math evaluator),
|
|
16
|
+
WikiTQ (exact-match accuracy), and FeTaQA (ROUGE-L recall). The snapshot includes
|
|
17
|
+
bundled evaluation test data in the data/ folder, inference code using vLLM, evaluation
|
|
18
|
+
scripts, a fine-tuning pipeline, and a data-generation pipeline for creating synthesis
|
|
19
|
+
training data. A pre-trained Synthesizer-8B checkpoint and training data are available
|
|
20
|
+
on Hugging Face. The workflow is inference-heavy: each evaluation requires serving
|
|
21
|
+
an 8B model via vLLM and running multi-sample generation. MATH evaluation additionally
|
|
22
|
+
requires installing the dart-math package from GitHub. The benchmark compares against
|
|
23
|
+
Self-Consistency, Universal Self-Consistency, Best-of-N with reward models, and
|
|
24
|
+
LMCOR baselines.
|
|
25
|
+
|
|
26
|
+
'
|
|
27
|
+
capability_tags:
|
|
28
|
+
- research_code_optimization
|
|
29
|
+
- large_language_models
|
|
30
|
+
- answer_synthesis
|
|
31
|
+
- mathematical_reasoning
|
|
32
|
+
- table_qa
|
|
33
|
+
- chain_of_thought
|
|
34
|
+
- inference_scaling
|
|
35
|
+
aisb_direction: T3
|
|
36
|
+
track_fit:
|
|
37
|
+
- paper_track
|
|
38
|
+
- benchmark_track
|
|
39
|
+
task_mode: evaluation_driven
|
|
40
|
+
requires_execution: true
|
|
41
|
+
requires_paper: true
|
|
42
|
+
integrity_level: cas_plus_canary
|
|
43
|
+
snapshot_status: runnable
|
|
44
|
+
support_level: advanced
|
|
45
|
+
cost_band: high
|
|
46
|
+
time_band: 1d+
|
|
47
|
+
difficulty: hard
|
|
48
|
+
data_access: public
|
|
49
|
+
primary_outputs:
|
|
50
|
+
- accuracy_gsm8k
|
|
51
|
+
- accuracy_math500
|
|
52
|
+
- accuracy_wikitq
|
|
53
|
+
- rouge_l_fetaqa
|
|
54
|
+
- synthesized_answers
|
|
55
|
+
- vote_report
|
|
56
|
+
launch_profiles:
|
|
57
|
+
- id: quick_check
|
|
58
|
+
label: Quick Check
|
|
59
|
+
description: 'Run synthesis inference on a single dataset (e.g. MATH500 with pre-generated
|
|
60
|
+
candidate responses) and evaluate accuracy. Verifies that the vLLM serving and
|
|
61
|
+
evaluation pipeline work end-to-end.
|
|
62
|
+
|
|
63
|
+
'
|
|
64
|
+
- id: synthesis_eval
|
|
65
|
+
label: Full Synthesis Evaluation
|
|
66
|
+
description: 'Run the complete answer-synthesis and evaluation workflow across all
|
|
67
|
+
four benchmarks (GSM8k, MATH500, WikiTQ, FeTaQA) using the Synthesizer-8B model
|
|
68
|
+
or Llama3.1-70B as the synthesizer. Requires generating candidate responses from
|
|
69
|
+
each policy model first.
|
|
70
|
+
|
|
71
|
+
'
|
|
72
|
+
- id: data_pipeline
|
|
73
|
+
label: Data Generation Pipeline
|
|
74
|
+
description: 'Run the two-stage data generation pipeline (sampling.py → synthesizer.py
|
|
75
|
+
→ filter) to produce training data for Synthesizer-8B. Requires a 70B response
|
|
76
|
+
LLM and is compute-intensive.
|
|
77
|
+
|
|
78
|
+
'
|
|
79
|
+
dataset_download:
|
|
80
|
+
primary_method: mixed
|
|
81
|
+
sources:
|
|
82
|
+
- kind: bundled
|
|
83
|
+
url: null
|
|
84
|
+
access: public
|
|
85
|
+
note: 'Test sets for GSM8k, MATH500, WikiTQ, and FeTaQA are included in the data/
|
|
86
|
+
folder of the snapshot archive.
|
|
87
|
+
|
|
88
|
+
'
|
|
89
|
+
- kind: huggingface
|
|
90
|
+
url: https://huggingface.co/datasets/BoHanMint/Synthesizer-8B-math-train-data
|
|
91
|
+
access: public
|
|
92
|
+
note: 'Pre-generated synthesis training data (295k MATH, 87k WikiTQ). Only needed
|
|
93
|
+
if re-training Synthesizer-8B.
|
|
94
|
+
|
|
95
|
+
'
|
|
96
|
+
- kind: huggingface
|
|
97
|
+
url: https://huggingface.co/BoHanMint/Synthesizer-8B-math
|
|
98
|
+
access: public
|
|
99
|
+
note: 'Pre-trained Synthesizer-8B-math checkpoint. Required for inference unless
|
|
100
|
+
training from scratch.
|
|
101
|
+
|
|
102
|
+
'
|
|
103
|
+
- kind: github
|
|
104
|
+
url: https://github.com/hkust-nlp/dart-math
|
|
105
|
+
access: public
|
|
106
|
+
note: 'DART-Math evaluation library required for MATH500 exact-match scoring.
|
|
107
|
+
Must be pip-installed separately.
|
|
108
|
+
|
|
109
|
+
'
|
|
110
|
+
notes:
|
|
111
|
+
- Bundled test data is small (a few MB). Model checkpoints are ~16 GB for the 8B
|
|
112
|
+
model.
|
|
113
|
+
- If running baselines with Llama3.1-70B as synthesizer, ~140 GB of model weights
|
|
114
|
+
are needed.
|
|
115
|
+
credential_requirements:
|
|
116
|
+
mode: optional
|
|
117
|
+
items:
|
|
118
|
+
- HuggingFace token (if gated model access is needed for Llama3 weights)
|
|
119
|
+
- OpenAI API key (only if evaluating GPT-4o as a policy model)
|
|
120
|
+
- GLM-4-Plus API key (only if evaluating GLM-4-Plus as a policy model)
|
|
121
|
+
notes:
|
|
122
|
+
- Core evaluation with open-source models requires no credentials.
|
|
123
|
+
- API keys are only needed to reproduce the full paper results with API-based policy
|
|
124
|
+
models.
|
|
125
|
+
resources:
|
|
126
|
+
minimum:
|
|
127
|
+
cpu_cores: 16
|
|
128
|
+
ram_gb: 64
|
|
129
|
+
disk_gb: 150
|
|
130
|
+
gpu_count: 1
|
|
131
|
+
gpu_vram_gb: 24
|
|
132
|
+
recommended:
|
|
133
|
+
cpu_cores: 32
|
|
134
|
+
ram_gb: 128
|
|
135
|
+
disk_gb: 300
|
|
136
|
+
gpu_count: 2
|
|
137
|
+
gpu_vram_gb: 48
|
|
138
|
+
environment:
|
|
139
|
+
python: '3.11'
|
|
140
|
+
cuda: null
|
|
141
|
+
pytorch: null
|
|
142
|
+
flash_attn: null
|
|
143
|
+
key_packages:
|
|
144
|
+
- deepspeed==0.15.2
|
|
145
|
+
- vllm==0.5.3
|
|
146
|
+
- transformers==4.43.1
|
|
147
|
+
- rouge-score
|
|
148
|
+
- dart-math (pip install from GitHub)
|
|
149
|
+
notes:
|
|
150
|
+
- vLLM is the primary inference engine; synthesis_infer.py defaults to tensor_parallel_size=1
|
|
151
|
+
and max_model_len=8192.
|
|
152
|
+
- MATH evaluation requires dart-math installed via `pip install -e .` from the dart-math
|
|
153
|
+
repo clone.
|
|
154
|
+
- FeTaQA evaluation requires rouge-score package.
|
|
155
|
+
- The code hardcodes CUDA_VISIBLE_DEVICES in several scripts; adjust for your GPU
|
|
156
|
+
topology.
|
|
157
|
+
- See requirements.txt in the snapshot for the full dependency set.
|
|
158
|
+
risk_flags:
|
|
159
|
+
- external_package_dependency
|
|
160
|
+
- large_model_weights
|
|
161
|
+
- hardcoded_gpu_ids
|
|
162
|
+
risk_notes:
|
|
163
|
+
- MATH evaluation depends on the external dart-math package (GitHub clone + pip install
|
|
164
|
+
-e). If unavailable, MATH500 scoring will fail.
|
|
165
|
+
- Scripts hardcode CUDA_VISIBLE_DEVICES (e.g. "0" in inference, "3" in eval). Must
|
|
166
|
+
be adjusted for multi-GPU or different hardware.
|
|
167
|
+
- Full paper reproduction requires serving 70B models for candidate generation baselines,
|
|
168
|
+
which needs ≥2×80GB GPUs.
|
|
169
|
+
- API-based policy model evaluation (GPT-4o, GLM-4-Plus) incurs real monetary cost
|
|
170
|
+
and requires API credentials.
|
|
171
|
+
- No runtime execution was performed during packaging; metric values are not yet validated.
|
|
172
|
+
recommended_when: 'Use this benchmark when you want an inference-heavy LLM evaluation
|
|
173
|
+
task focused on answer-synthesis strategies for mathematical reasoning and table
|
|
174
|
+
QA. Good fit for studying inference-time scaling, multi-response aggregation, and
|
|
175
|
+
CoT-based post-processing with 8B-class models. All evaluation data is bundled and
|
|
176
|
+
pre-trained checkpoints are publicly available on Hugging Face.
|
|
177
|
+
|
|
178
|
+
'
|
|
179
|
+
not_recommended_when: 'Do not use this if you cannot serve 8B-class models on GPU
|
|
180
|
+
(minimum 24 GB VRAM), if you need a benchmark without model-serving overhead, or
|
|
181
|
+
if you need fully self-contained evaluation without any external package dependencies
|
|
182
|
+
(dart-math is required for MATH scoring).
|
|
183
|
+
|
|
184
|
+
'
|
|
185
|
+
paper:
|
|
186
|
+
title: 'CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis'
|
|
187
|
+
venue: ACL 2025
|
|
188
|
+
year: 2025
|
|
189
|
+
url: https://arxiv.org/abs/2501.01668
|
|
190
|
+
download:
|
|
191
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.018_cotsynth.zip
|
|
192
|
+
archive_type: zip
|
|
193
|
+
local_dir_name: paper-18-CoTSynth
|
|
194
|
+
provider: github_release
|
|
195
|
+
repo: ResearAI/DeepScientist
|
|
196
|
+
tag: aisb-v0.0.1
|
|
197
|
+
asset_name: aisb.t3.018_cotsynth.zip
|
|
198
|
+
sha256: 245a9c52e66e83cc77844c2375cf61a65cb6823fd84bdaf9142687559498a885
|
|
199
|
+
size_bytes: 925436
|
|
200
|
+
commercial:
|
|
201
|
+
annual_fee: null
|
|
202
|
+
display:
|
|
203
|
+
palette_seed: teal-gold-synthesis
|
|
204
|
+
art_style: benchmark-notebook
|
|
205
|
+
accent_priority: high
|
|
206
|
+
image_path: ../image/018_aisb.t3.018_cotsynth.jpg
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.018_cotsynth
|
|
3
|
+
name: 'CoT合成器:通过答案合成提升大语言模型性能'
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: '推理时答案合成基准测试:从8B规模策略模型生成多样化的CoT候选响应,然后通过训练好的Synthesizer-8B合成更优答案,在GSM8k、MATH500、WikiTQ和FeTaQA上评估准确率。
|
|
6
|
+
|
|
7
|
+
'
|
|
8
|
+
task_description: '该基准测试复现了CoT合成器论文(ACL 2025)。核心工作流程为:(1)使用策略模型(如Llama3-8B-Instruct)通过温度/top-p解码为每个问题采样N个多样化候选响应,(2)将候选响应输入合成器模型,执行基于CoT的分析与合成以产生精炼的最终答案,(3)在四个基准测试上评估——GSM8k(精确匹配准确率)、MATH500(通过DART-Math评估器的精确匹配准确率)、WikiTQ(精确匹配准确率)和FeTaQA(ROUGE-L召回率)。快照包含data/文件夹中的捆绑评估测试数据、使用vLLM的推理代码、评估脚本、微调流程以及用于创建合成训练数据的数据生成流程。预训练的Synthesizer-8B检查点及训练数据可在Hugging Face获取。该工作流程为推理密集型:每次评估需要通过vLLM服务8B模型并运行多样本生成。MATH评估还需从GitHub安装dart-math包。该基准测试与自洽性、通用自洽性、基于奖励模型的Best-of-N以及LMCOR基线进行比较。
|
|
9
|
+
|
|
10
|
+
'
|
|
11
|
+
capability_tags:
|
|
12
|
+
- research_code_optimization
|
|
13
|
+
- large_language_models
|
|
14
|
+
- answer_synthesis
|
|
15
|
+
- mathematical_reasoning
|
|
16
|
+
- table_qa
|
|
17
|
+
- chain_of_thought
|
|
18
|
+
- inference_scaling
|
|
19
|
+
aisb_direction: T3
|
|
20
|
+
track_fit:
|
|
21
|
+
- paper_track
|
|
22
|
+
- benchmark_track
|
|
23
|
+
task_mode: evaluation_driven
|
|
24
|
+
requires_execution: true
|
|
25
|
+
requires_paper: true
|
|
26
|
+
integrity_level: cas_plus_canary
|
|
27
|
+
snapshot_status: runnable
|
|
28
|
+
support_level: advanced
|
|
29
|
+
cost_band: high
|
|
30
|
+
time_band: 1d+
|
|
31
|
+
difficulty: hard
|
|
32
|
+
data_access: public
|
|
33
|
+
primary_outputs:
|
|
34
|
+
- accuracy_gsm8k
|
|
35
|
+
- accuracy_math500
|
|
36
|
+
- accuracy_wikitq
|
|
37
|
+
- rouge_l_fetaqa
|
|
38
|
+
- synthesized_answers
|
|
39
|
+
- vote_report
|
|
40
|
+
launch_profiles:
|
|
41
|
+
- id: quick_check
|
|
42
|
+
label: 快速检查
|
|
43
|
+
description: '在单个数据集(如使用预生成候选响应的MATH500)上运行合成推理并评估准确率。验证vLLM服务和评估流程的端到端工作。
|
|
44
|
+
|
|
45
|
+
'
|
|
46
|
+
- id: synthesis_eval
|
|
47
|
+
label: 完整合成评估
|
|
48
|
+
description: '使用Synthesizer-8B模型或Llama3.1-70B作为合成器,在所有四个基准测试(GSM8k、MATH500、WikiTQ、FeTaQA)上运行完整的答案合成与评估工作流程。需要先从每个策略模型生成候选响应。
|
|
49
|
+
|
|
50
|
+
'
|
|
51
|
+
- id: data_pipeline
|
|
52
|
+
label: 数据生成流程
|
|
53
|
+
description: '运行两阶段数据生成流程(sampling.py → synthesizer.py → filter)以生成Synthesizer-8B的训练数据。需要70B响应LLM且计算密集。
|
|
54
|
+
|
|
55
|
+
'
|
|
56
|
+
dataset_download:
|
|
57
|
+
primary_method: mixed
|
|
58
|
+
sources:
|
|
59
|
+
- kind: bundled
|
|
60
|
+
url: null
|
|
61
|
+
access: public
|
|
62
|
+
note: 'GSM8k、MATH500、WikiTQ和FeTaQA的测试集包含在快照压缩包的data/文件夹中。
|
|
63
|
+
|
|
64
|
+
'
|
|
65
|
+
- kind: huggingface
|
|
66
|
+
url: https://huggingface.co/datasets/BoHanMint/Synthesizer-8B-math-train-data
|
|
67
|
+
access: public
|
|
68
|
+
note: '预生成的合成训练数据(295k MATH、87k WikiTQ)。仅在重新训练Synthesizer-8B时需要。
|
|
69
|
+
|
|
70
|
+
'
|
|
71
|
+
- kind: huggingface
|
|
72
|
+
url: https://huggingface.co/BoHanMint/Synthesizer-8B-math
|
|
73
|
+
access: public
|
|
74
|
+
note: '预训练的Synthesizer-8B-math检查点。除非从头训练,否则推理时必需。
|
|
75
|
+
|
|
76
|
+
'
|
|
77
|
+
- kind: github
|
|
78
|
+
url: https://github.com/hkust-nlp/dart-math
|
|
79
|
+
access: public
|
|
80
|
+
note: 'MATH500精确匹配评分所需的DART-Math评估库。必须单独pip安装。
|
|
81
|
+
|
|
82
|
+
'
|
|
83
|
+
notes:
|
|
84
|
+
- 捆绑测试数据很小(仅几MB)。模型检查点约16 GB(8B模型)。
|
|
85
|
+
- 如使用Llama3.1-70B作为合成器运行基线,需要约140 GB的模型权重。
|
|
86
|
+
credential_requirements:
|
|
87
|
+
mode: optional
|
|
88
|
+
items:
|
|
89
|
+
- HuggingFace token(如需访问门控模型以获取Llama3权重)
|
|
90
|
+
- OpenAI API key(仅在评估GPT-4o作为策略模型时需要)
|
|
91
|
+
- GLM-4-Plus API key(仅在评估GLM-4-Plus作为策略模型时需要)
|
|
92
|
+
notes:
|
|
93
|
+
- 使用开源模型的核心评估无需凭据。
|
|
94
|
+
- 仅在复现API基策略模型的完整论文结果时需要API密钥。
|
|
95
|
+
resources:
|
|
96
|
+
minimum:
|
|
97
|
+
cpu_cores: 16
|
|
98
|
+
ram_gb: 64
|
|
99
|
+
disk_gb: 150
|
|
100
|
+
gpu_count: 1
|
|
101
|
+
gpu_vram_gb: 24
|
|
102
|
+
recommended:
|
|
103
|
+
cpu_cores: 32
|
|
104
|
+
ram_gb: 128
|
|
105
|
+
disk_gb: 300
|
|
106
|
+
gpu_count: 2
|
|
107
|
+
gpu_vram_gb: 48
|
|
108
|
+
environment:
|
|
109
|
+
python: '3.11'
|
|
110
|
+
cuda: null
|
|
111
|
+
pytorch: null
|
|
112
|
+
flash_attn: null
|
|
113
|
+
key_packages:
|
|
114
|
+
- deepspeed==0.15.2
|
|
115
|
+
- vllm==0.5.3
|
|
116
|
+
- transformers==4.43.1
|
|
117
|
+
- rouge-score
|
|
118
|
+
- dart-math (pip install from GitHub)
|
|
119
|
+
notes:
|
|
120
|
+
- vLLM是主要推理引擎;synthesis_infer.py默认tensor_parallel_size=1,max_model_len=8192。
|
|
121
|
+
- MATH评估需要通过`pip install -e .`从dart-math仓库克隆并安装。
|
|
122
|
+
- FeTaQA评估需要rouge-score包。
|
|
123
|
+
- 代码在多个脚本中硬编码了CUDA_VISIBLE_DEVICES;请根据您的GPU拓扑进行调整。
|
|
124
|
+
- 快照中的requirements.txt包含完整的依赖列表。
|
|
125
|
+
risk_flags:
|
|
126
|
+
- external_package_dependency
|
|
127
|
+
- large_model_weights
|
|
128
|
+
- hardcoded_gpu_ids
|
|
129
|
+
risk_notes:
|
|
130
|
+
- MATH评估依赖外部dart-math包(GitHub克隆 + pip install -e)。如不可用,MATH500评分将失败。
|
|
131
|
+
- 脚本硬编码了CUDA_VISIBLE_DEVICES(如推理中为"0",评估中为"3")。在多GPU或不同硬件上必须调整。
|
|
132
|
+
- 完整论文复现需要服务70B模型以生成候选响应基线,需要≥2×80GB GPU。
|
|
133
|
+
- 基于API的策略模型评估(GPT-4o、GLM-4-Plus)会产生实际货币成本,需要API凭据。
|
|
134
|
+
- 打包过程中未执行运行时验证;指标值尚未确认。
|
|
135
|
+
recommended_when: '当您需要一个推理密集型的LLM评估任务,重点关注数学推理和表格问答的答案合成策略时使用此基准测试。非常适合研究推理时扩展、多响应聚合以及基于CoT的8B类模型后处理。所有评估数据已捆绑,预训练检查点可在Hugging Face公开获取。
|
|
136
|
+
|
|
137
|
+
'
|
|
138
|
+
not_recommended_when: '如果无法在GPU上服务8B类模型(最低24 GB显存)、需要无模型服务开销的基准测试,或需要完全自包含的评估而不依赖任何外部包依赖项(dart-math是MATH评分必需的),请勿使用此基准测试。
|
|
139
|
+
|
|
140
|
+
'
|
|
141
|
+
paper:
|
|
142
|
+
title: 'CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis'
|
|
143
|
+
venue: ACL 2025
|
|
144
|
+
year: 2025
|
|
145
|
+
url: https://arxiv.org/abs/2501.01668
|
|
146
|
+
download:
|
|
147
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.018_cotsynth.zip
|
|
148
|
+
archive_type: zip
|
|
149
|
+
local_dir_name: paper-18-CoTSynth
|
|
150
|
+
provider: github_release
|
|
151
|
+
repo: ResearAI/DeepScientist
|
|
152
|
+
tag: aisb-v0.0.1
|
|
153
|
+
asset_name: aisb.t3.018_cotsynth.zip
|
|
154
|
+
sha256: 245a9c52e66e83cc77844c2375cf61a65cb6823fd84bdaf9142687559498a885
|
|
155
|
+
size_bytes: 925436
|
|
156
|
+
commercial:
|
|
157
|
+
annual_fee: null
|
|
158
|
+
display:
|
|
159
|
+
palette_seed: teal-gold-synthesis
|
|
160
|
+
art_style: benchmark-notebook
|
|
161
|
+
accent_priority: high
|
|
162
|
+
image_path: ../image/018_aisb.t3.018_cotsynth.jpg
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.019_dyscaleut
|
|
3
|
+
name: Dynamic Scaling of Unit Tests for Code Reward Modeling
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: 'Generate and dynamically scale LLM-produced unit tests as reward signals
|
|
6
|
+
for best-of-N code solution selection, evaluated via pass@1 on HumanEval Plus, MBPP
|
|
7
|
+
Plus, and LiveCodeBench.
|
|
8
|
+
|
|
9
|
+
'
|
|
10
|
+
task_description: 'This benchmark reproduces the CodeRM pipeline for improving code
|
|
11
|
+
generation through scaled unit-test reward modeling. The core workflow is: (1) a
|
|
12
|
+
policy LLM generates N candidate code solutions per programming problem, (2) a reward
|
|
13
|
+
LLM (or the fine-tuned CodeRM-8B unit test generator) produces M unit tests per
|
|
14
|
+
problem, (3) unit tests are executed against solutions inside a Docker sandbox,
|
|
15
|
+
and (4) the best solution is selected via variance-weighted majority voting over
|
|
16
|
+
execution results. A dynamic scaling mechanism allocates more unit tests to harder
|
|
17
|
+
problems using a trained difficulty classifier. The primary metric is pass@1 (best-of-N
|
|
18
|
+
accuracy) computed by evaluation/calculate_result.py. The local snapshot includes
|
|
19
|
+
the evaluation/execution code, bundled benchmark data, pre-computed inference results,
|
|
20
|
+
and a Docker image specification for sandboxed code execution. The inference and
|
|
21
|
+
preprocessing steps (generating new solutions/unit tests from scratch with policy/reward
|
|
22
|
+
LLMs) require external model access and are partially covered by bundled scripts
|
|
23
|
+
but may need adaptation. A pre-computed output.tar.gz from Google Drive can substitute
|
|
24
|
+
for the inference+execution steps.
|
|
25
|
+
|
|
26
|
+
'
|
|
27
|
+
capability_tags:
|
|
28
|
+
- research_code_optimization
|
|
29
|
+
- code_generation
|
|
30
|
+
- reward_modeling
|
|
31
|
+
- unit_test_generation
|
|
32
|
+
- evaluation
|
|
33
|
+
aisb_direction: T3
|
|
34
|
+
track_fit:
|
|
35
|
+
- paper_track
|
|
36
|
+
- benchmark_track
|
|
37
|
+
task_mode: experiment_driven
|
|
38
|
+
requires_execution: true
|
|
39
|
+
requires_paper: true
|
|
40
|
+
integrity_level: cas_plus_canary
|
|
41
|
+
snapshot_status: partial
|
|
42
|
+
support_level: advanced
|
|
43
|
+
cost_band: high
|
|
44
|
+
time_band: 1d+
|
|
45
|
+
difficulty: hard
|
|
46
|
+
data_access: public
|
|
47
|
+
primary_outputs:
|
|
48
|
+
- pass_at_1
|
|
49
|
+
- scaled_unit_tests
|
|
50
|
+
- reward_scores
|
|
51
|
+
launch_profiles:
|
|
52
|
+
- id: quick_check
|
|
53
|
+
label: Quick Check
|
|
54
|
+
description: 'Run evaluation/calculate_result.py on the bundled or downloaded pre-computed
|
|
55
|
+
execution results to verify pass@1 on a single benchmark/model combination. No
|
|
56
|
+
GPU or Docker required.
|
|
57
|
+
|
|
58
|
+
'
|
|
59
|
+
- id: code_reward_eval
|
|
60
|
+
label: Code Reward Eval
|
|
61
|
+
description: 'Pull the Docker sandbox image, execute unit tests against candidate
|
|
62
|
+
solutions using evaluation/evaluate.py, then compute pass@1 with evaluation/calculate_result.py.
|
|
63
|
+
Requires Docker and moderate compute. Uses bundled benchmark data and pre-generated
|
|
64
|
+
solutions/unit tests; does not require LLM inference.
|
|
65
|
+
|
|
66
|
+
'
|
|
67
|
+
- id: full_pipeline
|
|
68
|
+
label: Full Pipeline (Inference + Eval)
|
|
69
|
+
description: 'Run end-to-end: multi-process LLM inference to generate solutions
|
|
70
|
+
and unit tests, preprocessing/merging, Docker-based execution, and final pass@1
|
|
71
|
+
calculation. Requires GPU access for LLM inference (CodeRM-8B or larger models)
|
|
72
|
+
and Docker for execution.
|
|
73
|
+
|
|
74
|
+
'
|
|
75
|
+
dataset_download:
|
|
76
|
+
primary_method: mixed
|
|
77
|
+
sources:
|
|
78
|
+
- kind: huggingface
|
|
79
|
+
url: https://huggingface.co/datasets/KAKA22/CodeRM-UnitTest
|
|
80
|
+
access: public
|
|
81
|
+
note: 60k synthetic Python unit tests used to train CodeRM-8B.
|
|
82
|
+
- kind: huggingface
|
|
83
|
+
url: https://huggingface.co/KAKA22/CodeRM-8B
|
|
84
|
+
access: public
|
|
85
|
+
note: Fine-tuned 8B unit test generator model weights.
|
|
86
|
+
- kind: google_drive
|
|
87
|
+
url: https://drive.google.com/drive/folders/1-wUvy9Ox49V5CY38TMjCr5RlLysapyyj?usp=sharing
|
|
88
|
+
access: public
|
|
89
|
+
note: 'Pre-computed execution output (output.tar.gz) that can replace Steps 1-3
|
|
90
|
+
of the pipeline.
|
|
91
|
+
|
|
92
|
+
'
|
|
93
|
+
- kind: bundled
|
|
94
|
+
url: null
|
|
95
|
+
access: local
|
|
96
|
+
note: 'Benchmark data (HumanEval Plus, MBPP Plus, LiveCodeBench) and pre-generated
|
|
97
|
+
inference results are included under data/benchmark/ and data/result/ in the
|
|
98
|
+
snapshot.
|
|
99
|
+
|
|
100
|
+
'
|
|
101
|
+
notes:
|
|
102
|
+
- Training dataset is ~60k examples; model weights are ~16 GB; pre-computed outputs
|
|
103
|
+
vary by benchmark.
|
|
104
|
+
- Benchmarks themselves (HumanEval Plus, MBPP Plus, LiveCodeBench) are public.
|
|
105
|
+
credential_requirements:
|
|
106
|
+
mode: none
|
|
107
|
+
items: []
|
|
108
|
+
notes:
|
|
109
|
+
- No API keys required if using bundled data and CodeRM-8B for inference.
|
|
110
|
+
- If replicating GPT-4o-mini or GPT-3.5 policy/reward experiments, OpenAI API keys
|
|
111
|
+
are needed.
|
|
112
|
+
resources:
|
|
113
|
+
minimum:
|
|
114
|
+
cpu_cores: 16
|
|
115
|
+
ram_gb: 64
|
|
116
|
+
disk_gb: 150
|
|
117
|
+
gpu_count: 1
|
|
118
|
+
gpu_vram_gb: 24
|
|
119
|
+
recommended:
|
|
120
|
+
cpu_cores: 32
|
|
121
|
+
ram_gb: 128
|
|
122
|
+
disk_gb: 300
|
|
123
|
+
gpu_count: 2
|
|
124
|
+
gpu_vram_gb: 48
|
|
125
|
+
environment:
|
|
126
|
+
python: null
|
|
127
|
+
cuda: null
|
|
128
|
+
pytorch: null
|
|
129
|
+
flash_attn: null
|
|
130
|
+
key_packages:
|
|
131
|
+
- vllm
|
|
132
|
+
- transformers
|
|
133
|
+
- docker
|
|
134
|
+
notes:
|
|
135
|
+
- 'The snapshot includes a dedicated Docker execution environment (kaka0605/exec_unit_test:24.12.30)
|
|
136
|
+
for sandboxed large-scale code execution of generated unit tests.
|
|
137
|
+
|
|
138
|
+
'
|
|
139
|
+
- See docker_source/Dockerfile and docker_source/requirements.txt for the sandbox
|
|
140
|
+
dependencies.
|
|
141
|
+
- See bundled README and requirements for the host-side dependency set.
|
|
142
|
+
- Inference uses multi-process Python (inference/inference_mp.py) and likely requires
|
|
143
|
+
vLLM or similar for efficient serving.
|
|
144
|
+
risk_flags:
|
|
145
|
+
- docker_required
|
|
146
|
+
- partial_snapshot
|
|
147
|
+
- external_model_for_full_replication
|
|
148
|
+
- code_execution_sandbox
|
|
149
|
+
risk_notes:
|
|
150
|
+
- 'The Docker sandbox (kaka0605/exec_unit_test:24.12.30) must be pulled or built locally
|
|
151
|
+
before running evaluation/evaluate.py. Without Docker, only the final calculate_result.py
|
|
152
|
+
step works on pre-computed outputs.
|
|
153
|
+
|
|
154
|
+
'
|
|
155
|
+
- 'The preprocessing scripts (preprocess/) are present but the full inference pipeline
|
|
156
|
+
requires serving a policy LLM and a reward LLM, which is not fully automated in
|
|
157
|
+
the snapshot.
|
|
158
|
+
|
|
159
|
+
'
|
|
160
|
+
- 'exec_main.py executes arbitrary generated Python code with a timeout mechanism
|
|
161
|
+
via signal.SIGALRM; this should only be run inside the provided Docker sandbox or
|
|
162
|
+
an equivalent isolated environment.
|
|
163
|
+
|
|
164
|
+
'
|
|
165
|
+
- 'No benchmark execution was performed during the packaging pass; metric values are
|
|
166
|
+
not yet validated.
|
|
167
|
+
|
|
168
|
+
'
|
|
169
|
+
recommended_when: 'Use this benchmark when you want to study how scaling LLM-generated
|
|
170
|
+
unit tests improves code reward signal quality and best-of-N code selection, or
|
|
171
|
+
when you need a code-generation task with real unit-test execution in the evaluation
|
|
172
|
+
loop. Also suitable for evaluating lightweight unit test generators against larger
|
|
173
|
+
teacher models.
|
|
174
|
+
|
|
175
|
+
'
|
|
176
|
+
not_recommended_when: 'Do not use this if you cannot provide Docker-based containerized
|
|
177
|
+
code execution, if you need a text-only reward-model benchmark without code execution,
|
|
178
|
+
or if you lack GPU resources for LLM inference and only need a quick metric check
|
|
179
|
+
(use the quick_check profile with pre-computed outputs instead).
|
|
180
|
+
|
|
181
|
+
'
|
|
182
|
+
paper:
|
|
183
|
+
title: Dynamic Scaling of Unit Tests for Code Reward Modeling
|
|
184
|
+
authors:
|
|
185
|
+
- Zeyao Ma
|
|
186
|
+
- Xiaokang Zhang
|
|
187
|
+
- Jing Zhang
|
|
188
|
+
- Jifan Yu
|
|
189
|
+
- Sijia Luo
|
|
190
|
+
- Jie Tang
|
|
191
|
+
venue: ACL 2025
|
|
192
|
+
year: 2025
|
|
193
|
+
url: https://arxiv.org/abs/2501.01054
|
|
194
|
+
homepage: https://code-reward-model.github.io/
|
|
195
|
+
download:
|
|
196
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.019_dyscaleut.zip
|
|
197
|
+
archive_type: zip
|
|
198
|
+
local_dir_name: paper-19-DyScaleUT
|
|
199
|
+
provider: github_release
|
|
200
|
+
repo: ResearAI/DeepScientist
|
|
201
|
+
tag: aisb-v0.0.1
|
|
202
|
+
asset_name: aisb.t3.019_dyscaleut.zip
|
|
203
|
+
sha256: 2eee5573353ade5e13c254f7372a3294b71459ee7c668205f27f2852347c141f
|
|
204
|
+
size_bytes: 60766
|
|
205
|
+
commercial:
|
|
206
|
+
annual_fee: null
|
|
207
|
+
display:
|
|
208
|
+
palette_seed: olive-ink-runtime
|
|
209
|
+
art_style: code-lab
|
|
210
|
+
accent_priority: high
|
|
211
|
+
image_path: ../image/019_aisb.t3.019_dyscaleut.jpg
|