@researai/deepscientist 1.5.17 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +309 -130
- package/AISB/catalog/aisb.b1.agentic_coding.yaml +244 -0
- package/AISB/catalog/aisb.b10.climate_earth.yaml +235 -0
- package/AISB/catalog/aisb.b11.model_efficiency.yaml +231 -0
- package/AISB/catalog/aisb.b12.embodied_ai.yaml +238 -0
- package/AISB/catalog/aisb.b2.agent_systems.yaml +229 -0
- package/AISB/catalog/aisb.b3.self_evolving_rl.yaml +237 -0
- package/AISB/catalog/aisb.b4.lm_reasoning.yaml +240 -0
- package/AISB/catalog/aisb.b5.math_proof.yaml +235 -0
- package/AISB/catalog/aisb.b6.research_process.yaml +243 -0
- package/AISB/catalog/aisb.b7.multimodal_fusion.yaml +232 -0
- package/AISB/catalog/aisb.b8.lifesci_drug.yaml +275 -0
- package/AISB/catalog/aisb.b9.material_science.yaml +237 -0
- package/AISB/catalog/aisb.t3.001_savvy.yaml +159 -0
- package/AISB/catalog/aisb.t3.001_savvy.zh.yaml +121 -0
- package/AISB/catalog/aisb.t3.002_pinet.yaml +189 -0
- package/AISB/catalog/aisb.t3.002_pinet.zh.yaml +130 -0
- package/AISB/catalog/aisb.t3.004_decentralattn.yaml +184 -0
- package/AISB/catalog/aisb.t3.004_decentralattn.zh.yaml +153 -0
- package/AISB/catalog/aisb.t3.005_tsae.yaml +193 -0
- package/AISB/catalog/aisb.t3.005_tsae.zh.yaml +139 -0
- package/AISB/catalog/aisb.t3.006_physense.yaml +194 -0
- package/AISB/catalog/aisb.t3.006_physense.zh.yaml +118 -0
- package/AISB/catalog/aisb.t3.007_reasoningiqa.yaml +169 -0
- package/AISB/catalog/aisb.t3.007_reasoningiqa.zh.yaml +133 -0
- package/AISB/catalog/aisb.t3.008_meanflows.yaml +188 -0
- package/AISB/catalog/aisb.t3.008_meanflows.zh.yaml +140 -0
- package/AISB/catalog/aisb.t3.009_scoremissing.yaml +179 -0
- package/AISB/catalog/aisb.t3.009_scoremissing.zh.yaml +119 -0
- package/AISB/catalog/aisb.t3.010_suitabilityfilter.yaml +221 -0
- package/AISB/catalog/aisb.t3.010_suitabilityfilter.zh.yaml +141 -0
- package/AISB/catalog/aisb.t3.011_osd.yaml +206 -0
- package/AISB/catalog/aisb.t3.011_osd.zh.yaml +163 -0
- package/AISB/catalog/aisb.t3.012_efficientqat.yaml +206 -0
- package/AISB/catalog/aisb.t3.012_efficientqat.zh.yaml +159 -0
- package/AISB/catalog/aisb.t3.013_appl.yaml +152 -0
- package/AISB/catalog/aisb.t3.013_appl.zh.yaml +126 -0
- package/AISB/catalog/aisb.t3.014_piguard.yaml +207 -0
- package/AISB/catalog/aisb.t3.014_piguard.zh.yaml +164 -0
- package/AISB/catalog/aisb.t3.015_frspec.yaml +209 -0
- package/AISB/catalog/aisb.t3.015_frspec.zh.yaml +163 -0
- package/AISB/catalog/aisb.t3.016_mathfusion.yaml +166 -0
- package/AISB/catalog/aisb.t3.016_mathfusion.zh.yaml +145 -0
- package/AISB/catalog/aisb.t3.017_multimodalglp.yaml +171 -0
- package/AISB/catalog/aisb.t3.017_multimodalglp.zh.yaml +122 -0
- package/AISB/catalog/aisb.t3.018_cotsynth.yaml +206 -0
- package/AISB/catalog/aisb.t3.018_cotsynth.zh.yaml +162 -0
- package/AISB/catalog/aisb.t3.019_dyscaleut.yaml +211 -0
- package/AISB/catalog/aisb.t3.019_dyscaleut.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.020_aristotle.yaml +173 -0
- package/AISB/catalog/aisb.t3.020_aristotle.zh.yaml +119 -0
- package/AISB/catalog/aisb.t3.021_tokenrecycling.yaml +160 -0
- package/AISB/catalog/aisb.t3.021_tokenrecycling.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.022_chainofreasoning.yaml +204 -0
- package/AISB/catalog/aisb.t3.022_chainofreasoning.zh.yaml +161 -0
- package/AISB/catalog/aisb.t3.023_guidedembed.yaml +211 -0
- package/AISB/catalog/aisb.t3.023_guidedembed.zh.yaml +189 -0
- package/AISB/catalog/aisb.t3.024_outputcentric.yaml +148 -0
- package/AISB/catalog/aisb.t3.024_outputcentric.zh.yaml +131 -0
- package/AISB/catalog/aisb.t3.025_deeper.yaml +143 -0
- package/AISB/catalog/aisb.t3.025_deeper.zh.yaml +116 -0
- package/AISB/catalog/aisb.t3.026_gartkg.yaml +195 -0
- package/AISB/catalog/aisb.t3.026_gartkg.zh.yaml +127 -0
- package/AISB/catalog/aisb.t3.027_citeeval.yaml +182 -0
- package/AISB/catalog/aisb.t3.027_citeeval.zh.yaml +135 -0
- package/AISB/catalog/aisb.t3.028_sbam.yaml +206 -0
- package/AISB/catalog/aisb.t3.028_sbam.zh.yaml +166 -0
- package/AISB/catalog/aisb.t3.029_cdqgeoembed.yaml +224 -0
- package/AISB/catalog/aisb.t3.029_cdqgeoembed.zh.yaml +142 -0
- package/AISB/catalog/aisb.t3.030_processrm.yaml +211 -0
- package/AISB/catalog/aisb.t3.030_processrm.zh.yaml +166 -0
- package/AISB/catalog/aisb.t3.031_circuitstability.yaml +172 -0
- package/AISB/catalog/aisb.t3.031_circuitstability.zh.yaml +134 -0
- package/AISB/catalog/aisb.t3.032_ptsolver.yaml +169 -0
- package/AISB/catalog/aisb.t3.032_ptsolver.zh.yaml +135 -0
- package/AISB/catalog/aisb.t3.033_gcse.yaml +144 -0
- package/AISB/catalog/aisb.t3.033_gcse.zh.yaml +126 -0
- package/AISB/catalog/aisb.t3.034_ensemblewm.yaml +183 -0
- package/AISB/catalog/aisb.t3.034_ensemblewm.zh.yaml +146 -0
- package/AISB/catalog/aisb.t3.035_moralvalueswa.yaml +207 -0
- package/AISB/catalog/aisb.t3.035_moralvalueswa.zh.yaml +165 -0
- package/AISB/catalog/aisb.t3.036_weakstrongpref.yaml +210 -0
- package/AISB/catalog/aisb.t3.036_weakstrongpref.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.037_dementiamask.yaml +172 -0
- package/AISB/catalog/aisb.t3.037_dementiamask.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.038_tinysam.yaml +284 -0
- package/AISB/catalog/aisb.t3.038_tinysam.zh.yaml +240 -0
- package/AISB/catalog/aisb.t3.039_calf.yaml +224 -0
- package/AISB/catalog/aisb.t3.039_calf.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.040_graniteguardian.yaml +199 -0
- package/AISB/catalog/aisb.t3.040_graniteguardian.zh.yaml +174 -0
- package/AISB/catalog/aisb.t3.041_amdm.yaml +149 -0
- package/AISB/catalog/aisb.t3.041_amdm.zh.yaml +137 -0
- package/AISB/catalog/aisb.t3.042_xpatch.yaml +216 -0
- package/AISB/catalog/aisb.t3.042_xpatch.zh.yaml +182 -0
- package/AISB/catalog/aisb.t3.043_vhm.yaml +268 -0
- package/AISB/catalog/aisb.t3.043_vhm.zh.yaml +193 -0
- package/AISB/catalog/aisb.t3.044_rgvi.yaml +224 -0
- package/AISB/catalog/aisb.t3.044_rgvi.zh.yaml +176 -0
- package/AISB/catalog/aisb.t3.045_pslstm.yaml +203 -0
- package/AISB/catalog/aisb.t3.045_pslstm.zh.yaml +179 -0
- package/AISB/catalog/aisb.t3.046_nonstatts.yaml +208 -0
- package/AISB/catalog/aisb.t3.046_nonstatts.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.047_timepfn.yaml +156 -0
- package/AISB/catalog/aisb.t3.047_timepfn.zh.yaml +124 -0
- package/AISB/catalog/aisb.t3.048_proxyspex.yaml +148 -0
- package/AISB/catalog/aisb.t3.048_proxyspex.zh.yaml +125 -0
- package/AISB/catalog/aisb.t3.049_hogwildinference.yaml +183 -0
- package/AISB/catalog/aisb.t3.049_hogwildinference.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.050_causalpfn.yaml +214 -0
- package/AISB/catalog/aisb.t3.050_causalpfn.zh.yaml +190 -0
- package/AISB/catalog/aisb.t3.051_flashtp.yaml +169 -0
- package/AISB/catalog/aisb.t3.051_flashtp.zh.yaml +124 -0
- package/AISB/catalog/aisb.t3.052_nsdiff.yaml +155 -0
- package/AISB/catalog/aisb.t3.052_nsdiff.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.053_k2vae.yaml +158 -0
- package/AISB/catalog/aisb.t3.053_k2vae.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.054_timebase.yaml +178 -0
- package/AISB/catalog/aisb.t3.054_timebase.zh.yaml +158 -0
- package/AISB/catalog/aisb.t3.055_csbrain.yaml +238 -0
- package/AISB/catalog/aisb.t3.055_csbrain.zh.yaml +184 -0
- package/AISB/catalog/aisb.t3.056_infosam.yaml +224 -0
- package/AISB/catalog/aisb.t3.056_infosam.zh.yaml +189 -0
- package/AISB/catalog/aisb.t3.057_mdreid.yaml +129 -0
- package/AISB/catalog/aisb.t3.057_mdreid.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.058_mindglitch.yaml +171 -0
- package/AISB/catalog/aisb.t3.058_mindglitch.zh.yaml +145 -0
- package/AISB/catalog/aisb.t3.059_selfsupervised.yaml +154 -0
- package/AISB/catalog/aisb.t3.059_selfsupervised.zh.yaml +125 -0
- package/AISB/catalog/aisb.t3.060_iaggad.yaml +121 -0
- package/AISB/catalog/aisb.t3.060_iaggad.zh.yaml +100 -0
- package/AISB/catalog/aisb.t3.061_hsgkn.yaml +136 -0
- package/AISB/catalog/aisb.t3.061_hsgkn.zh.yaml +113 -0
- package/AISB/catalog/aisb.t3.062_visionts.yaml +237 -0
- package/AISB/catalog/aisb.t3.062_visionts.zh.yaml +216 -0
- package/AISB/catalog/aisb.t3.063_tsrag.yaml +162 -0
- package/AISB/catalog/aisb.t3.063_tsrag.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.064_pir.yaml +221 -0
- package/AISB/catalog/aisb.t3.064_pir.zh.yaml +197 -0
- package/AISB/catalog/aisb.t3.065_proteinbinding.yaml +234 -0
- package/AISB/catalog/aisb.t3.065_proteinbinding.zh.yaml +167 -0
- package/AISB/catalog/aisb.t3.066_tropicalattention.yaml +267 -0
- package/AISB/catalog/aisb.t3.066_tropicalattention.zh.yaml +229 -0
- package/AISB/catalog/aisb.t3.067_kanad.yaml +193 -0
- package/AISB/catalog/aisb.t3.067_kanad.zh.yaml +167 -0
- package/AISB/catalog/aisb.t3.068_sempo.yaml +187 -0
- package/AISB/catalog/aisb.t3.068_sempo.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.069_treehfd.yaml +129 -0
- package/AISB/catalog/aisb.t3.069_treehfd.zh.yaml +111 -0
- package/AISB/catalog/aisb.t3.070_certifiedunlearning.yaml +224 -0
- package/AISB/catalog/aisb.t3.070_certifiedunlearning.zh.yaml +171 -0
- package/AISB/catalog/aisb.t3.071_neuralmjd.yaml +142 -0
- package/AISB/catalog/aisb.t3.071_neuralmjd.zh.yaml +120 -0
- package/AISB/catalog/aisb.t3.072_fedgmt.yaml +181 -0
- package/AISB/catalog/aisb.t3.072_fedgmt.zh.yaml +158 -0
- package/AISB/catalog/aisb.t3.073_rld.yaml +161 -0
- package/AISB/catalog/aisb.t3.073_rld.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.074_lsvi.yaml +163 -0
- package/AISB/catalog/aisb.t3.074_lsvi.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.075_treeslicedentropy.yaml +201 -0
- package/AISB/catalog/aisb.t3.075_treeslicedentropy.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.076_aanet.yaml +169 -0
- package/AISB/catalog/aisb.t3.076_aanet.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.077_cmnn.yaml +199 -0
- package/AISB/catalog/aisb.t3.077_cmnn.zh.yaml +165 -0
- package/AISB/catalog/aisb.t3.078_conformalanomaly.yaml +146 -0
- package/AISB/catalog/aisb.t3.078_conformalanomaly.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.079_dpfkmeans.yaml +131 -0
- package/AISB/catalog/aisb.t3.079_dpfkmeans.zh.yaml +104 -0
- package/AISB/catalog/aisb.t3.080_latentscorereweight.yaml +169 -0
- package/AISB/catalog/aisb.t3.080_latentscorereweight.zh.yaml +123 -0
- package/AISB/catalog/aisb.t3.081_qmamba.yaml +150 -0
- package/AISB/catalog/aisb.t3.081_qmamba.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.082_onlinellmrouting.yaml +160 -0
- package/AISB/catalog/aisb.t3.082_onlinellmrouting.zh.yaml +133 -0
- package/AISB/catalog/aisb.t3.083_starformer.yaml +178 -0
- package/AISB/catalog/aisb.t3.083_starformer.zh.yaml +140 -0
- package/AISB/catalog/aisb.t3.084_ift.yaml +139 -0
- package/AISB/catalog/aisb.t3.084_ift.zh.yaml +111 -0
- package/AISB/catalog/aisb.t3.085_neuralsurv.yaml +183 -0
- package/AISB/catalog/aisb.t3.085_neuralsurv.zh.yaml +143 -0
- package/AISB/catalog/aisb.t3.086_stella.yaml +197 -0
- package/AISB/catalog/aisb.t3.086_stella.zh.yaml +142 -0
- package/AISB/catalog/aisb.t3.087_moses.yaml +167 -0
- package/AISB/catalog/aisb.t3.087_moses.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.088_channelnorm.yaml +140 -0
- package/AISB/catalog/aisb.t3.088_channelnorm.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.089_causalvelocity.yaml +730 -0
- package/AISB/catalog/aisb.t3.089_causalvelocity.zh.yaml +668 -0
- package/AISB/catalog/aisb.t3.090_rstib.yaml +144 -0
- package/AISB/catalog/aisb.t3.090_rstib.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.091_timeawarecausal.yaml +132 -0
- package/AISB/catalog/aisb.t3.091_timeawarecausal.zh.yaml +107 -0
- package/AISB/catalog/aisb.t3.092_kmeanslocalopt.yaml +138 -0
- package/AISB/catalog/aisb.t3.092_kmeanslocalopt.zh.yaml +110 -0
- package/AISB/catalog/aisb.t3.093_fedwmsam.yaml +134 -0
- package/AISB/catalog/aisb.t3.093_fedwmsam.zh.yaml +106 -0
- package/AISB/catalog/aisb.t3.094_boundre.yaml +147 -0
- package/AISB/catalog/aisb.t3.094_boundre.zh.yaml +114 -0
- package/AISB/catalog/aisb.t3.095_fastfeaturecp.yaml +153 -0
- package/AISB/catalog/aisb.t3.095_fastfeaturecp.zh.yaml +118 -0
- package/AISB/catalog/aisb.t3.096_m3svm.yaml +189 -0
- package/AISB/catalog/aisb.t3.096_m3svm.zh.yaml +149 -0
- package/AISB/catalog/aisb.t3.097_wassersteintl.yaml +212 -0
- package/AISB/catalog/aisb.t3.097_wassersteintl.zh.yaml +169 -0
- package/AISB/catalog/aisb.t3.098_xmahalanobis.yaml +171 -0
- package/AISB/catalog/aisb.t3.098_xmahalanobis.zh.yaml +127 -0
- package/AISB/catalog/aisb.t3.099_ollalanding.yaml +248 -0
- package/AISB/catalog/aisb.t3.099_ollalanding.zh.yaml +182 -0
- package/AISB/catalog/aisb.t3.100_invmissingdata.yaml +179 -0
- package/AISB/catalog/aisb.t3.100_invmissingdata.zh.yaml +150 -0
- package/AISB/catalog/aisb.t3.101_acia.yaml +164 -0
- package/AISB/catalog/aisb.t3.101_acia.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.102_stochasticff.yaml +178 -0
- package/AISB/catalog/aisb.t3.102_stochasticff.zh.yaml +130 -0
- package/AISB/catalog/aisb.t3.103_qdcp.yaml +150 -0
- package/AISB/catalog/aisb.t3.103_qdcp.zh.yaml +116 -0
- package/AISB/catalog/aisb.t3.104_balancedactiveinf.yaml +137 -0
- package/AISB/catalog/aisb.t3.104_balancedactiveinf.zh.yaml +104 -0
- package/AISB/catalog/aisb.t3.105_binaryclasseval.yaml +161 -0
- package/AISB/catalog/aisb.t3.105_binaryclasseval.zh.yaml +130 -0
- package/AISB/image/001_aisb.t3.001_savvy.jpg +0 -0
- package/AISB/image/002_aisb.t3.002_pinet.jpg +0 -0
- package/AISB/image/003_aisb.t3.003_dmsqd.jpg +0 -0
- package/AISB/image/004_aisb.t3.004_decentralattn.jpg +0 -0
- package/AISB/image/005_aisb.t3.005_tsae.jpg +0 -0
- package/AISB/image/006_aisb.t3.006_physense.jpg +0 -0
- package/AISB/image/007_aisb.t3.007_reasoningiqa.jpg +0 -0
- package/AISB/image/008_aisb.t3.008_meanflows.jpg +0 -0
- package/AISB/image/009_aisb.t3.009_scoremissing.jpg +0 -0
- package/AISB/image/010_aisb.t3.010_suitabilityfilter.jpg +0 -0
- package/AISB/image/011_aisb.t3.011_osd.jpg +0 -0
- package/AISB/image/012_aisb.t3.012_efficientqat.jpg +0 -0
- package/AISB/image/013_aisb.t3.013_appl.jpg +0 -0
- package/AISB/image/014_aisb.t3.014_piguard.jpg +0 -0
- package/AISB/image/015_aisb.t3.015_frspec.jpg +0 -0
- package/AISB/image/016_aisb.t3.016_mathfusion.jpg +0 -0
- package/AISB/image/017_aisb.t3.017_multimodalglp.jpg +0 -0
- package/AISB/image/018_aisb.t3.018_cotsynth.jpg +0 -0
- package/AISB/image/019_aisb.t3.019_dyscaleut.jpg +0 -0
- package/AISB/image/020_aisb.t3.020_aristotle.jpg +0 -0
- package/AISB/image/021_aisb.t3.021_tokenrecycling.jpg +0 -0
- package/AISB/image/022_aisb.t3.022_chainofreasoning.jpg +0 -0
- package/AISB/image/023_aisb.t3.023_guidedembed.jpg +0 -0
- package/AISB/image/024_aisb.t3.024_outputcentric.jpg +0 -0
- package/AISB/image/025_aisb.t3.025_deeper.jpg +0 -0
- package/AISB/image/026_aisb.t3.026_gartkg.jpg +0 -0
- package/AISB/image/027_aisb.t3.027_citeeval.jpg +0 -0
- package/AISB/image/028_aisb.t3.028_sbam.jpg +0 -0
- package/AISB/image/029_aisb.t3.029_cdqgeoembed.jpg +0 -0
- package/AISB/image/030_aisb.t3.030_processrm.jpg +0 -0
- package/AISB/image/031_aisb.t3.031_circuitstability.jpg +0 -0
- package/AISB/image/032_aisb.t3.032_ptsolver.jpg +0 -0
- package/AISB/image/033_aisb.t3.033_gcse.jpg +0 -0
- package/AISB/image/034_aisb.t3.034_ensemblewm.jpg +0 -0
- package/AISB/image/035_aisb.t3.035_moralvalueswa.jpg +0 -0
- package/AISB/image/036_aisb.t3.036_weakstrongpref.jpg +0 -0
- package/AISB/image/037_aisb.t3.037_dementiamask.jpg +0 -0
- package/AISB/image/038_aisb.t3.038_tinysam.jpg +0 -0
- package/AISB/image/039_aisb.t3.039_calf.jpg +0 -0
- package/AISB/image/040_aisb.t3.040_graniteguardian.jpg +0 -0
- package/AISB/image/041_aisb.t3.041_amdm.jpg +0 -0
- package/AISB/image/042_aisb.t3.042_xpatch.jpg +0 -0
- package/AISB/image/043_aisb.t3.043_vhm.jpg +0 -0
- package/AISB/image/044_aisb.t3.044_rgvi.jpg +0 -0
- package/AISB/image/045_aisb.t3.045_pslstm.jpg +0 -0
- package/AISB/image/046_aisb.t3.046_nonstatts.jpg +0 -0
- package/AISB/image/047_aisb.t3.047_timepfn.jpg +0 -0
- package/AISB/image/048_aisb.t3.048_proxyspex.jpg +0 -0
- package/AISB/image/049_aisb.t3.049_hogwildinference.jpg +0 -0
- package/AISB/image/050_aisb.t3.050_causalpfn.jpg +0 -0
- package/AISB/image/051_aisb.t3.051_flashtp.jpg +0 -0
- package/AISB/image/052_aisb.t3.052_nsdiff.jpg +0 -0
- package/AISB/image/053_aisb.t3.053_k2vae.jpg +0 -0
- package/AISB/image/054_aisb.t3.054_timebase.jpg +0 -0
- package/AISB/image/055_aisb.t3.055_csbrain.jpg +0 -0
- package/AISB/image/056_aisb.t3.056_infosam.jpg +0 -0
- package/AISB/image/057_aisb.t3.057_mdreid.jpg +0 -0
- package/AISB/image/058_aisb.t3.058_mindglitch.jpg +0 -0
- package/AISB/image/059_aisb.t3.059_selfsupervised.jpg +0 -0
- package/AISB/image/060_aisb.t3.060_iaggad.jpg +0 -0
- package/AISB/image/061_aisb.t3.061_hsgkn.jpg +0 -0
- package/AISB/image/062_aisb.t3.062_visionts.jpg +0 -0
- package/AISB/image/063_aisb.t3.063_tsrag.jpg +0 -0
- package/AISB/image/064_aisb.t3.064_pir.jpg +0 -0
- package/AISB/image/065_aisb.t3.065_proteinbinding.jpg +0 -0
- package/AISB/image/066_aisb.t3.066_tropicalattention.jpg +0 -0
- package/AISB/image/067_aisb.t3.067_kanad.jpg +0 -0
- package/AISB/image/068_aisb.t3.068_sempo.jpg +0 -0
- package/AISB/image/069_aisb.t3.069_treehfd.jpg +0 -0
- package/AISB/image/070_aisb.t3.070_certifiedunlearning.jpg +0 -0
- package/AISB/image/071_aisb.t3.071_neuralmjd.jpg +0 -0
- package/AISB/image/072_aisb.t3.072_fedgmt.jpg +0 -0
- package/AISB/image/073_aisb.t3.073_rld.jpg +0 -0
- package/AISB/image/074_aisb.t3.074_lsvi.jpg +0 -0
- package/AISB/image/075_aisb.t3.075_treeslicedentropy.jpg +0 -0
- package/AISB/image/076_aisb.t3.076_aanet.jpg +0 -0
- package/AISB/image/077_aisb.t3.077_cmnn.jpg +0 -0
- package/AISB/image/078_aisb.t3.078_conformalanomaly.jpg +0 -0
- package/AISB/image/079_aisb.t3.079_dpfkmeans.jpg +0 -0
- package/AISB/image/080_aisb.t3.080_latentscorereweight.jpg +0 -0
- package/AISB/image/081_aisb.t3.081_qmamba.jpg +0 -0
- package/AISB/image/082_aisb.t3.082_onlinellmrouting.jpg +0 -0
- package/AISB/image/083_aisb.t3.083_starformer.jpg +0 -0
- package/AISB/image/084_aisb.t3.084_ift.jpg +0 -0
- package/AISB/image/085_aisb.t3.085_neuralsurv.jpg +0 -0
- package/AISB/image/086_aisb.t3.086_stella.jpg +0 -0
- package/AISB/image/087_aisb.t3.087_moses.jpg +0 -0
- package/AISB/image/088_aisb.t3.088_channelnorm.jpg +0 -0
- package/AISB/image/089_aisb.t3.089_causalvelocity.jpg +0 -0
- package/AISB/image/090_aisb.t3.090_rstib.jpg +0 -0
- package/AISB/image/091_aisb.t3.091_timeawarecausal.jpg +0 -0
- package/AISB/image/092_aisb.t3.092_kmeanslocalopt.jpg +0 -0
- package/AISB/image/093_aisb.t3.093_fedwmsam.jpg +0 -0
- package/AISB/image/094_aisb.t3.094_boundre.jpg +0 -0
- package/AISB/image/095_aisb.t3.095_fastfeaturecp.jpg +0 -0
- package/AISB/image/096_aisb.t3.096_m3svm.jpg +0 -0
- package/AISB/image/097_aisb.t3.097_wassersteintl.jpg +0 -0
- package/AISB/image/098_aisb.t3.098_xmahalanobis.jpg +0 -0
- package/AISB/image/099_aisb.t3.099_ollalanding.jpg +0 -0
- package/AISB/image/100_aisb.t3.100_invmissingdata.jpg +0 -0
- package/AISB/image/101_aisb.t3.101_acia.jpg +0 -0
- package/AISB/image/102_aisb.t3.102_stochasticff.jpg +0 -0
- package/AISB/image/103_aisb.t3.103_qdcp.jpg +0 -0
- package/AISB/image/104_aisb.t3.104_balancedactiveinf.jpg +0 -0
- package/AISB/image/105_aisb.t3.105_binaryclasseval.jpg +0 -0
- package/AISB/image/106_aisb.t1.reasoning_lite.jpg +0 -0
- package/AISB/image/107_aisb.t2.paper_audit.jpg +0 -0
- package/AISB/image/108_aisb.t3.multi_gpu_search.jpg +0 -0
- package/AISB/image/109_aisb.t3.tdc_admet.jpg +0 -0
- package/AISB/image/aisb.b1.agentic_coding.svg +16 -0
- package/AISB/image/aisb.b10.climate_earth.svg +16 -0
- package/AISB/image/aisb.b11.model_efficiency.svg +16 -0
- package/AISB/image/aisb.b12.embodied_ai.svg +16 -0
- package/AISB/image/aisb.b2.agent_systems.svg +16 -0
- package/AISB/image/aisb.b3.self_evolving_rl.svg +16 -0
- package/AISB/image/aisb.b4.lm_reasoning.svg +16 -0
- package/AISB/image/aisb.b5.math_proof.svg +16 -0
- package/AISB/image/aisb.b6.research_process.svg +16 -0
- package/AISB/image/aisb.b7.multimodal_fusion.svg +16 -0
- package/AISB/image/aisb.b8.lifesci_drug.svg +16 -0
- package/AISB/image/aisb.b9.material_science.svg +16 -0
- package/README.md +132 -11
- package/bin/ds.js +376 -49
- package/docs/en/00_QUICK_START.md +135 -18
- package/docs/en/01_SETTINGS_REFERENCE.md +468 -96
- package/docs/en/02_START_RESEARCH_GUIDE.md +26 -5
- package/docs/en/03_QQ_CONNECTOR_GUIDE.md +14 -3
- package/docs/en/04_LINGZHU_CONNECTOR_GUIDE.md +2 -0
- package/docs/en/05_TUI_GUIDE.md +171 -2
- package/docs/en/07_MEMORY_AND_MCP.md +38 -2
- package/docs/en/09_DOCTOR.md +64 -4
- package/docs/en/10_WEIXIN_CONNECTOR_GUIDE.md +38 -1
- package/docs/en/11_LICENSE_AND_RISK.md +4 -0
- package/docs/en/12_GUIDED_WORKFLOW_TOUR.md +15 -0
- package/docs/en/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
- package/docs/en/15_CODEX_PROVIDER_SETUP.md +622 -187
- package/docs/en/16_TELEGRAM_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/17_WHATSAPP_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/18_FEISHU_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/21_LOCAL_MODEL_BACKENDS_GUIDE.md +105 -2
- package/docs/en/22_BENCHSTORE_YAML_REFERENCE.md +469 -0
- package/docs/en/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +316 -0
- package/docs/en/24_CLAUDE_CODE_PROVIDER_SETUP.md +469 -0
- package/docs/en/25_OPENCODE_PROVIDER_SETUP.md +653 -0
- package/docs/en/26_CITATION_AND_ATTRIBUTION.md +119 -0
- package/docs/en/27_KIMI_CODE_PROVIDER_SETUP.md +180 -0
- package/docs/en/28_DISCORD_CONNECTOR_GUIDE.md +61 -0
- package/docs/en/29_SLACK_CONNECTOR_GUIDE.md +60 -0
- package/docs/en/30_SETTINGS_CONTROL_CENTER_GUIDE.md +371 -0
- package/docs/en/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
- package/docs/en/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +273 -0
- package/docs/en/33_WORKSPACE_EXPLORER_QA.md +121 -0
- package/docs/en/91_DEVELOPMENT.md +29 -0
- package/docs/en/99_ACKNOWLEDGEMENTS.md +24 -19
- package/docs/en/README.md +44 -7
- package/docs/images/admin/admin-connectors-health-en.png +0 -0
- package/docs/images/admin/admin-controllers-en.png +0 -0
- package/docs/images/admin/admin-diagnostics-en.png +0 -0
- package/docs/images/admin/admin-errors-en.png +0 -0
- package/docs/images/admin/admin-issues-en.png +0 -0
- package/docs/images/admin/admin-logs-en.png +0 -0
- package/docs/images/admin/admin-quest-detail-en.png +0 -0
- package/docs/images/admin/admin-quests-en.png +0 -0
- package/docs/images/admin/admin-repairs-en.png +0 -0
- package/docs/images/admin/admin-runtime-en.png +0 -0
- package/docs/images/admin/admin-search-en.png +0 -0
- package/docs/images/admin/admin-stats-en.png +0 -0
- package/docs/images/admin/admin-summary-en.png +0 -0
- package/docs/images/connectors/connector-discord-en.png +0 -0
- package/docs/images/connectors/connector-feishu-en.png +0 -0
- package/docs/images/connectors/connector-lingzhu-en.png +0 -0
- package/docs/images/connectors/connector-qq-en.png +0 -0
- package/docs/images/connectors/connector-slack-en.png +0 -0
- package/docs/images/connectors/connector-telegram-en.png +0 -0
- package/docs/images/connectors/connector-weixin-en.png +0 -0
- package/docs/images/connectors/connector-whatsapp-en.png +0 -0
- package/docs/images/settings/settings-baselines-en.png +0 -0
- package/docs/images/settings/settings-config-en.png +0 -0
- package/docs/images/settings/settings-connectors-overview-en.png +0 -0
- package/docs/images/settings/settings-deepxiv-en.png +0 -0
- package/docs/images/settings/settings-mcp-servers-en.png +0 -0
- package/docs/images/settings/settings-plugins-en.png +0 -0
- package/docs/images/settings/settings-runners-en.png +0 -0
- package/docs/zh/00_QUICK_START.md +92 -17
- package/docs/zh/01_SETTINGS_REFERENCE.md +219 -98
- package/docs/zh/02_START_RESEARCH_GUIDE.md +26 -5
- package/docs/zh/05_TUI_GUIDE.md +171 -2
- package/docs/zh/07_MEMORY_AND_MCP.md +29 -2
- package/docs/zh/09_DOCTOR.md +39 -4
- package/docs/zh/10_WEIXIN_CONNECTOR_GUIDE.md +24 -1
- package/docs/zh/11_LICENSE_AND_RISK.md +4 -0
- package/docs/zh/12_GUIDED_WORKFLOW_TOUR.md +15 -0
- package/docs/zh/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
- package/docs/zh/15_CODEX_PROVIDER_SETUP.md +550 -188
- package/docs/zh/21_LOCAL_MODEL_BACKENDS_GUIDE.md +105 -2
- package/docs/zh/22_BENCHSTORE_YAML_REFERENCE.md +459 -0
- package/docs/zh/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +287 -0
- package/docs/zh/23_CLAUDE_RUNNER_GUIDE.md +103 -0
- package/docs/zh/24_CLAUDE_CODE_PROVIDER_SETUP.md +460 -0
- package/docs/zh/25_OPENCODE_PROVIDER_SETUP.md +660 -0
- package/docs/zh/26_CITATION_AND_ATTRIBUTION.md +102 -0
- package/docs/zh/27_KIMI_CODE_PROVIDER_SETUP.md +51 -0
- package/docs/zh/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
- package/docs/zh/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +264 -0
- package/docs/zh/33_WORKSPACE_EXPLORER_QA.md +127 -0
- package/docs/zh/99_ACKNOWLEDGEMENTS.md +23 -19
- package/docs/zh/README.md +29 -7
- package/install.sh +122 -16
- package/package.json +4 -1
- package/pyproject.toml +2 -1
- package/src/deepscientist/__init__.py +1 -1
- package/src/deepscientist/acp/envelope.py +13 -0
- package/src/deepscientist/admin/__init__.py +3 -0
- package/src/deepscientist/admin/charts.py +681 -0
- package/src/deepscientist/admin/logs.py +119 -0
- package/src/deepscientist/admin/repairs.py +217 -0
- package/src/deepscientist/admin/service.py +1310 -0
- package/src/deepscientist/admin/system_info.py +700 -0
- package/src/deepscientist/admin/tasks.py +465 -0
- package/src/deepscientist/admin/tool_metrics.py +600 -0
- package/src/deepscientist/artifact/guidance.py +8 -4
- package/src/deepscientist/artifact/schemas.py +115 -0
- package/src/deepscientist/artifact/service.py +4268 -260
- package/src/deepscientist/bash_exec/monitor.py +30 -3
- package/src/deepscientist/bash_exec/service.py +134 -1
- package/src/deepscientist/benchstore/__init__.py +4 -0
- package/src/deepscientist/benchstore/prompt_builder.py +224 -0
- package/src/deepscientist/benchstore/service.py +1716 -0
- package/src/deepscientist/channels/weixin_ilink.py +8 -1
- package/src/deepscientist/cli.py +92 -17
- package/src/deepscientist/codex_cli_compat.py +2 -2
- package/src/deepscientist/config/models.py +82 -11
- package/src/deepscientist/config/service.py +927 -91
- package/src/deepscientist/connector/weixin_support.py +48 -17
- package/src/deepscientist/daemon/api/handlers.py +697 -210
- package/src/deepscientist/daemon/api/router.py +76 -1
- package/src/deepscientist/daemon/app.py +1054 -51
- package/src/deepscientist/diagnostics/runner_failures.py +147 -0
- package/src/deepscientist/doctor.py +212 -65
- package/src/deepscientist/evidence_packets.py +590 -0
- package/src/deepscientist/home.py +52 -4
- package/src/deepscientist/kimi_cli_compat.py +50 -0
- package/src/deepscientist/latex_runtime.py +2 -2
- package/src/deepscientist/mcp/context.py +2 -0
- package/src/deepscientist/mcp/schemas.py +114 -0
- package/src/deepscientist/mcp/server.py +1566 -126
- package/src/deepscientist/memory/service.py +203 -16
- package/src/deepscientist/process_control.py +8 -1
- package/src/deepscientist/prompts/builder.py +836 -92
- package/src/deepscientist/quest/__init__.py +2 -2
- package/src/deepscientist/quest/layout.py +12 -1
- package/src/deepscientist/quest/node_traces.py +10 -0
- package/src/deepscientist/quest/service.py +1430 -139
- package/src/deepscientist/quest/stage_views.py +1 -1
- package/src/deepscientist/runners/__init__.py +18 -0
- package/src/deepscientist/runners/base.py +89 -1
- package/src/deepscientist/runners/builtins.py +13 -1
- package/src/deepscientist/runners/claude.py +391 -0
- package/src/deepscientist/runners/codex.py +421 -21
- package/src/deepscientist/runners/codex_telemetry.py +127 -0
- package/src/deepscientist/runners/kimi.py +334 -0
- package/src/deepscientist/runners/metadata.py +68 -0
- package/src/deepscientist/runners/opencode.py +414 -0
- package/src/deepscientist/runners/runtime_overrides.py +100 -0
- package/src/deepscientist/runners/simple_cli.py +538 -0
- package/src/deepscientist/runtime_storage.py +303 -0
- package/src/deepscientist/shared.py +61 -16
- package/src/deepscientist/skills/installer.py +37 -0
- package/src/deepscientist/skills/registry.py +2 -0
- package/src/deepscientist/tinytex.py +2 -2
- package/src/deepscientist/tui.py +10 -3
- package/src/prompts/benchstore/system.md +77 -0
- package/src/prompts/connectors/qq.md +33 -2
- package/src/prompts/connectors/weixin.md +208 -23
- package/src/prompts/contracts/admin_ops.md +74 -0
- package/src/prompts/contracts/admin_ops_knowledge.md +138 -0
- package/src/prompts/contracts/shared_interaction.md +5 -11
- package/src/prompts/start_setup/system.md +422 -0
- package/src/prompts/system.md +409 -315
- package/src/prompts/system_copilot.md +88 -12
- package/src/skills/analysis-campaign/SKILL.md +239 -578
- package/src/skills/analysis-campaign/references/artifact-flow-examples.md +102 -0
- package/src/skills/analysis-campaign/references/boundary-cases.md +98 -0
- package/src/skills/analysis-campaign/references/campaign-checklist-template.md +39 -24
- package/src/skills/analysis-campaign/references/campaign-design.md +26 -10
- package/src/skills/analysis-campaign/references/campaign-plan-template.md +53 -54
- package/src/skills/analysis-campaign/references/operational-guidance.md +97 -0
- package/src/skills/analysis-campaign/references/writing-facing-slice-examples.md +10 -20
- package/src/skills/baseline/SKILL.md +183 -461
- package/src/skills/baseline/references/artifact-flow-examples.md +106 -0
- package/src/skills/baseline/references/artifact-payload-examples.md +1 -1
- package/src/skills/baseline/references/baseline-checklist-template.md +27 -35
- package/src/skills/baseline/references/baseline-plan-template.md +37 -76
- package/src/skills/baseline/references/boundary-cases.md +86 -0
- package/src/skills/baseline/references/codebase-audit-checklist.md +2 -6
- package/src/skills/baseline/references/comparability-contract.md +7 -12
- package/src/skills/baseline/references/operational-guidance.md +56 -0
- package/src/skills/baseline/references/route-selection.md +5 -25
- package/src/skills/decision/SKILL.md +113 -306
- package/src/skills/decision/references/checkpoint-memory-template.md +47 -0
- package/src/skills/decision/references/operational-guidance.md +94 -0
- package/src/skills/decision/references/research-route-criteria.md +7 -8
- package/src/skills/decision/references/strategic-decision-template.md +13 -26
- package/src/skills/experiment/SKILL.md +132 -670
- package/src/skills/experiment/references/execution-playbook.md +374 -0
- package/src/skills/experiment/references/main-experiment-checklist-template.md +26 -2
- package/src/skills/experiment/references/main-experiment-plan-template.md +28 -17
- package/src/skills/experiment/references/operational-guidance.md +108 -0
- package/src/skills/finalize/SKILL.md +62 -0
- package/src/skills/finalize/references/checkpoint-memory-template.md +49 -0
- package/src/skills/finalize/references/resume-packet-template.md +7 -0
- package/src/skills/idea/SKILL.md +228 -15
- package/src/skills/idea/references/controlled-brainstorming-playbook.md +78 -0
- package/src/skills/idea/references/current-board-packet-template.md +61 -0
- package/src/skills/idea/references/high-value-idea-sourcing.md +119 -0
- package/src/skills/idea/references/idea-generation-playbook.md +21 -0
- package/src/skills/idea/references/idea-thinking-flow.md +6 -0
- package/src/skills/idea/references/literature-survey-template.md +3 -0
- package/src/skills/idea/references/objective-contract-template.md +54 -0
- package/src/skills/idea/references/outline-seeding-example.md +56 -0
- package/src/skills/idea/references/pre-idea-draft-template.md +105 -0
- package/src/skills/idea/references/related-work-playbook.md +75 -2
- package/src/skills/idea/references/research-history-playbook.md +114 -0
- package/src/skills/idea/references/selection-gate.md +58 -6
- package/src/skills/intake-audit/SKILL.md +43 -2
- package/src/skills/intake-audit/references/state-audit-template.md +10 -0
- package/src/skills/nature-data/SKILL.md +128 -0
- package/src/skills/nature-data/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-data/agents/openai.yaml +4 -0
- package/src/skills/nature-data/references/chinese-author-alignment.md +84 -0
- package/src/skills/nature-data/references/fair-metadata-checklist.md +105 -0
- package/src/skills/nature-data/references/policy-principles.md +103 -0
- package/src/skills/nature-data/references/repository-and-identifiers.md +96 -0
- package/src/skills/nature-data/references/source-basis.md +54 -0
- package/src/skills/nature-data/references/statement-patterns.md +153 -0
- package/src/skills/nature-figure/SKILL.md +197 -0
- package/src/skills/nature-figure/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-figure/agents/openai.yaml +4 -0
- package/src/skills/nature-figure/evals/evals.json +37 -0
- package/src/skills/nature-figure/references/api.md +428 -0
- package/src/skills/nature-figure/references/backend-selection.md +100 -0
- package/src/skills/nature-figure/references/chart-types.md +281 -0
- package/src/skills/nature-figure/references/common-patterns.md +349 -0
- package/src/skills/nature-figure/references/design-theory.md +436 -0
- package/src/skills/nature-figure/references/figure-contract.md +93 -0
- package/src/skills/nature-figure/references/nature-2026-observations.md +112 -0
- package/src/skills/nature-figure/references/qa-contract.md +119 -0
- package/src/skills/nature-figure/references/r-template-index.md +66 -0
- package/src/skills/nature-figure/references/r-workflow.md +161 -0
- package/src/skills/nature-figure/references/tutorials.md +250 -0
- package/src/skills/nature-paper2ppt/SKILL.md +507 -0
- package/src/skills/nature-paper2ppt/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-paper2ppt/agents/openai.yaml +4 -0
- package/src/skills/nature-polishing/SKILL.md +385 -0
- package/src/skills/nature-polishing/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-polishing/agents/openai.yaml +4 -0
- package/src/skills/nature-polishing/references/phrasebank-playbook.md +162 -0
- package/src/skills/nature-polishing/references/section-moves.md +240 -0
- package/src/skills/nature-polishing/references/style-guardrails.md +94 -0
- package/src/skills/nature-polishing/references/writing-strategy.md +148 -0
- package/src/skills/optimize/SKILL.md +177 -1568
- package/src/skills/optimize/references/brief-shaping-playbook.md +95 -0
- package/src/skills/optimize/references/candidate-board-template.md +13 -0
- package/src/skills/optimize/references/candidate-ranking-template.md +51 -0
- package/src/skills/optimize/references/codegen-route-playbook.md +50 -0
- package/src/skills/optimize/references/debug-response-template.md +29 -0
- package/src/skills/optimize/references/frontier-review-template.md +32 -0
- package/src/skills/optimize/references/fusion-playbook.md +36 -0
- package/src/skills/optimize/references/method-brief-template.md +73 -0
- package/src/skills/optimize/references/operational-guidance.md +621 -0
- package/src/skills/optimize/references/optimization-memory-template.md +30 -0
- package/src/skills/optimize/references/optimize-checklist-template.md +18 -0
- package/src/skills/optimize/references/plateau-response-playbook.md +28 -0
- package/src/skills/optimize/references/prompt-patterns.md +49 -0
- package/src/skills/paper-outline/SKILL.md +227 -0
- package/src/skills/paper-outline/references/outline-patterns.md +87 -0
- package/src/skills/paper-plot/SKILL.md +79 -0
- package/src/skills/paper-plot/agents/openai.yaml +4 -0
- package/src/skills/paper-plot/references/bar_grouped_hatch.md +96 -0
- package/src/skills/paper-plot/references/bar_paired_delta.md +72 -0
- package/src/skills/paper-plot/references/line_confidence_band.md +75 -0
- package/src/skills/paper-plot/references/line_loss_with_inset.md +65 -0
- package/src/skills/paper-plot/references/line_training_curve.md +44 -0
- package/src/skills/paper-plot/references/radar_dual_series.md +59 -0
- package/src/skills/paper-plot/references/scatter_broken_axis.md +59 -0
- package/src/skills/paper-plot/references/scatter_tsne_cluster.md +72 -0
- package/src/skills/paper-plot/scripts/bar_memevolve.py +109 -0
- package/src/skills/paper-plot/scripts/bar_spice.py +166 -0
- package/src/skills/paper-plot/scripts/line_aime.py +94 -0
- package/src/skills/paper-plot/scripts/line_loss_inset.py +157 -0
- package/src/skills/paper-plot/scripts/line_selfdistill.py +168 -0
- package/src/skills/paper-plot/scripts/radar_dora.py +151 -0
- package/src/skills/paper-plot/scripts/scatter_break.py +169 -0
- package/src/skills/paper-plot/scripts/scatter_tsne.py +133 -0
- package/src/skills/rebuttal/SKILL.md +9 -0
- package/src/skills/references/tool-usage-by-stage.md +438 -0
- package/src/skills/review/SKILL.md +105 -7
- package/src/skills/science/PROVENANCE.md +44 -0
- package/src/skills/science/SKILL.md +137 -0
- package/src/skills/science/references/artifact-science-tool.md +110 -0
- package/src/skills/science/references/claim-type-discipline.md +56 -0
- package/src/skills/science/references/domain-index.md +422 -0
- package/src/skills/science/references/hpc-via-bash-exec.md +42 -0
- package/src/skills/science/references/package-check-playbook.md +64 -0
- package/src/skills/science/references/package-index.min.json +3616 -0
- package/src/skills/science/references/packages/abinit.md +80 -0
- package/src/skills/science/references/packages/acts.md +73 -0
- package/src/skills/science/references/packages/aiida-core.md +80 -0
- package/src/skills/science/references/packages/alamode.md +80 -0
- package/src/skills/science/references/packages/amuse.md +88 -0
- package/src/skills/science/references/packages/anndata.md +88 -0
- package/src/skills/science/references/packages/arbor.md +80 -0
- package/src/skills/science/references/packages/arc.md +73 -0
- package/src/skills/science/references/packages/astropy.md +88 -0
- package/src/skills/science/references/packages/astroquery.md +88 -0
- package/src/skills/science/references/packages/atomate2.md +80 -0
- package/src/skills/science/references/packages/atomsmltr.md +73 -0
- package/src/skills/science/references/packages/awkward.md +73 -0
- package/src/skills/science/references/packages/batman.md +88 -0
- package/src/skills/science/references/packages/biopython.md +88 -0
- package/src/skills/science/references/packages/bloqade.md +73 -0
- package/src/skills/science/references/packages/brian2.md +73 -0
- package/src/skills/science/references/packages/bullet3.md +73 -0
- package/src/skills/science/references/packages/calculix.md +80 -0
- package/src/skills/science/references/packages/cantera.md +73 -0
- package/src/skills/science/references/packages/cavity-md-ipi.md +80 -0
- package/src/skills/science/references/packages/ccdproc.md +88 -0
- package/src/skills/science/references/packages/celerite2.md +88 -0
- package/src/skills/science/references/packages/cellrank.md +73 -0
- package/src/skills/science/references/packages/cesm.md +80 -0
- package/src/skills/science/references/packages/chemicals.md +73 -0
- package/src/skills/science/references/packages/chempy.md +73 -0
- package/src/skills/science/references/packages/cirq.md +73 -0
- package/src/skills/science/references/packages/coffea.md +73 -0
- package/src/skills/science/references/packages/cp2k.md +88 -0
- package/src/skills/science/references/packages/custodian.md +80 -0
- package/src/skills/science/references/packages/dart.md +73 -0
- package/src/skills/science/references/packages/datamol.md +88 -0
- package/src/skills/science/references/packages/dd4hep.md +73 -0
- package/src/skills/science/references/packages/dealii.md +80 -0
- package/src/skills/science/references/packages/deepchem.md +88 -0
- package/src/skills/science/references/packages/delphes.md +73 -0
- package/src/skills/science/references/packages/devito.md +80 -0
- package/src/skills/science/references/packages/dftb.md +88 -0
- package/src/skills/science/references/packages/dftd4.md +88 -0
- package/src/skills/science/references/packages/dftk-jl.md +80 -0
- package/src/skills/science/references/packages/dolfinx.md +80 -0
- package/src/skills/science/references/packages/drake.md +73 -0
- package/src/skills/science/references/packages/dumux.md +73 -0
- package/src/skills/science/references/packages/elk.md +80 -0
- package/src/skills/science/references/packages/elmerfem.md +80 -0
- package/src/skills/science/references/packages/enzo-e.md +88 -0
- package/src/skills/science/references/packages/espresso.md +80 -0
- package/src/skills/science/references/packages/exoplanet.md +88 -0
- package/src/skills/science/references/packages/fairroot.md +73 -0
- package/src/skills/science/references/packages/fbpic.md +80 -0
- package/src/skills/science/references/packages/fdtdbath-meep.md +80 -0
- package/src/skills/science/references/packages/geant4.md +73 -0
- package/src/skills/science/references/packages/geosx.md +80 -0
- package/src/skills/science/references/packages/gprmax.md +80 -0
- package/src/skills/science/references/packages/gromacs.md +80 -0
- package/src/skills/science/references/packages/gwaslab.md +73 -0
- package/src/skills/science/references/packages/gz-sim.md +73 -0
- package/src/skills/science/references/packages/hail.md +88 -0
- package/src/skills/science/references/packages/hiphive.md +80 -0
- package/src/skills/science/references/packages/hoomd-blue.md +80 -0
- package/src/skills/science/references/packages/itensor.md +73 -0
- package/src/skills/science/references/packages/itensors-jl.md +73 -0
- package/src/skills/science/references/packages/jdftx.md +73 -0
- package/src/skills/science/references/packages/jobflow.md +80 -0
- package/src/skills/science/references/packages/kadanoffbaym-jl.md +73 -0
- package/src/skills/science/references/packages/kite.md +80 -0
- package/src/skills/science/references/packages/kratos.md +80 -0
- package/src/skills/science/references/packages/kwant.md +73 -0
- package/src/skills/science/references/packages/lammps.md +80 -0
- package/src/skills/science/references/packages/lightkurve.md +88 -0
- package/src/skills/science/references/packages/limix.md +73 -0
- package/src/skills/science/references/packages/maxwelllink.md +80 -0
- package/src/skills/science/references/packages/mcdc.md +73 -0
- package/src/skills/science/references/packages/meep.md +80 -0
- package/src/skills/science/references/packages/mfem.md +80 -0
- package/src/skills/science/references/packages/mitgcm.md +73 -0
- package/src/skills/science/references/packages/modflow6.md +73 -0
- package/src/skills/science/references/packages/molecool.md +73 -0
- package/src/skills/science/references/packages/mom6.md +73 -0
- package/src/skills/science/references/packages/moose.md +80 -0
- package/src/skills/science/references/packages/mpas-model.md +73 -0
- package/src/skills/science/references/packages/mujoco.md +73 -0
- package/src/skills/science/references/packages/mumax3.md +73 -0
- package/src/skills/science/references/packages/nekrs.md +80 -0
- package/src/skills/science/references/packages/nessi.md +73 -0
- package/src/skills/science/references/packages/nest-simulator.md +73 -0
- package/src/skills/science/references/packages/netket.md +73 -0
- package/src/skills/science/references/packages/neuron.md +73 -0
- package/src/skills/science/references/packages/nextflow.md +88 -0
- package/src/skills/science/references/packages/nwchem.md +88 -0
- package/src/skills/science/references/packages/openbabel.md +88 -0
- package/src/skills/science/references/packages/openems.md +80 -0
- package/src/skills/science/references/packages/openff-toolkit.md +88 -0
- package/src/skills/science/references/packages/openfoam-dev.md +80 -0
- package/src/skills/science/references/packages/openmc.md +73 -0
- package/src/skills/science/references/packages/openmm.md +80 -0
- package/src/skills/science/references/packages/openmoc.md +73 -0
- package/src/skills/science/references/packages/openmx.md +80 -0
- package/src/skills/science/references/packages/opensees.md +80 -0
- package/src/skills/science/references/packages/opensn.md +80 -0
- package/src/skills/science/references/packages/opm-simulators.md +73 -0
- package/src/skills/science/references/packages/oqupy.md +73 -0
- package/src/skills/science/references/packages/packmol.md +80 -0
- package/src/skills/science/references/packages/palabos.md +80 -0
- package/src/skills/science/references/packages/parflow.md +80 -0
- package/src/skills/science/references/packages/pennylane.md +88 -0
- package/src/skills/science/references/packages/perceval.md +73 -0
- package/src/skills/science/references/packages/phono3py.md +73 -0
- package/src/skills/science/references/packages/phonopy.md +73 -0
- package/src/skills/science/references/packages/photutils.md +88 -0
- package/src/skills/science/references/packages/picongpu.md +80 -0
- package/src/skills/science/references/packages/plink-ng.md +88 -0
- package/src/skills/science/references/packages/precice.md +73 -0
- package/src/skills/science/references/packages/psc.md +80 -0
- package/src/skills/science/references/packages/psi4.md +88 -0
- package/src/skills/science/references/packages/pybinding.md +73 -0
- package/src/skills/science/references/packages/pyfr.md +80 -0
- package/src/skills/science/references/packages/pyhf.md +73 -0
- package/src/skills/science/references/packages/pyiron_base.md +80 -0
- package/src/skills/science/references/packages/pylcp.md +73 -0
- package/src/skills/science/references/packages/pylith.md +80 -0
- package/src/skills/science/references/packages/pynbody.md +88 -0
- package/src/skills/science/references/packages/pysam.md +88 -0
- package/src/skills/science/references/packages/pyscf.md +88 -0
- package/src/skills/science/references/packages/q-e.md +73 -0
- package/src/skills/science/references/packages/qibo.md +73 -0
- package/src/skills/science/references/packages/qiskit.md +73 -0
- package/src/skills/science/references/packages/quantica-jl.md +73 -0
- package/src/skills/science/references/packages/quantumoptics-jl.md +73 -0
- package/src/skills/science/references/packages/quimb.md +73 -0
- package/src/skills/science/references/packages/qulacs.md +73 -0
- package/src/skills/science/references/packages/qutip.md +73 -0
- package/src/skills/science/references/packages/rdkit.md +88 -0
- package/src/skills/science/references/packages/rmg-py.md +73 -0
- package/src/skills/science/references/packages/root.md +73 -0
- package/src/skills/science/references/packages/scanpy.md +88 -0
- package/src/skills/science/references/packages/scikit-allel.md +88 -0
- package/src/skills/science/references/packages/scikit-bio.md +88 -0
- package/src/skills/science/references/packages/scqubits.md +73 -0
- package/src/skills/science/references/packages/scuff-em.md +80 -0
- package/src/skills/science/references/packages/scvi-tools.md +73 -0
- package/src/skills/science/references/packages/seissol.md +73 -0
- package/src/skills/science/references/packages/sfepy.md +80 -0
- package/src/skills/science/references/packages/sisl.md +73 -0
- package/src/skills/science/references/packages/smilei.md +80 -0
- package/src/skills/science/references/packages/snakemake.md +88 -0
- package/src/skills/science/references/packages/specfem3d-globe.md +80 -0
- package/src/skills/science/references/packages/specutils.md +88 -0
- package/src/skills/science/references/packages/spglib.md +80 -0
- package/src/skills/science/references/packages/squidpy.md +88 -0
- package/src/skills/science/references/packages/starry.md +88 -0
- package/src/skills/science/references/packages/strawberryfields.md +73 -0
- package/src/skills/science/references/packages/su2.md +80 -0
- package/src/skills/science/references/packages/sunny-jl.md +73 -0
- package/src/skills/science/references/packages/sw4.md +73 -0
- package/src/skills/science/references/packages/swift.md +88 -0
- package/src/skills/science/references/packages/tdnegf.md +73 -0
- package/src/skills/science/references/packages/tenpy.md +73 -0
- package/src/skills/science/references/packages/thermo.md +73 -0
- package/src/skills/science/references/packages/tkwant.md +73 -0
- package/src/skills/science/references/packages/tvb-root.md +73 -0
- package/src/skills/science/references/packages/uproot5.md +73 -0
- package/src/skills/science/references/packages/vampire.md +80 -0
- package/src/skills/science/references/packages/wannier_tools.md +73 -0
- package/src/skills/science/references/packages/warpx.md +80 -0
- package/src/skills/science/references/packages/wrf.md +73 -0
- package/src/skills/science/references/packages/xtb.md +88 -0
- package/src/skills/science/references/packages/yt.md +73 -0
- package/src/skills/science/references/science-task-brief-template.md +71 -0
- package/src/skills/scout/SKILL.md +83 -425
- package/src/skills/scout/references/literature-scout-template.md +5 -24
- package/src/skills/scout/references/operational-guidance.md +191 -0
- package/src/skills/scout/references/paper-triage-playbook.md +11 -35
- package/src/skills/write/SKILL.md +744 -1246
- package/src/skills/write/references/experiments_analysis_patterns.md +129 -0
- package/src/skills/write/references/oral_package_patterns.md +252 -0
- package/src/skills/write/references/oral_writing_principles.md +291 -0
- package/src/skills/write/references/section_rewrite_checklist.md +234 -0
- package/src/tui/dist/app/AppContainer.js +1314 -27
- package/src/tui/dist/components/Composer.js +26 -1
- package/src/tui/dist/components/ConfigScreen.js +2 -1
- package/src/tui/dist/components/InputPrompt.js +25 -9
- package/src/tui/dist/components/MainContent.js +18 -3
- package/src/tui/dist/components/QuestScreen.js +3 -2
- package/src/tui/dist/components/UtilityScreen.js +37 -0
- package/src/tui/dist/hooks/useSafeInput.js +10 -0
- package/src/tui/dist/index.js +13 -1
- package/src/tui/dist/layouts/DefaultAppLayout.js +11 -8
- package/src/tui/dist/lib/api.js +89 -1
- package/src/tui/package.json +1 -1
- package/src/ui/dist/assets/{AnalysisPlugin-BCKAfjba.js → AnalysisPlugin-CA94NGmI.js} +1 -1
- package/src/ui/dist/assets/CliPlugin-DHBzphZU.js +79 -0
- package/src/ui/dist/assets/CodeEditorPlugin-BOFwD2rn.js +2 -0
- package/src/ui/dist/assets/{CodeViewerPlugin-CbaFRrUU.js → CodeViewerPlugin-CqDpgjik.js} +4 -4
- package/src/ui/dist/assets/{DocViewerPlugin-DAjLVeQD.js → DocViewerPlugin-UDBgt8-4.js} +3 -3
- package/src/ui/dist/assets/GitCommitViewerPlugin-BmHtZ0bZ.js +6 -0
- package/src/ui/dist/assets/{GitDiffViewerPlugin-CQACjoAA.js → GitDiffViewerPlugin-CAxjNorQ.js} +2 -2
- package/src/ui/dist/assets/{GitSnapshotViewer-0r4nLPke.js → GitSnapshotViewer-CweA6VON.js} +2 -2
- package/src/ui/dist/assets/{ImageViewerPlugin-nBOmI2v_.js → ImageViewerPlugin-C8wHGvGN.js} +5 -5
- package/src/ui/dist/assets/LabPlugin-COyyLUol.js +32 -0
- package/src/ui/dist/assets/{LatexPlugin-ZwtV8pIp.js → LatexPlugin-BQjAaA5J.js} +4 -4
- package/src/ui/dist/assets/{MarkdownViewerPlugin-DKqVfKyW.js → MarkdownViewerPlugin-Dy1NE2dI.js} +3 -3
- package/src/ui/dist/assets/{MarketplacePlugin-BwxStZ9D.js → MarketplacePlugin-DMIZtEJ2.js} +2 -2
- package/src/ui/dist/assets/NotebookEditor-CFHMq_Qt.js +91 -0
- package/src/ui/dist/assets/{NotebookEditor-DB9N_T9q.js → NotebookEditor-WFyd8Ybt.js} +3 -3
- package/src/ui/dist/assets/{PdfLoader-eWBONbQP.js → PdfLoader-CLE5u5TS.js} +3 -3
- package/src/ui/dist/assets/{PdfMarkdownPlugin-D22YOZL3.js → PdfMarkdownPlugin-_iNK_H83.js} +1 -1
- package/src/ui/dist/assets/PdfViewerPlugin-DgWsbInT.js +22 -0
- package/src/ui/dist/assets/SearchPlugin-DrZmn5iw.js +11 -0
- package/src/ui/dist/assets/{TextViewerPlugin-C5xqeeUH.js → TextViewerPlugin-D1-T3aC7.js} +4 -4
- package/src/ui/dist/assets/branding/runner-claude.svg +107 -0
- package/src/ui/dist/assets/branding/runner-codex.svg +10 -0
- package/src/ui/dist/assets/branding/runner-kimi.svg +14 -0
- package/src/ui/dist/assets/branding/runner-opencode.svg +7 -0
- package/src/ui/dist/assets/cli-store-CoZ-x5Ip.js +1 -0
- package/src/ui/dist/assets/{code-WlFHE7z_.js → code-DbsmSd3Y.js} +1 -1
- package/src/ui/dist/assets/file-diff-panel-DsvyRz47.js +1 -0
- package/src/ui/dist/assets/{wrap-text-BC-Hltpd.js → file-jump-queue-DeQBikaw.js} +3 -3
- package/src/ui/dist/assets/{file-socket-CfQPKQKj.js → file-socket-DA5XIx88.js} +1 -1
- package/src/ui/dist/assets/fonts/ds-fonts.css +50 -4
- package/src/ui/dist/assets/images/deepxiv/register-guide.png +0 -0
- package/src/ui/dist/assets/index-39vY9LmZ.js +1 -0
- package/src/ui/dist/assets/{index-CwNu1aH4.js → index-BsO46tJA.js} +1 -1
- package/src/ui/dist/assets/index-CHzJ2xtB.js +3530 -0
- package/src/ui/dist/assets/index-DH-zxoZ3.css +33 -0
- package/src/ui/dist/assets/{plugin-notebook-HbW2K-1c.js → plugin-notebook-JRhysCqj.js} +2 -2
- package/src/ui/dist/assets/{project-sync-C9IdzdZW.js → project-sync-DPmWKmKD.js} +1 -1
- package/src/ui/dist/assets/{zoom-out-E_gaeAxL.js → zoom-out-DAukFWen.js} +3 -3
- package/src/ui/dist/index.html +3 -3
- package/src/skills/analysis-campaign/references/artifact-orchestration.md +0 -58
- package/src/skills/baseline/references/memory-playbook.md +0 -40
- package/src/skills/baseline/references/publishable-baseline-package.md +0 -30
- package/src/skills/write/references/outline-evidence-contract-example.md +0 -107
- package/src/skills/write/references/paper-experiment-matrix-template.md +0 -131
- package/src/skills/write/references/paper-section-playbook.md +0 -64
- package/src/skills/write/references/reviewer-first-writing.md +0 -64
- package/src/skills/write/references/revision-checklist.md +0 -70
- package/src/skills/write/references/section-contracts.md +0 -82
- package/src/skills/write/references/sentence-level-proofing.md +0 -49
- package/src/ui/dist/assets/AiManusChatView-Bv-Z8YpU.js +0 -204
- package/src/ui/dist/assets/CliPlugin-BCKcpc35.js +0 -109
- package/src/ui/dist/assets/CodeEditorPlugin-DbOfSJ8K.js +0 -2
- package/src/ui/dist/assets/GitCommitViewerPlugin-CIUqbUDO.js +0 -1
- package/src/ui/dist/assets/LabCopilotPanel-BHxOxF4z.js +0 -14
- package/src/ui/dist/assets/LabPlugin-BKoZGs95.js +0 -22
- package/src/ui/dist/assets/NotebookEditor-BEQhaQbt.js +0 -81
- package/src/ui/dist/assets/PdfViewerPlugin-c-RK9DLM.js +0 -17
- package/src/ui/dist/assets/SearchPlugin-CxF9ytAx.js +0 -16
- package/src/ui/dist/assets/VNCViewer-BoLGLnHz.js +0 -11
- package/src/ui/dist/assets/bot-DREQOxzP.js +0 -6
- package/src/ui/dist/assets/chevron-up-C9Qpx4DE.js +0 -6
- package/src/ui/dist/assets/file-content-BZMz3RYp.js +0 -1
- package/src/ui/dist/assets/file-diff-panel-CQhw0jS2.js +0 -1
- package/src/ui/dist/assets/file-jump-queue-DA-SdG__.js +0 -1
- package/src/ui/dist/assets/git-commit-horizontal-DxZ8DCZh.js +0 -6
- package/src/ui/dist/assets/image-Bgl4VIyx.js +0 -6
- package/src/ui/dist/assets/index-BpV6lusQ.css +0 -33
- package/src/ui/dist/assets/index-CBNVuWcP.js +0 -2496
- package/src/ui/dist/assets/index-DrUnlf6K.js +0 -1
- package/src/ui/dist/assets/index-NW-h8VzN.js +0 -1
- package/src/ui/dist/assets/pdf-effect-queue-J8OnM0jE.js +0 -6
- package/src/ui/dist/assets/popover-CLc0pPP8.js +0 -1
- package/src/ui/dist/assets/select-Cs2PmzwL.js +0 -11
- package/src/ui/dist/assets/sigma-ClKcHAXm.js +0 -6
- package/src/ui/dist/assets/trash-DwpbFr3w.js +0 -11
- package/src/ui/dist/assets/useCliAccess-NQ8m0Let.js +0 -1
- package/src/ui/dist/assets/useFileDiffOverlay-FuhcnKiw.js +0 -1
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.030_processrm
|
|
3
|
+
name: EpicPRM – 过程监督奖励模型数据构建
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: '通过基于困惑度的蒙特卡洛估计和自适应二分搜索构建高质量过程监督数据(Epic50k),在Qwen2-math-1.5B上训练PRM,并在ProcessBench(GSM8K、MATH、OlympiadBench、OmniMATH)上评估步骤级F1分数。
|
|
6
|
+
|
|
7
|
+
'
|
|
8
|
+
task_description: '此基准测试复现EpicPRM框架,用于构建数学推理的过程监督奖励模型。流水线包含三个阶段:(1)从多个LLM(LLaMA3-8B-Instruct、LLaMA3.1-8B-Instruct、Qwen2-7B-Instruct)在MATH数据集问题上生成链式思维推理链;(2)使用基于困惑度加权的蒙特卡洛估计和自适应二分搜索算法对中间推理步骤进行标注,该算法根据问题难度动态调整起始位置和采样数量;(3)在生成的Epic50k数据集(50k个标注步骤)上使用Qwen2-math-1.5B-base和二分类头训练过程奖励模型(PRM),然后通过ProcessBench严格协议评估步骤级正确性预测。评估脚本(eval_strict.py)计算F1作为acc_correct(正确识别的正确解决方案比例)和acc_error(预测的首个错误与真实值匹配的错误解决方案比例)的调和均值,阈值在GSM8K上调整后应用于MATH、OlympiadBench和OmniMATH。数据构建阶段(generate_train_data.py)需要运行多个7-8B参数LLM作为补全器进行rollout采样,这是计算最密集的阶段。PRM训练本身使用DeepSpeed ZeRO(提供stage 1/2/3配置)。快照包含所有阶段的可执行代码,但不含预生成的数据或预训练检查点;Epic50k和模型可在HuggingFace上获取。
|
|
9
|
+
|
|
10
|
+
'
|
|
11
|
+
capability_tags:
|
|
12
|
+
- research_code_optimization
|
|
13
|
+
- process_reward_modeling
|
|
14
|
+
- mathematical_reasoning
|
|
15
|
+
- large_language_models
|
|
16
|
+
- data_construction
|
|
17
|
+
- monte_carlo_estimation
|
|
18
|
+
aisb_direction: T3
|
|
19
|
+
track_fit:
|
|
20
|
+
- paper_track
|
|
21
|
+
- benchmark_track
|
|
22
|
+
task_mode: experiment_driven
|
|
23
|
+
requires_execution: true
|
|
24
|
+
requires_paper: true
|
|
25
|
+
integrity_level: cas_plus_canary
|
|
26
|
+
snapshot_status: runnable
|
|
27
|
+
support_level: advanced
|
|
28
|
+
cost_band: high
|
|
29
|
+
time_band: 1d+
|
|
30
|
+
difficulty: hard
|
|
31
|
+
data_access: public
|
|
32
|
+
primary_outputs:
|
|
33
|
+
- f1_gsm8k
|
|
34
|
+
- f1_math
|
|
35
|
+
- f1_olympiadbench
|
|
36
|
+
- f1_omnimath
|
|
37
|
+
- process_reward_dataset
|
|
38
|
+
- verifier_checkpoint
|
|
39
|
+
launch_profiles:
|
|
40
|
+
- id: quick_eval
|
|
41
|
+
label: 快速评估
|
|
42
|
+
description: '从HuggingFace下载预训练的PRM检查点和Epic50k,然后在ProcessBench数据集(GSM8K、MATH、OlympiadBench、OmniMATH)上运行eval_strict.py以复现F1指标。推理1.5B模型需要至少24GB显存的单块GPU。
|
|
43
|
+
|
|
44
|
+
'
|
|
45
|
+
- id: train_and_eval
|
|
46
|
+
label: 训练+评估
|
|
47
|
+
description: '使用DeepSpeed在Epic50k上从头训练PRM,然后通过eval_strict.py进行评估。通过使用已发布的数据集跳过数据构建阶段。
|
|
48
|
+
|
|
49
|
+
'
|
|
50
|
+
- id: full_pipeline
|
|
51
|
+
label: 完整流水线
|
|
52
|
+
description: '运行所有三个阶段:通过generate_train_data.py生成CoT链并标注步骤(需要通过vLLM运行多个7-8B LLM)、使用DeepSpeed训练PRM,然后进行评估。这是计算最密集的配置,需要多GPU进行rollout采样。
|
|
53
|
+
|
|
54
|
+
'
|
|
55
|
+
dataset_download:
|
|
56
|
+
primary_method: huggingface
|
|
57
|
+
sources:
|
|
58
|
+
- kind: huggingface
|
|
59
|
+
url: https://huggingface.co/datasets/SunW7777/EpicPRM
|
|
60
|
+
access: public
|
|
61
|
+
note: 'Epic50k数据集(50k个标注的中间推理步骤)及相关模型。也链接自https://github.com/xiaolizh1/EpicPRM。
|
|
62
|
+
|
|
63
|
+
'
|
|
64
|
+
- kind: external
|
|
65
|
+
url: https://github.com/openai/prm800k
|
|
66
|
+
access: public
|
|
67
|
+
note: 'PRM800k数据集,用于论文中的对比和阈值分析。
|
|
68
|
+
|
|
69
|
+
'
|
|
70
|
+
- kind: external
|
|
71
|
+
url: https://github.com/peiyi9979/Math-Shepherd
|
|
72
|
+
access: public
|
|
73
|
+
note: 'Math-Shepherd数据集,用于基线对比。
|
|
74
|
+
|
|
75
|
+
'
|
|
76
|
+
notes:
|
|
77
|
+
- Epic50k相对较小(~50k步骤);下载量不大。
|
|
78
|
+
- '如果运行完整流水线,需要MATH数据集(Hendrycks等人,2021)用于CoT生成。eval_strict.py需要ProcessBench评估数据。
|
|
79
|
+
|
|
80
|
+
'
|
|
81
|
+
- '训练和数据构建需要从HuggingFace下载基础模型(Qwen2-math-1.5B-base、LLaMA3-8B-Instruct、LLaMA3.1-8B-Instruct、Qwen2-7B-Instruct)。
|
|
82
|
+
|
|
83
|
+
'
|
|
84
|
+
credential_requirements:
|
|
85
|
+
mode: none
|
|
86
|
+
items: []
|
|
87
|
+
notes:
|
|
88
|
+
- 下载LLaMA3系列等 gated 模型可能需要HuggingFace账号。
|
|
89
|
+
resources:
|
|
90
|
+
minimum:
|
|
91
|
+
cpu_cores: 16
|
|
92
|
+
ram_gb: 64
|
|
93
|
+
disk_gb: 150
|
|
94
|
+
gpu_count: 1
|
|
95
|
+
gpu_vram_gb: 24
|
|
96
|
+
recommended:
|
|
97
|
+
cpu_cores: 32
|
|
98
|
+
ram_gb: 128
|
|
99
|
+
disk_gb: 300
|
|
100
|
+
gpu_count: 2
|
|
101
|
+
gpu_vram_gb: 48
|
|
102
|
+
environment:
|
|
103
|
+
python: '3.10'
|
|
104
|
+
cuda: '11.8'
|
|
105
|
+
pytorch: 2.1.0
|
|
106
|
+
key_packages:
|
|
107
|
+
- deepspeed==0.15.4
|
|
108
|
+
- transformers
|
|
109
|
+
- vllm
|
|
110
|
+
- scikit-learn
|
|
111
|
+
- numpy
|
|
112
|
+
- torch
|
|
113
|
+
notes:
|
|
114
|
+
- 'DeepSpeed ZeRO配置(stage 1/2/3,bf16)捆绑在deepspeed_config/中。典型2-GPU训练使用ds_config_bf16_zero2.json。
|
|
115
|
+
|
|
116
|
+
'
|
|
117
|
+
- 'vLLM用于加速数据构建阶段(generate_train_data.py)的rollout采样。评估或仅训练配置文件不需要。
|
|
118
|
+
|
|
119
|
+
'
|
|
120
|
+
- 参见捆绑的requirements和scripts/获取具体启动命令。
|
|
121
|
+
risk_flags:
|
|
122
|
+
- large_model_downloads
|
|
123
|
+
- multi_model_dependency
|
|
124
|
+
- compute_intensive_data_construction
|
|
125
|
+
risk_notes:
|
|
126
|
+
- '完整数据构建流水线需要同时运行多个7-8B参数LLM作为补全器,每个问题采样大量rollout。此阶段可能需要多个GPU天。
|
|
127
|
+
|
|
128
|
+
'
|
|
129
|
+
- '打包期间未执行基准测试;论文中的指标值尚未对此快照进行独立验证。
|
|
130
|
+
|
|
131
|
+
'
|
|
132
|
+
- 'eval_strict.py脚本实现了ProcessBench评估协议;如果未捆绑则需单独获取ProcessBench数据文件。
|
|
133
|
+
|
|
134
|
+
'
|
|
135
|
+
- '训练需要基础模型权重(Qwen2-math-1.5B-base)。LLaMA3模型可能需要在HuggingFace上接受许可协议。
|
|
136
|
+
|
|
137
|
+
'
|
|
138
|
+
recommended_when: '当您想研究高效数据构建的过程监督奖励模型训练、比较标注策略(基于困惑度与基于计数的蒙特卡洛估计)、或评估PRM在数学推理步骤正确性预测上的质量时使用此基准测试。对对数据高效PRM训练感兴趣的研究人员特别有帮助——Epic50k不到PRM800k的10%但能达到相当或更优的结果。
|
|
139
|
+
|
|
140
|
+
'
|
|
141
|
+
not_recommended_when: '如果您无法支持基于DeepSpeed的奖励模型训练、仅需要提示级评估而无需步骤级标注、或无法访问至少24GB GPU,请勿使用。完整数据构建流水线需要更多计算资源(多个7-8B LLM进行rollout采样)。
|
|
142
|
+
|
|
143
|
+
'
|
|
144
|
+
paper:
|
|
145
|
+
title: 'An Efficient and Precise Training Data Construction Framework for Process-supervised
|
|
146
|
+
Reward Model in Mathematical Reasoning
|
|
147
|
+
|
|
148
|
+
'
|
|
149
|
+
venue: ACL 2025
|
|
150
|
+
year: 2025
|
|
151
|
+
url: https://aclanthology.org/2025.acl-long.216/
|
|
152
|
+
download:
|
|
153
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.030_processrm.zip
|
|
154
|
+
archive_type: zip
|
|
155
|
+
local_dir_name: paper-30-ProcessRM
|
|
156
|
+
provider: github_release
|
|
157
|
+
repo: ResearAI/DeepScientist
|
|
158
|
+
tag: aisb-v0.0.1
|
|
159
|
+
asset_name: aisb.t3.030_processrm.zip
|
|
160
|
+
sha256: e34d67b264044c51d0bcdaa9d6dc4d7b9cb59f9c2285b4a629b3ac02af8c725a
|
|
161
|
+
size_bytes: 54360
|
|
162
|
+
display:
|
|
163
|
+
palette_seed: bronze-indigo-verifier
|
|
164
|
+
art_style: math-lab
|
|
165
|
+
accent_priority: high
|
|
166
|
+
image_path: ../image/030_aisb.t3.030_processrm.jpg
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.031_circuitstability
|
|
3
|
+
name: Circuit Stability Characterizes Language Model Generalization
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: 'Extract soft circuits from language models (gemma-2-2b and others) via
|
|
6
|
+
edge-ablation attribution, measure circuit stability across arithmetic/Boolean/sports
|
|
7
|
+
subtasks, and correlate stability with generalization performance.
|
|
8
|
+
|
|
9
|
+
'
|
|
10
|
+
task_description: 'This benchmark evaluates whether the internal circuits a language
|
|
11
|
+
model uses remain stable across structured subtask partitions, and whether that
|
|
12
|
+
stability predicts generalization. The workflow has three phases: (1) run baseline
|
|
13
|
+
evaluation of gemma-2-2b on arithmetic subtasks (digit-pair combinations from 1×1
|
|
14
|
+
to 8×8) using few-shot prompting, producing per-subtask exact-match accuracy; (2)
|
|
15
|
+
perform soft-circuit discovery via noisy-to-clean edge-ablation (EAP-IG) over the
|
|
16
|
+
model''s computational graph for each subtask, yielding real-valued importance scores
|
|
17
|
+
for every edge; (3) compute Spearman ρ between soft circuits of different subtasks,
|
|
18
|
+
apply α-equivalence clustering, and correlate circuit stability with accuracy fluctuations.
|
|
19
|
+
The paper extends this to Boolean expression evaluation and sports understanding.
|
|
20
|
+
The primary executable entry point is src/eval_arith_baseline.sh, which evaluates
|
|
21
|
+
four key digit combinations (1-1, 1-8, 8-1, 8-8) and computes accuracy from pickled
|
|
22
|
+
outputs. Circuit discovery is implemented in src/experiments/circuit_discovery.py
|
|
23
|
+
and explored interactively in notebooks/circuit-discovery.ipynb. The benchmark is
|
|
24
|
+
self-contained: data is generated procedurally (no external dataset download), models
|
|
25
|
+
are loaded via HuggingFace transformers/TransformerLens, and evaluation is local.
|
|
26
|
+
|
|
27
|
+
'
|
|
28
|
+
capability_tags:
|
|
29
|
+
- research_code_optimization
|
|
30
|
+
- mechanistic_interpretability
|
|
31
|
+
- large_language_models
|
|
32
|
+
- circuit_analysis
|
|
33
|
+
- evaluation
|
|
34
|
+
aisb_direction: T3
|
|
35
|
+
track_fit:
|
|
36
|
+
- paper_track
|
|
37
|
+
- benchmark_track
|
|
38
|
+
task_mode: evaluation_driven
|
|
39
|
+
requires_execution: true
|
|
40
|
+
requires_paper: true
|
|
41
|
+
integrity_level: cas_plus_canary
|
|
42
|
+
snapshot_status: runnable
|
|
43
|
+
support_level: advanced
|
|
44
|
+
cost_band: high
|
|
45
|
+
time_band: 1d+
|
|
46
|
+
difficulty: hard
|
|
47
|
+
data_access: public
|
|
48
|
+
primary_outputs:
|
|
49
|
+
- accuracy_11
|
|
50
|
+
- accuracy_18
|
|
51
|
+
- accuracy_81
|
|
52
|
+
- accuracy_88
|
|
53
|
+
- circuit_stability_report
|
|
54
|
+
- generalization_analysis
|
|
55
|
+
launch_profiles:
|
|
56
|
+
- id: quick_eval
|
|
57
|
+
label: Quick Eval (Arithmetic Baseline)
|
|
58
|
+
description: 'Run src/eval_arith_baseline.sh to evaluate gemma-2-2b on four arithmetic
|
|
59
|
+
subtask digit combinations (1-1, 1-8, 8-1, 8-8) with 1000 samples each. Requires
|
|
60
|
+
one A100 80 GB GPU. Produces per-subtask accuracy scores from pickled model outputs.
|
|
61
|
+
|
|
62
|
+
'
|
|
63
|
+
- id: full_analysis
|
|
64
|
+
label: Full Circuit Stability Analysis
|
|
65
|
+
description: 'Run the complete workflow: baseline evaluation across all 64 subtask
|
|
66
|
+
pairs, soft-circuit discovery via EAP-IG for each subtask, α-equivalence clustering,
|
|
67
|
+
Spearman ρ computation, t-SNE visualization, and generalization correlation analysis.
|
|
68
|
+
Case Studies I and II need one A100 80 GB; Case Study III requires two A100 80
|
|
69
|
+
GB GPUs. Expect 12-24+ hours depending on GPU count and subtask coverage.
|
|
70
|
+
|
|
71
|
+
'
|
|
72
|
+
dataset_download:
|
|
73
|
+
primary_method: procedural
|
|
74
|
+
sources: []
|
|
75
|
+
notes:
|
|
76
|
+
- All evaluation data (arithmetic, Boolean, sports) is generated procedurally at
|
|
77
|
+
runtime.
|
|
78
|
+
- No external dataset download is required.
|
|
79
|
+
- Model weights (gemma-2-2b) are fetched from HuggingFace Hub or loaded from a local
|
|
80
|
+
path via --model_path.
|
|
81
|
+
credential_requirements:
|
|
82
|
+
mode: optional
|
|
83
|
+
items:
|
|
84
|
+
- HuggingFace token (only if gemma-2-2b requires gated access)
|
|
85
|
+
notes:
|
|
86
|
+
- If model weights are pre-downloaded to a local path, no credentials are needed.
|
|
87
|
+
- Set --model_path in eval scripts to use local weights.
|
|
88
|
+
resources:
|
|
89
|
+
minimum:
|
|
90
|
+
cpu_cores: 16
|
|
91
|
+
ram_gb: 64
|
|
92
|
+
disk_gb: 150
|
|
93
|
+
gpu_count: 1
|
|
94
|
+
gpu_vram_gb: 80
|
|
95
|
+
recommended:
|
|
96
|
+
cpu_cores: 32
|
|
97
|
+
ram_gb: 128
|
|
98
|
+
disk_gb: 300
|
|
99
|
+
gpu_count: 2
|
|
100
|
+
gpu_vram_gb: 80
|
|
101
|
+
environment:
|
|
102
|
+
python: 3.10.16
|
|
103
|
+
cuda: null
|
|
104
|
+
pytorch: 2.4.1
|
|
105
|
+
key_packages:
|
|
106
|
+
- jax==0.5.0
|
|
107
|
+
- jaxlib==0.5.0
|
|
108
|
+
- transformers==4.44.2
|
|
109
|
+
- transformer-lens==2.11.0
|
|
110
|
+
- pygraphviz==1.14
|
|
111
|
+
- accelerate==0.34.2
|
|
112
|
+
- einops==0.8.0
|
|
113
|
+
- datasets==3.0.0
|
|
114
|
+
notes:
|
|
115
|
+
- Graphviz system package is a hard prerequisite for circuit visualization (pygraphviz).
|
|
116
|
+
- Install via install.sh or manually (brew install graphviz / apt-get install graphviz).
|
|
117
|
+
- Full dependency list in environment.yml (conda) or pip fallback documented in
|
|
118
|
+
README.
|
|
119
|
+
- CUDA version is not pinned; JAX and PyTorch will select appropriate CUDA backend.
|
|
120
|
+
risk_flags:
|
|
121
|
+
- gpu_intensive
|
|
122
|
+
- long_running
|
|
123
|
+
- model_download_required
|
|
124
|
+
risk_notes:
|
|
125
|
+
- Case Study III (sports understanding with larger models) requires two A100 80 GB
|
|
126
|
+
GPUs; single-GPU setups cannot run it.
|
|
127
|
+
- Full 64-subtask circuit discovery is computationally expensive (edge ablation over
|
|
128
|
+
the entire computational graph per subtask).
|
|
129
|
+
- No benchmark execution was performed during the packaging pass; metric values are
|
|
130
|
+
code-backed but unverified at runtime.
|
|
131
|
+
- The --model_path flag expects pre-downloaded weights; without it, the script attempts
|
|
132
|
+
HuggingFace Hub download which may fail for gated models.
|
|
133
|
+
- pygraphviz installation can fail if Graphviz headers are not available on the system.
|
|
134
|
+
recommended_when: 'Use this benchmark when you want a mechanistic-interpretability
|
|
135
|
+
task that extracts soft circuits via edge-ablation, clusters subtasks by circuit
|
|
136
|
+
similarity, and quantitatively links circuit stability to generalization. It is
|
|
137
|
+
well-suited for studying arithmetic, Boolean, or compositional reasoning in transformer
|
|
138
|
+
models and for testing whether optimization improvements preserve circuit-level
|
|
139
|
+
structure.
|
|
140
|
+
|
|
141
|
+
'
|
|
142
|
+
not_recommended_when: 'Do not use this if you cannot provision at least one A100 80
|
|
143
|
+
GB GPU, if you need a lightweight fine-tuning benchmark, or if you are looking for
|
|
144
|
+
a task with externally provided static evaluation datasets rather than procedurally
|
|
145
|
+
generated inputs.
|
|
146
|
+
|
|
147
|
+
'
|
|
148
|
+
paper:
|
|
149
|
+
title: Circuit Stability Characterizes Language Model Generalization
|
|
150
|
+
authors:
|
|
151
|
+
- Alan Sun
|
|
152
|
+
venue: ACL 2025
|
|
153
|
+
year: 2025
|
|
154
|
+
url: https://openreview.net/forum?id=2914j8175g
|
|
155
|
+
arxiv: https://arxiv.org/abs/2505.24731
|
|
156
|
+
download:
|
|
157
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.031_circuitstability.zip
|
|
158
|
+
archive_type: zip
|
|
159
|
+
local_dir_name: paper-31-CircuitStability
|
|
160
|
+
provider: github_release
|
|
161
|
+
repo: ResearAI/DeepScientist
|
|
162
|
+
tag: aisb-v0.0.1
|
|
163
|
+
asset_name: aisb.t3.031_circuitstability.zip
|
|
164
|
+
sha256: 549af47824f5e0af3f66e650ca1b91d28b02dc534b23ac71ddd4c84a70a1ee7e
|
|
165
|
+
size_bytes: 2108204
|
|
166
|
+
display:
|
|
167
|
+
palette_seed: graphite-gold-circuit
|
|
168
|
+
art_style: interpretability-lab
|
|
169
|
+
accent_priority: high
|
|
170
|
+
image_path: ../image/031_aisb.t3.031_circuitstability.jpg
|
|
171
|
+
commercial:
|
|
172
|
+
annual_fee: null
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.031_circuitstability
|
|
3
|
+
name: 电路稳定性刻画语言模型泛化能力
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: '从语言模型(gemma-2-2b及其他)中通过边消融归因提取软电路,测量算术/布尔/体育子任务的电路稳定性,并将稳定性与泛化性能相关联。
|
|
6
|
+
|
|
7
|
+
'
|
|
8
|
+
task_description: '该基准测试评估语言模型内部电路在结构化子任务划分下是否保持稳定,以及该稳定性是否可预测泛化能力。工作流程分为三个阶段:(1)对算术子任务(1×1到8×8的数字对组合)运行gemma-2-2b基线评估,使用少样本提示,产生每个子任务的精确匹配准确率;(2)通过噪声到清晰的边消融(EAP-IG)对每个子任务在模型的计算图上进行软电路发现,为每条边产生实值重要性分数;(3)计算不同子任务软电路之间的Spearman ρ,应用α等价聚类,并将电路稳定性与准确率波动进行相关性分析。该论文将此方法扩展到布尔表达式评估和体育理解。主要可执行入口是src/eval_arith_baseline.sh,评估四个关键数字组合(1-1、1-8、8-1、8-8)并从pickle输出中计算准确率。电路发现在src/experiments/circuit_discovery.py中实现,并在notebooks/circuit-discovery.ipynb中进行交互式探索。该基准测试是自包含的:数据在运行时程序化生成(无需外部数据集下载),模型通过HuggingFace transformers/TransformerLens加载,评估在本地进行。
|
|
9
|
+
|
|
10
|
+
'
|
|
11
|
+
capability_tags:
|
|
12
|
+
- research_code_optimization
|
|
13
|
+
- mechanistic_interpretability
|
|
14
|
+
- large_language_models
|
|
15
|
+
- circuit_analysis
|
|
16
|
+
- evaluation
|
|
17
|
+
aisb_direction: T3
|
|
18
|
+
track_fit:
|
|
19
|
+
- paper_track
|
|
20
|
+
- benchmark_track
|
|
21
|
+
task_mode: evaluation_driven
|
|
22
|
+
requires_execution: true
|
|
23
|
+
requires_paper: true
|
|
24
|
+
integrity_level: cas_plus_canary
|
|
25
|
+
snapshot_status: runnable
|
|
26
|
+
support_level: advanced
|
|
27
|
+
cost_band: high
|
|
28
|
+
time_band: 1d+
|
|
29
|
+
difficulty: hard
|
|
30
|
+
data_access: public
|
|
31
|
+
primary_outputs:
|
|
32
|
+
- accuracy_11
|
|
33
|
+
- accuracy_18
|
|
34
|
+
- accuracy_81
|
|
35
|
+
- accuracy_88
|
|
36
|
+
- circuit_stability_report
|
|
37
|
+
- generalization_analysis
|
|
38
|
+
launch_profiles:
|
|
39
|
+
- id: quick_eval
|
|
40
|
+
label: 快速评估(算术基线)
|
|
41
|
+
description: '运行src/eval_arith_baseline.sh对四个算术子任务数字组合(1-1、1-8、8-1、8-8)评估gemma-2-2b,每个组合1000个样本。需要一块A100 80 GB GPU。从pickle模型输出中产生每个子任务的准确率分数。
|
|
42
|
+
|
|
43
|
+
'
|
|
44
|
+
- id: full_analysis
|
|
45
|
+
label: 完整电路稳定性分析
|
|
46
|
+
description: '运行完整工作流程:所有64个子任务对的基线评估、每个子任务的EAP-IG软电路发现、α等价聚类、Spearman ρ计算、t-SNE可视化和泛化相关性分析。案例研究I和II需要一块A100 80 GB;案例研究III需要两块A100 80 GB GPU。根据GPU数量和子任务覆盖范围预计需要12-24小时以上。
|
|
47
|
+
|
|
48
|
+
'
|
|
49
|
+
dataset_download:
|
|
50
|
+
primary_method: procedural
|
|
51
|
+
sources: []
|
|
52
|
+
notes:
|
|
53
|
+
- 所有评估数据(算术、布尔、体育)在运行时程序化生成。
|
|
54
|
+
- 无需外部数据集下载。
|
|
55
|
+
- 模型权重(gemma-2-2b)从HuggingFace Hub获取,或通过--model_path从本地路径加载。
|
|
56
|
+
credential_requirements:
|
|
57
|
+
mode: optional
|
|
58
|
+
items:
|
|
59
|
+
- HuggingFace令牌(仅当gemma-2-2b需要门控访问时)
|
|
60
|
+
notes:
|
|
61
|
+
- 如果模型权重已预下载到本地路径,则无需凭据。
|
|
62
|
+
- 在评估脚本中设置--model_path以使用本地权重。
|
|
63
|
+
resources:
|
|
64
|
+
minimum:
|
|
65
|
+
cpu_cores: 16
|
|
66
|
+
ram_gb: 64
|
|
67
|
+
disk_gb: 150
|
|
68
|
+
gpu_count: 1
|
|
69
|
+
gpu_vram_gb: 80
|
|
70
|
+
recommended:
|
|
71
|
+
cpu_cores: 32
|
|
72
|
+
ram_gb: 128
|
|
73
|
+
disk_gb: 300
|
|
74
|
+
gpu_count: 2
|
|
75
|
+
gpu_vram_gb: 80
|
|
76
|
+
environment:
|
|
77
|
+
python: 3.10.16
|
|
78
|
+
cuda: null
|
|
79
|
+
pytorch: 2.4.1
|
|
80
|
+
key_packages:
|
|
81
|
+
- jax==0.5.0
|
|
82
|
+
- jaxlib==0.5.0
|
|
83
|
+
- transformers==4.44.2
|
|
84
|
+
- transformer-lens==2.11.0
|
|
85
|
+
- pygraphviz==1.14
|
|
86
|
+
- accelerate==0.34.2
|
|
87
|
+
- einops==0.8.0
|
|
88
|
+
- datasets==3.0.0
|
|
89
|
+
notes:
|
|
90
|
+
- Graphviz系统包是电路可视化的硬性前提条件(pygraphviz)。
|
|
91
|
+
- 通过install.sh或手动安装(brew install graphviz / apt-get install graphviz)。
|
|
92
|
+
- 完整依赖列表在environment.yml(conda)或README中记录的pip备选方案中。
|
|
93
|
+
- CUDA版本未固定;JAX和PyTorch将选择合适的CUDA后端。
|
|
94
|
+
risk_flags:
|
|
95
|
+
- gpu_intensive
|
|
96
|
+
- long_running
|
|
97
|
+
- model_download_required
|
|
98
|
+
risk_notes:
|
|
99
|
+
- 案例研究III(使用更大模型的体育理解)需要两块A100 80 GB GPU;单GPU设置无法运行。
|
|
100
|
+
- 完整的64子任务电路发现计算成本高昂(每个子任务在完整计算图上进行边消融)。
|
|
101
|
+
- 打包过程中未执行基准测试;指标值有代码支持但未在运行时验证。
|
|
102
|
+
- --model_path标志期望预下载的权重;没有该标志时,脚本尝试从HuggingFace Hub下载,可能对门控模型失败。
|
|
103
|
+
- 如果系统上缺少Graphviz头文件,pygraphviz安装可能失败。
|
|
104
|
+
recommended_when: '当您需要进行机械可解释性任务,通过边消融提取软电路、按电路相似性对子任务进行聚类,并定量地将电路稳定性与泛化能力关联时,使用此基准测试。它非常适合研究Transformer模型中的算术、布尔或组合推理,以及测试优化改进是否保留电路级结构。
|
|
105
|
+
|
|
106
|
+
'
|
|
107
|
+
not_recommended_when: '如果无法配置至少一块A100 80 GB GPU、需要轻量级微调基准测试,或正在寻找具有外部提供的静态评估数据集而非程序化生成输入的任务,请勿使用此基准测试。
|
|
108
|
+
|
|
109
|
+
'
|
|
110
|
+
paper:
|
|
111
|
+
title: Circuit Stability Characterizes Language Model Generalization
|
|
112
|
+
authors:
|
|
113
|
+
- Alan Sun
|
|
114
|
+
venue: ACL 2025
|
|
115
|
+
year: 2025
|
|
116
|
+
url: https://openreview.net/forum?id=2914j8175g
|
|
117
|
+
arxiv: https://arxiv.org/abs/2505.24731
|
|
118
|
+
download:
|
|
119
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.031_circuitstability.zip
|
|
120
|
+
archive_type: zip
|
|
121
|
+
local_dir_name: paper-31-CircuitStability
|
|
122
|
+
provider: github_release
|
|
123
|
+
repo: ResearAI/DeepScientist
|
|
124
|
+
tag: aisb-v0.0.1
|
|
125
|
+
asset_name: aisb.t3.031_circuitstability.zip
|
|
126
|
+
sha256: 549af47824f5e0af3f66e650ca1b91d28b02dc534b23ac71ddd4c84a70a1ee7e
|
|
127
|
+
size_bytes: 2108204
|
|
128
|
+
display:
|
|
129
|
+
palette_seed: graphite-gold-circuit
|
|
130
|
+
art_style: interpretability-lab
|
|
131
|
+
accent_priority: high
|
|
132
|
+
image_path: ../image/031_aisb.t3.031_circuitstability.jpg
|
|
133
|
+
commercial:
|
|
134
|
+
annual_fee: null
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.032_ptsolver
|
|
3
|
+
name: 'Personal Travel Solver: RealTravel Data Foundation'
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: 'Data-only snapshot of the RealTravel dataset (1,000 test + 155 validation
|
|
6
|
+
samples across 77 US cities) with LLM-driven user-preference extraction scripts;
|
|
7
|
+
the full PTS planner and SCIP solver are not bundled.
|
|
8
|
+
|
|
9
|
+
'
|
|
10
|
+
task_description: 'This benchmark packages the RealTravel dataset—an extension of
|
|
11
|
+
the TravelPlanner benchmark that incorporates real user reviews and POI metadata
|
|
12
|
+
from Google Local for 77 US cities. The snapshot contains structured travel queries,
|
|
13
|
+
user review histories, POI databases (restaurants, attractions, accommodations),
|
|
14
|
+
and four Python data-processing scripts that use LLM APIs (GPT-4o / DeepSeek) to
|
|
15
|
+
extract user preferences, summarize profiles, standardize preference tags, and generate
|
|
16
|
+
POI pro/con descriptions. The full Personal Travel Solver (PTS) system—including
|
|
17
|
+
the Translator, Search, Re-rank (SASRec), and Planning (SCIP solver) modules—is
|
|
18
|
+
NOT included in this snapshot. The primary metric (pass_rate) is blocked because
|
|
19
|
+
no executable evaluation or planning code is present. To use this benchmark as a
|
|
20
|
+
runnable evaluation, the downstream PTS planner must be restored from the paper''s
|
|
21
|
+
external code release.
|
|
22
|
+
|
|
23
|
+
'
|
|
24
|
+
capability_tags:
|
|
25
|
+
- research_code_optimization
|
|
26
|
+
- travel_planning
|
|
27
|
+
- llm_tooling
|
|
28
|
+
- constraint_solving
|
|
29
|
+
- personalization
|
|
30
|
+
aisb_direction: T3
|
|
31
|
+
track_fit:
|
|
32
|
+
- paper_track
|
|
33
|
+
- benchmark_track
|
|
34
|
+
task_mode: evaluation_driven
|
|
35
|
+
requires_execution: true
|
|
36
|
+
requires_paper: true
|
|
37
|
+
integrity_level: cas_plus_canary
|
|
38
|
+
snapshot_status: data_only
|
|
39
|
+
support_level: recovery
|
|
40
|
+
cost_band: low
|
|
41
|
+
time_band: 1-2h
|
|
42
|
+
difficulty: medium
|
|
43
|
+
data_access: public
|
|
44
|
+
primary_outputs:
|
|
45
|
+
- user_profiles
|
|
46
|
+
- poi_summaries
|
|
47
|
+
- structured_travel_data
|
|
48
|
+
- preference_tags
|
|
49
|
+
launch_profiles:
|
|
50
|
+
- id: data_prep
|
|
51
|
+
label: Data Preparation
|
|
52
|
+
description: 'Run the four bundled data-processing scripts to extract user preferences,
|
|
53
|
+
summarize profiles, standardize tags, and generate POI pro/con descriptions. Requires
|
|
54
|
+
LLM API credentials (OpenAI or DeepSeek). No GPU needed.
|
|
55
|
+
|
|
56
|
+
'
|
|
57
|
+
- id: planner_restore
|
|
58
|
+
label: Planner Restore
|
|
59
|
+
description: 'Restore the full PTS pipeline (Translator, Search, Re-rank with SASRec,
|
|
60
|
+
Planning with SCIP solver) from the paper''s external code release before treating
|
|
61
|
+
this benchmark as a runnable end-to-end evaluation.
|
|
62
|
+
|
|
63
|
+
'
|
|
64
|
+
dataset_download:
|
|
65
|
+
primary_method: bundled
|
|
66
|
+
sources:
|
|
67
|
+
- kind: archive
|
|
68
|
+
url: https://deepscientist.cc/AISB/032_ptsolver
|
|
69
|
+
access: public
|
|
70
|
+
note: 'ZIP archive containing RealTravel dataset (1,000 test / 155 validation
|
|
71
|
+
samples), POI databases for 77 cities, user review data, and data-processing
|
|
72
|
+
scripts.
|
|
73
|
+
|
|
74
|
+
'
|
|
75
|
+
notes:
|
|
76
|
+
- The dataset is derived from Google Local (Yan et al., 2023) and TravelPlanner
|
|
77
|
+
(Xie et al., 2024).
|
|
78
|
+
- Database subdirectories contain accommodations, attractions, restaurants, and
|
|
79
|
+
background data.
|
|
80
|
+
- Total disk footprint is modest (under 20 GB uncompressed).
|
|
81
|
+
credential_requirements:
|
|
82
|
+
mode: api_keys
|
|
83
|
+
items:
|
|
84
|
+
- OpenAI API key (GPT-4o) or DeepSeek API key for running data-processing scripts
|
|
85
|
+
- Optional ChatGLM / SiliconFlow / Yi API keys referenced in attraction-pro-con.py
|
|
86
|
+
notes:
|
|
87
|
+
- Scripts read keys from environment variables (Gpt_API_KEY, DEEPSEEK_API_KEY, CHATGLM_API_KEY).
|
|
88
|
+
- Some hardcoded API keys appear in the source code but may be expired or invalid.
|
|
89
|
+
- No credentials needed if you only inspect the pre-existing dataset files.
|
|
90
|
+
resources:
|
|
91
|
+
minimum:
|
|
92
|
+
cpu_cores: 8
|
|
93
|
+
ram_gb: 16
|
|
94
|
+
disk_gb: 20
|
|
95
|
+
gpu_count: 0
|
|
96
|
+
gpu_vram_gb: 0
|
|
97
|
+
recommended:
|
|
98
|
+
cpu_cores: 16
|
|
99
|
+
ram_gb: 32
|
|
100
|
+
disk_gb: 50
|
|
101
|
+
gpu_count: 0
|
|
102
|
+
gpu_vram_gb: 0
|
|
103
|
+
environment:
|
|
104
|
+
python: '3.10'
|
|
105
|
+
cuda: null
|
|
106
|
+
pytorch: null
|
|
107
|
+
flash_attn: null
|
|
108
|
+
key_packages:
|
|
109
|
+
- langchain
|
|
110
|
+
- openai
|
|
111
|
+
- tqdm
|
|
112
|
+
notes:
|
|
113
|
+
- CPU-only execution is sufficient for all bundled scripts.
|
|
114
|
+
- The data-processing scripts use langchain ChatOpenAI and openai client libraries.
|
|
115
|
+
- Full PTS system (not bundled) would additionally require PySCIPOpt, SASRec, BGE
|
|
116
|
+
embeddings, and scikit-learn for PCA.
|
|
117
|
+
- See bundled requirements files for the complete dependency set.
|
|
118
|
+
risk_flags:
|
|
119
|
+
- blocked_metric
|
|
120
|
+
- incomplete_pipeline
|
|
121
|
+
- api_key_exposure
|
|
122
|
+
- external_code_dependency
|
|
123
|
+
risk_notes:
|
|
124
|
+
- The primary metric (pass_rate) is blocked—no evaluation or planning code is in the
|
|
125
|
+
snapshot.
|
|
126
|
+
- The full PTS system (5 modules) is not bundled; only data-processing scripts are
|
|
127
|
+
present.
|
|
128
|
+
- Hardcoded API keys appear in attraction-pro-con.py; these may be leaked credentials
|
|
129
|
+
and should not be reused.
|
|
130
|
+
- Running data-processing scripts will incur LLM API costs proportional to the number
|
|
131
|
+
of users/POIs processed.
|
|
132
|
+
- The Re-rank module (SASRec + BGE embeddings) and Planning module (SCIP solver) must
|
|
133
|
+
be restored externally.
|
|
134
|
+
recommended_when: 'Use this benchmark when you want a realistic travel-planning dataset
|
|
135
|
+
grounded in real user reviews and POI data, for tasks that mix symbolic constraint
|
|
136
|
+
satisfaction with user preference modeling, without requiring heavyweight GPU training.
|
|
137
|
+
Suitable for evaluating LLM-based preference extraction, user profiling, or as a
|
|
138
|
+
data foundation for building constraint-based travel planners.
|
|
139
|
+
|
|
140
|
+
'
|
|
141
|
+
not_recommended_when: 'Do not use this if you need a fully self-contained, end-to-end
|
|
142
|
+
runnable benchmark. The planning and evaluation pipeline is not bundled. Also not
|
|
143
|
+
suitable if you need a benchmark focused on large-scale model fine-tuning, multimodal
|
|
144
|
+
data, or non-US travel destinations.
|
|
145
|
+
|
|
146
|
+
'
|
|
147
|
+
paper:
|
|
148
|
+
title: 'Personal Travel Solver: A Preference-Driven LLM-Solver System for Travel
|
|
149
|
+
Planning'
|
|
150
|
+
venue: ACL 2025
|
|
151
|
+
year: 2025
|
|
152
|
+
url: https://aclanthology.org/2025.acl-long.1339/
|
|
153
|
+
download:
|
|
154
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.032_ptsolver.zip
|
|
155
|
+
archive_type: zip
|
|
156
|
+
local_dir_name: paper-32-PTSolver
|
|
157
|
+
provider: github_release
|
|
158
|
+
repo: ResearAI/DeepScientist
|
|
159
|
+
tag: aisb-v0.0.1
|
|
160
|
+
asset_name: aisb.t3.032_ptsolver.zip
|
|
161
|
+
sha256: 26f7f39e12eb28552ada092809b289f77090a229d0b745e9dfbcbb7b7b4f9d5c
|
|
162
|
+
size_bytes: 38141816
|
|
163
|
+
commercial:
|
|
164
|
+
annual_fee: null
|
|
165
|
+
display:
|
|
166
|
+
palette_seed: sand-teal-itinerary
|
|
167
|
+
art_style: trip-planner
|
|
168
|
+
accent_priority: medium
|
|
169
|
+
image_path: ../image/032_aisb.t3.032_ptsolver.jpg
|