@researai/deepscientist 1.5.16 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +309 -130
- package/AISB/catalog/aisb.b1.agentic_coding.yaml +244 -0
- package/AISB/catalog/aisb.b10.climate_earth.yaml +235 -0
- package/AISB/catalog/aisb.b11.model_efficiency.yaml +231 -0
- package/AISB/catalog/aisb.b12.embodied_ai.yaml +238 -0
- package/AISB/catalog/aisb.b2.agent_systems.yaml +229 -0
- package/AISB/catalog/aisb.b3.self_evolving_rl.yaml +237 -0
- package/AISB/catalog/aisb.b4.lm_reasoning.yaml +240 -0
- package/AISB/catalog/aisb.b5.math_proof.yaml +235 -0
- package/AISB/catalog/aisb.b6.research_process.yaml +243 -0
- package/AISB/catalog/aisb.b7.multimodal_fusion.yaml +232 -0
- package/AISB/catalog/aisb.b8.lifesci_drug.yaml +275 -0
- package/AISB/catalog/aisb.b9.material_science.yaml +237 -0
- package/AISB/catalog/aisb.t3.001_savvy.yaml +159 -0
- package/AISB/catalog/aisb.t3.001_savvy.zh.yaml +121 -0
- package/AISB/catalog/aisb.t3.002_pinet.yaml +189 -0
- package/AISB/catalog/aisb.t3.002_pinet.zh.yaml +130 -0
- package/AISB/catalog/aisb.t3.004_decentralattn.yaml +184 -0
- package/AISB/catalog/aisb.t3.004_decentralattn.zh.yaml +153 -0
- package/AISB/catalog/aisb.t3.005_tsae.yaml +193 -0
- package/AISB/catalog/aisb.t3.005_tsae.zh.yaml +139 -0
- package/AISB/catalog/aisb.t3.006_physense.yaml +194 -0
- package/AISB/catalog/aisb.t3.006_physense.zh.yaml +118 -0
- package/AISB/catalog/aisb.t3.007_reasoningiqa.yaml +169 -0
- package/AISB/catalog/aisb.t3.007_reasoningiqa.zh.yaml +133 -0
- package/AISB/catalog/aisb.t3.008_meanflows.yaml +188 -0
- package/AISB/catalog/aisb.t3.008_meanflows.zh.yaml +140 -0
- package/AISB/catalog/aisb.t3.009_scoremissing.yaml +179 -0
- package/AISB/catalog/aisb.t3.009_scoremissing.zh.yaml +119 -0
- package/AISB/catalog/aisb.t3.010_suitabilityfilter.yaml +221 -0
- package/AISB/catalog/aisb.t3.010_suitabilityfilter.zh.yaml +141 -0
- package/AISB/catalog/aisb.t3.011_osd.yaml +206 -0
- package/AISB/catalog/aisb.t3.011_osd.zh.yaml +163 -0
- package/AISB/catalog/aisb.t3.012_efficientqat.yaml +206 -0
- package/AISB/catalog/aisb.t3.012_efficientqat.zh.yaml +159 -0
- package/AISB/catalog/aisb.t3.013_appl.yaml +152 -0
- package/AISB/catalog/aisb.t3.013_appl.zh.yaml +126 -0
- package/AISB/catalog/aisb.t3.014_piguard.yaml +207 -0
- package/AISB/catalog/aisb.t3.014_piguard.zh.yaml +164 -0
- package/AISB/catalog/aisb.t3.015_frspec.yaml +209 -0
- package/AISB/catalog/aisb.t3.015_frspec.zh.yaml +163 -0
- package/AISB/catalog/aisb.t3.016_mathfusion.yaml +166 -0
- package/AISB/catalog/aisb.t3.016_mathfusion.zh.yaml +145 -0
- package/AISB/catalog/aisb.t3.017_multimodalglp.yaml +171 -0
- package/AISB/catalog/aisb.t3.017_multimodalglp.zh.yaml +122 -0
- package/AISB/catalog/aisb.t3.018_cotsynth.yaml +206 -0
- package/AISB/catalog/aisb.t3.018_cotsynth.zh.yaml +162 -0
- package/AISB/catalog/aisb.t3.019_dyscaleut.yaml +211 -0
- package/AISB/catalog/aisb.t3.019_dyscaleut.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.020_aristotle.yaml +173 -0
- package/AISB/catalog/aisb.t3.020_aristotle.zh.yaml +119 -0
- package/AISB/catalog/aisb.t3.021_tokenrecycling.yaml +160 -0
- package/AISB/catalog/aisb.t3.021_tokenrecycling.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.022_chainofreasoning.yaml +204 -0
- package/AISB/catalog/aisb.t3.022_chainofreasoning.zh.yaml +161 -0
- package/AISB/catalog/aisb.t3.023_guidedembed.yaml +211 -0
- package/AISB/catalog/aisb.t3.023_guidedembed.zh.yaml +189 -0
- package/AISB/catalog/aisb.t3.024_outputcentric.yaml +148 -0
- package/AISB/catalog/aisb.t3.024_outputcentric.zh.yaml +131 -0
- package/AISB/catalog/aisb.t3.025_deeper.yaml +143 -0
- package/AISB/catalog/aisb.t3.025_deeper.zh.yaml +116 -0
- package/AISB/catalog/aisb.t3.026_gartkg.yaml +195 -0
- package/AISB/catalog/aisb.t3.026_gartkg.zh.yaml +127 -0
- package/AISB/catalog/aisb.t3.027_citeeval.yaml +182 -0
- package/AISB/catalog/aisb.t3.027_citeeval.zh.yaml +135 -0
- package/AISB/catalog/aisb.t3.028_sbam.yaml +206 -0
- package/AISB/catalog/aisb.t3.028_sbam.zh.yaml +166 -0
- package/AISB/catalog/aisb.t3.029_cdqgeoembed.yaml +224 -0
- package/AISB/catalog/aisb.t3.029_cdqgeoembed.zh.yaml +142 -0
- package/AISB/catalog/aisb.t3.030_processrm.yaml +211 -0
- package/AISB/catalog/aisb.t3.030_processrm.zh.yaml +166 -0
- package/AISB/catalog/aisb.t3.031_circuitstability.yaml +172 -0
- package/AISB/catalog/aisb.t3.031_circuitstability.zh.yaml +134 -0
- package/AISB/catalog/aisb.t3.032_ptsolver.yaml +169 -0
- package/AISB/catalog/aisb.t3.032_ptsolver.zh.yaml +135 -0
- package/AISB/catalog/aisb.t3.033_gcse.yaml +144 -0
- package/AISB/catalog/aisb.t3.033_gcse.zh.yaml +126 -0
- package/AISB/catalog/aisb.t3.034_ensemblewm.yaml +183 -0
- package/AISB/catalog/aisb.t3.034_ensemblewm.zh.yaml +146 -0
- package/AISB/catalog/aisb.t3.035_moralvalueswa.yaml +207 -0
- package/AISB/catalog/aisb.t3.035_moralvalueswa.zh.yaml +165 -0
- package/AISB/catalog/aisb.t3.036_weakstrongpref.yaml +210 -0
- package/AISB/catalog/aisb.t3.036_weakstrongpref.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.037_dementiamask.yaml +172 -0
- package/AISB/catalog/aisb.t3.037_dementiamask.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.038_tinysam.yaml +284 -0
- package/AISB/catalog/aisb.t3.038_tinysam.zh.yaml +240 -0
- package/AISB/catalog/aisb.t3.039_calf.yaml +224 -0
- package/AISB/catalog/aisb.t3.039_calf.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.040_graniteguardian.yaml +199 -0
- package/AISB/catalog/aisb.t3.040_graniteguardian.zh.yaml +174 -0
- package/AISB/catalog/aisb.t3.041_amdm.yaml +149 -0
- package/AISB/catalog/aisb.t3.041_amdm.zh.yaml +137 -0
- package/AISB/catalog/aisb.t3.042_xpatch.yaml +216 -0
- package/AISB/catalog/aisb.t3.042_xpatch.zh.yaml +182 -0
- package/AISB/catalog/aisb.t3.043_vhm.yaml +268 -0
- package/AISB/catalog/aisb.t3.043_vhm.zh.yaml +193 -0
- package/AISB/catalog/aisb.t3.044_rgvi.yaml +224 -0
- package/AISB/catalog/aisb.t3.044_rgvi.zh.yaml +176 -0
- package/AISB/catalog/aisb.t3.045_pslstm.yaml +203 -0
- package/AISB/catalog/aisb.t3.045_pslstm.zh.yaml +179 -0
- package/AISB/catalog/aisb.t3.046_nonstatts.yaml +208 -0
- package/AISB/catalog/aisb.t3.046_nonstatts.zh.yaml +194 -0
- package/AISB/catalog/aisb.t3.047_timepfn.yaml +156 -0
- package/AISB/catalog/aisb.t3.047_timepfn.zh.yaml +124 -0
- package/AISB/catalog/aisb.t3.048_proxyspex.yaml +148 -0
- package/AISB/catalog/aisb.t3.048_proxyspex.zh.yaml +125 -0
- package/AISB/catalog/aisb.t3.049_hogwildinference.yaml +183 -0
- package/AISB/catalog/aisb.t3.049_hogwildinference.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.050_causalpfn.yaml +214 -0
- package/AISB/catalog/aisb.t3.050_causalpfn.zh.yaml +190 -0
- package/AISB/catalog/aisb.t3.051_flashtp.yaml +169 -0
- package/AISB/catalog/aisb.t3.051_flashtp.zh.yaml +124 -0
- package/AISB/catalog/aisb.t3.052_nsdiff.yaml +155 -0
- package/AISB/catalog/aisb.t3.052_nsdiff.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.053_k2vae.yaml +158 -0
- package/AISB/catalog/aisb.t3.053_k2vae.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.054_timebase.yaml +178 -0
- package/AISB/catalog/aisb.t3.054_timebase.zh.yaml +158 -0
- package/AISB/catalog/aisb.t3.055_csbrain.yaml +238 -0
- package/AISB/catalog/aisb.t3.055_csbrain.zh.yaml +184 -0
- package/AISB/catalog/aisb.t3.056_infosam.yaml +224 -0
- package/AISB/catalog/aisb.t3.056_infosam.zh.yaml +189 -0
- package/AISB/catalog/aisb.t3.057_mdreid.yaml +129 -0
- package/AISB/catalog/aisb.t3.057_mdreid.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.058_mindglitch.yaml +171 -0
- package/AISB/catalog/aisb.t3.058_mindglitch.zh.yaml +145 -0
- package/AISB/catalog/aisb.t3.059_selfsupervised.yaml +154 -0
- package/AISB/catalog/aisb.t3.059_selfsupervised.zh.yaml +125 -0
- package/AISB/catalog/aisb.t3.060_iaggad.yaml +121 -0
- package/AISB/catalog/aisb.t3.060_iaggad.zh.yaml +100 -0
- package/AISB/catalog/aisb.t3.061_hsgkn.yaml +136 -0
- package/AISB/catalog/aisb.t3.061_hsgkn.zh.yaml +113 -0
- package/AISB/catalog/aisb.t3.062_visionts.yaml +237 -0
- package/AISB/catalog/aisb.t3.062_visionts.zh.yaml +216 -0
- package/AISB/catalog/aisb.t3.063_tsrag.yaml +162 -0
- package/AISB/catalog/aisb.t3.063_tsrag.zh.yaml +138 -0
- package/AISB/catalog/aisb.t3.064_pir.yaml +221 -0
- package/AISB/catalog/aisb.t3.064_pir.zh.yaml +197 -0
- package/AISB/catalog/aisb.t3.065_proteinbinding.yaml +234 -0
- package/AISB/catalog/aisb.t3.065_proteinbinding.zh.yaml +167 -0
- package/AISB/catalog/aisb.t3.066_tropicalattention.yaml +267 -0
- package/AISB/catalog/aisb.t3.066_tropicalattention.zh.yaml +229 -0
- package/AISB/catalog/aisb.t3.067_kanad.yaml +193 -0
- package/AISB/catalog/aisb.t3.067_kanad.zh.yaml +167 -0
- package/AISB/catalog/aisb.t3.068_sempo.yaml +187 -0
- package/AISB/catalog/aisb.t3.068_sempo.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.069_treehfd.yaml +129 -0
- package/AISB/catalog/aisb.t3.069_treehfd.zh.yaml +111 -0
- package/AISB/catalog/aisb.t3.070_certifiedunlearning.yaml +224 -0
- package/AISB/catalog/aisb.t3.070_certifiedunlearning.zh.yaml +171 -0
- package/AISB/catalog/aisb.t3.071_neuralmjd.yaml +142 -0
- package/AISB/catalog/aisb.t3.071_neuralmjd.zh.yaml +120 -0
- package/AISB/catalog/aisb.t3.072_fedgmt.yaml +181 -0
- package/AISB/catalog/aisb.t3.072_fedgmt.zh.yaml +158 -0
- package/AISB/catalog/aisb.t3.073_rld.yaml +161 -0
- package/AISB/catalog/aisb.t3.073_rld.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.074_lsvi.yaml +163 -0
- package/AISB/catalog/aisb.t3.074_lsvi.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.075_treeslicedentropy.yaml +201 -0
- package/AISB/catalog/aisb.t3.075_treeslicedentropy.zh.yaml +148 -0
- package/AISB/catalog/aisb.t3.076_aanet.yaml +169 -0
- package/AISB/catalog/aisb.t3.076_aanet.zh.yaml +129 -0
- package/AISB/catalog/aisb.t3.077_cmnn.yaml +199 -0
- package/AISB/catalog/aisb.t3.077_cmnn.zh.yaml +165 -0
- package/AISB/catalog/aisb.t3.078_conformalanomaly.yaml +146 -0
- package/AISB/catalog/aisb.t3.078_conformalanomaly.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.079_dpfkmeans.yaml +131 -0
- package/AISB/catalog/aisb.t3.079_dpfkmeans.zh.yaml +104 -0
- package/AISB/catalog/aisb.t3.080_latentscorereweight.yaml +169 -0
- package/AISB/catalog/aisb.t3.080_latentscorereweight.zh.yaml +123 -0
- package/AISB/catalog/aisb.t3.081_qmamba.yaml +150 -0
- package/AISB/catalog/aisb.t3.081_qmamba.zh.yaml +117 -0
- package/AISB/catalog/aisb.t3.082_onlinellmrouting.yaml +160 -0
- package/AISB/catalog/aisb.t3.082_onlinellmrouting.zh.yaml +133 -0
- package/AISB/catalog/aisb.t3.083_starformer.yaml +178 -0
- package/AISB/catalog/aisb.t3.083_starformer.zh.yaml +140 -0
- package/AISB/catalog/aisb.t3.084_ift.yaml +139 -0
- package/AISB/catalog/aisb.t3.084_ift.zh.yaml +111 -0
- package/AISB/catalog/aisb.t3.085_neuralsurv.yaml +183 -0
- package/AISB/catalog/aisb.t3.085_neuralsurv.zh.yaml +143 -0
- package/AISB/catalog/aisb.t3.086_stella.yaml +197 -0
- package/AISB/catalog/aisb.t3.086_stella.zh.yaml +142 -0
- package/AISB/catalog/aisb.t3.087_moses.yaml +167 -0
- package/AISB/catalog/aisb.t3.087_moses.zh.yaml +132 -0
- package/AISB/catalog/aisb.t3.088_channelnorm.yaml +140 -0
- package/AISB/catalog/aisb.t3.088_channelnorm.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.089_causalvelocity.yaml +730 -0
- package/AISB/catalog/aisb.t3.089_causalvelocity.zh.yaml +668 -0
- package/AISB/catalog/aisb.t3.090_rstib.yaml +144 -0
- package/AISB/catalog/aisb.t3.090_rstib.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.091_timeawarecausal.yaml +132 -0
- package/AISB/catalog/aisb.t3.091_timeawarecausal.zh.yaml +107 -0
- package/AISB/catalog/aisb.t3.092_kmeanslocalopt.yaml +138 -0
- package/AISB/catalog/aisb.t3.092_kmeanslocalopt.zh.yaml +110 -0
- package/AISB/catalog/aisb.t3.093_fedwmsam.yaml +134 -0
- package/AISB/catalog/aisb.t3.093_fedwmsam.zh.yaml +106 -0
- package/AISB/catalog/aisb.t3.094_boundre.yaml +147 -0
- package/AISB/catalog/aisb.t3.094_boundre.zh.yaml +114 -0
- package/AISB/catalog/aisb.t3.095_fastfeaturecp.yaml +153 -0
- package/AISB/catalog/aisb.t3.095_fastfeaturecp.zh.yaml +118 -0
- package/AISB/catalog/aisb.t3.096_m3svm.yaml +189 -0
- package/AISB/catalog/aisb.t3.096_m3svm.zh.yaml +149 -0
- package/AISB/catalog/aisb.t3.097_wassersteintl.yaml +212 -0
- package/AISB/catalog/aisb.t3.097_wassersteintl.zh.yaml +169 -0
- package/AISB/catalog/aisb.t3.098_xmahalanobis.yaml +171 -0
- package/AISB/catalog/aisb.t3.098_xmahalanobis.zh.yaml +127 -0
- package/AISB/catalog/aisb.t3.099_ollalanding.yaml +248 -0
- package/AISB/catalog/aisb.t3.099_ollalanding.zh.yaml +182 -0
- package/AISB/catalog/aisb.t3.100_invmissingdata.yaml +179 -0
- package/AISB/catalog/aisb.t3.100_invmissingdata.zh.yaml +150 -0
- package/AISB/catalog/aisb.t3.101_acia.yaml +164 -0
- package/AISB/catalog/aisb.t3.101_acia.zh.yaml +109 -0
- package/AISB/catalog/aisb.t3.102_stochasticff.yaml +178 -0
- package/AISB/catalog/aisb.t3.102_stochasticff.zh.yaml +130 -0
- package/AISB/catalog/aisb.t3.103_qdcp.yaml +150 -0
- package/AISB/catalog/aisb.t3.103_qdcp.zh.yaml +116 -0
- package/AISB/catalog/aisb.t3.104_balancedactiveinf.yaml +137 -0
- package/AISB/catalog/aisb.t3.104_balancedactiveinf.zh.yaml +104 -0
- package/AISB/catalog/aisb.t3.105_binaryclasseval.yaml +161 -0
- package/AISB/catalog/aisb.t3.105_binaryclasseval.zh.yaml +130 -0
- package/AISB/image/001_aisb.t3.001_savvy.jpg +0 -0
- package/AISB/image/002_aisb.t3.002_pinet.jpg +0 -0
- package/AISB/image/003_aisb.t3.003_dmsqd.jpg +0 -0
- package/AISB/image/004_aisb.t3.004_decentralattn.jpg +0 -0
- package/AISB/image/005_aisb.t3.005_tsae.jpg +0 -0
- package/AISB/image/006_aisb.t3.006_physense.jpg +0 -0
- package/AISB/image/007_aisb.t3.007_reasoningiqa.jpg +0 -0
- package/AISB/image/008_aisb.t3.008_meanflows.jpg +0 -0
- package/AISB/image/009_aisb.t3.009_scoremissing.jpg +0 -0
- package/AISB/image/010_aisb.t3.010_suitabilityfilter.jpg +0 -0
- package/AISB/image/011_aisb.t3.011_osd.jpg +0 -0
- package/AISB/image/012_aisb.t3.012_efficientqat.jpg +0 -0
- package/AISB/image/013_aisb.t3.013_appl.jpg +0 -0
- package/AISB/image/014_aisb.t3.014_piguard.jpg +0 -0
- package/AISB/image/015_aisb.t3.015_frspec.jpg +0 -0
- package/AISB/image/016_aisb.t3.016_mathfusion.jpg +0 -0
- package/AISB/image/017_aisb.t3.017_multimodalglp.jpg +0 -0
- package/AISB/image/018_aisb.t3.018_cotsynth.jpg +0 -0
- package/AISB/image/019_aisb.t3.019_dyscaleut.jpg +0 -0
- package/AISB/image/020_aisb.t3.020_aristotle.jpg +0 -0
- package/AISB/image/021_aisb.t3.021_tokenrecycling.jpg +0 -0
- package/AISB/image/022_aisb.t3.022_chainofreasoning.jpg +0 -0
- package/AISB/image/023_aisb.t3.023_guidedembed.jpg +0 -0
- package/AISB/image/024_aisb.t3.024_outputcentric.jpg +0 -0
- package/AISB/image/025_aisb.t3.025_deeper.jpg +0 -0
- package/AISB/image/026_aisb.t3.026_gartkg.jpg +0 -0
- package/AISB/image/027_aisb.t3.027_citeeval.jpg +0 -0
- package/AISB/image/028_aisb.t3.028_sbam.jpg +0 -0
- package/AISB/image/029_aisb.t3.029_cdqgeoembed.jpg +0 -0
- package/AISB/image/030_aisb.t3.030_processrm.jpg +0 -0
- package/AISB/image/031_aisb.t3.031_circuitstability.jpg +0 -0
- package/AISB/image/032_aisb.t3.032_ptsolver.jpg +0 -0
- package/AISB/image/033_aisb.t3.033_gcse.jpg +0 -0
- package/AISB/image/034_aisb.t3.034_ensemblewm.jpg +0 -0
- package/AISB/image/035_aisb.t3.035_moralvalueswa.jpg +0 -0
- package/AISB/image/036_aisb.t3.036_weakstrongpref.jpg +0 -0
- package/AISB/image/037_aisb.t3.037_dementiamask.jpg +0 -0
- package/AISB/image/038_aisb.t3.038_tinysam.jpg +0 -0
- package/AISB/image/039_aisb.t3.039_calf.jpg +0 -0
- package/AISB/image/040_aisb.t3.040_graniteguardian.jpg +0 -0
- package/AISB/image/041_aisb.t3.041_amdm.jpg +0 -0
- package/AISB/image/042_aisb.t3.042_xpatch.jpg +0 -0
- package/AISB/image/043_aisb.t3.043_vhm.jpg +0 -0
- package/AISB/image/044_aisb.t3.044_rgvi.jpg +0 -0
- package/AISB/image/045_aisb.t3.045_pslstm.jpg +0 -0
- package/AISB/image/046_aisb.t3.046_nonstatts.jpg +0 -0
- package/AISB/image/047_aisb.t3.047_timepfn.jpg +0 -0
- package/AISB/image/048_aisb.t3.048_proxyspex.jpg +0 -0
- package/AISB/image/049_aisb.t3.049_hogwildinference.jpg +0 -0
- package/AISB/image/050_aisb.t3.050_causalpfn.jpg +0 -0
- package/AISB/image/051_aisb.t3.051_flashtp.jpg +0 -0
- package/AISB/image/052_aisb.t3.052_nsdiff.jpg +0 -0
- package/AISB/image/053_aisb.t3.053_k2vae.jpg +0 -0
- package/AISB/image/054_aisb.t3.054_timebase.jpg +0 -0
- package/AISB/image/055_aisb.t3.055_csbrain.jpg +0 -0
- package/AISB/image/056_aisb.t3.056_infosam.jpg +0 -0
- package/AISB/image/057_aisb.t3.057_mdreid.jpg +0 -0
- package/AISB/image/058_aisb.t3.058_mindglitch.jpg +0 -0
- package/AISB/image/059_aisb.t3.059_selfsupervised.jpg +0 -0
- package/AISB/image/060_aisb.t3.060_iaggad.jpg +0 -0
- package/AISB/image/061_aisb.t3.061_hsgkn.jpg +0 -0
- package/AISB/image/062_aisb.t3.062_visionts.jpg +0 -0
- package/AISB/image/063_aisb.t3.063_tsrag.jpg +0 -0
- package/AISB/image/064_aisb.t3.064_pir.jpg +0 -0
- package/AISB/image/065_aisb.t3.065_proteinbinding.jpg +0 -0
- package/AISB/image/066_aisb.t3.066_tropicalattention.jpg +0 -0
- package/AISB/image/067_aisb.t3.067_kanad.jpg +0 -0
- package/AISB/image/068_aisb.t3.068_sempo.jpg +0 -0
- package/AISB/image/069_aisb.t3.069_treehfd.jpg +0 -0
- package/AISB/image/070_aisb.t3.070_certifiedunlearning.jpg +0 -0
- package/AISB/image/071_aisb.t3.071_neuralmjd.jpg +0 -0
- package/AISB/image/072_aisb.t3.072_fedgmt.jpg +0 -0
- package/AISB/image/073_aisb.t3.073_rld.jpg +0 -0
- package/AISB/image/074_aisb.t3.074_lsvi.jpg +0 -0
- package/AISB/image/075_aisb.t3.075_treeslicedentropy.jpg +0 -0
- package/AISB/image/076_aisb.t3.076_aanet.jpg +0 -0
- package/AISB/image/077_aisb.t3.077_cmnn.jpg +0 -0
- package/AISB/image/078_aisb.t3.078_conformalanomaly.jpg +0 -0
- package/AISB/image/079_aisb.t3.079_dpfkmeans.jpg +0 -0
- package/AISB/image/080_aisb.t3.080_latentscorereweight.jpg +0 -0
- package/AISB/image/081_aisb.t3.081_qmamba.jpg +0 -0
- package/AISB/image/082_aisb.t3.082_onlinellmrouting.jpg +0 -0
- package/AISB/image/083_aisb.t3.083_starformer.jpg +0 -0
- package/AISB/image/084_aisb.t3.084_ift.jpg +0 -0
- package/AISB/image/085_aisb.t3.085_neuralsurv.jpg +0 -0
- package/AISB/image/086_aisb.t3.086_stella.jpg +0 -0
- package/AISB/image/087_aisb.t3.087_moses.jpg +0 -0
- package/AISB/image/088_aisb.t3.088_channelnorm.jpg +0 -0
- package/AISB/image/089_aisb.t3.089_causalvelocity.jpg +0 -0
- package/AISB/image/090_aisb.t3.090_rstib.jpg +0 -0
- package/AISB/image/091_aisb.t3.091_timeawarecausal.jpg +0 -0
- package/AISB/image/092_aisb.t3.092_kmeanslocalopt.jpg +0 -0
- package/AISB/image/093_aisb.t3.093_fedwmsam.jpg +0 -0
- package/AISB/image/094_aisb.t3.094_boundre.jpg +0 -0
- package/AISB/image/095_aisb.t3.095_fastfeaturecp.jpg +0 -0
- package/AISB/image/096_aisb.t3.096_m3svm.jpg +0 -0
- package/AISB/image/097_aisb.t3.097_wassersteintl.jpg +0 -0
- package/AISB/image/098_aisb.t3.098_xmahalanobis.jpg +0 -0
- package/AISB/image/099_aisb.t3.099_ollalanding.jpg +0 -0
- package/AISB/image/100_aisb.t3.100_invmissingdata.jpg +0 -0
- package/AISB/image/101_aisb.t3.101_acia.jpg +0 -0
- package/AISB/image/102_aisb.t3.102_stochasticff.jpg +0 -0
- package/AISB/image/103_aisb.t3.103_qdcp.jpg +0 -0
- package/AISB/image/104_aisb.t3.104_balancedactiveinf.jpg +0 -0
- package/AISB/image/105_aisb.t3.105_binaryclasseval.jpg +0 -0
- package/AISB/image/106_aisb.t1.reasoning_lite.jpg +0 -0
- package/AISB/image/107_aisb.t2.paper_audit.jpg +0 -0
- package/AISB/image/108_aisb.t3.multi_gpu_search.jpg +0 -0
- package/AISB/image/109_aisb.t3.tdc_admet.jpg +0 -0
- package/AISB/image/aisb.b1.agentic_coding.svg +16 -0
- package/AISB/image/aisb.b10.climate_earth.svg +16 -0
- package/AISB/image/aisb.b11.model_efficiency.svg +16 -0
- package/AISB/image/aisb.b12.embodied_ai.svg +16 -0
- package/AISB/image/aisb.b2.agent_systems.svg +16 -0
- package/AISB/image/aisb.b3.self_evolving_rl.svg +16 -0
- package/AISB/image/aisb.b4.lm_reasoning.svg +16 -0
- package/AISB/image/aisb.b5.math_proof.svg +16 -0
- package/AISB/image/aisb.b6.research_process.svg +16 -0
- package/AISB/image/aisb.b7.multimodal_fusion.svg +16 -0
- package/AISB/image/aisb.b8.lifesci_drug.svg +16 -0
- package/AISB/image/aisb.b9.material_science.svg +16 -0
- package/README.md +196 -32
- package/bin/ds.js +924 -66
- package/docs/en/00_QUICK_START.md +195 -18
- package/docs/en/01_SETTINGS_REFERENCE.md +468 -96
- package/docs/en/02_START_RESEARCH_GUIDE.md +26 -5
- package/docs/en/03_QQ_CONNECTOR_GUIDE.md +14 -3
- package/docs/en/04_LINGZHU_CONNECTOR_GUIDE.md +2 -0
- package/docs/en/05_TUI_GUIDE.md +171 -2
- package/docs/en/07_MEMORY_AND_MCP.md +38 -2
- package/docs/en/09_DOCTOR.md +78 -7
- package/docs/en/10_WEIXIN_CONNECTOR_GUIDE.md +38 -1
- package/docs/en/11_LICENSE_AND_RISK.md +4 -0
- package/docs/en/12_GUIDED_WORKFLOW_TOUR.md +15 -0
- package/docs/en/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
- package/docs/en/15_CODEX_PROVIDER_SETUP.md +624 -180
- package/docs/en/16_TELEGRAM_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/17_WHATSAPP_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/18_FEISHU_CONNECTOR_GUIDE.md +14 -0
- package/docs/en/21_LOCAL_MODEL_BACKENDS_GUIDE.md +386 -0
- package/docs/en/22_BENCHSTORE_YAML_REFERENCE.md +469 -0
- package/docs/en/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +316 -0
- package/docs/en/24_CLAUDE_CODE_PROVIDER_SETUP.md +469 -0
- package/docs/en/25_OPENCODE_PROVIDER_SETUP.md +653 -0
- package/docs/en/26_CITATION_AND_ATTRIBUTION.md +119 -0
- package/docs/en/27_KIMI_CODE_PROVIDER_SETUP.md +180 -0
- package/docs/en/28_DISCORD_CONNECTOR_GUIDE.md +61 -0
- package/docs/en/29_SLACK_CONNECTOR_GUIDE.md +60 -0
- package/docs/en/30_SETTINGS_CONTROL_CENTER_GUIDE.md +371 -0
- package/docs/en/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
- package/docs/en/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +273 -0
- package/docs/en/33_WORKSPACE_EXPLORER_QA.md +121 -0
- package/docs/en/91_DEVELOPMENT.md +266 -0
- package/docs/en/99_ACKNOWLEDGEMENTS.md +24 -19
- package/docs/en/README.md +48 -7
- package/docs/images/admin/admin-connectors-health-en.png +0 -0
- package/docs/images/admin/admin-controllers-en.png +0 -0
- package/docs/images/admin/admin-diagnostics-en.png +0 -0
- package/docs/images/admin/admin-errors-en.png +0 -0
- package/docs/images/admin/admin-issues-en.png +0 -0
- package/docs/images/admin/admin-logs-en.png +0 -0
- package/docs/images/admin/admin-quest-detail-en.png +0 -0
- package/docs/images/admin/admin-quests-en.png +0 -0
- package/docs/images/admin/admin-repairs-en.png +0 -0
- package/docs/images/admin/admin-runtime-en.png +0 -0
- package/docs/images/admin/admin-search-en.png +0 -0
- package/docs/images/admin/admin-stats-en.png +0 -0
- package/docs/images/admin/admin-summary-en.png +0 -0
- package/docs/images/connectors/connector-discord-en.png +0 -0
- package/docs/images/connectors/connector-feishu-en.png +0 -0
- package/docs/images/connectors/connector-lingzhu-en.png +0 -0
- package/docs/images/connectors/connector-qq-en.png +0 -0
- package/docs/images/connectors/connector-slack-en.png +0 -0
- package/docs/images/connectors/connector-telegram-en.png +0 -0
- package/docs/images/connectors/connector-weixin-en.png +0 -0
- package/docs/images/connectors/connector-whatsapp-en.png +0 -0
- package/docs/images/settings/settings-baselines-en.png +0 -0
- package/docs/images/settings/settings-config-en.png +0 -0
- package/docs/images/settings/settings-connectors-overview-en.png +0 -0
- package/docs/images/settings/settings-deepxiv-en.png +0 -0
- package/docs/images/settings/settings-mcp-servers-en.png +0 -0
- package/docs/images/settings/settings-plugins-en.png +0 -0
- package/docs/images/settings/settings-runners-en.png +0 -0
- package/docs/zh/00_QUICK_START.md +142 -18
- package/docs/zh/01_SETTINGS_REFERENCE.md +219 -98
- package/docs/zh/02_START_RESEARCH_GUIDE.md +26 -5
- package/docs/zh/05_TUI_GUIDE.md +171 -2
- package/docs/zh/07_MEMORY_AND_MCP.md +29 -2
- package/docs/zh/09_DOCTOR.md +54 -8
- package/docs/zh/10_WEIXIN_CONNECTOR_GUIDE.md +24 -1
- package/docs/zh/11_LICENSE_AND_RISK.md +4 -0
- package/docs/zh/12_GUIDED_WORKFLOW_TOUR.md +15 -0
- package/docs/zh/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
- package/docs/zh/15_CODEX_PROVIDER_SETUP.md +552 -181
- package/docs/zh/21_LOCAL_MODEL_BACKENDS_GUIDE.md +384 -0
- package/docs/zh/22_BENCHSTORE_YAML_REFERENCE.md +459 -0
- package/docs/zh/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +287 -0
- package/docs/zh/23_CLAUDE_RUNNER_GUIDE.md +103 -0
- package/docs/zh/24_CLAUDE_CODE_PROVIDER_SETUP.md +460 -0
- package/docs/zh/25_OPENCODE_PROVIDER_SETUP.md +660 -0
- package/docs/zh/26_CITATION_AND_ATTRIBUTION.md +102 -0
- package/docs/zh/27_KIMI_CODE_PROVIDER_SETUP.md +51 -0
- package/docs/zh/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
- package/docs/zh/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +264 -0
- package/docs/zh/33_WORKSPACE_EXPLORER_QA.md +127 -0
- package/docs/zh/99_ACKNOWLEDGEMENTS.md +23 -19
- package/docs/zh/README.md +33 -7
- package/install.sh +168 -20
- package/package.json +5 -1
- package/pyproject.toml +2 -1
- package/src/deepscientist/__init__.py +1 -1
- package/src/deepscientist/acp/envelope.py +13 -0
- package/src/deepscientist/admin/__init__.py +3 -0
- package/src/deepscientist/admin/charts.py +681 -0
- package/src/deepscientist/admin/logs.py +119 -0
- package/src/deepscientist/admin/repairs.py +217 -0
- package/src/deepscientist/admin/service.py +1310 -0
- package/src/deepscientist/admin/system_info.py +700 -0
- package/src/deepscientist/admin/tasks.py +465 -0
- package/src/deepscientist/admin/tool_metrics.py +600 -0
- package/src/deepscientist/artifact/guidance.py +8 -4
- package/src/deepscientist/artifact/schemas.py +115 -0
- package/src/deepscientist/artifact/service.py +4268 -260
- package/src/deepscientist/bash_exec/monitor.py +30 -3
- package/src/deepscientist/bash_exec/service.py +134 -1
- package/src/deepscientist/benchstore/__init__.py +4 -0
- package/src/deepscientist/benchstore/prompt_builder.py +224 -0
- package/src/deepscientist/benchstore/service.py +1716 -0
- package/src/deepscientist/bridges/connectors.py +8 -2
- package/src/deepscientist/channels/weixin_ilink.py +8 -1
- package/src/deepscientist/cli.py +92 -17
- package/src/deepscientist/codex_cli_compat.py +187 -74
- package/src/deepscientist/config/models.py +82 -11
- package/src/deepscientist/config/service.py +1077 -93
- package/src/deepscientist/connector/weixin_support.py +48 -17
- package/src/deepscientist/daemon/api/handlers.py +827 -235
- package/src/deepscientist/daemon/api/router.py +81 -1
- package/src/deepscientist/daemon/app.py +1512 -85
- package/src/deepscientist/diagnostics/__init__.py +6 -0
- package/src/deepscientist/diagnostics/runner_failures.py +277 -0
- package/src/deepscientist/doctor.py +407 -56
- package/src/deepscientist/evidence_packets.py +590 -0
- package/src/deepscientist/home.py +52 -4
- package/src/deepscientist/kimi_cli_compat.py +50 -0
- package/src/deepscientist/latex_runtime.py +2 -2
- package/src/deepscientist/mcp/context.py +2 -0
- package/src/deepscientist/mcp/schemas.py +114 -0
- package/src/deepscientist/mcp/server.py +1566 -126
- package/src/deepscientist/memory/service.py +203 -16
- package/src/deepscientist/process_control.py +8 -1
- package/src/deepscientist/prompts/builder.py +850 -88
- package/src/deepscientist/quest/__init__.py +2 -2
- package/src/deepscientist/quest/layout.py +12 -1
- package/src/deepscientist/quest/node_traces.py +10 -0
- package/src/deepscientist/quest/service.py +1852 -161
- package/src/deepscientist/quest/stage_views.py +1 -1
- package/src/deepscientist/runners/__init__.py +18 -0
- package/src/deepscientist/runners/base.py +89 -1
- package/src/deepscientist/runners/builtins.py +13 -1
- package/src/deepscientist/runners/claude.py +391 -0
- package/src/deepscientist/runners/codex.py +480 -35
- package/src/deepscientist/runners/codex_telemetry.py +127 -0
- package/src/deepscientist/runners/kimi.py +334 -0
- package/src/deepscientist/runners/metadata.py +68 -0
- package/src/deepscientist/runners/opencode.py +414 -0
- package/src/deepscientist/runners/runtime_overrides.py +100 -0
- package/src/deepscientist/runners/simple_cli.py +538 -0
- package/src/deepscientist/runtime_storage.py +303 -0
- package/src/deepscientist/shared.py +80 -16
- package/src/deepscientist/skills/installer.py +37 -0
- package/src/deepscientist/skills/registry.py +2 -0
- package/src/deepscientist/tinytex.py +2 -2
- package/src/deepscientist/tui.py +10 -3
- package/src/prompts/benchstore/system.md +77 -0
- package/src/prompts/connectors/qq.md +33 -2
- package/src/prompts/connectors/weixin.md +208 -23
- package/src/prompts/contracts/admin_ops.md +74 -0
- package/src/prompts/contracts/admin_ops_knowledge.md +138 -0
- package/src/prompts/contracts/shared_interaction.md +5 -10
- package/src/prompts/start_setup/system.md +422 -0
- package/src/prompts/system.md +411 -304
- package/src/prompts/system_copilot.md +89 -0
- package/src/skills/analysis-campaign/SKILL.md +239 -578
- package/src/skills/analysis-campaign/references/artifact-flow-examples.md +102 -0
- package/src/skills/analysis-campaign/references/boundary-cases.md +98 -0
- package/src/skills/analysis-campaign/references/campaign-checklist-template.md +39 -24
- package/src/skills/analysis-campaign/references/campaign-design.md +26 -10
- package/src/skills/analysis-campaign/references/campaign-plan-template.md +53 -54
- package/src/skills/analysis-campaign/references/operational-guidance.md +97 -0
- package/src/skills/analysis-campaign/references/writing-facing-slice-examples.md +10 -20
- package/src/skills/baseline/SKILL.md +183 -461
- package/src/skills/baseline/references/artifact-flow-examples.md +106 -0
- package/src/skills/baseline/references/artifact-payload-examples.md +1 -1
- package/src/skills/baseline/references/baseline-checklist-template.md +27 -35
- package/src/skills/baseline/references/baseline-plan-template.md +37 -76
- package/src/skills/baseline/references/boundary-cases.md +86 -0
- package/src/skills/baseline/references/codebase-audit-checklist.md +2 -6
- package/src/skills/baseline/references/comparability-contract.md +7 -12
- package/src/skills/baseline/references/operational-guidance.md +56 -0
- package/src/skills/baseline/references/route-selection.md +5 -25
- package/src/skills/decision/SKILL.md +113 -306
- package/src/skills/decision/references/checkpoint-memory-template.md +47 -0
- package/src/skills/decision/references/operational-guidance.md +94 -0
- package/src/skills/decision/references/research-route-criteria.md +7 -8
- package/src/skills/decision/references/strategic-decision-template.md +13 -26
- package/src/skills/experiment/SKILL.md +132 -670
- package/src/skills/experiment/references/execution-playbook.md +374 -0
- package/src/skills/experiment/references/main-experiment-checklist-template.md +26 -2
- package/src/skills/experiment/references/main-experiment-plan-template.md +28 -17
- package/src/skills/experiment/references/operational-guidance.md +108 -0
- package/src/skills/finalize/SKILL.md +62 -0
- package/src/skills/finalize/references/checkpoint-memory-template.md +49 -0
- package/src/skills/finalize/references/resume-packet-template.md +7 -0
- package/src/skills/idea/SKILL.md +228 -15
- package/src/skills/idea/references/controlled-brainstorming-playbook.md +78 -0
- package/src/skills/idea/references/current-board-packet-template.md +61 -0
- package/src/skills/idea/references/high-value-idea-sourcing.md +119 -0
- package/src/skills/idea/references/idea-generation-playbook.md +21 -0
- package/src/skills/idea/references/idea-thinking-flow.md +6 -0
- package/src/skills/idea/references/literature-survey-template.md +3 -0
- package/src/skills/idea/references/objective-contract-template.md +54 -0
- package/src/skills/idea/references/outline-seeding-example.md +56 -0
- package/src/skills/idea/references/pre-idea-draft-template.md +105 -0
- package/src/skills/idea/references/related-work-playbook.md +75 -2
- package/src/skills/idea/references/research-history-playbook.md +114 -0
- package/src/skills/idea/references/selection-gate.md +58 -6
- package/src/skills/intake-audit/SKILL.md +43 -2
- package/src/skills/intake-audit/references/state-audit-template.md +10 -0
- package/src/skills/nature-data/SKILL.md +128 -0
- package/src/skills/nature-data/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-data/agents/openai.yaml +4 -0
- package/src/skills/nature-data/references/chinese-author-alignment.md +84 -0
- package/src/skills/nature-data/references/fair-metadata-checklist.md +105 -0
- package/src/skills/nature-data/references/policy-principles.md +103 -0
- package/src/skills/nature-data/references/repository-and-identifiers.md +96 -0
- package/src/skills/nature-data/references/source-basis.md +54 -0
- package/src/skills/nature-data/references/statement-patterns.md +153 -0
- package/src/skills/nature-figure/SKILL.md +197 -0
- package/src/skills/nature-figure/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-figure/agents/openai.yaml +4 -0
- package/src/skills/nature-figure/evals/evals.json +37 -0
- package/src/skills/nature-figure/references/api.md +428 -0
- package/src/skills/nature-figure/references/backend-selection.md +100 -0
- package/src/skills/nature-figure/references/chart-types.md +281 -0
- package/src/skills/nature-figure/references/common-patterns.md +349 -0
- package/src/skills/nature-figure/references/design-theory.md +436 -0
- package/src/skills/nature-figure/references/figure-contract.md +93 -0
- package/src/skills/nature-figure/references/nature-2026-observations.md +112 -0
- package/src/skills/nature-figure/references/qa-contract.md +119 -0
- package/src/skills/nature-figure/references/r-template-index.md +66 -0
- package/src/skills/nature-figure/references/r-workflow.md +161 -0
- package/src/skills/nature-figure/references/tutorials.md +250 -0
- package/src/skills/nature-paper2ppt/SKILL.md +507 -0
- package/src/skills/nature-paper2ppt/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-paper2ppt/agents/openai.yaml +4 -0
- package/src/skills/nature-polishing/SKILL.md +385 -0
- package/src/skills/nature-polishing/UPSTREAM_LICENSE.txt +21 -0
- package/src/skills/nature-polishing/agents/openai.yaml +4 -0
- package/src/skills/nature-polishing/references/phrasebank-playbook.md +162 -0
- package/src/skills/nature-polishing/references/section-moves.md +240 -0
- package/src/skills/nature-polishing/references/style-guardrails.md +94 -0
- package/src/skills/nature-polishing/references/writing-strategy.md +148 -0
- package/src/skills/optimize/SKILL.md +177 -1568
- package/src/skills/optimize/references/brief-shaping-playbook.md +95 -0
- package/src/skills/optimize/references/candidate-board-template.md +13 -0
- package/src/skills/optimize/references/candidate-ranking-template.md +51 -0
- package/src/skills/optimize/references/codegen-route-playbook.md +50 -0
- package/src/skills/optimize/references/debug-response-template.md +29 -0
- package/src/skills/optimize/references/frontier-review-template.md +32 -0
- package/src/skills/optimize/references/fusion-playbook.md +36 -0
- package/src/skills/optimize/references/method-brief-template.md +73 -0
- package/src/skills/optimize/references/operational-guidance.md +621 -0
- package/src/skills/optimize/references/optimization-memory-template.md +30 -0
- package/src/skills/optimize/references/optimize-checklist-template.md +18 -0
- package/src/skills/optimize/references/plateau-response-playbook.md +28 -0
- package/src/skills/optimize/references/prompt-patterns.md +49 -0
- package/src/skills/paper-outline/SKILL.md +227 -0
- package/src/skills/paper-outline/references/outline-patterns.md +87 -0
- package/src/skills/paper-plot/SKILL.md +79 -0
- package/src/skills/paper-plot/agents/openai.yaml +4 -0
- package/src/skills/paper-plot/references/bar_grouped_hatch.md +96 -0
- package/src/skills/paper-plot/references/bar_paired_delta.md +72 -0
- package/src/skills/paper-plot/references/line_confidence_band.md +75 -0
- package/src/skills/paper-plot/references/line_loss_with_inset.md +65 -0
- package/src/skills/paper-plot/references/line_training_curve.md +44 -0
- package/src/skills/paper-plot/references/radar_dual_series.md +59 -0
- package/src/skills/paper-plot/references/scatter_broken_axis.md +59 -0
- package/src/skills/paper-plot/references/scatter_tsne_cluster.md +72 -0
- package/src/skills/paper-plot/scripts/bar_memevolve.py +109 -0
- package/src/skills/paper-plot/scripts/bar_spice.py +166 -0
- package/src/skills/paper-plot/scripts/line_aime.py +94 -0
- package/src/skills/paper-plot/scripts/line_loss_inset.py +157 -0
- package/src/skills/paper-plot/scripts/line_selfdistill.py +168 -0
- package/src/skills/paper-plot/scripts/radar_dora.py +151 -0
- package/src/skills/paper-plot/scripts/scatter_break.py +169 -0
- package/src/skills/paper-plot/scripts/scatter_tsne.py +133 -0
- package/src/skills/rebuttal/SKILL.md +9 -0
- package/src/skills/references/tool-usage-by-stage.md +438 -0
- package/src/skills/review/SKILL.md +105 -7
- package/src/skills/science/PROVENANCE.md +44 -0
- package/src/skills/science/SKILL.md +137 -0
- package/src/skills/science/references/artifact-science-tool.md +110 -0
- package/src/skills/science/references/claim-type-discipline.md +56 -0
- package/src/skills/science/references/domain-index.md +422 -0
- package/src/skills/science/references/hpc-via-bash-exec.md +42 -0
- package/src/skills/science/references/package-check-playbook.md +64 -0
- package/src/skills/science/references/package-index.min.json +3616 -0
- package/src/skills/science/references/packages/abinit.md +80 -0
- package/src/skills/science/references/packages/acts.md +73 -0
- package/src/skills/science/references/packages/aiida-core.md +80 -0
- package/src/skills/science/references/packages/alamode.md +80 -0
- package/src/skills/science/references/packages/amuse.md +88 -0
- package/src/skills/science/references/packages/anndata.md +88 -0
- package/src/skills/science/references/packages/arbor.md +80 -0
- package/src/skills/science/references/packages/arc.md +73 -0
- package/src/skills/science/references/packages/astropy.md +88 -0
- package/src/skills/science/references/packages/astroquery.md +88 -0
- package/src/skills/science/references/packages/atomate2.md +80 -0
- package/src/skills/science/references/packages/atomsmltr.md +73 -0
- package/src/skills/science/references/packages/awkward.md +73 -0
- package/src/skills/science/references/packages/batman.md +88 -0
- package/src/skills/science/references/packages/biopython.md +88 -0
- package/src/skills/science/references/packages/bloqade.md +73 -0
- package/src/skills/science/references/packages/brian2.md +73 -0
- package/src/skills/science/references/packages/bullet3.md +73 -0
- package/src/skills/science/references/packages/calculix.md +80 -0
- package/src/skills/science/references/packages/cantera.md +73 -0
- package/src/skills/science/references/packages/cavity-md-ipi.md +80 -0
- package/src/skills/science/references/packages/ccdproc.md +88 -0
- package/src/skills/science/references/packages/celerite2.md +88 -0
- package/src/skills/science/references/packages/cellrank.md +73 -0
- package/src/skills/science/references/packages/cesm.md +80 -0
- package/src/skills/science/references/packages/chemicals.md +73 -0
- package/src/skills/science/references/packages/chempy.md +73 -0
- package/src/skills/science/references/packages/cirq.md +73 -0
- package/src/skills/science/references/packages/coffea.md +73 -0
- package/src/skills/science/references/packages/cp2k.md +88 -0
- package/src/skills/science/references/packages/custodian.md +80 -0
- package/src/skills/science/references/packages/dart.md +73 -0
- package/src/skills/science/references/packages/datamol.md +88 -0
- package/src/skills/science/references/packages/dd4hep.md +73 -0
- package/src/skills/science/references/packages/dealii.md +80 -0
- package/src/skills/science/references/packages/deepchem.md +88 -0
- package/src/skills/science/references/packages/delphes.md +73 -0
- package/src/skills/science/references/packages/devito.md +80 -0
- package/src/skills/science/references/packages/dftb.md +88 -0
- package/src/skills/science/references/packages/dftd4.md +88 -0
- package/src/skills/science/references/packages/dftk-jl.md +80 -0
- package/src/skills/science/references/packages/dolfinx.md +80 -0
- package/src/skills/science/references/packages/drake.md +73 -0
- package/src/skills/science/references/packages/dumux.md +73 -0
- package/src/skills/science/references/packages/elk.md +80 -0
- package/src/skills/science/references/packages/elmerfem.md +80 -0
- package/src/skills/science/references/packages/enzo-e.md +88 -0
- package/src/skills/science/references/packages/espresso.md +80 -0
- package/src/skills/science/references/packages/exoplanet.md +88 -0
- package/src/skills/science/references/packages/fairroot.md +73 -0
- package/src/skills/science/references/packages/fbpic.md +80 -0
- package/src/skills/science/references/packages/fdtdbath-meep.md +80 -0
- package/src/skills/science/references/packages/geant4.md +73 -0
- package/src/skills/science/references/packages/geosx.md +80 -0
- package/src/skills/science/references/packages/gprmax.md +80 -0
- package/src/skills/science/references/packages/gromacs.md +80 -0
- package/src/skills/science/references/packages/gwaslab.md +73 -0
- package/src/skills/science/references/packages/gz-sim.md +73 -0
- package/src/skills/science/references/packages/hail.md +88 -0
- package/src/skills/science/references/packages/hiphive.md +80 -0
- package/src/skills/science/references/packages/hoomd-blue.md +80 -0
- package/src/skills/science/references/packages/itensor.md +73 -0
- package/src/skills/science/references/packages/itensors-jl.md +73 -0
- package/src/skills/science/references/packages/jdftx.md +73 -0
- package/src/skills/science/references/packages/jobflow.md +80 -0
- package/src/skills/science/references/packages/kadanoffbaym-jl.md +73 -0
- package/src/skills/science/references/packages/kite.md +80 -0
- package/src/skills/science/references/packages/kratos.md +80 -0
- package/src/skills/science/references/packages/kwant.md +73 -0
- package/src/skills/science/references/packages/lammps.md +80 -0
- package/src/skills/science/references/packages/lightkurve.md +88 -0
- package/src/skills/science/references/packages/limix.md +73 -0
- package/src/skills/science/references/packages/maxwelllink.md +80 -0
- package/src/skills/science/references/packages/mcdc.md +73 -0
- package/src/skills/science/references/packages/meep.md +80 -0
- package/src/skills/science/references/packages/mfem.md +80 -0
- package/src/skills/science/references/packages/mitgcm.md +73 -0
- package/src/skills/science/references/packages/modflow6.md +73 -0
- package/src/skills/science/references/packages/molecool.md +73 -0
- package/src/skills/science/references/packages/mom6.md +73 -0
- package/src/skills/science/references/packages/moose.md +80 -0
- package/src/skills/science/references/packages/mpas-model.md +73 -0
- package/src/skills/science/references/packages/mujoco.md +73 -0
- package/src/skills/science/references/packages/mumax3.md +73 -0
- package/src/skills/science/references/packages/nekrs.md +80 -0
- package/src/skills/science/references/packages/nessi.md +73 -0
- package/src/skills/science/references/packages/nest-simulator.md +73 -0
- package/src/skills/science/references/packages/netket.md +73 -0
- package/src/skills/science/references/packages/neuron.md +73 -0
- package/src/skills/science/references/packages/nextflow.md +88 -0
- package/src/skills/science/references/packages/nwchem.md +88 -0
- package/src/skills/science/references/packages/openbabel.md +88 -0
- package/src/skills/science/references/packages/openems.md +80 -0
- package/src/skills/science/references/packages/openff-toolkit.md +88 -0
- package/src/skills/science/references/packages/openfoam-dev.md +80 -0
- package/src/skills/science/references/packages/openmc.md +73 -0
- package/src/skills/science/references/packages/openmm.md +80 -0
- package/src/skills/science/references/packages/openmoc.md +73 -0
- package/src/skills/science/references/packages/openmx.md +80 -0
- package/src/skills/science/references/packages/opensees.md +80 -0
- package/src/skills/science/references/packages/opensn.md +80 -0
- package/src/skills/science/references/packages/opm-simulators.md +73 -0
- package/src/skills/science/references/packages/oqupy.md +73 -0
- package/src/skills/science/references/packages/packmol.md +80 -0
- package/src/skills/science/references/packages/palabos.md +80 -0
- package/src/skills/science/references/packages/parflow.md +80 -0
- package/src/skills/science/references/packages/pennylane.md +88 -0
- package/src/skills/science/references/packages/perceval.md +73 -0
- package/src/skills/science/references/packages/phono3py.md +73 -0
- package/src/skills/science/references/packages/phonopy.md +73 -0
- package/src/skills/science/references/packages/photutils.md +88 -0
- package/src/skills/science/references/packages/picongpu.md +80 -0
- package/src/skills/science/references/packages/plink-ng.md +88 -0
- package/src/skills/science/references/packages/precice.md +73 -0
- package/src/skills/science/references/packages/psc.md +80 -0
- package/src/skills/science/references/packages/psi4.md +88 -0
- package/src/skills/science/references/packages/pybinding.md +73 -0
- package/src/skills/science/references/packages/pyfr.md +80 -0
- package/src/skills/science/references/packages/pyhf.md +73 -0
- package/src/skills/science/references/packages/pyiron_base.md +80 -0
- package/src/skills/science/references/packages/pylcp.md +73 -0
- package/src/skills/science/references/packages/pylith.md +80 -0
- package/src/skills/science/references/packages/pynbody.md +88 -0
- package/src/skills/science/references/packages/pysam.md +88 -0
- package/src/skills/science/references/packages/pyscf.md +88 -0
- package/src/skills/science/references/packages/q-e.md +73 -0
- package/src/skills/science/references/packages/qibo.md +73 -0
- package/src/skills/science/references/packages/qiskit.md +73 -0
- package/src/skills/science/references/packages/quantica-jl.md +73 -0
- package/src/skills/science/references/packages/quantumoptics-jl.md +73 -0
- package/src/skills/science/references/packages/quimb.md +73 -0
- package/src/skills/science/references/packages/qulacs.md +73 -0
- package/src/skills/science/references/packages/qutip.md +73 -0
- package/src/skills/science/references/packages/rdkit.md +88 -0
- package/src/skills/science/references/packages/rmg-py.md +73 -0
- package/src/skills/science/references/packages/root.md +73 -0
- package/src/skills/science/references/packages/scanpy.md +88 -0
- package/src/skills/science/references/packages/scikit-allel.md +88 -0
- package/src/skills/science/references/packages/scikit-bio.md +88 -0
- package/src/skills/science/references/packages/scqubits.md +73 -0
- package/src/skills/science/references/packages/scuff-em.md +80 -0
- package/src/skills/science/references/packages/scvi-tools.md +73 -0
- package/src/skills/science/references/packages/seissol.md +73 -0
- package/src/skills/science/references/packages/sfepy.md +80 -0
- package/src/skills/science/references/packages/sisl.md +73 -0
- package/src/skills/science/references/packages/smilei.md +80 -0
- package/src/skills/science/references/packages/snakemake.md +88 -0
- package/src/skills/science/references/packages/specfem3d-globe.md +80 -0
- package/src/skills/science/references/packages/specutils.md +88 -0
- package/src/skills/science/references/packages/spglib.md +80 -0
- package/src/skills/science/references/packages/squidpy.md +88 -0
- package/src/skills/science/references/packages/starry.md +88 -0
- package/src/skills/science/references/packages/strawberryfields.md +73 -0
- package/src/skills/science/references/packages/su2.md +80 -0
- package/src/skills/science/references/packages/sunny-jl.md +73 -0
- package/src/skills/science/references/packages/sw4.md +73 -0
- package/src/skills/science/references/packages/swift.md +88 -0
- package/src/skills/science/references/packages/tdnegf.md +73 -0
- package/src/skills/science/references/packages/tenpy.md +73 -0
- package/src/skills/science/references/packages/thermo.md +73 -0
- package/src/skills/science/references/packages/tkwant.md +73 -0
- package/src/skills/science/references/packages/tvb-root.md +73 -0
- package/src/skills/science/references/packages/uproot5.md +73 -0
- package/src/skills/science/references/packages/vampire.md +80 -0
- package/src/skills/science/references/packages/wannier_tools.md +73 -0
- package/src/skills/science/references/packages/warpx.md +80 -0
- package/src/skills/science/references/packages/wrf.md +73 -0
- package/src/skills/science/references/packages/xtb.md +88 -0
- package/src/skills/science/references/packages/yt.md +73 -0
- package/src/skills/science/references/science-task-brief-template.md +71 -0
- package/src/skills/scout/SKILL.md +83 -425
- package/src/skills/scout/references/literature-scout-template.md +5 -24
- package/src/skills/scout/references/operational-guidance.md +191 -0
- package/src/skills/scout/references/paper-triage-playbook.md +11 -35
- package/src/skills/write/SKILL.md +744 -1246
- package/src/skills/write/references/experiments_analysis_patterns.md +129 -0
- package/src/skills/write/references/oral_package_patterns.md +252 -0
- package/src/skills/write/references/oral_writing_principles.md +291 -0
- package/src/skills/write/references/section_rewrite_checklist.md +234 -0
- package/src/tui/dist/app/AppContainer.js +1314 -27
- package/src/tui/dist/components/Composer.js +26 -1
- package/src/tui/dist/components/ConfigScreen.js +2 -1
- package/src/tui/dist/components/InputPrompt.js +25 -9
- package/src/tui/dist/components/MainContent.js +18 -3
- package/src/tui/dist/components/QuestScreen.js +3 -2
- package/src/tui/dist/components/UtilityScreen.js +37 -0
- package/src/tui/dist/hooks/useSafeInput.js +10 -0
- package/src/tui/dist/index.js +13 -1
- package/src/tui/dist/layouts/DefaultAppLayout.js +11 -8
- package/src/tui/dist/lib/api.js +89 -1
- package/src/tui/package.json +1 -1
- package/src/ui/dist/assets/{AnalysisPlugin-DnSm0GZn.js → AnalysisPlugin-CA94NGmI.js} +1 -1
- package/src/ui/dist/assets/CliPlugin-DHBzphZU.js +79 -0
- package/src/ui/dist/assets/CodeEditorPlugin-BOFwD2rn.js +2 -0
- package/src/ui/dist/assets/{CodeViewerPlugin-itb0tltR.js → CodeViewerPlugin-CqDpgjik.js} +4 -4
- package/src/ui/dist/assets/{DocViewerPlugin-DqKkiCI6.js → DocViewerPlugin-UDBgt8-4.js} +3 -3
- package/src/ui/dist/assets/GitCommitViewerPlugin-BmHtZ0bZ.js +6 -0
- package/src/ui/dist/assets/{GitDiffViewerPlugin-DxL2ezFG.js → GitDiffViewerPlugin-CAxjNorQ.js} +2 -2
- package/src/ui/dist/assets/{GitSnapshotViewer-B_RQm1YZ.js → GitSnapshotViewer-CweA6VON.js} +2 -2
- package/src/ui/dist/assets/{ImageViewerPlugin-tHqlXY3n.js → ImageViewerPlugin-C8wHGvGN.js} +5 -5
- package/src/ui/dist/assets/LabPlugin-COyyLUol.js +32 -0
- package/src/ui/dist/assets/{LatexPlugin-B495DTXC.js → LatexPlugin-BQjAaA5J.js} +4 -4
- package/src/ui/dist/assets/{MarkdownViewerPlugin-DG28-61B.js → MarkdownViewerPlugin-Dy1NE2dI.js} +3 -3
- package/src/ui/dist/assets/{MarketplacePlugin-BiOGT-Kj.js → MarketplacePlugin-DMIZtEJ2.js} +2 -2
- package/src/ui/dist/assets/NotebookEditor-CFHMq_Qt.js +91 -0
- package/src/ui/dist/assets/{NotebookEditor-CVsj8h_T.js → NotebookEditor-WFyd8Ybt.js} +23 -23
- package/src/ui/dist/assets/{PdfLoader-CASDQmxJ.js → PdfLoader-CLE5u5TS.js} +3 -3
- package/src/ui/dist/assets/{PdfMarkdownPlugin-BFhwoKsY.js → PdfMarkdownPlugin-_iNK_H83.js} +1 -1
- package/src/ui/dist/assets/PdfViewerPlugin-DgWsbInT.js +22 -0
- package/src/ui/dist/assets/SearchPlugin-DrZmn5iw.js +11 -0
- package/src/ui/dist/assets/{TextViewerPlugin-CB4DYfWO.js → TextViewerPlugin-D1-T3aC7.js} +4 -4
- package/src/ui/dist/assets/branding/runner-claude.svg +107 -0
- package/src/ui/dist/assets/branding/runner-codex.svg +10 -0
- package/src/ui/dist/assets/branding/runner-kimi.svg +14 -0
- package/src/ui/dist/assets/branding/runner-opencode.svg +7 -0
- package/src/ui/dist/assets/cli-store-CoZ-x5Ip.js +1 -0
- package/src/ui/dist/assets/{code-DLC6G24T.js → code-DbsmSd3Y.js} +1 -1
- package/src/ui/dist/assets/file-diff-panel-DsvyRz47.js +1 -0
- package/src/ui/dist/assets/{wrap-text-CwMn-iqb.js → file-jump-queue-DeQBikaw.js} +3 -3
- package/src/ui/dist/assets/{file-socket-Cu4Qln7Y.js → file-socket-DA5XIx88.js} +1 -1
- package/src/ui/dist/assets/fonts/ds-fonts.css +50 -4
- package/src/ui/dist/assets/images/deepxiv/register-guide.png +0 -0
- package/src/ui/dist/assets/index-39vY9LmZ.js +1 -0
- package/src/ui/dist/assets/{index-wQ7RIIRd.js → index-BsO46tJA.js} +1 -1
- package/src/ui/dist/assets/index-CHzJ2xtB.js +3530 -0
- package/src/ui/dist/assets/index-DH-zxoZ3.css +33 -0
- package/src/ui/dist/assets/{plugin-notebook-HbW2K-1c.js → plugin-notebook-JRhysCqj.js} +2 -2
- package/src/ui/dist/assets/{project-sync-CsX08Qno.js → project-sync-DPmWKmKD.js} +1 -1
- package/src/ui/dist/assets/{zoom-out-R-GWEhzS.js → zoom-out-DAukFWen.js} +3 -3
- package/src/ui/dist/index.html +3 -3
- package/src/skills/analysis-campaign/references/artifact-orchestration.md +0 -58
- package/src/skills/baseline/references/memory-playbook.md +0 -40
- package/src/skills/baseline/references/publishable-baseline-package.md +0 -30
- package/src/skills/write/references/outline-evidence-contract-example.md +0 -107
- package/src/skills/write/references/paper-experiment-matrix-template.md +0 -131
- package/src/skills/write/references/paper-section-playbook.md +0 -64
- package/src/skills/write/references/reviewer-first-writing.md +0 -64
- package/src/skills/write/references/revision-checklist.md +0 -70
- package/src/skills/write/references/section-contracts.md +0 -82
- package/src/skills/write/references/sentence-level-proofing.md +0 -49
- package/src/ui/dist/assets/AiManusChatView-COFACy7V.js +0 -204
- package/src/ui/dist/assets/CliPlugin-CvwCmDQ5.js +0 -109
- package/src/ui/dist/assets/CodeEditorPlugin-cOqSa0xq.js +0 -2
- package/src/ui/dist/assets/GitCommitViewerPlugin-DVgNHBCS.js +0 -1
- package/src/ui/dist/assets/LabCopilotPanel-ClMbq5Yu.js +0 -14
- package/src/ui/dist/assets/LabPlugin-L_SuE8ow.js +0 -22
- package/src/ui/dist/assets/NotebookEditor-C-4Kt1p9.js +0 -81
- package/src/ui/dist/assets/PdfViewerPlugin-DcOzU9vd.js +0 -17
- package/src/ui/dist/assets/SearchPlugin-CHj7M58O.js +0 -16
- package/src/ui/dist/assets/VNCViewer-CjlbyCB3.js +0 -11
- package/src/ui/dist/assets/bot-CFkZY-JP.js +0 -6
- package/src/ui/dist/assets/chevron-up-Dq5ofbht.js +0 -6
- package/src/ui/dist/assets/file-content-Dv4LoZec.js +0 -1
- package/src/ui/dist/assets/file-diff-panel-Denq-lC3.js +0 -1
- package/src/ui/dist/assets/file-jump-queue-DA-SdG__.js +0 -1
- package/src/ui/dist/assets/git-commit-horizontal-BUh6G52n.js +0 -6
- package/src/ui/dist/assets/image-B9HUUddG.js +0 -6
- package/src/ui/dist/assets/index-B2B1sg-M.js +0 -1
- package/src/ui/dist/assets/index-Cgla8biy.css +0 -33
- package/src/ui/dist/assets/index-DRyx7vAc.js +0 -1
- package/src/ui/dist/assets/index-Gbl53BNp.js +0 -2496
- package/src/ui/dist/assets/pdf-effect-queue-ZtnHFCAi.js +0 -6
- package/src/ui/dist/assets/popover-DL6h35vr.js +0 -1
- package/src/ui/dist/assets/select-DvmXt1yY.js +0 -11
- package/src/ui/dist/assets/sigma-7jpXazui.js +0 -6
- package/src/ui/dist/assets/trash-xA7kFt8i.js +0 -11
- package/src/ui/dist/assets/useCliAccess-DsMwDjOp.js +0 -1
- package/src/ui/dist/assets/useFileDiffOverlay-FuhcnKiw.js +0 -1
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.027_citeeval
|
|
3
|
+
name: 'CiteEval: Principle-Driven Citation Evaluation for Source Attribution'
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: 'Evaluate and optimize citation quality metrics on the CiteBench benchmark
|
|
6
|
+
using CiteEval-Auto, a principle-driven framework that assesses citations against
|
|
7
|
+
full retrieval context, user queries, and generated text — measuring statement-level
|
|
8
|
+
Pearson correlation with human judgments.
|
|
9
|
+
|
|
10
|
+
'
|
|
11
|
+
task_description: 'This packaged benchmark covers principle-driven citation evaluation
|
|
12
|
+
for retrieval-augmented generation (RAG) systems. The core task is to run the CiteEval-Auto
|
|
13
|
+
metric suite — comprising context attribution (CA), citation editing (CE), and citation
|
|
14
|
+
rating (CR via IterCoE and EditDist) modules — against the CiteBench dataset, which
|
|
15
|
+
contains multi-domain queries (ASQA, ELI5, MS MARCO, LFRQA) with statement-level
|
|
16
|
+
human annotations of citation quality on a 1-5 Likert scale. The primary metric
|
|
17
|
+
is statement-level Pearson correlation between predicted and human citation ratings.
|
|
18
|
+
The evaluation goes beyond simple NLI-based supportiveness by considering full retrieval
|
|
19
|
+
sources, user context, response context, and parametric knowledge. The benchmark
|
|
20
|
+
supports two evaluation scenarios: "Full" (all citable statements, uncited ones
|
|
21
|
+
penalized) and "Cited" (only cited statements evaluated). Execution requires an
|
|
22
|
+
LLM API (OpenAI or DeepSeek) for the CiteEval-Auto modules, which use prompted LLM
|
|
23
|
+
calls for context attribution and citation rating. The CiteBench dataset must be
|
|
24
|
+
downloaded separately from Google Drive. Pre-computed metric outputs for baselines
|
|
25
|
+
(AutoAIS, LQAC, AttriScore) are bundled in the snapshot.
|
|
26
|
+
|
|
27
|
+
'
|
|
28
|
+
capability_tags:
|
|
29
|
+
- research_code_optimization
|
|
30
|
+
- citation_evaluation
|
|
31
|
+
- retrieval_augmented_generation
|
|
32
|
+
- source_attribution
|
|
33
|
+
- evaluation
|
|
34
|
+
- meta_evaluation
|
|
35
|
+
aisb_direction: T3
|
|
36
|
+
track_fit:
|
|
37
|
+
- paper_track
|
|
38
|
+
- benchmark_track
|
|
39
|
+
task_mode: evaluation_driven
|
|
40
|
+
requires_execution: true
|
|
41
|
+
requires_paper: true
|
|
42
|
+
integrity_level: cas_plus_canary
|
|
43
|
+
snapshot_status: runnable
|
|
44
|
+
support_level: advanced
|
|
45
|
+
cost_band: medium
|
|
46
|
+
time_band: 2-6h
|
|
47
|
+
difficulty: medium
|
|
48
|
+
data_access: public
|
|
49
|
+
primary_outputs:
|
|
50
|
+
- statement_level_pearson
|
|
51
|
+
- context_attribution_correlation
|
|
52
|
+
- citation_rating_correlation
|
|
53
|
+
- citation_eval_report
|
|
54
|
+
- principle_judgments
|
|
55
|
+
launch_profiles:
|
|
56
|
+
- id: quick_check
|
|
57
|
+
label: Quick Check
|
|
58
|
+
description: 'Run the packaged CiteEval-Auto evaluation on a small example batch
|
|
59
|
+
or the pre-computed metric outputs to verify the pipeline and reproduce baseline
|
|
60
|
+
correlation numbers.
|
|
61
|
+
|
|
62
|
+
'
|
|
63
|
+
- id: full_eval
|
|
64
|
+
label: Full Metric Evaluation
|
|
65
|
+
description: 'Run the full CiteEval-Auto pipeline (CA, CE, CR modules) over the
|
|
66
|
+
CiteBench metric test set from scratch using an LLM API, then compute human correlation
|
|
67
|
+
metrics against statement-level and response-level human annotations.
|
|
68
|
+
|
|
69
|
+
'
|
|
70
|
+
- id: system_eval
|
|
71
|
+
label: System Evaluation
|
|
72
|
+
description: 'Evaluate citation quality of a custom RAG system''s outputs using
|
|
73
|
+
CiteEval-Auto. Requires converting system output to .citeeval format, running
|
|
74
|
+
the metric suite, and printing results with run_system_eval.sh. Supports --cited
|
|
75
|
+
and full scenarios.
|
|
76
|
+
|
|
77
|
+
'
|
|
78
|
+
dataset_download:
|
|
79
|
+
primary_method: manual
|
|
80
|
+
sources:
|
|
81
|
+
- kind: google_drive
|
|
82
|
+
url: https://drive.google.com/drive/folders/12Evj0f92wKz_7OGuuwq3KShTdSM8eu4v?usp=drive_link
|
|
83
|
+
access: public
|
|
84
|
+
note: 'CiteBench dataset including metric_dev, metric_test, full dev, and full
|
|
85
|
+
test splits. Must be downloaded manually and placed under data/ in the project
|
|
86
|
+
root.
|
|
87
|
+
|
|
88
|
+
'
|
|
89
|
+
notes:
|
|
90
|
+
- Pre-computed metric outputs for baseline metrics (AutoAIS, LQAC, AttriScore) are
|
|
91
|
+
bundled in data/metric_eval_outputs/.
|
|
92
|
+
- Dataset size is moderate (thousands of queries with annotations); exact download
|
|
93
|
+
size not documented but expected under 1 GB.
|
|
94
|
+
credential_requirements:
|
|
95
|
+
mode: api_key
|
|
96
|
+
items:
|
|
97
|
+
- OPENAI_API_KEY (for GPT-4o-based CiteEval-Auto runs, or substitute with DeepSeek
|
|
98
|
+
API)
|
|
99
|
+
notes:
|
|
100
|
+
- The default config in run_citeeval.sh uses model=deepseek-chat; run_system_eval.sh
|
|
101
|
+
references gpt-4o.
|
|
102
|
+
- API costs depend on the number of statements evaluated and the chosen model. The
|
|
103
|
+
metric test set has ~1000 responses.
|
|
104
|
+
- CPU-only execution of the evaluation scripts is possible but still requires an
|
|
105
|
+
LLM API for the CiteEval-Auto modules.
|
|
106
|
+
resources:
|
|
107
|
+
minimum:
|
|
108
|
+
cpu_cores: 8
|
|
109
|
+
ram_gb: 32
|
|
110
|
+
disk_gb: 50
|
|
111
|
+
gpu_count: 0
|
|
112
|
+
gpu_vram_gb: 0
|
|
113
|
+
recommended:
|
|
114
|
+
cpu_cores: 16
|
|
115
|
+
ram_gb: 64
|
|
116
|
+
disk_gb: 100
|
|
117
|
+
gpu_count: 1
|
|
118
|
+
gpu_vram_gb: 16
|
|
119
|
+
environment:
|
|
120
|
+
python: '3.10'
|
|
121
|
+
cuda: '11.7'
|
|
122
|
+
pytorch: 1.13.0
|
|
123
|
+
key_packages:
|
|
124
|
+
- openai==1.16.2
|
|
125
|
+
- transformers==4.38.2
|
|
126
|
+
notes:
|
|
127
|
+
- CPU-only execution is plausible for the minimum route; GPU is only needed if running
|
|
128
|
+
local NLI models for baselines.
|
|
129
|
+
- The requirements.txt file has a typo in the README (requirments.txt) — use the
|
|
130
|
+
actual file in the snapshot.
|
|
131
|
+
- CITEEVAL_ROOT and PYTHONPATH environment variables must be set as described in
|
|
132
|
+
README.
|
|
133
|
+
risk_flags:
|
|
134
|
+
- api_dependency
|
|
135
|
+
- external_dataset_download
|
|
136
|
+
- api_cost_variable
|
|
137
|
+
risk_notes:
|
|
138
|
+
- CiteEval-Auto modules require live LLM API calls (OpenAI or DeepSeek). Without an
|
|
139
|
+
API key, only pre-computed metric outputs can be evaluated.
|
|
140
|
+
- CiteBench data must be manually downloaded from Google Drive; it is not bundled
|
|
141
|
+
in the snapshot.
|
|
142
|
+
- API costs scale with the number of evaluated statements and the LLM model used.
|
|
143
|
+
The metric test set has ~1000 responses with multiple statements each.
|
|
144
|
+
- The default model in run_citeeval.sh is deepseek-chat, not GPT-4o as used in the
|
|
145
|
+
paper's main results. Reproducing paper numbers requires GPT-4o.
|
|
146
|
+
- No benchmark execution was performed during the packaging pass; metric values should
|
|
147
|
+
be verified at runtime.
|
|
148
|
+
recommended_when: 'Use this benchmark when you want a citation-quality metric task
|
|
149
|
+
that evaluates full retrieval context, user queries, and generated text rather than
|
|
150
|
+
simple NLI-based supportiveness proxies. Ideal for research on improving RAG citation
|
|
151
|
+
quality, developing better automatic citation metrics, or benchmarking citation
|
|
152
|
+
quality of new RAG systems against human judgments.
|
|
153
|
+
|
|
154
|
+
'
|
|
155
|
+
not_recommended_when: 'Do not use this if you need a fully self-contained benchmark
|
|
156
|
+
with no external API calls, if you lack access to OpenAI or DeepSeek APIs, or if
|
|
157
|
+
your task does not involve retrieval passages and source attribution. Not suitable
|
|
158
|
+
if you need a benchmark that can run entirely offline without any LLM API dependency.
|
|
159
|
+
|
|
160
|
+
'
|
|
161
|
+
paper:
|
|
162
|
+
title: 'CiteEval: Principle-Driven Citation Evaluation for Source Attribution'
|
|
163
|
+
venue: arXiv preprint
|
|
164
|
+
year: 2025
|
|
165
|
+
url: https://arxiv.org/abs/2506.01829
|
|
166
|
+
download:
|
|
167
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.027_citeeval.zip
|
|
168
|
+
archive_type: zip
|
|
169
|
+
local_dir_name: paper-27-CiteEval
|
|
170
|
+
provider: github_release
|
|
171
|
+
repo: ResearAI/DeepScientist
|
|
172
|
+
tag: aisb-v0.0.1
|
|
173
|
+
asset_name: aisb.t3.027_citeeval.zip
|
|
174
|
+
sha256: 5eb48e11b91ec4856d18461899c236f658b110dc49ff29487205908230ac8d0b
|
|
175
|
+
size_bytes: 83173
|
|
176
|
+
commercial:
|
|
177
|
+
annual_fee: null
|
|
178
|
+
display:
|
|
179
|
+
palette_seed: cream-navy-citation
|
|
180
|
+
art_style: reference-audit
|
|
181
|
+
accent_priority: high
|
|
182
|
+
image_path: ../image/027_aisb.t3.027_citeeval.jpg
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.027_citeeval
|
|
3
|
+
name: 'CiteEval: 基于原则驱动的引用评估用于来源归属'
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: '在CiteBench基准上使用CiteEval-Auto(一个原则驱动的框架)评估和优化引用质量指标,该框架根据完整检索上下文、用户查询和生成文本对引用进行评估——衡量语句级皮尔逊相关系数与人工判断的一致性。
|
|
6
|
+
|
|
7
|
+
'
|
|
8
|
+
task_description: '该打包基准测试涵盖用于检索增强生成(RAG)系统的原则驱动引用评估。核心任务是在CiteBench数据集上运行CiteEval-Auto指标套件——包括上下文归因(CA)、引用编辑(CE)和引用评分(CR,通过IterCoE和EditDist)模块。该数据集包含多领域查询(ASQA、ELI5、MS MARCO、LFRQA),带有引用质量的语句级人工标注(1-5李克特量表)。主要指标是预测引用评分与人工引用评分之间的语句级皮尔逊相关系数。该评估超越了简单的基于NLI的支持度考量,考虑了完整检索来源、用户上下文、响应上下文和参数知识。基准测试支持两种评估场景:"完整"(所有可引用语句,未引用者受罚)和"已引用"(仅评估已引用的语句)。执行需要LLM API(OpenAI或DeepSeek)来运行CiteEval-Auto模块,这些模块使用提示的LLM调用进行上下文归因和引用评分。CiteBench数据集需从Google Drive单独下载。基线(AutoAIS、LQAC、AttriScore)的预计算指标输出已捆绑在快照中。
|
|
9
|
+
|
|
10
|
+
'
|
|
11
|
+
capability_tags:
|
|
12
|
+
- research_code_optimization
|
|
13
|
+
- citation_evaluation
|
|
14
|
+
- retrieval_augmented_generation
|
|
15
|
+
- source_attribution
|
|
16
|
+
- evaluation
|
|
17
|
+
- meta_evaluation
|
|
18
|
+
aisb_direction: T3
|
|
19
|
+
track_fit:
|
|
20
|
+
- paper_track
|
|
21
|
+
- benchmark_track
|
|
22
|
+
task_mode: evaluation_driven
|
|
23
|
+
requires_execution: true
|
|
24
|
+
requires_paper: true
|
|
25
|
+
integrity_level: cas_plus_canary
|
|
26
|
+
snapshot_status: runnable
|
|
27
|
+
support_level: advanced
|
|
28
|
+
cost_band: medium
|
|
29
|
+
time_band: 2-6h
|
|
30
|
+
difficulty: medium
|
|
31
|
+
data_access: public
|
|
32
|
+
primary_outputs:
|
|
33
|
+
- statement_level_pearson
|
|
34
|
+
- context_attribution_correlation
|
|
35
|
+
- citation_rating_correlation
|
|
36
|
+
- citation_eval_report
|
|
37
|
+
- principle_judgments
|
|
38
|
+
launch_profiles:
|
|
39
|
+
- id: quick_check
|
|
40
|
+
label: 快速检查
|
|
41
|
+
description: '在小样本批次或预计算指标输出上运行打包的CiteEval-Auto评估,以验证流程并复现基线相关系数。
|
|
42
|
+
|
|
43
|
+
'
|
|
44
|
+
- id: full_eval
|
|
45
|
+
label: 完整指标评估
|
|
46
|
+
description: '使用LLM API从头开始对CiteBench指标测试集运行完整的CiteEval-Auto流程(CA、CE、CR模块),然后根据语句级和响应级人工标注计算与人工的相关性指标。
|
|
47
|
+
|
|
48
|
+
'
|
|
49
|
+
- id: system_eval
|
|
50
|
+
label: 系统评估
|
|
51
|
+
description: '使用CiteEval-Auto评估自定义RAG系统输出的引用质量。需要将系统输出转换为.citeeval格式,运行指标套件,并使用run_system_eval.sh打印结果。支持--cited和完整场景。
|
|
52
|
+
|
|
53
|
+
'
|
|
54
|
+
dataset_download:
|
|
55
|
+
primary_method: manual
|
|
56
|
+
sources:
|
|
57
|
+
- kind: google_drive
|
|
58
|
+
url: https://drive.google.com/drive/folders/12Evj0f92wKz_7OGuuwq3KShTdSM8eu4v?usp=drive_link
|
|
59
|
+
access: public
|
|
60
|
+
note: 'CiteBench数据集,包含metric_dev、metric_test、full dev和full test划分。必须手动下载并放置在项目根目录下的data/中。
|
|
61
|
+
|
|
62
|
+
'
|
|
63
|
+
notes:
|
|
64
|
+
- 基线指标(AutoAIS、LQAC、AttriScore)的预计算指标输出已捆绑在data/metric_eval_outputs/中。
|
|
65
|
+
- 数据集大小适中(数千个带标注的查询);确切下载大小未记录,但预计在1GB以下。
|
|
66
|
+
credential_requirements:
|
|
67
|
+
mode: api_key
|
|
68
|
+
items:
|
|
69
|
+
- OPENAI_API_KEY(用于基于GPT-4o的CiteEval-Auto运行,或使用DeepSeek API替代)
|
|
70
|
+
notes:
|
|
71
|
+
- run_citeeval.sh中的默认配置使用model=deepseek-chat;run_system_eval.sh引用gpt-4o。
|
|
72
|
+
- API成本取决于评估的语句数量和所选模型。指标测试集约有1000个响应。
|
|
73
|
+
- 评估脚本可以在仅CPU模式下执行,但仍需要LLM API来运行CiteEval-Auto模块。
|
|
74
|
+
resources:
|
|
75
|
+
minimum:
|
|
76
|
+
cpu_cores: 8
|
|
77
|
+
ram_gb: 32
|
|
78
|
+
disk_gb: 50
|
|
79
|
+
gpu_count: 0
|
|
80
|
+
gpu_vram_gb: 0
|
|
81
|
+
recommended:
|
|
82
|
+
cpu_cores: 16
|
|
83
|
+
ram_gb: 64
|
|
84
|
+
disk_gb: 100
|
|
85
|
+
gpu_count: 1
|
|
86
|
+
gpu_vram_gb: 16
|
|
87
|
+
environment:
|
|
88
|
+
python: '3.10'
|
|
89
|
+
cuda: '11.7'
|
|
90
|
+
pytorch: 1.13.0
|
|
91
|
+
key_packages:
|
|
92
|
+
- openai==1.16.2
|
|
93
|
+
- transformers==4.38.2
|
|
94
|
+
notes:
|
|
95
|
+
- 最低规格路线可以仅用CPU执行;仅在运行本地NLI模型进行基线评估时才需要GPU。
|
|
96
|
+
- requirements.txt文件在README中有拼写错误(requirments.txt)——请使用快照中的实际文件。
|
|
97
|
+
- 必须按README中的说明设置CITEEVAL_ROOT和PYTHONPATH环境变量。
|
|
98
|
+
risk_flags:
|
|
99
|
+
- api_dependency
|
|
100
|
+
- external_dataset_download
|
|
101
|
+
- api_cost_variable
|
|
102
|
+
risk_notes:
|
|
103
|
+
- CiteEval-Auto模块需要实时LLM API调用(OpenAI或DeepSeek)。如果没有API密钥,只能评估预计算的指标输出。
|
|
104
|
+
- CiteBench数据必须从Google Drive手动下载;未捆绑在快照中。
|
|
105
|
+
- API成本随评估的语句数量和使用的LLM模型而增加。指标测试集约有1000个响应,每个响应包含多个语句。
|
|
106
|
+
- run_citeeval.sh中的默认模型是deepseek-chat,而非论文主要结果中使用的GPT-4o。复现论文数据需要GPT-4o。
|
|
107
|
+
- 打包过程中未执行基准测试;指标值应在运行时验证。
|
|
108
|
+
recommended_when: '当您需要一个评估完整检索上下文、用户查询和生成文本的引用质量指标任务,而非简单的基于NLI的支持度代理时,使用此基准测试。非常适合研究改进RAG引用质量、开发更好的自动引用指标,或根据人工判断基准测试新RAG系统的引用质量。
|
|
109
|
+
|
|
110
|
+
'
|
|
111
|
+
not_recommended_when: '如果您需要一个完全自包含、无外部API调用的基准测试,或者您无法访问OpenAI或DeepSeek API,或者您的任务不涉及检索段落和来源归属,请勿使用此基准测试。如果您需要一个可以完全离线运行、没有任何LLM API依赖的基准测试,则不适合。
|
|
112
|
+
|
|
113
|
+
'
|
|
114
|
+
paper:
|
|
115
|
+
title: 'CiteEval: Principle-Driven Citation Evaluation for Source Attribution'
|
|
116
|
+
venue: arXiv preprint
|
|
117
|
+
year: 2025
|
|
118
|
+
url: https://arxiv.org/abs/2506.01829
|
|
119
|
+
download:
|
|
120
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.027_citeeval.zip
|
|
121
|
+
archive_type: zip
|
|
122
|
+
local_dir_name: paper-27-CiteEval
|
|
123
|
+
provider: github_release
|
|
124
|
+
repo: ResearAI/DeepScientist
|
|
125
|
+
tag: aisb-v0.0.1
|
|
126
|
+
asset_name: aisb.t3.027_citeeval.zip
|
|
127
|
+
sha256: 5eb48e11b91ec4856d18461899c236f658b110dc49ff29487205908230ac8d0b
|
|
128
|
+
size_bytes: 83173
|
|
129
|
+
commercial:
|
|
130
|
+
annual_fee: null
|
|
131
|
+
display:
|
|
132
|
+
palette_seed: cream-navy-citation
|
|
133
|
+
art_style: reference-audit
|
|
134
|
+
accent_priority: high
|
|
135
|
+
image_path: ../image/027_aisb.t3.027_citeeval.jpg
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.028_sbam
|
|
3
|
+
name: Segment-Based Attention Masking for GPTs
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: Fine-tune and evaluate Masked Attention by Segment (MAS) on Llama-3.2-1B
|
|
6
|
+
across eight commonsense reasoning datasets, comparing segment-aware prefill masking
|
|
7
|
+
against standard causal masking.
|
|
8
|
+
task_description: 'This benchmark evaluates the Masked Attention by Segment (MAS)
|
|
9
|
+
technique, which modifies the causal attention mask of decoder-only GPTs so that
|
|
10
|
+
tokens within each prefill segment (system prompt, user prompt) can attend bidirectionally,
|
|
11
|
+
while the autoregressive generation phase retains standard causal masking. The packaged
|
|
12
|
+
task uses a LoRA-adapted Llama-3.2-1B model and evaluates on eight commonsense reasoning
|
|
13
|
+
datasets: BoolQ, PIQA, SocialIQA, HellaSwag, WinoGrande, ARC-Challenge, ARC-Easy,
|
|
14
|
+
and OpenBookQA. A pre-trained MAS LoRA checkpoint is bundled (trained_models_and_results/Llama-3.2-1B_epoch3_MAS),
|
|
15
|
+
so evaluation can proceed without re-training. The evaluation script (run_eval_all.sh)
|
|
16
|
+
parallelizes across two GPUs and reports per-dataset accuracy plus an average. The
|
|
17
|
+
base Llama-3.2-1B model must be downloaded from HuggingFace (gated access). The
|
|
18
|
+
fine-tuning dataset (170k commonsense reasoning samples in chat-template format)
|
|
19
|
+
is referenced via ft_utils.py but is not bundled; re-training requires sourcing
|
|
20
|
+
it externally. The core evaluation route is self-contained given the bundled LoRA
|
|
21
|
+
weights and base model access.
|
|
22
|
+
|
|
23
|
+
'
|
|
24
|
+
capability_tags:
|
|
25
|
+
- research_code_optimization
|
|
26
|
+
- large_language_models
|
|
27
|
+
- transformers
|
|
28
|
+
- prompt_engineering
|
|
29
|
+
- attention_analysis
|
|
30
|
+
- commonsense_reasoning
|
|
31
|
+
aisb_direction: T3
|
|
32
|
+
track_fit:
|
|
33
|
+
- paper_track
|
|
34
|
+
- benchmark_track
|
|
35
|
+
task_mode: evaluation_driven
|
|
36
|
+
requires_execution: true
|
|
37
|
+
requires_paper: true
|
|
38
|
+
integrity_level: cas_plus_canary
|
|
39
|
+
snapshot_status: runnable
|
|
40
|
+
support_level: turnkey
|
|
41
|
+
cost_band: medium
|
|
42
|
+
time_band: 6-24h
|
|
43
|
+
difficulty: medium
|
|
44
|
+
data_access: public
|
|
45
|
+
primary_outputs:
|
|
46
|
+
- average_accuracy
|
|
47
|
+
- arc_c_accuracy
|
|
48
|
+
- obqa_accuracy
|
|
49
|
+
- boolq_accuracy
|
|
50
|
+
- piqa_accuracy
|
|
51
|
+
- siqa_accuracy
|
|
52
|
+
- hellaswag_accuracy
|
|
53
|
+
- winogrande_accuracy
|
|
54
|
+
- arc_e_accuracy
|
|
55
|
+
- masked_attention_report
|
|
56
|
+
- prompt_variant_scores
|
|
57
|
+
launch_profiles:
|
|
58
|
+
- id: quick_check
|
|
59
|
+
label: Quick Check
|
|
60
|
+
description: 'Run mas_eval.py on a single dataset (e.g. ARC-Challenge) with the
|
|
61
|
+
bundled LoRA checkpoint to verify the evaluation pipeline works end-to-end on
|
|
62
|
+
one GPU.
|
|
63
|
+
|
|
64
|
+
'
|
|
65
|
+
- id: full_eval
|
|
66
|
+
label: Full Eval
|
|
67
|
+
description: 'Run run_eval_all.sh to evaluate the bundled MAS LoRA checkpoint on
|
|
68
|
+
all eight commonsense reasoning datasets in parallel across two GPUs, producing
|
|
69
|
+
per-dataset and average accuracy.
|
|
70
|
+
|
|
71
|
+
'
|
|
72
|
+
- id: retrain_and_eval
|
|
73
|
+
label: Retrain + Eval
|
|
74
|
+
description: 'Re-run LoRA fine-tuning with MAS masking on the 170k commonsense dataset
|
|
75
|
+
(must be sourced externally), then evaluate. This reproduces the full paper pipeline.
|
|
76
|
+
|
|
77
|
+
'
|
|
78
|
+
dataset_download:
|
|
79
|
+
primary_method: mixed
|
|
80
|
+
sources:
|
|
81
|
+
- kind: huggingface
|
|
82
|
+
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
|
83
|
+
access: gated
|
|
84
|
+
note: 'Base Llama-3.2-1B model weights. Requires HuggingFace account and Meta
|
|
85
|
+
license acceptance. ~2.5 GB download.
|
|
86
|
+
|
|
87
|
+
'
|
|
88
|
+
- kind: bundled
|
|
89
|
+
url: null
|
|
90
|
+
access: local
|
|
91
|
+
note: 'Pre-trained MAS LoRA weights are bundled in trained_models_and_results/Llama-3.2-1B_epoch3_MAS.
|
|
92
|
+
A standard causal LoRA checkpoint is in Llama-3.2-1B_epoch3.
|
|
93
|
+
|
|
94
|
+
'
|
|
95
|
+
- kind: external
|
|
96
|
+
url: null
|
|
97
|
+
access: public
|
|
98
|
+
note: 'The 170k commonsense reasoning training dataset (referenced by ft_utils.py)
|
|
99
|
+
is not bundled. It follows the setup from Liu et al. (2024) / Hu et al. (2023).
|
|
100
|
+
Only needed if re-training.
|
|
101
|
+
|
|
102
|
+
'
|
|
103
|
+
- kind: external
|
|
104
|
+
url: null
|
|
105
|
+
access: public
|
|
106
|
+
note: 'Test splits for BoolQ, PIQA, SocialIQA, HellaSwag, WinoGrande, ARC-Challenge,
|
|
107
|
+
ARC-Easy, OpenBookQA are loaded via ft_utils.py at eval time. These are public
|
|
108
|
+
datasets typically fetched from HuggingFace datasets hub.
|
|
109
|
+
|
|
110
|
+
'
|
|
111
|
+
notes:
|
|
112
|
+
- Total disk usage with base model and evaluation datasets is approximately 10-20
|
|
113
|
+
GB.
|
|
114
|
+
- If re-training, the 170k training samples add several GB.
|
|
115
|
+
credential_requirements:
|
|
116
|
+
mode: api_key
|
|
117
|
+
items:
|
|
118
|
+
- HuggingFace token with Meta Llama license acceptance (for base model download)
|
|
119
|
+
notes:
|
|
120
|
+
- No other API keys required for evaluation.
|
|
121
|
+
- If base model is pre-cached locally, no credentials are needed at runtime.
|
|
122
|
+
resources:
|
|
123
|
+
minimum:
|
|
124
|
+
cpu_cores: 8
|
|
125
|
+
ram_gb: 32
|
|
126
|
+
disk_gb: 50
|
|
127
|
+
gpu_count: 1
|
|
128
|
+
gpu_vram_gb: 12
|
|
129
|
+
recommended:
|
|
130
|
+
cpu_cores: 16
|
|
131
|
+
ram_gb: 64
|
|
132
|
+
disk_gb: 120
|
|
133
|
+
gpu_count: 2
|
|
134
|
+
gpu_vram_gb: 24
|
|
135
|
+
environment:
|
|
136
|
+
python: '3.10'
|
|
137
|
+
cuda: '11.8'
|
|
138
|
+
pytorch: 2.1.0
|
|
139
|
+
flash_attn: null
|
|
140
|
+
key_packages:
|
|
141
|
+
- transformers==4.47.0
|
|
142
|
+
- peft
|
|
143
|
+
- torch
|
|
144
|
+
- tqdm
|
|
145
|
+
notes:
|
|
146
|
+
- MAS implementation uses eager attention (attn_implementation="eager"), not flash
|
|
147
|
+
attention.
|
|
148
|
+
- See bundled requirements.txt for full dependency set.
|
|
149
|
+
- CUDA is required; the code checks torch.cuda.is_available() and defaults to "cuda".
|
|
150
|
+
risk_flags:
|
|
151
|
+
- gated_model_dependency
|
|
152
|
+
- training_data_not_bundled
|
|
153
|
+
- eager_attention_only
|
|
154
|
+
risk_notes:
|
|
155
|
+
- Base Llama-3.2-1B requires gated HuggingFace access; evaluation cannot proceed without
|
|
156
|
+
it.
|
|
157
|
+
- The 170k commonsense training dataset is not bundled; only the eval route with pre-trained
|
|
158
|
+
LoRA is self-contained.
|
|
159
|
+
- MAS currently only supports Llama models with eager attention mode; flash_attn is
|
|
160
|
+
not compatible.
|
|
161
|
+
- run_eval_all.sh hardcodes paths (/models/Llama-3.2-1B, /repo/); these must be adjusted
|
|
162
|
+
to local layout.
|
|
163
|
+
- No benchmark execution was performed during the packaging pass; metric values are
|
|
164
|
+
not pre-validated.
|
|
165
|
+
- Batch size is set to 1 in run_eval_all.sh; full eval on 8 datasets takes several
|
|
166
|
+
hours.
|
|
167
|
+
recommended_when: 'Use this benchmark when you want a manageable, single-GPU evaluation
|
|
168
|
+
of a novel attention masking strategy for decoder-only LLMs on commonsense reasoning.
|
|
169
|
+
Good for studying how segment-level bidirectional attention during prefill affects
|
|
170
|
+
downstream accuracy compared to standard causal masking, with lightweight LoRA fine-tuning.
|
|
171
|
+
|
|
172
|
+
'
|
|
173
|
+
not_recommended_when: 'Do not use this if you need a large-scale multi-billion parameter
|
|
174
|
+
LLM training benchmark, a non-transformer architecture task, or a benchmark with
|
|
175
|
+
fully bundled training data. Also not suitable if you cannot obtain gated access
|
|
176
|
+
to Meta Llama models.
|
|
177
|
+
|
|
178
|
+
'
|
|
179
|
+
paper:
|
|
180
|
+
title: Segment-Based Attention Masking for GPTs
|
|
181
|
+
authors:
|
|
182
|
+
- Shahar Katz
|
|
183
|
+
- Liran Ringel
|
|
184
|
+
- Yaniv Romano
|
|
185
|
+
- Lior Wolf
|
|
186
|
+
venue: arXiv preprint
|
|
187
|
+
year: 2024
|
|
188
|
+
url: https://arxiv.org/abs/2412.18487
|
|
189
|
+
doi: null
|
|
190
|
+
download:
|
|
191
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.028_sbam.zip
|
|
192
|
+
archive_type: zip
|
|
193
|
+
local_dir_name: paper-28-SBAM
|
|
194
|
+
provider: github_release
|
|
195
|
+
repo: ResearAI/DeepScientist
|
|
196
|
+
tag: aisb-v0.0.1
|
|
197
|
+
asset_name: aisb.t3.028_sbam.zip
|
|
198
|
+
sha256: e6a4ea39b6fa60b49a9ed55f8386355bae8ed0b92188064bc351a5f71c8f9066
|
|
199
|
+
size_bytes: 104097
|
|
200
|
+
commercial:
|
|
201
|
+
annual_fee: null
|
|
202
|
+
display:
|
|
203
|
+
palette_seed: violet-graphite-mask
|
|
204
|
+
art_style: attention-map
|
|
205
|
+
accent_priority: medium
|
|
206
|
+
image_path: ../image/028_aisb.t3.028_sbam.jpg
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
schema_version: 1
|
|
2
|
+
id: aisb.t3.028_sbam
|
|
3
|
+
name: 基于分段注意力的GPT掩码技术
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
one_line: 在Llama-3.2-1B上对分段掩码注意力(MAS)进行微调和评估,在八个常识推理数据集上比较分段感知的预填充掩码与标准因果掩码。
|
|
6
|
+
task_description: '该基准测试评估分段掩码注意力(MAS)技术,该技术修改了解码器专用GPT的因果注意力掩码,使每个预填充分段(系统提示、用户提示)内的token能够双向注意力,而自回归生成阶段保留标准因果掩码。打包的任务使用LoRA适配的Llama-3.2-1B模型,并在八个常识推理数据集上进行评估:BoolQ、PIQA、SocialIQA、HellaSwag、WinoGrande、ARC-Challenge、ARC-Easy和OpenBookQA。预训练的MAS LoRA检查点已打包(trained_models_and_results/Llama-3.2-1B_epoch3_MAS),因此评估可以在不重新训练的情况下进行。评估脚本(run_eval_all.sh)在两个GPU上并行处理,并报告每个数据集的准确率及平均值。基础Llama-3.2-1B模型必须从HuggingFace下载(需要 gated 访问)。微调数据集(170k条聊天模板格式的常识推理样本)通过ft_utils.py引用,但未打包;如需重新训练需要外部获取。核心评估流程在有打包的LoRA权重和基础模型访问权限的情况下是自包含的。
|
|
7
|
+
|
|
8
|
+
'
|
|
9
|
+
capability_tags:
|
|
10
|
+
- research_code_optimization
|
|
11
|
+
- large_language_models
|
|
12
|
+
- transformers
|
|
13
|
+
- prompt_engineering
|
|
14
|
+
- attention_analysis
|
|
15
|
+
- commonsense_reasoning
|
|
16
|
+
aisb_direction: T3
|
|
17
|
+
track_fit:
|
|
18
|
+
- paper_track
|
|
19
|
+
- benchmark_track
|
|
20
|
+
task_mode: evaluation_driven
|
|
21
|
+
requires_execution: true
|
|
22
|
+
requires_paper: true
|
|
23
|
+
integrity_level: cas_plus_canary
|
|
24
|
+
snapshot_status: runnable
|
|
25
|
+
support_level: turnkey
|
|
26
|
+
cost_band: medium
|
|
27
|
+
time_band: 6-24h
|
|
28
|
+
difficulty: medium
|
|
29
|
+
data_access: public
|
|
30
|
+
primary_outputs:
|
|
31
|
+
- average_accuracy
|
|
32
|
+
- arc_c_accuracy
|
|
33
|
+
- obqa_accuracy
|
|
34
|
+
- boolq_accuracy
|
|
35
|
+
- piqa_accuracy
|
|
36
|
+
- siqa_accuracy
|
|
37
|
+
- hellaswag_accuracy
|
|
38
|
+
- winogrande_accuracy
|
|
39
|
+
- arc_e_accuracy
|
|
40
|
+
- masked_attention_report
|
|
41
|
+
- prompt_variant_scores
|
|
42
|
+
launch_profiles:
|
|
43
|
+
- id: quick_check
|
|
44
|
+
label: 快速检查
|
|
45
|
+
description: '使用打包的LoRA检查点在单个数据集(例如ARC-Challenge)上运行mas_eval.py,以验证评估流程在单个GPU上端到端工作。
|
|
46
|
+
|
|
47
|
+
'
|
|
48
|
+
- id: full_eval
|
|
49
|
+
label: 完整评估
|
|
50
|
+
description: '运行run_eval_all.sh在两个GPU上并行评估打包的MAS LoRA检查点在所有八个常识推理数据集上的表现,生成每个数据集的准确率和平均准确率。
|
|
51
|
+
|
|
52
|
+
'
|
|
53
|
+
- id: retrain_and_eval
|
|
54
|
+
label: 重新训练 + 评估
|
|
55
|
+
description: '使用MAS掩码在170k常识数据集上重新运行LoRA微调(必须从外部获取),然后进行评估。这将重现完整的论文流程。
|
|
56
|
+
|
|
57
|
+
'
|
|
58
|
+
dataset_download:
|
|
59
|
+
primary_method: mixed
|
|
60
|
+
sources:
|
|
61
|
+
- kind: huggingface
|
|
62
|
+
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
|
63
|
+
access: gated
|
|
64
|
+
note: '基础Llama-3.2-1B模型权重。需要HuggingFace账户和Meta许可协议接受。约2.5GB下载量。
|
|
65
|
+
|
|
66
|
+
'
|
|
67
|
+
- kind: bundled
|
|
68
|
+
url: null
|
|
69
|
+
access: local
|
|
70
|
+
note: '预训练的MAS LoRA权重打包在trained_models_and_results/Llama-3.2-1B_epoch3_MAS中。标准因果LoRA检查点在Llama-3.2-1B_epoch3中。
|
|
71
|
+
|
|
72
|
+
'
|
|
73
|
+
- kind: external
|
|
74
|
+
url: null
|
|
75
|
+
access: public
|
|
76
|
+
note: '170k常识推理训练数据集(由ft_utils.py引用)未打包。它遵循Liu等人(2024)/Hu等人(2023)的设置。仅在重新训练时需要。
|
|
77
|
+
|
|
78
|
+
'
|
|
79
|
+
- kind: external
|
|
80
|
+
url: null
|
|
81
|
+
access: public
|
|
82
|
+
note: 'BoolQ、PIQA、SocialIQA、HellaSwag、WinoGrande、ARC-Challenge、ARC-Easy、OpenBookQA的测试集通过ft_utils.py在评估时加载。这些是通常从HuggingFace数据集中心获取的公共数据集。
|
|
83
|
+
|
|
84
|
+
'
|
|
85
|
+
notes:
|
|
86
|
+
- 包含基础模型和评估数据集的总磁盘使用量约为10-20GB。
|
|
87
|
+
- 如果重新训练,170k训练样本会增加数GB。
|
|
88
|
+
credential_requirements:
|
|
89
|
+
mode: api_key
|
|
90
|
+
items:
|
|
91
|
+
- 具有Meta Llama许可协议接受权限的HuggingFace令牌(用于基础模型下载)
|
|
92
|
+
notes:
|
|
93
|
+
- 评估不需要其他API密钥。
|
|
94
|
+
- 如果基础模型已在本地缓存,运行时不需要凭据。
|
|
95
|
+
resources:
|
|
96
|
+
minimum:
|
|
97
|
+
cpu_cores: 8
|
|
98
|
+
ram_gb: 32
|
|
99
|
+
disk_gb: 50
|
|
100
|
+
gpu_count: 1
|
|
101
|
+
gpu_vram_gb: 12
|
|
102
|
+
recommended:
|
|
103
|
+
cpu_cores: 16
|
|
104
|
+
ram_gb: 64
|
|
105
|
+
disk_gb: 120
|
|
106
|
+
gpu_count: 2
|
|
107
|
+
gpu_vram_gb: 24
|
|
108
|
+
environment:
|
|
109
|
+
python: '3.10'
|
|
110
|
+
cuda: '11.8'
|
|
111
|
+
pytorch: 2.1.0
|
|
112
|
+
flash_attn: null
|
|
113
|
+
key_packages:
|
|
114
|
+
- transformers==4.47.0
|
|
115
|
+
- peft
|
|
116
|
+
- torch
|
|
117
|
+
- tqdm
|
|
118
|
+
notes:
|
|
119
|
+
- MAS实现使用eager注意力(attn_implementation="eager"),而非flash注意力。
|
|
120
|
+
- 有关完整的依赖项列表,请参阅打包的requirements.txt。
|
|
121
|
+
- 需要CUDA;代码检查torch.cuda.is_available()并默认为"cuda"。
|
|
122
|
+
risk_flags:
|
|
123
|
+
- gated_model_dependency
|
|
124
|
+
- training_data_not_bundled
|
|
125
|
+
- eager_attention_only
|
|
126
|
+
risk_notes:
|
|
127
|
+
- 基础Llama-3.2-1B需要gated HuggingFace访问;没有它评估无法进行。
|
|
128
|
+
- 170k常识训练数据集未打包;只有带有预训练LoRA的评估流程是自包含的。
|
|
129
|
+
- MAS目前仅支持使用eager注意力模式的Llama模型;flash_attn不兼容。
|
|
130
|
+
- run_eval_all.sh硬编码路径(/models/Llama-3.2-1B、/repo/);必须调整为本地布局。
|
|
131
|
+
- 打包过程中未执行基准测试;指标值未预先验证。
|
|
132
|
+
- run_eval_all.sh中批大小设置为1;在8个数据集上完整评估需要数小时。
|
|
133
|
+
recommended_when: '当您想要对解码器专用LLM的新型注意力掩码策略在常识推理上进行可管理的单GPU评估时使用此基准测试。适合研究预填充期间分段级双向注意力如何影响下游准确率(与标准因果掩码相比),并使用轻量级LoRA微调。
|
|
134
|
+
|
|
135
|
+
'
|
|
136
|
+
not_recommended_when: '如果需要数十亿参数LLM的大规模训练基准测试、非Transformer架构任务或完全打包训练数据的基准测试,请勿使用此基准测试。如果无法获得Meta Llama模型的gated访问权限,也不适用。
|
|
137
|
+
|
|
138
|
+
'
|
|
139
|
+
paper:
|
|
140
|
+
title: Segment-Based Attention Masking for GPTs
|
|
141
|
+
authors:
|
|
142
|
+
- Shahar Katz
|
|
143
|
+
- Liran Ringel
|
|
144
|
+
- Yaniv Romano
|
|
145
|
+
- Lior Wolf
|
|
146
|
+
venue: arXiv preprint
|
|
147
|
+
year: 2024
|
|
148
|
+
url: https://arxiv.org/abs/2412.18487
|
|
149
|
+
doi: null
|
|
150
|
+
download:
|
|
151
|
+
url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.028_sbam.zip
|
|
152
|
+
archive_type: zip
|
|
153
|
+
local_dir_name: paper-28-SBAM
|
|
154
|
+
provider: github_release
|
|
155
|
+
repo: ResearAI/DeepScientist
|
|
156
|
+
tag: aisb-v0.0.1
|
|
157
|
+
asset_name: aisb.t3.028_sbam.zip
|
|
158
|
+
sha256: e6a4ea39b6fa60b49a9ed55f8386355bae8ed0b92188064bc351a5f71c8f9066
|
|
159
|
+
size_bytes: 104097
|
|
160
|
+
commercial:
|
|
161
|
+
annual_fee: null
|
|
162
|
+
display:
|
|
163
|
+
palette_seed: violet-graphite-mask
|
|
164
|
+
art_style: attention-map
|
|
165
|
+
accent_priority: medium
|
|
166
|
+
image_path: ../image/028_aisb.t3.028_sbam.jpg
|