academic-army 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.editorconfig +9 -0
- package/.github/workflows/publish.yml +44 -0
- package/.prettierrc.json +3 -0
- package/LICENSE +21 -0
- package/README.md +172 -0
- package/README.zh-CN.md +172 -0
- package/agent-forge.yaml +83 -0
- package/eslint.config.js +28 -0
- package/install_mcp.py +85 -0
- package/mcp-server/__main__.py +33 -0
- package/mcp-server/deepresearch/__init__.py +3 -0
- package/mcp-server/deepresearch/tools.py +33 -0
- package/mcp-server/requirements.txt +4 -0
- package/metaskills/README.md +131 -0
- package/metaskills/README.zh-CN.md +131 -0
- package/metaskills/academic-army-architect/METASKILL.md +91 -0
- package/metaskills/academic-army-architect/envolve.sh +9 -0
- package/metaskills/academic-army-coding-plan/ENVOLVETASK.md +1 -0
- package/metaskills/academic-army-coding-plan/METASKILL.md +118 -0
- package/metaskills/academic-army-coding-plan/envolve.sh +9 -0
- package/metaskills/academic-army-coding-style/METASKILL.md +292 -0
- package/metaskills/academic-army-experiment-plan/ENVOLVETASK.md +1 -0
- package/metaskills/academic-army-experiment-plan/METASKILL.md +82 -0
- package/metaskills/academic-army-experiment-plan/envolve.sh +9 -0
- package/metaskills/academic-army-repo-scaffold/ENVOLVETASK.md +1 -0
- package/metaskills/academic-army-repo-scaffold/METASKILL.md +223 -0
- package/metaskills/academic-army-repo-scaffold/envolve.sh +9 -0
- package/package.json +35 -0
- package/runs/develop-skill.sh +17 -0
- package/runs/develop.sh +16 -0
- package/skills/academic-army-architect/SKILL.md +336 -0
- package/skills/academic-army-architect/agents/openai.yaml +11 -0
- package/skills/academic-army-architect/references/blueprint-schema.md +345 -0
- package/skills/academic-army-coding-plan/SKILL.md +491 -0
- package/skills/academic-army-coding-plan/agents/openai.yaml +11 -0
- package/skills/academic-army-coding-style/SKILL.md +915 -0
- package/skills/academic-army-coding-style/agents/openai.yaml +11 -0
- package/skills/academic-army-experiment-plan/SKILL.md +517 -0
- package/skills/academic-army-experiment-plan/agents/openai.yaml +11 -0
- package/skills/academic-army-repo-scaffold/SKILL.md +756 -0
- package/skills/academic-army-repo-scaffold/agents/openai.yaml +10 -0
- package/src/README.md +79 -0
- package/src/README.zh-CN.md +79 -0
- package/src/cli.ts +55 -0
- package/src/developing/README.md +146 -0
- package/src/developing/README.zh-CN.md +146 -0
- package/src/developing/agents/developer.ts +40 -0
- package/src/developing/agents/factory.ts +11 -0
- package/src/developing/agents/index.ts +8 -0
- package/src/developing/agents/manager.ts +74 -0
- package/src/developing/agents/prompts.ts +12 -0
- package/src/developing/agents/reviewer.ts +44 -0
- package/src/developing/agents/trajectory-optimizer.ts +70 -0
- package/src/developing/agents/types.ts +41 -0
- package/src/developing/index.ts +2 -0
- package/src/developing/pipeline.ts +306 -0
- package/src/developing/pipelineskill.ts +169 -0
- package/src/evolve-skill/README.md +116 -0
- package/src/evolve-skill/README.zh-CN.md +116 -0
- package/src/evolve-skill/agents/evaluator.ts +28 -0
- package/src/evolve-skill/agents/factory.ts +11 -0
- package/src/evolve-skill/agents/index.ts +4 -0
- package/src/evolve-skill/agents/modifier.ts +27 -0
- package/src/evolve-skill/agents/runner.ts +19 -0
- package/src/evolve-skill/index.ts +1 -0
- package/src/evolve-skill/pipeline.ts +140 -0
- package/src/pipeline.ts +65 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
interface:
|
|
2
|
+
display_name: "Academic Army Coding Style"
|
|
3
|
+
short_description: "Keep research-code changes clean and local"
|
|
4
|
+
default_prompt: "Use $academic-army-coding-style while making this code or framework change so the implementation stays readable, local, low-coupling, and consistent with the existing repository."
|
|
5
|
+
|
|
6
|
+
dependencies:
|
|
7
|
+
tools:
|
|
8
|
+
- type: "mcp"
|
|
9
|
+
value: "academic_army_mcp_tools"
|
|
10
|
+
description: "Provides academic_army_mcp_tools.deepresearch when unfamiliar ecosystem conventions, harness/test practices, framework organization, or open-source reuse decisions need current evidence."
|
|
11
|
+
transport: "stdio"
|
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: academic-army-experiment-plan
|
|
3
|
+
description: >-
|
|
4
|
+
Create a strategic, evidence-driven academic experiment plan from a research
|
|
5
|
+
idea, paper_blueprint.md, paper claims, storytelling blueprint, target venue,
|
|
6
|
+
existing results, prior plans, or revision feedback. Produces exactly two
|
|
7
|
+
Markdown files: an English AI-facing experiment_plan.md and a Chinese
|
|
8
|
+
human-facing experiment_plan.explain.md. Uses
|
|
9
|
+
academic_army_mcp_tools.deepresearch for live target-venue, influential-paper,
|
|
10
|
+
baseline, dataset/workload, metric, artifact, autoresearch, and reviewer
|
|
11
|
+
expectation research before making claim-to-evidence planning choices.
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# Academic Army Experiment Plan
|
|
15
|
+
|
|
16
|
+
## Purpose
|
|
17
|
+
|
|
18
|
+
Create a strategic experiment plan for an academic paper. The plan turns paper
|
|
19
|
+
claims into an evidence strategy that later AI skills can inherit when they
|
|
20
|
+
write code, run experiments, plan figures, and draft paper sections.
|
|
21
|
+
|
|
22
|
+
Own the experiment-strategy layer:
|
|
23
|
+
|
|
24
|
+
- paper thesis and claim-to-evidence mapping
|
|
25
|
+
- experiment objectives and their paper-story roles
|
|
26
|
+
- workload/dataset, metric, and baseline strategy
|
|
27
|
+
- reader interpretation and reviewer-concern coverage
|
|
28
|
+
- ablation, robustness, boundary, and artifact-readiness objectives
|
|
29
|
+
- Chinese rationale that lets the user judge whether the strategy is reasonable
|
|
30
|
+
|
|
31
|
+
Do not execute experiments, write code, prescribe shell commands, create exact
|
|
32
|
+
run matrices, fabricate results, or produce final figures.
|
|
33
|
+
|
|
34
|
+
## Output Contract
|
|
35
|
+
|
|
36
|
+
Create exactly two Markdown files in the requested output directory:
|
|
37
|
+
|
|
38
|
+
1. `experiment_plan.md`
|
|
39
|
+
- English.
|
|
40
|
+
- AI-facing.
|
|
41
|
+
- Contains only the strategic experiment plan.
|
|
42
|
+
- Uses stable experiment names, registries, and fields that downstream skills
|
|
43
|
+
can inherit.
|
|
44
|
+
|
|
45
|
+
2. `experiment_plan.explain.md`
|
|
46
|
+
- Chinese.
|
|
47
|
+
- Human-facing.
|
|
48
|
+
- Starts with the concrete user-provided and locally available planning
|
|
49
|
+
inputs used in this invocation.
|
|
50
|
+
- Explains why the experiment portfolio is reasonable for the paper.
|
|
51
|
+
- Explains choices from the paper thesis, blueprint, target venue, live
|
|
52
|
+
research, existing evidence, and storytelling needs.
|
|
53
|
+
|
|
54
|
+
Do not create extra output files. Do not put provenance, source summaries,
|
|
55
|
+
user-facing review notes, or skill-internal process comments inside
|
|
56
|
+
`experiment_plan.md`.
|
|
57
|
+
|
|
58
|
+
## Inputs To Use
|
|
59
|
+
|
|
60
|
+
Use supplied or available project context before planning. Prefer
|
|
61
|
+
`paper_blueprint.md` when present. Also use prior experiment plans,
|
|
62
|
+
preliminary results, revision feedback, target-venue notes, metaskill design
|
|
63
|
+
goals, or artifact summaries when they are part of the current task.
|
|
64
|
+
|
|
65
|
+
Extract or infer:
|
|
66
|
+
|
|
67
|
+
- paper goal, title, field, subfield, target venue, year, and track
|
|
68
|
+
- central research bet and novelty boundary
|
|
69
|
+
- main claims and expected reviewer concerns
|
|
70
|
+
- storytelling posture: motivation, method insight, main evidence, claim
|
|
71
|
+
boundary, and reader journey
|
|
72
|
+
- required or preferred datasets, workloads, baselines, metrics, hardware,
|
|
73
|
+
traces, artifacts, or deployment setting
|
|
74
|
+
- known constraints: compute, privacy, data access, unavailable baselines,
|
|
75
|
+
human-subject constraints, deadline, target track, or reproducibility needs
|
|
76
|
+
- existing evidence: preliminary numbers, pilot studies, prior figures, logs,
|
|
77
|
+
notes, old experiment plans, reviews, rebuttal feedback, or artifact feedback
|
|
78
|
+
|
|
79
|
+
Use live research and paper goals to infer nonblocking missing details. Ask for
|
|
80
|
+
additional user input only when a missing fact would materially change claim
|
|
81
|
+
coverage, workload scale, baseline fairness, ethics, or story placement and no
|
|
82
|
+
defensible default can be inferred.
|
|
83
|
+
|
|
84
|
+
## Required Deepresearch
|
|
85
|
+
|
|
86
|
+
Use `academic_army_mcp_tools.deepresearch` for every nontrivial plan or
|
|
87
|
+
substantive revision after project-specific context is understood.
|
|
88
|
+
|
|
89
|
+
- Server: `academic_army_mcp_tools`
|
|
90
|
+
- Tool: `deepresearch`
|
|
91
|
+
- Canonical MCP name when exposed:
|
|
92
|
+
`mcp__academic_army_mcp_tools__deepresearch`
|
|
93
|
+
|
|
94
|
+
Ask for concise planning lessons, not a literature review. The prompt should
|
|
95
|
+
cover multiple perspectives:
|
|
96
|
+
|
|
97
|
+
- current or recent target-venue experiment expectations
|
|
98
|
+
- high-impact or high-citation papers from the target venue and adjacent top
|
|
99
|
+
venues such as SIGGRAPH, CVPR, SIGCOMM, NSDI, INFOCOM, MMSys, CHI, NeurIPS,
|
|
100
|
+
ICML, ICLR, ACL, or domain-specific venues when relevant
|
|
101
|
+
- why those papers' experiments are persuasive: methods, datasets, baselines,
|
|
102
|
+
metrics, result presentation, artifacts, and claim boundaries
|
|
103
|
+
- recent methods, datasets, workloads, baselines, metrics, benchmarks,
|
|
104
|
+
artifacts, and result-presentation patterns in the paper's subfield
|
|
105
|
+
- motivation or design-insight experiment patterns that make the core intuition
|
|
106
|
+
visible before full-system evaluation
|
|
107
|
+
- autoresearch, scientific-discovery, paper-writing-agent, benchmark, prompt
|
|
108
|
+
template, and experiment-automation workflow lessons when they improve
|
|
109
|
+
planning or downstream handoff
|
|
110
|
+
|
|
111
|
+
The skill defines what to research, not the answer. Do not hardcode mutable
|
|
112
|
+
venue norms, current SOTA baselines, or dataset preferences into the skill body;
|
|
113
|
+
derive them at invocation time through deepresearch.
|
|
114
|
+
|
|
115
|
+
For each live-research anchor that changes the plan, record this in
|
|
116
|
+
`experiment_plan.explain.md`:
|
|
117
|
+
|
|
118
|
+
- `source`: title and link
|
|
119
|
+
- `date`: visible publication, submission, event, metadata, or page date
|
|
120
|
+
- `venue_status`: one of `official_proceedings`, `arxiv_only`,
|
|
121
|
+
`project_page_claim`, `secondary_metadata`, or `classic_background`
|
|
122
|
+
- `影响到的规划决定`: the baseline, metric, workload, experiment placement,
|
|
123
|
+
evidence style, artifact expectation, or claim boundary it changed
|
|
124
|
+
|
|
125
|
+
Use `official_proceedings` only when the venue is confirmed by conference,
|
|
126
|
+
proceedings, publisher, DOI metadata, or official venue pages. Use
|
|
127
|
+
`arxiv_only` for arXiv records. Use `project_page_claim` for author, lab, or
|
|
128
|
+
project claims not confirmed elsewhere. Use `secondary_metadata` for
|
|
129
|
+
aggregators or institutional pages. Use `classic_background` for older
|
|
130
|
+
foundational precedents.
|
|
131
|
+
|
|
132
|
+
## Decision Sufficiency Policy
|
|
133
|
+
|
|
134
|
+
Make goal-oriented choices. Do not transfer obvious decisions to the user.
|
|
135
|
+
|
|
136
|
+
When user input, the blueprint, existing evidence, and deepresearch make one
|
|
137
|
+
choice clearly better for the paper, write that choice into `experiment_plan.md`
|
|
138
|
+
and explain the reasoning in Chinese in `experiment_plan.explain.md`.
|
|
139
|
+
|
|
140
|
+
Keep open variables only when all are true:
|
|
141
|
+
|
|
142
|
+
- the information cannot be inferred reliably from current inputs or live
|
|
143
|
+
research
|
|
144
|
+
- the choice materially changes experiment objectives, claim coverage, workload
|
|
145
|
+
scale, baseline fairness, ethics, or story placement
|
|
146
|
+
- downstream skills cannot proceed sensibly without inheriting the uncertainty
|
|
147
|
+
|
|
148
|
+
Do not create broad lists of questions. Represent nonblocking unknowns as
|
|
149
|
+
assumptions, dependencies, optional claim-expansion modules, or handoff notes.
|
|
150
|
+
As user-confirmed content and research accumulate, open variables should shrink.
|
|
151
|
+
|
|
152
|
+
## Planning Method
|
|
153
|
+
|
|
154
|
+
Build the plan around the paper story, not around a generic evaluation
|
|
155
|
+
checklist. The plan must be claim-derived, not template-derived: treat the
|
|
156
|
+
template as scaffolding for consistency, and include content only when it
|
|
157
|
+
advances a specific paper claim, reader doubt, storytelling need, or downstream
|
|
158
|
+
handoff.
|
|
159
|
+
|
|
160
|
+
1. Normalize the paper into an experimental thesis, primary comparison,
|
|
161
|
+
operating conditions, venue/story evidence posture, and paper-specific claim
|
|
162
|
+
verbs such as demonstrate, isolate, quantify, rule out, stress-test,
|
|
163
|
+
calibrate, validate, attribute, generalize, diagnose, contextualize, or
|
|
164
|
+
explain.
|
|
165
|
+
2. Build a claim-to-evidence map before writing individual objectives.
|
|
166
|
+
3. Define workload/dataset, metric, and baseline registries once.
|
|
167
|
+
4. Organize experiment objectives by evidence role: motivation, method insight,
|
|
168
|
+
main evaluation, mechanism/ablation, robustness, generalization, contribution
|
|
169
|
+
boundary, human/perceptual evidence, deployment realism,
|
|
170
|
+
cost/scalability, or reproducibility.
|
|
171
|
+
5. For each objective, specify the claim supported, reviewer concern answered,
|
|
172
|
+
story placement, evidence scope, workloads, metrics, comparators,
|
|
173
|
+
presentation intent, expected result pattern, reader takeaway,
|
|
174
|
+
claim-calibration output, downstream handoff, dependencies, and priority.
|
|
175
|
+
6. Merge objectives that do not have a distinct claim, story role, reader
|
|
176
|
+
takeaway, or primary evidence output. Represent secondary needs as metric
|
|
177
|
+
slices, reporting views, or shared protocols.
|
|
178
|
+
7. Put optional broader-scope ideas into claim-expansion modules with activation
|
|
179
|
+
conditions.
|
|
180
|
+
8. Explain the rationale in Chinese as a causal argument for the user, not as a
|
|
181
|
+
field-by-field translation.
|
|
182
|
+
|
|
183
|
+
Motivation and design-insight experiments should make the core intuition visible
|
|
184
|
+
early. Use them to show an existing-system defect or a core-mechanism feasibility
|
|
185
|
+
signal before full-system evaluation. Their planned result should be immediately
|
|
186
|
+
readable: a curve separation, small table, heatmap, qualitative grid, timeline,
|
|
187
|
+
breakdown, representative case, or before/after panel.
|
|
188
|
+
|
|
189
|
+
Use deepresearch-derived venue and paper patterns as experiment-design choices,
|
|
190
|
+
not as citations or literature review. Convert them into concrete baseline
|
|
191
|
+
families, metric choices, ablation styles, robustness checks, artifact
|
|
192
|
+
expectations, and result-presentation intent for the current paper.
|
|
193
|
+
|
|
194
|
+
Before writing, ask of each experiment: What paper sentence or claim will this
|
|
195
|
+
evidence support? What doubt does it remove? Why are these metrics, baselines,
|
|
196
|
+
workloads, and ablations the right ones for this claim? What should the reader
|
|
197
|
+
conclude? Where will the result appear in the paper? What downstream
|
|
198
|
+
writing/figure/table handoff does it enable?
|
|
199
|
+
|
|
200
|
+
## Strategic Plan Boundary
|
|
201
|
+
|
|
202
|
+
`experiment_plan.md` should include:
|
|
203
|
+
|
|
204
|
+
- experimental thesis, primary comparison, and operating conditions
|
|
205
|
+
- venue/storytelling evidence posture
|
|
206
|
+
- claim-to-evidence map
|
|
207
|
+
- workload or dataset registry
|
|
208
|
+
- metric registry
|
|
209
|
+
- baseline registry
|
|
210
|
+
- resource, cost, statistical, and reproducibility principles when relevant
|
|
211
|
+
- experiment objectives organized by evidence role
|
|
212
|
+
- ablation, sensitivity, robustness, and claim-boundary objectives when needed
|
|
213
|
+
- optional claim-expansion modules for broader scope
|
|
214
|
+
- main-paper versus supplemental presentation intent at a strategic level
|
|
215
|
+
- objective dependency graph
|
|
216
|
+
|
|
217
|
+
`experiment_plan.md` should not include:
|
|
218
|
+
|
|
219
|
+
- source summaries or literature review prose
|
|
220
|
+
- confirmed-input ledger or user-facing explanation
|
|
221
|
+
- implementation owners
|
|
222
|
+
- shell commands, scripts, exact run matrices, hyperparameter grids, or code
|
|
223
|
+
- concrete output paths, logging schemas, manifest fields, or final figure files
|
|
224
|
+
- fabricated numeric results or claims that experiments have succeeded
|
|
225
|
+
- user reminders, disclaimers, or sections such as `Assumptions to validate`,
|
|
226
|
+
`Artifact cautions`, or `Do not assume reviewers will run code`
|
|
227
|
+
|
|
228
|
+
Use logical handles for outputs, such as `substitution_surface`,
|
|
229
|
+
`main_qoe_table`, or `stress_regime_matrix`. Later skills choose concrete file
|
|
230
|
+
names, logging formats, implementation details, and plotting layouts.
|
|
231
|
+
|
|
232
|
+
Avoid generic plan content. If a section, heading, experiment name, metric
|
|
233
|
+
rationale, baseline choice, or ablation could apply unchanged to another paper,
|
|
234
|
+
make it more specific to the current paper's thesis or remove it.
|
|
235
|
+
|
|
236
|
+
## Registries
|
|
237
|
+
|
|
238
|
+
Define shared registries once and reference IDs in objectives.
|
|
239
|
+
|
|
240
|
+
### Workload or Dataset Registry
|
|
241
|
+
|
|
242
|
+
Separate:
|
|
243
|
+
|
|
244
|
+
- `Required workloads/datasets`: committed by user input, blueprint, existing
|
|
245
|
+
evidence, or live-research-selected venue protocol.
|
|
246
|
+
- `Scope-extension candidates`: broaden scene, data, benchmark, substrate,
|
|
247
|
+
device, deployment, user-study, or contention claims.
|
|
248
|
+
|
|
249
|
+
Name workload classes or dataset families unless exact datasets are confirmed or
|
|
250
|
+
venue norms make a dataset clearly required.
|
|
251
|
+
|
|
252
|
+
### Metric Registry
|
|
253
|
+
|
|
254
|
+
Group metrics by evidence role, for example:
|
|
255
|
+
|
|
256
|
+
- primary claim quality/effectiveness
|
|
257
|
+
- latency/deadline/responsiveness
|
|
258
|
+
- cost/resource/efficiency
|
|
259
|
+
- robustness/stress/generalization
|
|
260
|
+
- mechanism/control/action behavior
|
|
261
|
+
- statistical reporting
|
|
262
|
+
- human/perceptual signal, when relevant
|
|
263
|
+
|
|
264
|
+
Objectives reference metric IDs only. Do not repeat metric definitions inside
|
|
265
|
+
every objective.
|
|
266
|
+
|
|
267
|
+
### Baseline Registry
|
|
268
|
+
|
|
269
|
+
Use compact entries:
|
|
270
|
+
|
|
271
|
+
```markdown
|
|
272
|
+
- `baseline_id`:
|
|
273
|
+
- Burden: minimum | diagnostic | optional_expensive
|
|
274
|
+
- Baseline role: canonical | recent_strong | simple | ablated_self | status_quo | oracle | deployment
|
|
275
|
+
- Comparison purpose:
|
|
276
|
+
- Fairness principle:
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
Use baseline ladders:
|
|
280
|
+
|
|
281
|
+
- canonical baselines expected by reviewers
|
|
282
|
+
- recent strong baselines from live research
|
|
283
|
+
- simple baselines that test whether complexity is justified
|
|
284
|
+
- ablated self-baselines that isolate mechanism
|
|
285
|
+
- status-quo or deployment baselines for systems papers
|
|
286
|
+
- oracle or upper-bound baselines only when they clarify headroom
|
|
287
|
+
|
|
288
|
+
Objectives own baseline usage through their `Comparators` field. The registry
|
|
289
|
+
defines each baseline once.
|
|
290
|
+
|
|
291
|
+
## Positive Evidence Language
|
|
292
|
+
|
|
293
|
+
Write the main plan as a positive evidence specification. Use fields such as:
|
|
294
|
+
|
|
295
|
+
- `Evidence goal`
|
|
296
|
+
- `Evidence scope`
|
|
297
|
+
- `Evidence role`
|
|
298
|
+
- `Story placement`
|
|
299
|
+
- `Reviewer concern answered`
|
|
300
|
+
- `Presentation intent`
|
|
301
|
+
- `Reader takeaway`
|
|
302
|
+
- `Claim calibration output`
|
|
303
|
+
- `Expected evidence outputs`
|
|
304
|
+
- `Handled by later skills`
|
|
305
|
+
|
|
306
|
+
Use positive limitation language: `limitation regime`, `unsupported regime`,
|
|
307
|
+
`claim boundary`, `stress sensitivity`, and `adaptation attribution`. Use
|
|
308
|
+
`failure` only for explicit diagnostic objectives where a failure-analysis
|
|
309
|
+
artifact is part of the evidence.
|
|
310
|
+
|
|
311
|
+
For engineering papers, do not organize the plan around weak-result
|
|
312
|
+
contingencies. Plan how the core intuition should be shown and verified. Express
|
|
313
|
+
risks as dependencies, open variables, stress regimes, or claim-boundary
|
|
314
|
+
objectives.
|
|
315
|
+
|
|
316
|
+
## `experiment_plan.md` Template
|
|
317
|
+
|
|
318
|
+
```markdown
|
|
319
|
+
# Experiment Plan: <Paper/System Name>
|
|
320
|
+
|
|
321
|
+
## 1. Experimental Thesis
|
|
322
|
+
|
|
323
|
+
- Experimental thesis:
|
|
324
|
+
- Primary comparison:
|
|
325
|
+
- Operating conditions:
|
|
326
|
+
- Venue/story evidence posture:
|
|
327
|
+
|
|
328
|
+
## 2. Claim-to-Evidence Map
|
|
329
|
+
|
|
330
|
+
| Claim | Reviewer Concern | Evidence Objective | Story Placement | Expected Evidence Output |
|
|
331
|
+
|---|---|---|---|---|
|
|
332
|
+
|
|
333
|
+
## 3. Workload and Dataset Registry
|
|
334
|
+
|
|
335
|
+
- Required workloads/datasets:
|
|
336
|
+
- Scope-extension candidates:
|
|
337
|
+
|
|
338
|
+
## 4. Metric Registry
|
|
339
|
+
|
|
340
|
+
- `<metric_id>`:
|
|
341
|
+
|
|
342
|
+
## 5. Baseline Registry
|
|
343
|
+
|
|
344
|
+
- `<baseline_id>`:
|
|
345
|
+
- Burden:
|
|
346
|
+
- Baseline role:
|
|
347
|
+
- Comparison purpose:
|
|
348
|
+
- Fairness principle:
|
|
349
|
+
|
|
350
|
+
## 6. Resource, Cost, and Reproducibility Principles
|
|
351
|
+
|
|
352
|
+
- Resource/cost reporting:
|
|
353
|
+
- Statistical reporting:
|
|
354
|
+
- Artifact/reproducibility principle:
|
|
355
|
+
|
|
356
|
+
## 7. Core Experiment Objectives
|
|
357
|
+
|
|
358
|
+
### <Experiment Name>
|
|
359
|
+
|
|
360
|
+
- Story placement:
|
|
361
|
+
- Evidence goal:
|
|
362
|
+
- Claims supported:
|
|
363
|
+
- Reviewer concern answered:
|
|
364
|
+
- Evidence scope:
|
|
365
|
+
- Evidence role:
|
|
366
|
+
- Workloads/datasets:
|
|
367
|
+
- Controlled factors:
|
|
368
|
+
- Comparators:
|
|
369
|
+
- Metrics:
|
|
370
|
+
- Presentation intent:
|
|
371
|
+
- Expected evidence outputs:
|
|
372
|
+
- Expected result pattern:
|
|
373
|
+
- Reader takeaway:
|
|
374
|
+
- Claim calibration output:
|
|
375
|
+
- Handled by later skills:
|
|
376
|
+
- Dependencies:
|
|
377
|
+
- Priority:
|
|
378
|
+
|
|
379
|
+
## 8. Optional Claim-Expansion Modules
|
|
380
|
+
|
|
381
|
+
### <Module Name>
|
|
382
|
+
|
|
383
|
+
- Module type: claim_expansion_module
|
|
384
|
+
- Scope expanded:
|
|
385
|
+
- Activation condition:
|
|
386
|
+
- Use objective fields only when the module is activated.
|
|
387
|
+
|
|
388
|
+
## 9. Objective Dependency Graph
|
|
389
|
+
|
|
390
|
+
- <experiment/output> -> <experiment/output>:
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
Omit empty sections. Keep identifiers natural and readable; avoid abstract ID
|
|
394
|
+
systems such as `c1`, `c2`, `b1`, or `m1` unless the source paper already uses
|
|
395
|
+
them.
|
|
396
|
+
|
|
397
|
+
## `experiment_plan.explain.md` Template
|
|
398
|
+
|
|
399
|
+
Write this file in natural Chinese. English paper titles, venue names, method
|
|
400
|
+
names, datasets, benchmarks, and technical terms may remain in English when that
|
|
401
|
+
is clearer.
|
|
402
|
+
|
|
403
|
+
```markdown
|
|
404
|
+
# 实验计划说明:<论文/系统名>
|
|
405
|
+
|
|
406
|
+
## 用户已经明确的内容
|
|
407
|
+
|
|
408
|
+
记录本轮实际使用的用户指令、论文蓝图、旧计划、反馈、已有结果、目标 venue、约束和实时调研入口。
|
|
409
|
+
|
|
410
|
+
## 论文核心出发点
|
|
411
|
+
|
|
412
|
+
解释这篇论文想让审稿人相信什么,以及为什么实验必须围绕这些论点组织。
|
|
413
|
+
|
|
414
|
+
## 实时调研如何影响实验取舍
|
|
415
|
+
|
|
416
|
+
| 来源 | 日期 | venue_status | 影响到的规划决定 |
|
|
417
|
+
|---|---:|---|---|
|
|
418
|
+
|
|
419
|
+
## 实验故事线
|
|
420
|
+
|
|
421
|
+
用自然语言说明 motivation、method insight、main evaluation、ablation、
|
|
422
|
+
robustness、boundary、artifact evidence 如何串起来。
|
|
423
|
+
|
|
424
|
+
## 为什么选择这些实验
|
|
425
|
+
|
|
426
|
+
逐个实验解释:它支撑哪个 claim、解决哪个 reviewer concern、放在论文哪个叙事位置、预期结果如何帮助读者理解核心思想。
|
|
427
|
+
|
|
428
|
+
## 为什么选择这些基线
|
|
429
|
+
|
|
430
|
+
说明 canonical、recent strong、simple、self-ablation、status quo、oracle 等基线各自排除哪个疑虑。
|
|
431
|
+
|
|
432
|
+
## 为什么选择这些指标和工作负载
|
|
433
|
+
|
|
434
|
+
解释指标和 workload 如何服务论文论点,不要只解释字段含义。
|
|
435
|
+
|
|
436
|
+
## 结果展示策略
|
|
437
|
+
|
|
438
|
+
说明哪些结果适合主文,哪些适合补充材料;只做战略层面的图表/表格/案例意图,不设计最终图。
|
|
439
|
+
|
|
440
|
+
## 仍需继承的开放变量
|
|
441
|
+
|
|
442
|
+
只列真正影响实验规模、claim 覆盖、伦理/数据访问、baseline 公平性或 story placement 的未知项,并说明为什么当前信息不足以决定。
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
The explanation is for user confirmation, not for downstream execution. It
|
|
446
|
+
should let the user identify whether a questionable experiment comes from the
|
|
447
|
+
core thesis, target-venue prior, live-research pattern, or an inference step.
|
|
448
|
+
It should not describe the Markdown template, generation process, or section
|
|
449
|
+
mechanics.
|
|
450
|
+
|
|
451
|
+
## Revision Behavior
|
|
452
|
+
|
|
453
|
+
When revising an existing experiment plan, revise from concrete artifact content
|
|
454
|
+
and concrete feedback. Classify feedback as substantive, file-contract,
|
|
455
|
+
over-defensive/open-question, generic/template-driven, language/filename,
|
|
456
|
+
evidence-linkage, or non-controlling.
|
|
457
|
+
|
|
458
|
+
- Substantive feedback may change experiments, baselines, workloads, metrics,
|
|
459
|
+
story placement, or claim boundaries.
|
|
460
|
+
- File-contract and language feedback should change filenames, language split,
|
|
461
|
+
section boundaries, or lint compliance without inventing new experiment
|
|
462
|
+
content.
|
|
463
|
+
- Over-defensive/open-question feedback should reduce unnecessary open variables
|
|
464
|
+
and turn inferable choices into committed plan decisions with Chinese
|
|
465
|
+
rationale.
|
|
466
|
+
- Generic/template-driven feedback should replace checklist-like sections with
|
|
467
|
+
paper-specific experiment objectives, claim verbs, reader doubts, expected
|
|
468
|
+
result patterns, and downstream handoffs.
|
|
469
|
+
- Evidence-linkage feedback should strengthen the claim-to-evidence map,
|
|
470
|
+
objective fields, reader takeaways, and explanation logic.
|
|
471
|
+
- Feedback that provides no artifact-content defect should not trigger changes
|
|
472
|
+
to experiment objectives, baselines, workloads, metrics, or output schema.
|
|
473
|
+
- Non-inspective evaluator feedback that only says artifacts were not examined
|
|
474
|
+
or asks for artifact availability is outside this skill's academic-planning
|
|
475
|
+
scope. Unless it cites a content-specific defect, preserve the academic design
|
|
476
|
+
and output contract.
|
|
477
|
+
|
|
478
|
+
Make the smallest change that addresses the feedback while preserving the
|
|
479
|
+
two-file contract.
|
|
480
|
+
|
|
481
|
+
## Quality Checks
|
|
482
|
+
|
|
483
|
+
Before finalizing, check:
|
|
484
|
+
|
|
485
|
+
- Exactly two Markdown files are produced: `experiment_plan.md` and
|
|
486
|
+
`experiment_plan.explain.md`.
|
|
487
|
+
- `experiment_plan.md` is English-only and contains only the strategic plan.
|
|
488
|
+
- `experiment_plan.explain.md` is Chinese-first and begins with actual planning
|
|
489
|
+
inputs used.
|
|
490
|
+
- The main plan defines workload/dataset, metric, and baseline registries once.
|
|
491
|
+
- Objectives reference registry IDs rather than redefining baselines or metrics.
|
|
492
|
+
- Every baseline has `Burden: minimum | diagnostic | optional_expensive`.
|
|
493
|
+
- Every objective has claim support, story placement, reviewer concern,
|
|
494
|
+
presentation intent, expected evidence output, expected result pattern, reader
|
|
495
|
+
takeaway, and priority.
|
|
496
|
+
- Motivation/design-insight experiments make the core intuition visible before
|
|
497
|
+
full evaluation when the paper needs them.
|
|
498
|
+
- Main-paper versus supplemental presentation intent is strategic, not a final
|
|
499
|
+
figure design.
|
|
500
|
+
- Open variables appear only in the explanation and only when they materially
|
|
501
|
+
affect plan quality.
|
|
502
|
+
- Facts resolved by user input, blueprint, existing evidence, live research, or
|
|
503
|
+
clear inference are not restated as user questions.
|
|
504
|
+
- Every live-research anchor in the explanation has source, date,
|
|
505
|
+
`venue_status`, and the planning decision it changed.
|
|
506
|
+
- Deepresearch-derived venue and paper patterns are converted into design
|
|
507
|
+
choices, not generic citations or literature-review summaries.
|
|
508
|
+
- No section, heading, experiment name, metric rationale, baseline choice, or
|
|
509
|
+
ablation could apply unchanged to an unrelated paper.
|
|
510
|
+
- The plan contains no source prose, literature review, user-facing warnings,
|
|
511
|
+
shell commands, code, fabricated results, concrete output paths, logging
|
|
512
|
+
schemas, manifest fields, implementation owners, exact run scripts, or
|
|
513
|
+
runtime environment mechanics.
|
|
514
|
+
- The explanation explains design reasoning and paper-story fit, not template
|
|
515
|
+
mechanics or generation process.
|
|
516
|
+
- Objectives with overlapping claims, workloads, metrics, comparators, and
|
|
517
|
+
outputs are merged or represented as reporting views.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
interface:
|
|
2
|
+
display_name: "Academic Army Experiment Plan"
|
|
3
|
+
short_description: "Strategic claim-to-evidence experiment plan plus Chinese rationale"
|
|
4
|
+
default_prompt: "Create an English experiment_plan.md and Chinese experiment_plan.explain.md with $academic-army-experiment-plan. Use the available blueprint, prior plans or artifacts, metaskill context, existing evidence, and revision feedback; use academic_army_mcp_tools.deepresearch for live venue and recent-paper experiment patterns; make the plan claim-derived rather than template-derived, and make decisive planning choices when the inputs and research are sufficient."
|
|
5
|
+
|
|
6
|
+
dependencies:
|
|
7
|
+
tools:
|
|
8
|
+
- type: "mcp"
|
|
9
|
+
value: "academic_army_mcp_tools"
|
|
10
|
+
description: "Provides academic_army_mcp_tools.deepresearch for live recent-paper, target-venue, baseline, dataset, metric, benchmark, artifact, motivation-pattern, and reviewer-expectation evidence."
|
|
11
|
+
transport: "stdio"
|