academic-army 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/.editorconfig +9 -0
  2. package/.github/workflows/publish.yml +44 -0
  3. package/.prettierrc.json +3 -0
  4. package/LICENSE +21 -0
  5. package/README.md +172 -0
  6. package/README.zh-CN.md +172 -0
  7. package/agent-forge.yaml +83 -0
  8. package/eslint.config.js +28 -0
  9. package/install_mcp.py +85 -0
  10. package/mcp-server/__main__.py +33 -0
  11. package/mcp-server/deepresearch/__init__.py +3 -0
  12. package/mcp-server/deepresearch/tools.py +33 -0
  13. package/mcp-server/requirements.txt +4 -0
  14. package/metaskills/README.md +131 -0
  15. package/metaskills/README.zh-CN.md +131 -0
  16. package/metaskills/academic-army-architect/METASKILL.md +91 -0
  17. package/metaskills/academic-army-architect/envolve.sh +9 -0
  18. package/metaskills/academic-army-coding-plan/ENVOLVETASK.md +1 -0
  19. package/metaskills/academic-army-coding-plan/METASKILL.md +118 -0
  20. package/metaskills/academic-army-coding-plan/envolve.sh +9 -0
  21. package/metaskills/academic-army-coding-style/METASKILL.md +292 -0
  22. package/metaskills/academic-army-experiment-plan/ENVOLVETASK.md +1 -0
  23. package/metaskills/academic-army-experiment-plan/METASKILL.md +82 -0
  24. package/metaskills/academic-army-experiment-plan/envolve.sh +9 -0
  25. package/metaskills/academic-army-repo-scaffold/ENVOLVETASK.md +1 -0
  26. package/metaskills/academic-army-repo-scaffold/METASKILL.md +223 -0
  27. package/metaskills/academic-army-repo-scaffold/envolve.sh +9 -0
  28. package/package.json +35 -0
  29. package/runs/develop-skill.sh +17 -0
  30. package/runs/develop.sh +16 -0
  31. package/skills/academic-army-architect/SKILL.md +336 -0
  32. package/skills/academic-army-architect/agents/openai.yaml +11 -0
  33. package/skills/academic-army-architect/references/blueprint-schema.md +345 -0
  34. package/skills/academic-army-coding-plan/SKILL.md +491 -0
  35. package/skills/academic-army-coding-plan/agents/openai.yaml +11 -0
  36. package/skills/academic-army-coding-style/SKILL.md +915 -0
  37. package/skills/academic-army-coding-style/agents/openai.yaml +11 -0
  38. package/skills/academic-army-experiment-plan/SKILL.md +517 -0
  39. package/skills/academic-army-experiment-plan/agents/openai.yaml +11 -0
  40. package/skills/academic-army-repo-scaffold/SKILL.md +756 -0
  41. package/skills/academic-army-repo-scaffold/agents/openai.yaml +10 -0
  42. package/src/README.md +79 -0
  43. package/src/README.zh-CN.md +79 -0
  44. package/src/cli.ts +55 -0
  45. package/src/developing/README.md +146 -0
  46. package/src/developing/README.zh-CN.md +146 -0
  47. package/src/developing/agents/developer.ts +40 -0
  48. package/src/developing/agents/factory.ts +11 -0
  49. package/src/developing/agents/index.ts +8 -0
  50. package/src/developing/agents/manager.ts +74 -0
  51. package/src/developing/agents/prompts.ts +12 -0
  52. package/src/developing/agents/reviewer.ts +44 -0
  53. package/src/developing/agents/trajectory-optimizer.ts +70 -0
  54. package/src/developing/agents/types.ts +41 -0
  55. package/src/developing/index.ts +2 -0
  56. package/src/developing/pipeline.ts +306 -0
  57. package/src/developing/pipelineskill.ts +169 -0
  58. package/src/evolve-skill/README.md +116 -0
  59. package/src/evolve-skill/README.zh-CN.md +116 -0
  60. package/src/evolve-skill/agents/evaluator.ts +28 -0
  61. package/src/evolve-skill/agents/factory.ts +11 -0
  62. package/src/evolve-skill/agents/index.ts +4 -0
  63. package/src/evolve-skill/agents/modifier.ts +27 -0
  64. package/src/evolve-skill/agents/runner.ts +19 -0
  65. package/src/evolve-skill/index.ts +1 -0
  66. package/src/evolve-skill/pipeline.ts +140 -0
  67. package/src/pipeline.ts +65 -0
  68. package/tsconfig.json +22 -0
@@ -0,0 +1,11 @@
1
+ interface:
2
+ display_name: "Academic Army Coding Style"
3
+ short_description: "Keep research-code changes clean and local"
4
+ default_prompt: "Use $academic-army-coding-style while making this code or framework change so the implementation stays readable, local, low-coupling, and consistent with the existing repository."
5
+
6
+ dependencies:
7
+ tools:
8
+ - type: "mcp"
9
+ value: "academic_army_mcp_tools"
10
+ description: "Provides academic_army_mcp_tools.deepresearch when unfamiliar ecosystem conventions, harness/test practices, framework organization, or open-source reuse decisions need current evidence."
11
+ transport: "stdio"
@@ -0,0 +1,517 @@
1
+ ---
2
+ name: academic-army-experiment-plan
3
+ description: >-
4
+ Create a strategic, evidence-driven academic experiment plan from a research
5
+ idea, paper_blueprint.md, paper claims, storytelling blueprint, target venue,
6
+ existing results, prior plans, or revision feedback. Produces exactly two
7
+ Markdown files: an English AI-facing experiment_plan.md and a Chinese
8
+ human-facing experiment_plan.explain.md. Uses
9
+ academic_army_mcp_tools.deepresearch for live target-venue, influential-paper,
10
+ baseline, dataset/workload, metric, artifact, autoresearch, and reviewer
11
+ expectation research before making claim-to-evidence planning choices.
12
+ ---
13
+
14
+ # Academic Army Experiment Plan
15
+
16
+ ## Purpose
17
+
18
+ Create a strategic experiment plan for an academic paper. The plan turns paper
19
+ claims into an evidence strategy that later AI skills can inherit when they
20
+ write code, run experiments, plan figures, and draft paper sections.
21
+
22
+ Own the experiment-strategy layer:
23
+
24
+ - paper thesis and claim-to-evidence mapping
25
+ - experiment objectives and their paper-story roles
26
+ - workload/dataset, metric, and baseline strategy
27
+ - reader interpretation and reviewer-concern coverage
28
+ - ablation, robustness, boundary, and artifact-readiness objectives
29
+ - Chinese rationale that lets the user judge whether the strategy is reasonable
30
+
31
+ Do not execute experiments, write code, prescribe shell commands, create exact
32
+ run matrices, fabricate results, or produce final figures.
33
+
34
+ ## Output Contract
35
+
36
+ Create exactly two Markdown files in the requested output directory:
37
+
38
+ 1. `experiment_plan.md`
39
+ - English.
40
+ - AI-facing.
41
+ - Contains only the strategic experiment plan.
42
+ - Uses stable experiment names, registries, and fields that downstream skills
43
+ can inherit.
44
+
45
+ 2. `experiment_plan.explain.md`
46
+ - Chinese.
47
+ - Human-facing.
48
+ - Starts with the concrete user-provided and locally available planning
49
+ inputs used in this invocation.
50
+ - Explains why the experiment portfolio is reasonable for the paper.
51
+ - Explains choices from the paper thesis, blueprint, target venue, live
52
+ research, existing evidence, and storytelling needs.
53
+
54
+ Do not create extra output files. Do not put provenance, source summaries,
55
+ user-facing review notes, or skill-internal process comments inside
56
+ `experiment_plan.md`.
57
+
58
+ ## Inputs To Use
59
+
60
+ Use supplied or available project context before planning. Prefer
61
+ `paper_blueprint.md` when present. Also use prior experiment plans,
62
+ preliminary results, revision feedback, target-venue notes, metaskill design
63
+ goals, or artifact summaries when they are part of the current task.
64
+
65
+ Extract or infer:
66
+
67
+ - paper goal, title, field, subfield, target venue, year, and track
68
+ - central research bet and novelty boundary
69
+ - main claims and expected reviewer concerns
70
+ - storytelling posture: motivation, method insight, main evidence, claim
71
+ boundary, and reader journey
72
+ - required or preferred datasets, workloads, baselines, metrics, hardware,
73
+ traces, artifacts, or deployment setting
74
+ - known constraints: compute, privacy, data access, unavailable baselines,
75
+ human-subject constraints, deadline, target track, or reproducibility needs
76
+ - existing evidence: preliminary numbers, pilot studies, prior figures, logs,
77
+ notes, old experiment plans, reviews, rebuttal feedback, or artifact feedback
78
+
79
+ Use live research and paper goals to infer nonblocking missing details. Ask for
80
+ additional user input only when a missing fact would materially change claim
81
+ coverage, workload scale, baseline fairness, ethics, or story placement and no
82
+ defensible default can be inferred.
83
+
84
+ ## Required Deepresearch
85
+
86
+ Use `academic_army_mcp_tools.deepresearch` for every nontrivial plan or
87
+ substantive revision after project-specific context is understood.
88
+
89
+ - Server: `academic_army_mcp_tools`
90
+ - Tool: `deepresearch`
91
+ - Canonical MCP name when exposed:
92
+ `mcp__academic_army_mcp_tools__deepresearch`
93
+
94
+ Ask for concise planning lessons, not a literature review. The prompt should
95
+ cover multiple perspectives:
96
+
97
+ - current or recent target-venue experiment expectations
98
+ - high-impact or high-citation papers from the target venue and adjacent top
99
+ venues such as SIGGRAPH, CVPR, SIGCOMM, NSDI, INFOCOM, MMSys, CHI, NeurIPS,
100
+ ICML, ICLR, ACL, or domain-specific venues when relevant
101
+ - why those papers' experiments are persuasive: methods, datasets, baselines,
102
+ metrics, result presentation, artifacts, and claim boundaries
103
+ - recent methods, datasets, workloads, baselines, metrics, benchmarks,
104
+ artifacts, and result-presentation patterns in the paper's subfield
105
+ - motivation or design-insight experiment patterns that make the core intuition
106
+ visible before full-system evaluation
107
+ - autoresearch, scientific-discovery, paper-writing-agent, benchmark, prompt
108
+ template, and experiment-automation workflow lessons when they improve
109
+ planning or downstream handoff
110
+
111
+ The skill defines what to research, not the answer. Do not hardcode mutable
112
+ venue norms, current SOTA baselines, or dataset preferences into the skill body;
113
+ derive them at invocation time through deepresearch.
114
+
115
+ For each live-research anchor that changes the plan, record this in
116
+ `experiment_plan.explain.md`:
117
+
118
+ - `source`: title and link
119
+ - `date`: visible publication, submission, event, metadata, or page date
120
+ - `venue_status`: one of `official_proceedings`, `arxiv_only`,
121
+ `project_page_claim`, `secondary_metadata`, or `classic_background`
122
+ - `影响到的规划决定`: the baseline, metric, workload, experiment placement,
123
+ evidence style, artifact expectation, or claim boundary it changed
124
+
125
+ Use `official_proceedings` only when the venue is confirmed by conference,
126
+ proceedings, publisher, DOI metadata, or official venue pages. Use
127
+ `arxiv_only` for arXiv records. Use `project_page_claim` for author, lab, or
128
+ project claims not confirmed elsewhere. Use `secondary_metadata` for
129
+ aggregators or institutional pages. Use `classic_background` for older
130
+ foundational precedents.
131
+
132
+ ## Decision Sufficiency Policy
133
+
134
+ Make goal-oriented choices. Do not transfer obvious decisions to the user.
135
+
136
+ When user input, the blueprint, existing evidence, and deepresearch make one
137
+ choice clearly better for the paper, write that choice into `experiment_plan.md`
138
+ and explain the reasoning in Chinese in `experiment_plan.explain.md`.
139
+
140
+ Keep open variables only when all are true:
141
+
142
+ - the information cannot be inferred reliably from current inputs or live
143
+ research
144
+ - the choice materially changes experiment objectives, claim coverage, workload
145
+ scale, baseline fairness, ethics, or story placement
146
+ - downstream skills cannot proceed sensibly without inheriting the uncertainty
147
+
148
+ Do not create broad lists of questions. Represent nonblocking unknowns as
149
+ assumptions, dependencies, optional claim-expansion modules, or handoff notes.
150
+ As user-confirmed content and research accumulate, open variables should shrink.
151
+
152
+ ## Planning Method
153
+
154
+ Build the plan around the paper story, not around a generic evaluation
155
+ checklist. The plan must be claim-derived, not template-derived: treat the
156
+ template as scaffolding for consistency, and include content only when it
157
+ advances a specific paper claim, reader doubt, storytelling need, or downstream
158
+ handoff.
159
+
160
+ 1. Normalize the paper into an experimental thesis, primary comparison,
161
+ operating conditions, venue/story evidence posture, and paper-specific claim
162
+ verbs such as demonstrate, isolate, quantify, rule out, stress-test,
163
+ calibrate, validate, attribute, generalize, diagnose, contextualize, or
164
+ explain.
165
+ 2. Build a claim-to-evidence map before writing individual objectives.
166
+ 3. Define workload/dataset, metric, and baseline registries once.
167
+ 4. Organize experiment objectives by evidence role: motivation, method insight,
168
+ main evaluation, mechanism/ablation, robustness, generalization, contribution
169
+ boundary, human/perceptual evidence, deployment realism,
170
+ cost/scalability, or reproducibility.
171
+ 5. For each objective, specify the claim supported, reviewer concern answered,
172
+ story placement, evidence scope, workloads, metrics, comparators,
173
+ presentation intent, expected result pattern, reader takeaway,
174
+ claim-calibration output, downstream handoff, dependencies, and priority.
175
+ 6. Merge objectives that do not have a distinct claim, story role, reader
176
+ takeaway, or primary evidence output. Represent secondary needs as metric
177
+ slices, reporting views, or shared protocols.
178
+ 7. Put optional broader-scope ideas into claim-expansion modules with activation
179
+ conditions.
180
+ 8. Explain the rationale in Chinese as a causal argument for the user, not as a
181
+ field-by-field translation.
182
+
183
+ Motivation and design-insight experiments should make the core intuition visible
184
+ early. Use them to show an existing-system defect or a core-mechanism feasibility
185
+ signal before full-system evaluation. Their planned result should be immediately
186
+ readable: a curve separation, small table, heatmap, qualitative grid, timeline,
187
+ breakdown, representative case, or before/after panel.
188
+
189
+ Use deepresearch-derived venue and paper patterns as experiment-design choices,
190
+ not as citations or literature review. Convert them into concrete baseline
191
+ families, metric choices, ablation styles, robustness checks, artifact
192
+ expectations, and result-presentation intent for the current paper.
193
+
194
+ Before writing, ask of each experiment: What paper sentence or claim will this
195
+ evidence support? What doubt does it remove? Why are these metrics, baselines,
196
+ workloads, and ablations the right ones for this claim? What should the reader
197
+ conclude? Where will the result appear in the paper? What downstream
198
+ writing/figure/table handoff does it enable?
199
+
200
+ ## Strategic Plan Boundary
201
+
202
+ `experiment_plan.md` should include:
203
+
204
+ - experimental thesis, primary comparison, and operating conditions
205
+ - venue/storytelling evidence posture
206
+ - claim-to-evidence map
207
+ - workload or dataset registry
208
+ - metric registry
209
+ - baseline registry
210
+ - resource, cost, statistical, and reproducibility principles when relevant
211
+ - experiment objectives organized by evidence role
212
+ - ablation, sensitivity, robustness, and claim-boundary objectives when needed
213
+ - optional claim-expansion modules for broader scope
214
+ - main-paper versus supplemental presentation intent at a strategic level
215
+ - objective dependency graph
216
+
217
+ `experiment_plan.md` should not include:
218
+
219
+ - source summaries or literature review prose
220
+ - confirmed-input ledger or user-facing explanation
221
+ - implementation owners
222
+ - shell commands, scripts, exact run matrices, hyperparameter grids, or code
223
+ - concrete output paths, logging schemas, manifest fields, or final figure files
224
+ - fabricated numeric results or claims that experiments have succeeded
225
+ - user reminders, disclaimers, or sections such as `Assumptions to validate`,
226
+ `Artifact cautions`, or `Do not assume reviewers will run code`
227
+
228
+ Use logical handles for outputs, such as `substitution_surface`,
229
+ `main_qoe_table`, or `stress_regime_matrix`. Later skills choose concrete file
230
+ names, logging formats, implementation details, and plotting layouts.
231
+
232
+ Avoid generic plan content. If a section, heading, experiment name, metric
233
+ rationale, baseline choice, or ablation could apply unchanged to another paper,
234
+ make it more specific to the current paper's thesis or remove it.
235
+
236
+ ## Registries
237
+
238
+ Define shared registries once and reference IDs in objectives.
239
+
240
+ ### Workload or Dataset Registry
241
+
242
+ Separate:
243
+
244
+ - `Required workloads/datasets`: committed by user input, blueprint, existing
245
+ evidence, or live-research-selected venue protocol.
246
+ - `Scope-extension candidates`: broaden scene, data, benchmark, substrate,
247
+ device, deployment, user-study, or contention claims.
248
+
249
+ Name workload classes or dataset families unless exact datasets are confirmed or
250
+ venue norms make a dataset clearly required.
251
+
252
+ ### Metric Registry
253
+
254
+ Group metrics by evidence role, for example:
255
+
256
+ - primary claim quality/effectiveness
257
+ - latency/deadline/responsiveness
258
+ - cost/resource/efficiency
259
+ - robustness/stress/generalization
260
+ - mechanism/control/action behavior
261
+ - statistical reporting
262
+ - human/perceptual signal, when relevant
263
+
264
+ Objectives reference metric IDs only. Do not repeat metric definitions inside
265
+ every objective.
266
+
267
+ ### Baseline Registry
268
+
269
+ Use compact entries:
270
+
271
+ ```markdown
272
+ - `baseline_id`:
273
+ - Burden: minimum | diagnostic | optional_expensive
274
+ - Baseline role: canonical | recent_strong | simple | ablated_self | status_quo | oracle | deployment
275
+ - Comparison purpose:
276
+ - Fairness principle:
277
+ ```
278
+
279
+ Use baseline ladders:
280
+
281
+ - canonical baselines expected by reviewers
282
+ - recent strong baselines from live research
283
+ - simple baselines that test whether complexity is justified
284
+ - ablated self-baselines that isolate mechanism
285
+ - status-quo or deployment baselines for systems papers
286
+ - oracle or upper-bound baselines only when they clarify headroom
287
+
288
+ Objectives own baseline usage through their `Comparators` field. The registry
289
+ defines each baseline once.
290
+
291
+ ## Positive Evidence Language
292
+
293
+ Write the main plan as a positive evidence specification. Use fields such as:
294
+
295
+ - `Evidence goal`
296
+ - `Evidence scope`
297
+ - `Evidence role`
298
+ - `Story placement`
299
+ - `Reviewer concern answered`
300
+ - `Presentation intent`
301
+ - `Reader takeaway`
302
+ - `Claim calibration output`
303
+ - `Expected evidence outputs`
304
+ - `Handled by later skills`
305
+
306
+ Use positive limitation language: `limitation regime`, `unsupported regime`,
307
+ `claim boundary`, `stress sensitivity`, and `adaptation attribution`. Use
308
+ `failure` only for explicit diagnostic objectives where a failure-analysis
309
+ artifact is part of the evidence.
310
+
311
+ For engineering papers, do not organize the plan around weak-result
312
+ contingencies. Plan how the core intuition should be shown and verified. Express
313
+ risks as dependencies, open variables, stress regimes, or claim-boundary
314
+ objectives.
315
+
316
+ ## `experiment_plan.md` Template
317
+
318
+ ```markdown
319
+ # Experiment Plan: <Paper/System Name>
320
+
321
+ ## 1. Experimental Thesis
322
+
323
+ - Experimental thesis:
324
+ - Primary comparison:
325
+ - Operating conditions:
326
+ - Venue/story evidence posture:
327
+
328
+ ## 2. Claim-to-Evidence Map
329
+
330
+ | Claim | Reviewer Concern | Evidence Objective | Story Placement | Expected Evidence Output |
331
+ |---|---|---|---|---|
332
+
333
+ ## 3. Workload and Dataset Registry
334
+
335
+ - Required workloads/datasets:
336
+ - Scope-extension candidates:
337
+
338
+ ## 4. Metric Registry
339
+
340
+ - `<metric_id>`:
341
+
342
+ ## 5. Baseline Registry
343
+
344
+ - `<baseline_id>`:
345
+ - Burden:
346
+ - Baseline role:
347
+ - Comparison purpose:
348
+ - Fairness principle:
349
+
350
+ ## 6. Resource, Cost, and Reproducibility Principles
351
+
352
+ - Resource/cost reporting:
353
+ - Statistical reporting:
354
+ - Artifact/reproducibility principle:
355
+
356
+ ## 7. Core Experiment Objectives
357
+
358
+ ### <Experiment Name>
359
+
360
+ - Story placement:
361
+ - Evidence goal:
362
+ - Claims supported:
363
+ - Reviewer concern answered:
364
+ - Evidence scope:
365
+ - Evidence role:
366
+ - Workloads/datasets:
367
+ - Controlled factors:
368
+ - Comparators:
369
+ - Metrics:
370
+ - Presentation intent:
371
+ - Expected evidence outputs:
372
+ - Expected result pattern:
373
+ - Reader takeaway:
374
+ - Claim calibration output:
375
+ - Handled by later skills:
376
+ - Dependencies:
377
+ - Priority:
378
+
379
+ ## 8. Optional Claim-Expansion Modules
380
+
381
+ ### <Module Name>
382
+
383
+ - Module type: claim_expansion_module
384
+ - Scope expanded:
385
+ - Activation condition:
386
+ - Use objective fields only when the module is activated.
387
+
388
+ ## 9. Objective Dependency Graph
389
+
390
+ - <experiment/output> -> <experiment/output>:
391
+ ```
392
+
393
+ Omit empty sections. Keep identifiers natural and readable; avoid abstract ID
394
+ systems such as `c1`, `c2`, `b1`, or `m1` unless the source paper already uses
395
+ them.
396
+
397
+ ## `experiment_plan.explain.md` Template
398
+
399
+ Write this file in natural Chinese. English paper titles, venue names, method
400
+ names, datasets, benchmarks, and technical terms may remain in English when that
401
+ is clearer.
402
+
403
+ ```markdown
404
+ # 实验计划说明:<论文/系统名>
405
+
406
+ ## 用户已经明确的内容
407
+
408
+ 记录本轮实际使用的用户指令、论文蓝图、旧计划、反馈、已有结果、目标 venue、约束和实时调研入口。
409
+
410
+ ## 论文核心出发点
411
+
412
+ 解释这篇论文想让审稿人相信什么,以及为什么实验必须围绕这些论点组织。
413
+
414
+ ## 实时调研如何影响实验取舍
415
+
416
+ | 来源 | 日期 | venue_status | 影响到的规划决定 |
417
+ |---|---:|---|---|
418
+
419
+ ## 实验故事线
420
+
421
+ 用自然语言说明 motivation、method insight、main evaluation、ablation、
422
+ robustness、boundary、artifact evidence 如何串起来。
423
+
424
+ ## 为什么选择这些实验
425
+
426
+ 逐个实验解释:它支撑哪个 claim、解决哪个 reviewer concern、放在论文哪个叙事位置、预期结果如何帮助读者理解核心思想。
427
+
428
+ ## 为什么选择这些基线
429
+
430
+ 说明 canonical、recent strong、simple、self-ablation、status quo、oracle 等基线各自排除哪个疑虑。
431
+
432
+ ## 为什么选择这些指标和工作负载
433
+
434
+ 解释指标和 workload 如何服务论文论点,不要只解释字段含义。
435
+
436
+ ## 结果展示策略
437
+
438
+ 说明哪些结果适合主文,哪些适合补充材料;只做战略层面的图表/表格/案例意图,不设计最终图。
439
+
440
+ ## 仍需继承的开放变量
441
+
442
+ 只列真正影响实验规模、claim 覆盖、伦理/数据访问、baseline 公平性或 story placement 的未知项,并说明为什么当前信息不足以决定。
443
+ ```
444
+
445
+ The explanation is for user confirmation, not for downstream execution. It
446
+ should let the user identify whether a questionable experiment comes from the
447
+ core thesis, target-venue prior, live-research pattern, or an inference step.
448
+ It should not describe the Markdown template, generation process, or section
449
+ mechanics.
450
+
451
+ ## Revision Behavior
452
+
453
+ When revising an existing experiment plan, revise from concrete artifact content
454
+ and concrete feedback. Classify feedback as substantive, file-contract,
455
+ over-defensive/open-question, generic/template-driven, language/filename,
456
+ evidence-linkage, or non-controlling.
457
+
458
+ - Substantive feedback may change experiments, baselines, workloads, metrics,
459
+ story placement, or claim boundaries.
460
+ - File-contract and language feedback should change filenames, language split,
461
+ section boundaries, or lint compliance without inventing new experiment
462
+ content.
463
+ - Over-defensive/open-question feedback should reduce unnecessary open variables
464
+ and turn inferable choices into committed plan decisions with Chinese
465
+ rationale.
466
+ - Generic/template-driven feedback should replace checklist-like sections with
467
+ paper-specific experiment objectives, claim verbs, reader doubts, expected
468
+ result patterns, and downstream handoffs.
469
+ - Evidence-linkage feedback should strengthen the claim-to-evidence map,
470
+ objective fields, reader takeaways, and explanation logic.
471
+ - Feedback that provides no artifact-content defect should not trigger changes
472
+ to experiment objectives, baselines, workloads, metrics, or output schema.
473
+ - Non-inspective evaluator feedback that only says artifacts were not examined
474
+ or asks for artifact availability is outside this skill's academic-planning
475
+ scope. Unless it cites a content-specific defect, preserve the academic design
476
+ and output contract.
477
+
478
+ Make the smallest change that addresses the feedback while preserving the
479
+ two-file contract.
480
+
481
+ ## Quality Checks
482
+
483
+ Before finalizing, check:
484
+
485
+ - Exactly two Markdown files are produced: `experiment_plan.md` and
486
+ `experiment_plan.explain.md`.
487
+ - `experiment_plan.md` is English-only and contains only the strategic plan.
488
+ - `experiment_plan.explain.md` is Chinese-first and begins with actual planning
489
+ inputs used.
490
+ - The main plan defines workload/dataset, metric, and baseline registries once.
491
+ - Objectives reference registry IDs rather than redefining baselines or metrics.
492
+ - Every baseline has `Burden: minimum | diagnostic | optional_expensive`.
493
+ - Every objective has claim support, story placement, reviewer concern,
494
+ presentation intent, expected evidence output, expected result pattern, reader
495
+ takeaway, and priority.
496
+ - Motivation/design-insight experiments make the core intuition visible before
497
+ full evaluation when the paper needs them.
498
+ - Main-paper versus supplemental presentation intent is strategic, not a final
499
+ figure design.
500
+ - Open variables appear only in the explanation and only when they materially
501
+ affect plan quality.
502
+ - Facts resolved by user input, blueprint, existing evidence, live research, or
503
+ clear inference are not restated as user questions.
504
+ - Every live-research anchor in the explanation has source, date,
505
+ `venue_status`, and the planning decision it changed.
506
+ - Deepresearch-derived venue and paper patterns are converted into design
507
+ choices, not generic citations or literature-review summaries.
508
+ - No section, heading, experiment name, metric rationale, baseline choice, or
509
+ ablation could apply unchanged to an unrelated paper.
510
+ - The plan contains no source prose, literature review, user-facing warnings,
511
+ shell commands, code, fabricated results, concrete output paths, logging
512
+ schemas, manifest fields, implementation owners, exact run scripts, or
513
+ runtime environment mechanics.
514
+ - The explanation explains design reasoning and paper-story fit, not template
515
+ mechanics or generation process.
516
+ - Objectives with overlapping claims, workloads, metrics, comparators, and
517
+ outputs are merged or represented as reporting views.
@@ -0,0 +1,11 @@
1
+ interface:
2
+ display_name: "Academic Army Experiment Plan"
3
+ short_description: "Strategic claim-to-evidence experiment plan plus Chinese rationale"
4
+ default_prompt: "Create an English experiment_plan.md and Chinese experiment_plan.explain.md with $academic-army-experiment-plan. Use the available blueprint, prior plans or artifacts, metaskill context, existing evidence, and revision feedback; use academic_army_mcp_tools.deepresearch for live venue and recent-paper experiment patterns; make the plan claim-derived rather than template-derived, and make decisive planning choices when the inputs and research are sufficient."
5
+
6
+ dependencies:
7
+ tools:
8
+ - type: "mcp"
9
+ value: "academic_army_mcp_tools"
10
+ description: "Provides academic_army_mcp_tools.deepresearch for live recent-paper, target-venue, baseline, dataset, metric, benchmark, artifact, motivation-pattern, and reviewer-expectation evidence."
11
+ transport: "stdio"