npm - @researai/deepscientist - Versions diffs - 1.5.1 → 1.5.2 - Mend

@researai/deepscientist 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

package/src/deepscientist/quest/stage_views.py CHANGED Viewed

@@ -62,6 +62,38 @@ def _field(label: str, value: object, *, tone: str = "default") -> dict[str, Any
     }
+def _evaluation_summary(value: object) -> dict[str, Any]:
+    if not isinstance(value, dict):
+        return {}
+    normalized: dict[str, Any] = {}
+    for key in (
+        "takeaway",
+        "claim_update",
+        "baseline_relation",
+        "comparability",
+        "failure_mode",
+        "next_action",
+    ):
+        raw = value.get(key)
+        text = str(raw).strip() if raw is not None else ""
+        if text:
+            normalized[key] = text
+    return normalized
+def _evaluation_summary_fields(value: object, *, prefix: str = "Evaluation") -> list[dict[str, Any]]:
+    summary = _evaluation_summary(value)
+    labels = (
+        ("takeaway", f"{prefix} Takeaway"),
+        ("claim_update", f"{prefix} Claim Update"),
+        ("baseline_relation", f"{prefix} Baseline Relation"),
+        ("comparability", f"{prefix} Comparability"),
+        ("failure_mode", f"{prefix} Failure Mode"),
+        ("next_action", f"{prefix} Next Action"),
+    )
+    return [_field(label, summary[key]) for key, label in labels if summary.get(key)]
 def _artifact_sort_key(item: dict[str, Any]) -> tuple[str, str]:
     payload = item.get("payload") if isinstance(item.get("payload"), dict) else {}
     return (
@@ -814,6 +846,9 @@ class QuestStageViewBuilder:
         )
         latest_metrics_summary = latest_experiment_payload.get("metrics_summary") or latest_result_payload.get("metrics_summary") or {}
         latest_run_id = str(latest_experiment_payload.get("run_id") or "").strip() or None
+        latest_evaluation_summary = _evaluation_summary(
+            latest_experiment_payload.get("evaluation_summary") or latest_result_payload.get("evaluation_summary")
+        )
         analysis_manifests = self._analysis_manifests()
         analysis_manifest = next(
@@ -883,6 +918,7 @@ class QuestStageViewBuilder:
                 _field("Latest Metrics", latest_metrics_summary or "Not recorded"),
                 _field("Delta vs Baseline", latest_progress_eval.get("delta_vs_baseline") or "Not recorded"),
                 _field("Breakthrough", latest_progress_eval.get("breakthrough_level") or "Not recorded"),
+                *_evaluation_summary_fields(latest_evaluation_summary),
             ],
             key_files=self._dedupe_files(
                 [
@@ -940,6 +976,7 @@ class QuestStageViewBuilder:
                         "verdict": latest_experiment_payload.get("verdict"),
                         "metrics_summary": latest_metrics_summary,
                         "progress_eval": latest_progress_eval,
+                        "evaluation_summary": latest_evaluation_summary,
                         "run_md_path": latest_experiment_paths.get("run_md"),
                         "result_json_path": latest_experiment_paths.get("result_json"),
                     }
@@ -979,6 +1016,7 @@ class QuestStageViewBuilder:
         result_payload = read_json(Path(paths.get("result_json")), {}) if str(paths.get("result_json") or "").strip() else {}
         progress_eval = payload.get("progress_eval") or result_payload.get("progress_eval") or {}
         baseline_ref = payload.get("baseline_ref") or result_payload.get("baseline_ref") or {}
+        evaluation_summary = _evaluation_summary(payload.get("evaluation_summary") or result_payload.get("evaluation_summary"))
         run_id = str(payload.get("run_id") or "pending").strip() or "pending"
         note = (
             str(payload.get("summary") or result_payload.get("conclusion") or (progress_eval or {}).get("reason") or "").strip()
@@ -1028,6 +1066,7 @@ class QuestStageViewBuilder:
                 _field("Metrics Summary", metrics_summary or "Not recorded"),
                 _field("Delta vs Baseline", (progress_eval or {}).get("delta_vs_baseline") or "Not recorded"),
                 _field("Breakthrough Level", (progress_eval or {}).get("breakthrough_level") or "Not recorded"),
+                *_evaluation_summary_fields(evaluation_summary),
             ],
             key_files=key_files,
             history=self._artifact_history(experiment_items),
@@ -1040,6 +1079,7 @@ class QuestStageViewBuilder:
                     "baseline_ref": baseline_ref,
                     "metrics_summary": metrics_summary,
                     "progress_eval": progress_eval,
+                    "evaluation_summary": evaluation_summary,
                     "result_payload": result_payload,
                 }
             },
@@ -1141,6 +1181,9 @@ class QuestStageViewBuilder:
                     "reviewer_resolution": detail_payload.get("reviewer_resolution"),
                     "manuscript_update_hint": detail_payload.get("manuscript_update_hint"),
                     "next_recommendation": detail_payload.get("next_recommendation"),
+                    "evaluation_summary": _evaluation_summary(
+                        run_payload.get("evaluation_summary") or detail_payload.get("evaluation_summary")
+                    ),
                     "deviations": detail_payload.get("deviations") or [],
                     "evidence_paths": detail_payload.get("evidence_paths") or [],
                     "plan_path": item.get("plan_path"),
@@ -1233,8 +1276,11 @@ class QuestStageViewBuilder:
             self._file_entry("paper/writing_plan.md", label="Writing Plan", description="Paper writing plan."),
             self._file_entry("paper/references.bib", label="References", description="Bibliography file."),
             self._file_entry("paper/claim_evidence_map.json", label="Claim-Evidence Map", description="Claim to evidence mapping."),
+            self._file_entry("paper/baseline_inventory.json", label="Baseline Inventory", description="Canonical and supplementary baseline inventory for writing."),
             self._file_entry("paper/build/compile_report.json", label="Compile Report", description="Paper build/compile report."),
             self._file_entry("paper/paper_bundle_manifest.json", label="Bundle Manifest", description="Final paper bundle manifest."),
+            self._file_entry("release/open_source/manifest.json", label="Open Source Manifest", description="Open-source cleanup and release preparation manifest."),
+            self._file_entry("release/open_source/cleanup_plan.md", label="Open Source Cleanup Plan", description="Checklist for cleaning the paper branch into a public release."),
             self._file_entry(latex_root_rel, label="LaTeX Sources", description="LaTeX source folder.", expected_kind="directory"),
             self._file_entry(main_tex_rel, label="Main TeX", description="Primary TeX source file."),
         ]

package/src/deepscientist/runners/codex.py CHANGED Viewed

@@ -530,6 +530,7 @@ class CodexRunner:
         env = dict(**os.environ)
         env["CODEX_HOME"] = str(codex_home)
+        env["DEEPSCIENTIST_HOME"] = str(self.home)
         env["DS_HOME"] = str(self.home)
         env["DS_QUEST_ID"] = request.quest_id
         env["DS_QUEST_ROOT"] = str(request.quest_root)
@@ -846,6 +847,7 @@ class CodexRunner:
             tool_timeout_sec = None
         shared_env = {
+            "DEEPSCIENTIST_HOME": str(self.home),
             "DS_HOME": str(self.home),
             "DS_QUEST_ID": quest_id,
             "DS_QUEST_ROOT": str(quest_root),

package/src/deepscientist/shared.py CHANGED Viewed

@@ -71,7 +71,8 @@ def write_json(path: Path, payload: Any) -> None:
     )
-def read_json(path: Path, default: Any = None) -> Any:
+def read_json(path: Path | str, default: Any = None) -> Any:
+    path = Path(path)
     if not path.exists():
         return default
     payload = path.read_text(encoding="utf-8").strip()
@@ -155,35 +156,61 @@ def which(binary: str) -> str | None:
     return shutil.which(binary)
-def resolve_runner_binary(binary: str, *, runner_name: str | None = None) -> str | None:
-    normalized = str(binary or "").strip()
+def _resolve_executable_reference(reference: str) -> str | None:
+    normalized = str(reference or "").strip()
     if not normalized:
         return None
     candidate = Path(normalized).expanduser()
     if candidate.is_absolute() or os.path.sep in normalized or (os.path.altsep and os.path.altsep in normalized):
         return str(candidate) if candidate.exists() else None
+    return shutil.which(normalized)
+def _codex_repo_roots() -> list[Path]:
+    roots: list[Path] = []
+    configured = str(os.environ.get("DEEPSCIENTIST_REPO_ROOT") or "").strip()
+    if configured:
+        roots.append(Path(configured).expanduser().resolve())
+    roots.append(Path(__file__).resolve().parents[2])
+    deduped: list[Path] = []
+    seen: set[str] = set()
+    for root in roots:
+        key = str(root)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(root)
+    return deduped
-    discovered = shutil.which(normalized)
-    if discovered:
-        return discovered
+def resolve_runner_binary(binary: str, *, runner_name: str | None = None) -> str | None:
+    normalized = str(binary or "").strip()
+    if not normalized:
+        return None
+    resolved_reference = _resolve_executable_reference(normalized)
+    candidate = Path(normalized).expanduser()
+    if candidate.is_absolute() or os.path.sep in normalized or (os.path.altsep and os.path.altsep in normalized):
+        return resolved_reference
     normalized_runner = str(runner_name or candidate.name or normalized).strip().lower()
     if normalized_runner != "codex":
-        return None
+        return resolved_reference
     for env_name in ("DEEPSCIENTIST_CODEX_BINARY", "DS_CODEX_BINARY"):
         override = os.environ.get(env_name)
         if override:
-            override_path = Path(override).expanduser()
-            if override_path.exists():
-                return str(override_path)
+            resolved_override = _resolve_executable_reference(override)
+            if resolved_override:
+                return resolved_override
-    repo_root = Path(__file__).resolve().parents[2]
-    node_bin_root = repo_root / "node_modules" / ".bin"
     names = ["codex.cmd", "codex.exe", "codex"] if sys.platform.startswith("win") else ["codex"]
-    for name in names:
-        package_local = node_bin_root / name
-        if package_local.exists():
-            return str(package_local)
-    return None
+    for root in _codex_repo_roots():
+        node_bin_root = root / "node_modules" / ".bin"
+        for name in names:
+            package_local = node_bin_root / name
+            if package_local.exists():
+                return str(package_local)
+    return resolved_reference

package/src/prompts/connectors/lingzhu.md CHANGED Viewed

@@ -10,3 +10,6 @@
 - lingzhu_progress_rule: for long-running work, your first substantive reply should contain either the direct answer or the first concrete checkpoint, not a duplicate transport acknowledgement
 - lingzhu_safety_rule: request only actions that are clearly justified by the current quest and understandable to the human user
 - lingzhu_text_rule: even when requesting `surface_actions`, always include a clear text explanation of what is happening and why
+- lingzhu_reply_style_rule: for Lingzhu-facing user-visible text sent through `artifact.interact(...)`, keep the message clear, concise, respectful, and high-information-density
+- lingzhu_reply_length_rule: for each Lingzhu-facing `artifact.interact(...)` message, normally answer in at most 2 to 3 sentences unless the user explicitly asks for more detail
+- lingzhu_summary_first_rule: in Lingzhu-facing `artifact.interact(...)` messages, usually give only the synopsis and key facts needed for the user's next decision or understanding; avoid long preambles, repetition, and low-signal detail

package/src/prompts/system.md CHANGED Viewed

@@ -433,12 +433,16 @@ If you must deviate, record the reason in an artifact report or decision.
 - `baselines/local/` (baseline code you maintain)
   - Baseline code that you are actively fixing, reproducing, or extending inside this quest.
+  - Supplementary analysis comparators still live here when they are reproduced inside the quest; do not create a parallel top-level baseline root.
   - Store durable baseline variants here when they must be committed and reviewed.
 - `artifacts/baselines/` (baseline records)
   - Baseline audit notes, metric contracts, reproduction notes, and baseline attachment records.
   - This is metadata and reporting, not the baseline code itself.
+- `release/open_source/` (public-release preparation)
+  - Use this for open-source cleanup manifests, include/exclude lists, and the final public-code pruning checklist after the paper bundle exists.
 - `experiments/main/` (main experiment workspace)
   - Main experiment scripts, configs, and durable outputs tied to the active idea branch.
@@ -891,10 +895,22 @@ Prefer these patterns:
   - do not use `mode='revise'` as the default way to start a new optimization round, even for documentation-only changes
 - use `artifact.record_main_experiment(...)` immediately after a real main experiment finishes on the active idea workspace
   - this call is the normal path to write `RUN.md` and `RESULT.json`
+  - include a compact `evaluation_summary` for every durable main-experiment result with exactly these fields:
+    - `takeaway`
+    - `claim_update`
+    - `baseline_relation`
+    - `comparability`
+    - `failure_mode`
+    - `next_action`
+  - do not omit `evaluation_summary` just because the result is weak, mixed, or not directly comparable
+  - if comparison is invalid or evidence is limited, express that explicitly through `baseline_relation`, `comparability`, and `failure_mode` instead of hiding the uncertainty in prose
+  - write it for a human reader who should understand the run outcome without opening logs, diffs, or file paths
+  - keep `takeaway` to one short sentence, keep `next_action` to one best immediate route, and do not include branch ids, paths, tool traces, or raw metric dumps
   - once a branch has a durable main-experiment result, treat that branch as a fixed historical research node
 - use `artifact.create_analysis_campaign(...)` whenever one or more extra experiments must branch from the current workspace/result node
 - even a single extra experiment should still become a one-slice analysis campaign instead of mutating the completed parent node in place
 - use `artifact.record_analysis_slice(...)` immediately after each analysis slice finishes
+  - include the same six-field `evaluation_summary` so later review, rebuttal, and route selection can read one stable summary instead of re-parsing long prose
 - use `artifact.prepare_branch(...)` only for compatibility or exceptional manual recovery; do not prefer it for the normal idea -> experiment -> analysis flow
 - use `artifact.confirm_baseline(...)` as the canonical baseline-stage gate after the accepted baseline root, variant, and metric contract are clear
 - use `artifact.waive_baseline(...)` only when the quest must explicitly continue without a baseline
@@ -968,7 +984,10 @@ For `artifact.interact(...)` specifically:
 - when requesting user input, include concrete options and an explicit reply format whenever possible
 - for a blocking `artifact.interact(kind='decision_request', ...)`, provide 1 to 3 concrete options, put the recommended option first, and explain each option's actual content, pros, cons, and expected consequence
 - for a blocking `artifact.interact(kind='decision_request', ...)`, state the reply format clearly and normally wait up to 1 day for the user unless the task or user already defined a shorter safe deadline
-- if that blocking decision request times out, choose the best option yourself from the stated options, record the evidence-backed reason, and notify the user of the chosen option before continuing
+- if the blocker is a user-supplied external credential or secret that you cannot safely obtain yourself, such as an API key, GitHub key/token, Hugging Face key/token, or similar account credential, always use `artifact.interact(kind='decision_request', reply_mode='blocking', ...)` to ask the user to provide it or choose an alternative route
+- for that credential-blocked case, do not fabricate placeholder credentials, do not silently skip the blocked step, and do not self-resolve by pretending the credential is optional unless the user explicitly chose an alternative route
+- if such a credential request remains unanswered, keep the quest waiting instead of forcing a route decision; if the runtime or tool loop resumes you without fresh credentials and no other work is possible, you may park with a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700, ...)` rather than busy-looping
+- otherwise, if that blocking decision request times out, choose the best option yourself from the stated options, record the evidence-backed reason, and notify the user of the chosen option before continuing
 - prefer one blocking user request at a time unless true parallel ambiguity is unavoidable
 - if a threaded user reply arrives after a progress update, interpret it relative to that progress thread first before treating it as a new unrelated task
 - after sending a blocking request, treat the next unseen inbound user messages as higher-priority context than stale plan assumptions
@@ -1115,16 +1134,27 @@ For analysis campaigns specifically, the safest default sequence is:
 2. call `artifact.create_analysis_campaign(...)` with the full slice list
 3. move into the returned slice worktrees one by one
 4. emit `progress` during long-running slices
-5. call `artifact.record_analysis_slice(...)` after each slice with setup, execution, results, metrics, and any genuinely useful claim/update fields
+5. call `artifact.record_analysis_slice(...)` after each slice with setup, execution, results, metrics, and a six-field `evaluation_summary`
 6. after the last slice, return automatically to the parent idea branch and continue writing
+When writing `evaluation_summary`, use these semantics:
+- `takeaway`: one-sentence human-readable conclusion, starting with the outcome rather than the procedure
+- `claim_update`: only describe whether the core claim is strengthened, weakened, narrowed, or left neutral
+- `baseline_relation`: compare against the active baseline only when the comparison is methodologically valid; otherwise use `not_comparable`
+- `comparability`: use this as the explicit uncertainty channel when protocol drift, data mismatch, or incomplete runs reduce confidence
+- `failure_mode`: classify the dominant reason for failure or instability instead of reframing failures as support
+- `next_action`: choose one immediate route only; do not turn it into a wishlist
+Before planning further work, first read the most recent `evaluation_summary` blocks from the relevant main experiment and analysis slices; only drop to raw logs or long prose when the short judgment layer is still ambiguous.
 For a normal main experiment specifically, the safest default sequence is:
 1. stay in the active idea worktree returned by `artifact.submit_idea(...)`
 2. implement and run there
 3. verify that the metric keys still match the active baseline contract
-4. write the human-readable run log and structured result through `artifact.record_main_experiment(...)`
-5. use the returned baseline comparison and breakthrough signal before deciding whether to continue, launch analysis, or write
+4. write the human-readable run log and structured result through `artifact.record_main_experiment(...)`, including a six-field `evaluation_summary`
+5. use the returned baseline comparison, breakthrough signal, and `evaluation_summary` before deciding whether to continue, launch analysis, or write
 ### Startup-contract delivery mode
@@ -1524,6 +1554,7 @@ First ensure one selected outline exists, then bind the campaign to that outline
 If durable state exposes `active_baseline_metric_contract_json`, read that JSON file before defining slice success criteria or comparison tables.
 By default, use it as the campaign's baseline comparison contract unless a slice is explicitly designed to test a different evaluation contract and that deviation is recorded durably.
+If a slice needs an extra comparator baseline, reproduce or attach it under the normal `baselines/local/` or `baselines/imported/` quest roots, record that requirement in the campaign slice, and later submit the realized comparator through `record_analysis_slice(..., comparison_baselines=[...])` without replacing the canonical baseline gate unless the quest explicitly promotes it.
 Recommended tool discipline:
@@ -1668,7 +1699,7 @@ Before finalizing:
 - re-check the latest decisions, reports, and package inventory
 - re-check writing review / proofing / submission outputs when a paper bundle exists
-- when a paper bundle exists or should exist, verify `paper/paper_bundle_manifest.json` and its referenced `outline_path`, `draft_path`, `writing_plan_path`, `references_path`, `claim_evidence_map_path`, `compile_report_path`, `pdf_path`, and `latex_root_path`
+- when a paper bundle exists or should exist, verify `paper/paper_bundle_manifest.json` and its referenced `outline_path`, `draft_path`, `writing_plan_path`, `references_path`, `claim_evidence_map_path`, `baseline_inventory_path`, `compile_report_path`, `pdf_path`, `latex_root_path`, and any `open_source_manifest_path`
 - classify major claims as supported, partial, unsupported, or deferred
 - preserve important failures and downgrade history instead of hiding them
@@ -1762,6 +1793,7 @@ When summarizing long logs, campaigns, or multi-agent work:
 - Any shell-like command execution must go through `bash_exec`; this includes `curl`, `python`, `python3`, `bash`, `sh`, `node`, package managers, and similar CLI tools.
 - Do not execute shell commands through any non-`bash_exec` path.
 - Use `bash_exec(mode='detach', ...)` for long-running work, `bash_exec(mode='await', ...)` for bounded blocking checks, `bash_exec(mode='read', id=...)` to inspect saved logs, `bash_exec(mode='list')` to inspect active and finished sessions, and `bash_exec(mode='kill', id=...)` to stop a managed command.
+- Before using a bounded wait such as `bash_exec(mode='await', ...)`, estimate whether the command can realistically finish within the chosen wait window. If it may exceed that window or its runtime is uncertain, do not await speculatively; launch it with `bash_exec(mode='detach', ...)` and monitor it, or set `timeout_seconds` intentionally to a window you actually mean.
 - For important MCP calls, especially long-running `bash_exec`, include a structured `comment` that briefly states what you are doing, why now, and the next check or next action.
 - For a command that is likely to run for a long time, do not launch it and disappear. After `bash_exec(mode='detach', ...)`, keep monitoring it in the same turn through an explicit wait-and-check loop.
 - The default long-run monitoring cadence is:
@@ -1771,6 +1803,7 @@ When summarizing long logs, campaigns, or multi-agent work:
   - sleep about `600s`, then inspect again
   - sleep about `1800s`, then inspect again
   - if the run is still active, continue checking about every `1800s`
+- If the only blocker is a missing user-supplied external credential that has already been requested through a blocking interaction and no other useful work is possible, you may intentionally park with a much longer low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700, ...)` to avoid busy-looping.
 - If the environment or tool surface makes direct shell waiting awkward, an equivalent bounded wait such as `bash_exec(mode='await', id=..., timeout_seconds=...)` is acceptable, but the behavior must stay the same: wait, inspect real logs, then continue.
 - Never stay silent across multiple sleep windows for an important long-running task.
 - After each sleep/await cycle finishes and you inspect the real logs again, send `artifact.interact(kind='progress', ...)` with:

package/src/skills/analysis-campaign/SKILL.md CHANGED Viewed

@@ -53,7 +53,7 @@ Do not invent a separate experiment system for those cases.
 - If the runtime starts an auto-continue turn with no new user message, resume from the current campaign state and active requirements instead of replaying the previous user turn.
 - Progress message templates are references only. Adapt to the actual context and vary wording so messages feel human, respectful, and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest campaign progress update before assuming the task changed completely.
 ## Stage purpose
@@ -129,6 +129,8 @@ A campaign should usually leave behind:
 - a campaign identifier
 - a selected outline reference when the campaign is writing-facing
 - one directory per analysis run
+- any supplementary baseline reproduced for analysis under `baselines/local/<baseline_id>/` or attached under `baselines/imported/<baseline_id>/`
+- one quest-level supplementary baseline inventory at `artifacts/baselines/analysis_inventory.json`
 - one run artifact per analysis slice
 - one outline-bound todo manifest when the campaign is writing-facing
 - an aggregated campaign report
@@ -252,12 +254,21 @@ For each slice, define at minimum:
 - metric or observable
 - stop condition
 - evidence path expectations
+- `required_baselines` when the slice depends on an extra comparator that is not yet available in the quest
 Recommended extra per-slice fields:
 - `slice_id`
 - `run_kind`
 - `slice_class`, such as `auxiliary`, `claim-carrying`, or `supporting`
+- `required_baselines`, where each item records at least `baseline_id` plus the reason, benchmark, and split when known
+If a slice needs an extra comparator baseline:
+- reproduce it under `baselines/local/<baseline_id>/` unless it is attached under `baselines/imported/<baseline_id>/`
+- keep the usual durable baseline notes there, including `analysis_plan.md`, `setup.md`, `execution.md`, and `verification.md`
+- do not overwrite the canonical quest baseline gate just because an analysis slice needed a supplementary baseline
+- after the comparator is ready, record it back through `record_analysis_slice(..., comparison_baselines=[...])` with its `baseline_id`, path, benchmark/split, and metrics summary
 - `parent_run_id`
 - whether a code diff is required
 - whether an isolated branch/worktree is required
@@ -284,6 +295,17 @@ Treat `campaign_id` as system-owned, and treat `slice_id` / `todo_id` as agent-a
 Do not replace the normal campaign flow with repeated manual `artifact.prepare_branch(...)` calls.
 After each slice finishes, call `artifact.record_analysis_slice(...)` immediately so the result is mirrored back to the parent branch and the next slice can be activated.
 For slice recording, `deviations` and `evidence_paths` are optional context fields, not mandatory ceremony; include them only when they materially help explanation or auditability.
+Each `artifact.record_analysis_slice(...)` call should also include an `evaluation_summary` with exactly these six fields:
+- `takeaway`
+- `claim_update`
+- `baseline_relation`
+- `comparability`
+- `failure_mode`
+- `next_action`
+Use those six fields to keep each slice readable at a glance from Canvas, stage tabs, review, and rebuttal.
+The longer prose still matters, but the six-field summary is the stable routing summary.
 For writing-facing campaigns, prefer running `claim-carrying` slices before `supporting` slices unless an auxiliary check is required to make the main slice interpretable.
@@ -473,6 +495,7 @@ Stage-end requirement:
 - if the campaign produced a durable cross-slice lesson, failure pattern, or comparability caveat, write at least one `memory.write(...)` before leaving the stage
 The campaign’s main record belongs in run artifacts and the aggregated report.
+When synthesizing the campaign, read the per-slice `evaluation_summary` fields first, then expand into longer evidence only where the short summaries are still ambiguous.
 ## Artifact rules

package/src/skills/baseline/SKILL.md CHANGED Viewed

@@ -18,7 +18,7 @@ It absorbs the essential old DeepScientist reproducer discipline into one stage
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - Message templates are references only. Adapt to the actual context and vary wording so updates feel natural and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest baseline progress update before assuming the task changed completely.
 - Prefer `bash_exec` for setup, reproduction, and verification commands so each baseline action keeps a durable quest-local session id and log trail.
 - When the baseline route is durably chosen, confirmed, waived, or blocked with a clear next action, send one richer `artifact.interact(kind='milestone', reply_mode='threaded', ...)` update that says whether the baseline is trusted, blocked, or waived, why that matters, and what the next stage is.
@@ -204,6 +204,12 @@ Global reusable registry paths:
 Do not invent parallel durable locations when these runtime contracts already exist.
 Do not leave the authoritative metric contract only in chat, memory, or prose once the baseline is accepted.
+If a baseline is reproduced only because an analysis campaign needs an extra comparator:
+- still place it under `<quest_root>/baselines/local/<baseline_id>/` or `<quest_root>/baselines/imported/<baseline_id>/`
+- treat it as a supplementary analysis baseline unless the quest explicitly promotes it into the canonical gate
+- do not call `artifact.confirm_baseline(...)` for that supplementary case unless the quest truly intends to replace the canonical baseline
 ## Baseline id and variant rules
 Baseline identity should be stable and path-safe.

package/src/skills/decision/SKILL.md CHANGED Viewed

@@ -19,7 +19,7 @@ Use this skill whenever continuation is non-trivial.
 - If the runtime starts an auto-continue turn with no new user message, continue from the active requirements and durable quest state instead of replaying the previous user turn.
 - If `startup_contract.decision_policy = autonomous`, do not emit ordinary `artifact.interact(kind='decision_request', ...)` calls; decide the route yourself, record the reason, and continue.
 - Use `reply_mode='blocking'` for the actual decision request only when the user must choose before safe continuation and the quest contract still allows a user-gated decision.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest decision or progress interaction before assuming the task changed completely.
 - Quest completion is a special terminal decision: first ask for explicit completion approval with `artifact.interact(kind='decision_request', reply_mode='blocking', reply_schema={'decision_type': 'quest_completion_approval'}, ...)`, and only after an explicit approval reply should you call `artifact.complete_quest(...)`.
@@ -319,7 +319,7 @@ When asking, use a structured decision request with:
 - tradeoffs, including the main pros and cons for each option
 - recommended option first
 - explicit reply format
-- a stated timeout window; normally wait up to 1 day before self-resolving if no user reply arrives
+- a stated timeout window; normally wait up to 1 day before self-resolving if no user reply arrives, except when the only blocker is a missing external credential or secret that only the user can provide
 ### 6. Record the decision durably
@@ -327,6 +327,7 @@ Use `artifact.record(kind='decision', ...)` for the final decision.
 If user input is needed, also use `artifact.interact(kind='decision_request', ...)`.
 If the timeout expires without a user reply, choose the best option yourself, record why, and notify the user of the chosen option before moving on.
+This does not apply when the only blocker is a missing external credential or secret that only the user can provide; in that case keep the interaction waiting and, if resumed without the credential, you may park with `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` instead of busy-looping.
 If `startup_contract.decision_policy = autonomous`, ordinary route ambiguity is not by itself grounds to request user input.
 In that mode, only explicit approval-style exceptions such as quest completion should normally become blocking user decisions.

package/src/skills/experiment/SKILL.md CHANGED Viewed

@@ -43,7 +43,7 @@ Use this skill for the main evidence-producing runs of the quest.
 - If the runtime starts an auto-continue turn with no new user message, continue from the current run state, logs, artifacts, and active requirements instead of replaying the previous user turn.
 - Progress message templates are references only. Adapt to the actual context and vary wording so messages feel human, respectful, and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest experiment progress update before assuming the task changed completely.
 - Prefer `bash_exec` for experiment commands so each run gets a durable session id, quest-local log folder, and later `read/list/kill` control.
@@ -466,6 +466,22 @@ That call is responsible for writing:
 - evidence paths
 - changed files
 - relevant config paths when applicable
+- `evaluation_summary` with exactly these six fields:
+  - `takeaway`
+  - `claim_update`
+  - `baseline_relation`
+  - `comparability`
+  - `failure_mode`
+  - `next_action`
+Use `evaluation_summary` as the short structured judgment layer on top of the longer narrative fields:
+- `takeaway`: one sentence the next reader can reuse directly
+- `claim_update`: `strengthens`, `weakens`, `narrows`, or `neutral`
+- `baseline_relation`: `better`, `worse`, `mixed`, or `not_comparable`
+- `comparability`: `high`, `medium`, or `low`
+- `failure_mode`: `none`, `implementation`, `evaluation`, `environment`, or `direction`
+- `next_action`: the immediate route such as `continue`, `revise_idea`, `analysis_campaign`, `write`, or `stop`
 After `artifact.record_main_experiment(...)` succeeds, do not assume the same branch should absorb the next round by default.
 Interpret the measured result first, then either:

package/src/skills/finalize/SKILL.md CHANGED Viewed

@@ -17,7 +17,7 @@ Use this skill to close or pause a quest responsibly.
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - If the runtime starts an auto-continue turn with no new user message, keep finalizing from the durable quest state and active requirements instead of replaying the previous user turn.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest finalize progress update before assuming the task changed completely.
 - When finalize reaches a real closure state, pause-ready packet, or route-back decision, send one threaded `artifact.interact(kind='milestone', ...)` update that names the recommendation, why it is the right call, and any reopen condition that still matters.
 - True quest completion still requires explicit user approval through the runtime completion flow before calling `artifact.complete_quest(...)`.
@@ -124,9 +124,12 @@ When a paper bundle exists, verify the manifest inventory explicitly, including:
 - referenced `writing_plan_path`
 - referenced `references_path`
 - referenced `claim_evidence_map_path`
+- referenced `baseline_inventory_path`
 - referenced `compile_report_path`
 - referenced `pdf_path`
 - referenced `latex_root_path`
+- `release/open_source/manifest.json` when open-source preparation has started
+- `release/open_source/cleanup_plan.md` when the paper line is being prepared for a public code release
 ### 2. Build the final claim ledger

package/src/skills/idea/SKILL.md CHANGED Viewed

@@ -21,7 +21,7 @@ Use this skill to turn the current baseline and problem frame into concrete, lit
 - If the runtime starts an auto-continue turn with no new user message, keep advancing from the active requirements and current durable state instead of re-answering the previous user turn.
 - Message templates are references only. Adapt to the actual context and vary wording so updates feel natural and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest idea progress update before assuming the task changed completely.
 ## Stage purpose

package/src/skills/intake-audit/SKILL.md CHANGED Viewed

@@ -17,7 +17,7 @@ Use this skill when the quest already has meaningful state and the first job is
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - Message templates are references only. Adapt to the actual context and vary wording so updates feel natural and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest intake-audit progress update before assuming the task changed completely.
 - When the audit reaches a durable route recommendation, send one richer `artifact.interact(kind='milestone', reply_mode='threaded', ...)` update that says what state is trusted, what still needs work, and which anchor should run next.

package/src/skills/rebuttal/SKILL.md CHANGED Viewed

@@ -21,7 +21,7 @@ The task is “respond to concrete reviewer pressure with the smallest honest se
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - Message templates are references only. Adapt to the actual context and vary wording so updates feel natural and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest rebuttal progress update before assuming the task changed completely.
 - When the rebuttal plan, the main supplementary-evidence package, or the final response bundle becomes durable, send one richer `artifact.interact(kind='milestone', reply_mode='threaded', ...)` update that says what reviewer concerns are now addressed, what still remains open, and what happens next.
@@ -87,11 +87,13 @@ Use, in roughly this order:
 - the current paper or draft
 - the selected outline if one exists
 - review comments, meta-review, or editor letter
+- the six-field `evaluation_summary` blocks from recent main experiments and analysis slices
 - recent main and analysis experiment results
 - prior decision and writing memory
 - existing figures, tables, and claim-evidence maps
 If the current paper/result state is still unclear, open `intake-audit` first before continuing the rebuttal workflow.
+Before launching any new supplementary experiment, read those structured `evaluation_summary` blocks first so the rebuttal plan starts from the already-recorded evidence state rather than from raw narrative memory.
 ## Core outputs

package/src/skills/review/SKILL.md CHANGED Viewed

@@ -23,7 +23,7 @@ It is also not the same as `rebuttal`.
 - Keep progress updates chat-like and easy to understand: say what changed, what it means, and what happens next.
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - When the review report, revision plan, or follow-up experiment TODO list becomes durable, send a richer `artifact.interact(kind='milestone', reply_mode='threaded', ...)` update that says what the main risks are, what should be fixed next, and whether the next route is writing, experiment, or claim downgrade.
 ## Purpose
@@ -77,12 +77,14 @@ Use, in roughly this order:
 - the current paper or report draft
 - the selected outline if one exists
 - the claim-evidence map if one exists
+- the six-field `evaluation_summary` blocks from recent main experiments and analysis slices
 - recent main and analysis experiment results
 - figures, tables, and captions
 - prior self-review or reviewer-first notes as low-trust auxiliary input
 - nearby papers when novelty or comparison is unclear
 If the draft/result state is still unclear, open `intake-audit` first before continuing the review workflow.
+Before proposing extra experiments, read those structured `evaluation_summary` blocks first so you do not request work that the recorded evidence already resolved.
 ## Core outputs

package/src/skills/scout/SKILL.md CHANGED Viewed

@@ -17,7 +17,7 @@ Use this skill when the quest does not yet have a stable research frame.
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - Message templates are references only. Adapt to the actual context and vary wording so updates feel natural and non-robotic.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.
-- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, wait up to 1 day when feasible, then choose the best option yourself and notify the user of the chosen option if the timeout expires.
+- For any blocking decision request, provide 1 to 3 concrete options, put the recommended option first, explain each option's actual content plus pros and cons, and wait up to 1 day when feasible. If the blocker is a missing external credential or secret that only the user can provide, keep the quest waiting, ask the user to supply it or choose an alternative, and do not self-resolve; if resumed without that credential and no other work is possible, a long low-frequency wait such as `bash_exec(command='sleep 3600', mode='await', timeout_seconds=3700)` is acceptable. Otherwise choose the best option yourself and notify the user of the chosen option if the timeout expires.
 - If a threaded user reply arrives, interpret it relative to the latest scout progress update before assuming the task changed completely.
 - When scouting actually resolves the framing ambiguity, locks the evaluation contract, or makes the next anchor obvious, send one richer `artifact.interact(kind='milestone', reply_mode='threaded', ...)` update that says what is now clear, why it matters, and which stage should come next.