npm - @researai/deepscientist - Versions diffs - 1.5.8 → 1.5.11 - Mend

@researai/deepscientist 1.5.8 → 1.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/src/deepscientist/registries/baseline.py CHANGED Viewed

@@ -22,26 +22,49 @@ class BaselineRegistry:
         self.reconcile_confirmed_quests()
         entry_files = sorted(self.entries_root.glob("*.yaml"))
         if entry_files:
-            return sorted((self._load_entry_file(path) for path in entry_files), key=self._entry_sort_key)
+            return sorted(
+                (
+                    entry
+                    for path in entry_files
+                    for entry in [self._load_entry_file(path)]
+                    if not self._is_deleted_entry(entry)
+                ),
+                key=self._entry_sort_key,
+            )
         latest_by_id: dict[str, dict] = {}
         for item in self._history_entries():
             baseline_id = str(item.get("baseline_id") or item.get("entry_id") or "").strip()
             if baseline_id:
                 latest_by_id[baseline_id] = item
-        return sorted(latest_by_id.values(), key=self._entry_sort_key)
+        return sorted(
+            (item for item in latest_by_id.values() if not self._is_deleted_entry(item)),
+            key=self._entry_sort_key,
+        )
-    def get(self, baseline_id: str) -> dict | None:
+    def get(self, baseline_id: str, *, include_deleted: bool = False) -> dict | None:
         normalized_id = self._normalize_identifier(baseline_id, field_name="Baseline id")
         path = self._entry_path(normalized_id)
         if path.exists():
-            return self._load_entry_file(path)
+            entry = self._load_entry_file(path)
+            if self._is_deleted_entry(entry) and not include_deleted:
+                return None
+            return entry
         latest_match = None
         for item in self._history_entries():
             if item.get("baseline_id") == normalized_id or item.get("entry_id") == normalized_id:
                 latest_match = item
+        if self._is_deleted_entry(latest_match) and not include_deleted:
+            return None
         return latest_match
+    def is_deleted(self, baseline_id: str) -> bool:
+        try:
+            entry = self.get(baseline_id, include_deleted=True)
+        except ValueError:
+            return False
+        return self._is_deleted_entry(entry)
     def publish(self, entry: dict) -> dict:
         timestamp = utc_now()
         baseline_id = self._normalize_identifier(
@@ -201,6 +224,8 @@ class BaselineRegistry:
             }
             existing = self._existing_entry(baseline_id)
+            if self._is_deleted_entry(existing):
+                continue
             if self._entry_needs_publish(existing, entry):
                 synchronized.append(self.publish(entry))
             elif existing:
@@ -244,6 +269,27 @@ class BaselineRegistry:
         write_yaml(attachment_root / "attachment.yaml", attachment)
         return attachment
+    def delete(self, baseline_id: str) -> dict:
+        normalized_id = self._normalize_identifier(baseline_id, field_name="Baseline id")
+        existing = self.get(normalized_id, include_deleted=True) or {}
+        timestamp = utc_now()
+        deleted_entry = {
+            **existing,
+            "registry_kind": "baseline",
+            "schema_version": 1,
+            "entry_id": normalized_id,
+            "baseline_id": normalized_id,
+            "status": "deleted",
+            "updated_at": timestamp,
+            "deleted_at": timestamp,
+            "summary": str(existing.get("summary") or "").strip(),
+        }
+        if not deleted_entry.get("created_at"):
+            deleted_entry["created_at"] = timestamp
+        write_yaml(self._entry_path(normalized_id), deleted_entry)
+        append_jsonl(self.index_path, deleted_entry)
+        return deleted_entry
     def _history_entries(self) -> list[dict]:
         return read_jsonl(self.index_path)
@@ -292,6 +338,12 @@ class BaselineRegistry:
         entry = self._load_entry_file(path)
         return entry if isinstance(entry, dict) and entry else None
+    @staticmethod
+    def _is_deleted_entry(entry: dict[str, Any] | None) -> bool:
+        if not isinstance(entry, dict):
+            return False
+        return str(entry.get("status") or "").strip().lower() == "deleted"
     @staticmethod
     def _entry_needs_publish(existing: dict[str, Any] | None, candidate: dict[str, Any]) -> bool:
         if not existing:

package/src/deepscientist/runners/codex.py CHANGED Viewed

@@ -35,6 +35,17 @@ def _compact_text(value: object, *, limit: int = 1200) -> str:
     return text[: limit - 1].rstrip() + "…"
+def _structured_text(value: object) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value.strip()
+    try:
+        return json.dumps(value, ensure_ascii=False, indent=2)
+    except TypeError:
+        return str(value)
 def _iter_event_texts(event: dict[str, Any]) -> list[str]:
     texts: list[str] = []
     for key in ("text", "content", "message"):
@@ -184,7 +195,24 @@ def _tool_name(event: dict[str, Any], item: dict[str, Any]) -> str:
     return "tool"
+def _is_bash_exec_item(event: dict[str, Any], item: dict[str, Any]) -> bool:
+    server = str(item.get("server") or event.get("server") or "").strip()
+    tool = str(item.get("tool") or event.get("tool") or "").strip()
+    return server == "bash_exec" and tool == "bash_exec"
 def _tool_args(event: dict[str, Any], item: dict[str, Any]) -> str:
+    if _is_bash_exec_item(event, item):
+        for value in (
+            item.get("arguments"),
+            event.get("arguments"),
+            item.get("input"),
+            event.get("input"),
+        ):
+            text = _structured_text(value)
+            if text:
+                return text
+        return ""
     for value in (
         item.get("command"),
         item.get("query"),
@@ -204,6 +232,21 @@ def _tool_args(event: dict[str, Any], item: dict[str, Any]) -> str:
 def _tool_output(event: dict[str, Any], item: dict[str, Any]) -> str:
+    if _is_bash_exec_item(event, item):
+        for value in (
+            item.get("result"),
+            item.get("output"),
+            item.get("content"),
+            event.get("result"),
+            event.get("output"),
+            event.get("content"),
+            item.get("aggregated_output"),
+            event.get("aggregated_output"),
+        ):
+            text = _structured_text(value)
+            if text:
+                return text
+        return ""
     for value in (
         item.get("aggregated_output"),
         item.get("changes"),
@@ -253,10 +296,12 @@ def _mcp_tool_metadata(
             metadata["workdir"] = arguments.get("workdir")
         if isinstance(arguments.get("mode"), str):
             metadata["mode"] = arguments.get("mode")
-        if isinstance(arguments.get("timeout_seconds"), int):
+        if arguments.get("timeout_seconds") is not None:
             metadata["timeout_seconds"] = arguments.get("timeout_seconds")
         if "comment" in arguments:
             metadata["comment"] = arguments.get("comment")
+        if server == "bash_exec" and tool == "bash_exec" and isinstance(arguments.get("id"), str):
+            metadata["bash_id"] = arguments.get("id")
     metadata["session_id"] = f"quest:{quest_id}"
     metadata["agent_id"] = "pi"
     metadata["agent_instance_id"] = run_id
@@ -266,12 +311,18 @@ def _mcp_tool_metadata(
         for key in (
             "bash_id",
             "status",
+            "command",
+            "workdir",
+            "cwd",
+            "kind",
+            "comment",
             "started_at",
             "finished_at",
             "exit_code",
             "stop_reason",
             "last_progress",
             "log_path",
+            "watchdog_after_seconds",
         ):
             if key in result_payload:
                 metadata[key] = result_payload.get(key)
@@ -758,6 +809,7 @@ class CodexRunner:
         workspace_root = request.worktree_root or request.quest_root
         resolved_binary = resolve_runner_binary(self.binary, runner_name="codex")
         resolved_runner_config = runner_config if isinstance(runner_config, dict) else self._load_runner_config()
+        normalized_model = str(request.model or "").strip()
         command = [
             resolved_binary or self.binary,
             "--search",
@@ -766,9 +818,9 @@ class CodexRunner:
             "--cd",
             str(workspace_root),
             "--skip-git-repo-check",
-            "--model",
-            request.model,
         ]
+        if normalized_model.lower() not in {"", "inherit", "default", "codex-default"}:
+            command.extend(["--model", normalized_model])
         if request.approval_policy:
             command.extend(["-c", f'approval_policy="{request.approval_policy}"'])
         reasoning_effort = request.reasoning_effort

package/src/deepscientist/weixin_support.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .connector.weixin_support import * # noqa: F401,F403

package/src/prompts/connectors/lingzhu.md CHANGED Viewed

@@ -11,5 +11,7 @@
 - lingzhu_safety_rule: request only actions that are clearly justified by the current quest and understandable to the human user
 - lingzhu_text_rule: even when requesting `surface_actions`, always include a clear text explanation of what is happening and why
 - lingzhu_reply_style_rule: for Lingzhu-facing user-visible text sent through `artifact.interact(...)`, keep the message clear, concise, respectful, and high-information-density
-- lingzhu_reply_length_rule: for each Lingzhu-facing `artifact.interact(...)` message, normally answer in at most 2 to 3 sentences unless the user explicitly asks for more detail
+- lingzhu_reply_length_rule: for each Lingzhu-facing `artifact.interact(...)` message, normally keep the text within about 20 Chinese characters or one very short sentence unless the user explicitly asks for more detail
 - lingzhu_summary_first_rule: in Lingzhu-facing `artifact.interact(...)` messages, usually give only the synopsis and key facts needed for the user's next decision or understanding; avoid long preambles, repetition, and low-signal detail
+- lingzhu_task_gate_rule: only treat a Lingzhu user utterance as a new quest instruction when the text explicitly starts with `我现在的任务是`; otherwise assume the device is polling for queued progress or buffered replies
+- lingzhu_poll_rule: when Lingzhu is polling rather than giving a new task, return only the buffered progress checkpoints or the latest short status; do not reinterpret the poll text as a fresh instruction

package/src/prompts/connectors/weixin.md ADDED Viewed

@@ -0,0 +1,230 @@
+# Weixin Connector Contract
+- connector_contract_id: weixin
+- connector_contract_scope: loaded only when Weixin is the active or bound external connector for this quest
+- connector_contract_goal: use `artifact.interact(...)` as the main durable user-visible thread while respecting the Weixin iLink `context_token` reply model
+- weixin_runtime_ack_rule: the Weixin bridge itself emits the immediate transport-level receipt acknowledgement before the model turn starts
+- weixin_no_duplicate_ack_rule: do not waste your first model response or first `artifact.interact(...)` call on a second bare acknowledgement such as "received", "已收到", or "processing" when the bridge already sent that
+- weixin_reply_style_rule: keep Weixin replies concise, milestone-first, respectful, and easy to scan on a phone
+- weixin_reply_length_rule: for ordinary Weixin progress replies, normally use only 2 to 4 short sentences, or 3 short bullets at most
+- weixin_summary_first_rule: start with the user-facing conclusion, then what it means, then the next action
+- weixin_progress_shape_rule: make the current task, the main difficulty or latest real progress, and the next concrete measure explicit whenever possible
+- weixin_eta_rule: for important long-running phases such as baseline reproduction, main experiments, analysis, or paper packaging, include a rough ETA or next check-in window when you can
+- weixin_tool_call_keepalive_rule: for ordinary active work, prefer one concise Weixin progress update after roughly 10 tool calls when there is already a human-meaningful delta, and do not let work drift beyond roughly 20 tool calls or about 15 minutes without a user-visible checkpoint
+- weixin_internal_detail_rule: omit worker names, retry counters, pending/running/completed counts, low-level file listings, and monitor-window narration unless the user explicitly asked for them or they change the recommended action
+- weixin_translation_rule: translate internal execution and file-management work into user value instead of narrating tool or filesystem churn
+- weixin_preflight_rule: before sending a Weixin-facing progress update, rewrite it if it still reads like a monitor log, execution diary, or file inventory
+- weixin_operator_surface_rule: treat Weixin as an operator surface for concise coordination and milestone delivery, not as a full artifact browser
+- weixin_default_text_rule: plain text is the default and safest Weixin mode
+- weixin_context_token_rule: ordinary downstream replies rely on the runtime-managed `context_token`; do not invent your own reply token fields
+- weixin_media_rule: Weixin supports native image, video, and file delivery through structured attachments; request them through `artifact.interact(..., attachments=[...])` instead of inventing inline tag syntax
+- weixin_media_path_rule: when sending native Weixin media, prefer absolute local paths; remote URLs are allowed only when the bridge can download them safely
+- weixin_media_path_priority_rule: prefer quest-local files under `artifacts/`, `experiments/`, `paper/`, or `userfiles/` over arbitrary external URLs
+- weixin_media_hint_rule: when you need native Weixin media typing, set `connector_delivery={'weixin': {'media_kind': ...}}` on the attachment instead of relying only on filename suffixes
+- weixin_inbound_media_rule: inbound image, video, and file messages can now enter the quest as attachments, including media-only inbound turns
+- weixin_inbound_materialization_rule: inbound media is copied into quest-local `userfiles/weixin/...`; if the user sent media, read those quest-local files before continuing
+- weixin_audio_output_rule: there is no native Weixin voice-message output branch; audio files fall back to ordinary file delivery, not Weixin voice messages
+- weixin_partial_delivery_rule: the runtime now preflights native attachments before send and prefers a single combined Weixin message for text plus media, so do not assume text was already delivered if attachment preparation failed
+- weixin_failure_rule: if `artifact.interact(...)` returns `attachment_issues` or `delivery_results` errors, treat that as a real delivery failure and adapt before assuming the user received the media
+- weixin_first_followup_rule: after a new inbound Weixin message, your first substantive follow-up should either answer directly or give the first meaningful checkpoint and next action, not a second bare acknowledgement
+## Weixin Runtime Capabilities
+- always supported:
+  - concise plain-text Weixin replies through `artifact.interact(...)`
+  - ordinary threaded continuity through runtime-managed `context_token`
+  - automatic downstream reply-to-user behavior when a valid `context_token` has been seen for that user
+  - inbound text messages entering the quest as user turns
+  - inbound image, video, and file attachments being materialized into quest-local `userfiles/weixin/...`
+- supported when you attach one structured attachment with explicit delivery hints:
+  - native Weixin image delivery
+  - native Weixin video delivery
+  - native Weixin file delivery
+- do not assume:
+  - inline connector-specific tags in the message body
+  - arbitrary historical quote reconstruction beyond the active `context_token`
+  - device-side `surface_actions`
+  - native Weixin voice-message output
+## Structured Usage Rules
+- request native Weixin image delivery by attaching one structured attachment with:
+  - `connector_delivery={'weixin': {'media_kind': 'image'}}`
+- request native Weixin video delivery by attaching one structured attachment with:
+  - `connector_delivery={'weixin': {'media_kind': 'video'}}`
+- request native Weixin file delivery by attaching one structured attachment with:
+  - `connector_delivery={'weixin': {'media_kind': 'file'}}`
+- when you want native Weixin media delivery, make sure the attachment exposes at least one usable file reference such as:
+  - `path`
+  - `source_path`
+  - `output_path`
+  - `artifact_path`
+  - `url`
+- if no native media delivery is needed, omit `connector_delivery`
+- do not attach many files to Weixin by default; choose only the one highest-value image, video, or file for that milestone
+- if native delivery fails, fall back to a concise text update unless the missing media is essential
+- if the user sent media into Weixin, prefer the quest-local copied attachment path over connector cache or remote URL
+## Examples
+### 0. Bad vs good Weixin progress update
+Bad:
+```text
+我刚看完新的一轮监控窗，现在还是 12 pending / 3 running / 1 completed。retry 计数已经到第 4 次，workspace 里又多了几个 png 和 json。我接下来继续盯日志和文件变动，之后再看看是不是还要再补一轮。
+```
+Why bad:
+- it forces the user to infer the real conclusion from internal telemetry
+- it exposes retry counters, queue numbers, and file churn that usually do not help a phone-side operator
+- it reads like a monitor log, not a concise collaborator update
+Good:
+```text
+主实验还在继续推进，当前不需要您额外处理。最新进展是核心结果已经基本稳定，但还有一条对照线比较慢。接下来我会补完这条对照，预计 20 分钟左右给您下一次关键更新。
+```
+Why good:
+- it starts with the conclusion the user actually needs
+- it keeps the meaningful risk but removes low-level runtime chatter
+- it tells the user what happens next and when to expect the next checkpoint
+### 1. Plain-text Weixin progress update
+```python
+artifact.interact(
+    kind="progress",
+    message="主实验第一轮已经跑完，当前结果基本稳定。接下来我会继续补关键对照，确认这个提升是不是稳得住。预计下一次关键更新在 20 分钟左右。",
+    reply_mode="threaded",
+)
+```
+### 2. Continue the current Weixin thread normally
+Use the normal `artifact.interact(...)` call. The runtime keeps continuity through the latest `context_token` for that Weixin user.
+```python
+artifact.interact(
+    kind="progress",
+    message="我已经看完您刚才发来的材料，也确认了它和当前 baseline 的关键差异。接下来我会把真正影响路线判断的部分整理出来，再给您一个更完整的结论。",
+    reply_mode="threaded",
+)
+```
+### 3. Send one native Weixin image
+```python
+artifact.interact(
+    kind="milestone",
+    message="主实验已经完成。我发一张汇总图给您，方便直接在手机上看。",
+    reply_mode="threaded",
+    attachments=[
+        {
+            "kind": "path",
+            "path": "/absolute/path/to/main_summary.png",
+            "label": "main-summary",
+            "content_type": "image/png",
+            "connector_delivery": {"weixin": {"media_kind": "image"}},
+        }
+    ],
+)
+```
+### 4. Send one native Weixin video
+```python
+artifact.interact(
+    kind="milestone",
+    message="我把这段关键演示视频一起发给您。",
+    reply_mode="threaded",
+    attachments=[
+        {
+            "kind": "path",
+            "path": "/absolute/path/to/demo.mp4",
+            "label": "demo-video",
+            "content_type": "video/mp4",
+            "connector_delivery": {"weixin": {"media_kind": "video"}},
+        }
+    ],
+)
+```
+### 5. Send one native Weixin file
+```python
+artifact.interact(
+    kind="milestone",
+    message="论文初稿已经整理完成，我把 PDF 一并发给您。",
+    reply_mode="threaded",
+    attachments=[
+        {
+            "kind": "path",
+            "path": "/absolute/path/to/paper_draft.pdf",
+            "label": "paper-draft",
+            "content_type": "application/pdf",
+            "connector_delivery": {"weixin": {"media_kind": "file"}},
+        }
+    ],
+)
+```
+### 6. Send a native Weixin image from an artifact-style path field
+If the attachment is not using `path` but does expose a real quest-local file through `source_path`, `output_path`, or `artifact_path`, the runtime can still use it for native Weixin media delivery.
+```python
+artifact.interact(
+    kind="milestone",
+    message="我把这张结果图直接发给您。",
+    reply_mode="threaded",
+    attachments=[
+        {
+            "kind": "runner_result",
+            "source_path": "/absolute/path/to/result.png",
+            "content_type": "image/png",
+            "connector_delivery": {"weixin": {"media_kind": "image"}},
+        }
+    ],
+)
+```
+### 7. If the user sent Weixin media into the quest
+- inspect the current turn attachments
+- prefer the copied quest-local file under `userfiles/weixin/...`
+- reason over that local file instead of asking the user to resend unless the attachment is broken
+### 8. If delivery fails
+- inspect `attachment_issues`
+- inspect `delivery_results`
+- if native media failed, send a concise text-only fallback unless the missing media is essential
+Example fallback shape:
+```python
+result = artifact.interact(
+    kind="milestone",
+    message="我把汇总图发给您。",
+    reply_mode="threaded",
+    attachments=[
+        {
+            "kind": "path",
+            "path": "/absolute/path/to/main_summary.png",
+            "content_type": "image/png",
+            "connector_delivery": {"weixin": {"media_kind": "image"}},
+        }
+    ],
+)
+if result.get("attachment_issues") or any(not item.get("ok") for item in (result.get("delivery_results") or [])):
+    artifact.interact(
+        kind="progress",
+        message="图片这次没有成功送达。我先继续用文字给您同步结论，稍后再补发可用版本。",
+        reply_mode="threaded",
+    )
+```

package/src/prompts/system.md CHANGED Viewed

@@ -978,12 +978,15 @@ Prefer these patterns:
   - treat the resulting branch as one durable research round or route, not merely a temporary Git container
   - every accepted durable idea submission should normally create a new user-visible canvas node
   - before accepting an idea, unless strong durable evidence already narrows the route to one obvious serious option, run one bounded divergent -> convergent ideation pass instead of collapsing onto the first plausible route
+  - before writing or submitting the final selected idea, durably map at least 5 and usually 5 to 10 related and usable papers; prioritize direct task-modeling or mechanism-neighbor papers and only backfill with the closest adjacent translatable work when the direct pool is truly smaller
   - classify the current framing as `problem-first` or `solution-first`
   - generate a small but genuinely diverse candidate slate before ranking, then shrink it back to a serious frontier that is usually 2 to 3 alternatives and at most 5
   - if the candidates are all from the same mechanism family, widen once with distinct lenses such as abstraction ladder, tension hunting, analogy transfer, inversion, or adjacent-possible reasoning
   - require each serious candidate to answer `why now` / `what changed`
   - before `artifact.submit_idea(...)`, make the winner pass a two-sentence pitch and strongest-objection check
   - before calling it, first finish a concise but durable idea draft in Markdown that explains the route clearly enough for later implementation and review
+  - do not treat the literature floor as optional; if fewer than 5 usable papers are durably mapped, go back to search or record a blocked state instead of forcing the idea through
+  - that final idea draft must use one consistent standard citation format and include a `References` or `Bibliography` section for the survey-stage papers that actually shaped the idea
   - when available, pass that draft through `draft_markdown` so the branch keeps both a compact `idea.md` contract and a richer `draft.md`
   - `continue_line` means the new idea is a child of the current active branch
   - `branch_alternative` means the new idea is a sibling-like branch that starts from the current branch's parent foundation
@@ -1042,6 +1045,8 @@ Prefer these patterns:
 - use `artifact.checkpoint(...)` for meaningful code-state milestones
 - use `artifact.render_git_graph(...)` when the quest needs a refreshed Git history view
 - use `artifact.arxiv(paper_id=..., full_text=False)` to read an already identified arXiv paper
+- `artifact.arxiv(mode='read', paper_id=..., full_text=False)` is the preferred explicit form; it is local-first and will auto-persist the paper into the quest arXiv library when missing
+- use `artifact.arxiv(mode='list')` when you need to inspect the arXiv papers already saved for the current quest
 - keep paper discovery in web search; switch to `artifact.arxiv(..., full_text=True)` only when the full paper body is actually needed
 - use stage-significant artifact writes for progress, milestone, report, run, and decision updates
 - if the runtime exposes `artifact.interact(...)`, use it for structured progress updates, decision requests, and approval responses
@@ -1626,6 +1631,9 @@ If you choose a non-default foundation, record why.
 At the start of `idea`, if related-work coverage or novelty judgment is not already durable and explicit, also open `scout/SKILL.md` as a companion skill before final selection.
 At the start of a fresh or resumed `idea` pass, search quest/global memory first.
 If coverage is still incomplete or stale, actively use the runner's web/search tool for discovery and `artifact.arxiv(...)` for reading shortlisted arXiv papers before selecting a direction.
+Treat literature grounding as a hard gate: do not write or submit a final selected idea until the durable survey covers at least 5 and usually 5 to 10 related and usable papers.
+Those papers should be close enough to the task-modeling problem, failure mode, mechanism, or codebase translation question to justify the selected route with real evidence rather than intuition alone.
+If the direct neighborhood is genuinely smaller, document that shortage explicitly and use the closest adjacent translatable papers to finish the grounding.
 Expected outcomes:
@@ -1640,6 +1648,7 @@ Expected outcomes:
 - explicit mechanism and risk
 - cheapest falsification path
 - selected direction or rejection decision
+- a final idea draft that uses standard-format citations and a `References` or `Bibliography` section for the papers actually used
 - when the pass is substantial, a research-outline style note can be preferable to loose ideation prose; that note should usually cover:
   - executive summary
   - codebase analysis

package/src/skills/idea/SKILL.md CHANGED Viewed

@@ -103,6 +103,9 @@ Break ties primarily through careful reasoning over:
 - Do not select an idea before checking whether close prior work already did it.
 - Do not confuse "I can implement this" with "this is a publishable or useful research direction".
 - Do not treat a weak literature search as sufficient because the idea sounds elegant.
+- Do not write, promote, or submit a final idea until the durable survey covers at least `5` and usually `5-10` task-modeling-related, mechanism-relevant, or otherwise directly usable papers.
+- Treat that literature floor as a hard gate, not a suggestion.
+  If the direct task-modeling neighborhood truly contains fewer than `5` usable papers, record that evidence explicitly and fill the remaining slots with the closest adjacent papers whose mechanism can be translated into the current task and codebase.
 - Every fresh idea build or idea-refinement pass must begin with:
   - a memory sweep, and
   - an external literature sweep.
@@ -206,6 +209,8 @@ Before you choose a direction, perform a broad but bounded literature sweep.
 The sweep must be grounded in actual retrieval, not recall alone.
 If durable quest memory already contains a recent and explicit survey, reuse it first and search externally only for the missing buckets, newer papers, or unresolved overlaps.
+For a normal selected-idea decision, the durable sweep must end with at least `5` and usually `5-10` papers that are close enough to the task-modeling problem, failure mode, mechanism, or codebase translation question to inform the actual design.
+This floor exists to prevent thin novelty claims and under-motivated ideas, not to reward quota chasing.
 When tools allow it, combine:
@@ -240,6 +245,8 @@ For each promising idea, you must be able to answer:
 The goal is not to cite everything on Earth.
 The goal is to avoid fake novelty and to identify a direction that has credible research value.
+However, do not stop the sweep early once the first plausible argument appears.
+Keep going until the strongest obvious overlaps are mapped and the `5-10` usable-paper floor is durably satisfied.
 Recommended search outputs:
@@ -962,9 +969,15 @@ At minimum, preserve:
 - a `why now` statement
 - the code-level plan and minimal experiment
 - the literature relation and evidence pointers
+- inline citations or citation markers tied to the papers actually used in the idea rationale
+- a `References` or `Bibliography` section in a standard citation format
 - the strongest alternative hypothesis
 - the strongest likely objection
+The selected idea draft must cite the survey papers that actually shaped the mechanism, motivation, novelty check, or claim boundary.
+Use one consistent standard citation format throughout the draft, such as numbered references or author-year style.
+Do not mention paper titles casually in prose without giving them a proper citation entry.
 ## Idea quality rules
 Good ideas should be:
@@ -1135,6 +1148,7 @@ Preferred artifact choices:
 If the idea is selected and becomes the active route, immediately call `artifact.submit_idea(mode='create', lineage_intent='continue_line'|'branch_alternative', ...)`.
 Before that call, first finalize a concise but durable Markdown draft for the chosen route.
+Do not start writing that final draft until the literature survey has already met the hard minimum of at least `5` and usually `5-10` usable papers.
 That draft should usually cover:
 - executive summary
@@ -1148,9 +1162,11 @@ That draft should usually cover:
 - code-level change plan
 - evaluation or falsification plan
 - risks, caveats, and implementation notes
+- a citation-ready `References` or `Bibliography` section that lists the survey-stage papers actually used by the idea in a standard citation format
 Use the draft to think clearly first, then compress the accepted contract into the structured `artifact.submit_idea(...)` fields.
 When the MCP surface supports it, pass the final Markdown draft through `draft_markdown` so the branch records both `idea.md` and `draft.md`.
+Ensure the final draft carries appropriate citations for the closest prior work, direct inspirations, and any cross-domain papers that materially shaped the selected idea.
 Normal durable idea flow should create a new branch and a new canvas node every time an accepted idea package changes meaningfully, including documentation-only idea-package changes.
 Use `lineage_intent='continue_line'` when the new idea is a child of the current active branch.
 Use `lineage_intent='branch_alternative'` when the new idea should branch from the current branch's parent foundation as a sibling-like alternative.

package/src/skills/idea/references/literature-survey-template.md CHANGED Viewed

@@ -11,6 +11,11 @@ The purpose is to make related-work coverage durable, searchable, and reusable s
 - baseline id or method name
 - task / dataset / metric contract
 - current investigation target
+- survey minimum gate status
+  - related and usable papers found so far
+  - how many are direct task-modeling papers
+  - how many are adjacent but translatable papers
+  - whether the hard floor of at least `5` and usually `5-10` usable papers has been satisfied
 - why the survey is being run now
   - first idea build
   - idea refinement
@@ -66,9 +71,11 @@ For each paper, include:
 - year
 - identifier or arXiv id
 - URL
+- standard citation string or citation key
 - short mechanism summary
 - task / dataset / metric overlap
 - what it means for the current idea
+- whether it is directly usable for the current idea, only a novelty check, or only an adjacent inspiration
 - status:
   - `new_this_pass`
   - `known_before`
@@ -80,6 +87,7 @@ Recommended columns:
 - identifier
 - year
+- standard citation key
 - mechanism overlap
 - task overlap
 - dataset overlap
@@ -129,3 +137,19 @@ Close with:
 - the rejected ideas and why
 - what still needs more search before selection
 - whether the stage is ready for `idea` selection, more `scout`, or a user decision
+## 10. Citation-ready shortlist for the selected idea
+Before the final idea draft is written, extract the papers that materially support the winning idea.
+For each such paper, include:
+- standard citation entry in the format you plan to use later
+- what part of the idea it supports:
+  - problem motivation
+  - closest prior work
+  - mechanism inspiration
+  - claim boundary
+- whether it must appear inline in the idea draft or only in the references section
+The final selected idea should not be written or submitted until this shortlist is ready.

package/src/skills/idea/references/related-work-playbook.md CHANGED Viewed

@@ -52,6 +52,9 @@ Try to cover these buckets before final selection:
 - papers focused on the same failure mode
 - papers with the same task but different mechanism families
+For a normal selected-idea decision, the survey should durably cover at least `5` and usually `5-10` related and usable papers.
+Prefer direct task-modeling papers first; if that pool is truly small, fill the rest with the closest adjacent and translatable work instead of pretending the literature is empty.
 If the area is active, recent work matters a lot.
 If the area is stable, seminal work may matter more than recency.
@@ -132,4 +135,5 @@ The related-work search is good enough to stop when:
 - the strongest obvious nearby papers are mapped
 - the closest-prior-work table is complete enough to compare seriously
 - each top candidate has an explicit novelty or value verdict
+- the usable-paper floor for the selected idea has been satisfied or the shortage is explicitly documented
 - the remaining uncertainty is recorded rather than hidden