npm - @researai/deepscientist - Versions diffs - 1.5.9 → 1.5.12 - Mend

@researai/deepscientist 1.5.9 → 1.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

package/src/deepscientist/arxiv_library.py ADDED Viewed

@@ -0,0 +1,275 @@
+from __future__ import annotations
+import threading
+from pathlib import Path
+from typing import Any
+from urllib.request import Request, urlopen
+from .artifact.arxiv import USER_AGENT, normalize_arxiv_id
+from .shared import ensure_dir, read_json, utc_now, write_json
+class ArxivLibraryService:
+    _SCHEMA_VERSION = 2
+    def __init__(self) -> None:
+        self._manifest_lock = threading.Lock()
+        self._download_lock = threading.Lock()
+        self._inflight_downloads: set[tuple[str, str]] = set()
+    @staticmethod
+    def _root(quest_root: Path) -> Path:
+        return quest_root / "literature" / "arxiv"
+    @classmethod
+    def _index_path(cls, quest_root: Path) -> Path:
+        return cls._root(quest_root) / "index.json"
+    @classmethod
+    def _pdf_dir(cls, quest_root: Path) -> Path:
+        return cls._root(quest_root) / "pdfs"
+    @staticmethod
+    def _pdf_file_name(arxiv_id: str) -> str:
+        return f"{arxiv_id}.pdf"
+    @classmethod
+    def pdf_relative_path(cls, arxiv_id: str) -> str:
+        return f"literature/arxiv/pdfs/{cls._pdf_file_name(arxiv_id)}"
+    @classmethod
+    def pdf_path(cls, quest_root: Path, arxiv_id: str) -> Path:
+        return cls._pdf_dir(quest_root) / cls._pdf_file_name(arxiv_id)
+    @classmethod
+    def _empty_payload(cls) -> dict[str, Any]:
+        return {
+            "schema_version": cls._SCHEMA_VERSION,
+            "updated_at": utc_now(),
+            "items": [],
+        }
+    def load_manifest(self, quest_root: Path) -> dict[str, Any]:
+        path = self._index_path(quest_root)
+        payload = read_json(path, default=None)
+        if not isinstance(payload, dict):
+            payload = self._empty_payload()
+        items = payload.get("items")
+        if not isinstance(items, list):
+            payload["items"] = []
+        payload["schema_version"] = self._SCHEMA_VERSION
+        payload["updated_at"] = str(payload.get("updated_at") or utc_now())
+        return payload
+    def save_manifest(self, quest_root: Path, payload: dict[str, Any]) -> dict[str, Any]:
+        normalized = dict(payload or {})
+        normalized["schema_version"] = self._SCHEMA_VERSION
+        normalized["updated_at"] = utc_now()
+        if not isinstance(normalized.get("items"), list):
+            normalized["items"] = []
+        ensure_dir(self._root(quest_root))
+        write_json(self._index_path(quest_root), normalized)
+        return normalized
+    @staticmethod
+    def _normalize_item(item: dict[str, Any]) -> dict[str, Any]:
+        normalized = dict(item or {})
+        normalized["arxiv_id"] = str(normalized.get("arxiv_id") or "").strip()
+        normalized["status"] = str(normalized.get("status") or "processing").strip() or "processing"
+        metadata_status = str(normalized.get("metadata_status") or "").strip()
+        if not metadata_status:
+            metadata_status = "ready" if str(normalized.get("metadata_source") or "").strip() else ""
+        normalized["metadata_status"] = metadata_status or None
+        normalized["title"] = str(normalized.get("title") or normalized.get("display_name") or normalized["arxiv_id"]).strip()
+        normalized["display_name"] = str(
+            normalized.get("display_name") or normalized.get("title") or normalized["arxiv_id"]
+        ).strip()
+        normalized["abstract"] = str(normalized.get("abstract") or "").strip()
+        normalized["overview"] = str(normalized.get("overview") or "").strip()
+        normalized["overview_markdown"] = str(normalized.get("overview_markdown") or "").strip()
+        normalized["summary_source"] = str(normalized.get("summary_source") or "").strip() or None
+        normalized["overview_source"] = str(normalized.get("overview_source") or "").strip() or None
+        normalized["metadata_source"] = str(normalized.get("metadata_source") or "").strip() or None
+        normalized["published_at"] = str(normalized.get("published_at") or "").strip()
+        normalized["primary_class"] = str(normalized.get("primary_class") or "").strip()
+        bibtex = str(normalized.get("bibtex") or "").strip()
+        normalized["bibtex"] = bibtex or None
+        normalized["abs_url"] = str(normalized.get("abs_url") or "").strip() or None
+        normalized["pdf_url"] = str(normalized.get("pdf_url") or "").strip() or None
+        normalized["created_at"] = str(normalized.get("created_at") or utc_now()).strip()
+        normalized["updated_at"] = str(normalized.get("updated_at") or utc_now()).strip()
+        normalized["authors"] = [str(item).strip() for item in (normalized.get("authors") or []) if str(item).strip()]
+        normalized["categories"] = [str(item).strip() for item in (normalized.get("categories") or []) if str(item).strip()]
+        normalized["tags"] = [str(item).strip() for item in (normalized.get("tags") or []) if str(item).strip()]
+        version = normalized.get("version")
+        normalized["version"] = int(version) if isinstance(version, int) or str(version).isdigit() else None
+        normalized["pdf_rel_path"] = str(normalized.get("pdf_rel_path") or "").strip() or None
+        normalized["error"] = str(normalized.get("error") or "").strip() or None
+        return normalized
+    def get_item(self, quest_root: Path, arxiv_id: str) -> dict[str, Any] | None:
+        normalized_id = normalize_arxiv_id(arxiv_id)
+        if not normalized_id:
+            return None
+        payload = self.load_manifest(quest_root)
+        for raw_item in payload.get("items") or []:
+            if str(raw_item.get("arxiv_id") or "").strip() == normalized_id:
+                return self._materialize_item(quest_root, self._normalize_item(dict(raw_item)))
+        return None
+    def list_items(self, quest_root: Path) -> list[dict[str, Any]]:
+        payload = self.load_manifest(quest_root)
+        items = [
+            self._materialize_item(quest_root, self._normalize_item(dict(item)))
+            for item in payload.get("items") or []
+            if str(item.get("arxiv_id") or "").strip()
+        ]
+        return sorted(items, key=lambda item: str(item.get("updated_at") or ""), reverse=True)
+    def upsert_item(self, quest_root: Path, item: dict[str, Any]) -> dict[str, Any]:
+        normalized = self._normalize_item(item)
+        if not normalized["arxiv_id"]:
+            raise ValueError("`arxiv_id` is required.")
+        with self._manifest_lock:
+            payload = self.load_manifest(quest_root)
+            items = [dict(existing) for existing in (payload.get("items") or []) if isinstance(existing, dict)]
+            updated = False
+            for index, existing in enumerate(items):
+                if str(existing.get("arxiv_id") or "").strip() != normalized["arxiv_id"]:
+                    continue
+                merged = {**existing, **normalized, "updated_at": utc_now()}
+                if not existing.get("created_at"):
+                    merged["created_at"] = normalized["created_at"]
+                items[index] = merged
+                updated = True
+                break
+            if not updated:
+                items.append({**normalized, "created_at": utc_now(), "updated_at": utc_now()})
+            payload["items"] = items
+            self.save_manifest(quest_root, payload)
+        return self.get_item(quest_root, normalized["arxiv_id"]) or normalized
+    def mark_processing(self, quest_root: Path, arxiv_id: str, *, display_name: str | None = None) -> dict[str, Any]:
+        normalized_id = normalize_arxiv_id(arxiv_id)
+        if not normalized_id:
+            raise ValueError("Invalid arXiv id.")
+        current = self.get_item(quest_root, normalized_id) or {}
+        return self.upsert_item(
+            quest_root,
+            {
+                **current,
+                "arxiv_id": normalized_id,
+                "display_name": display_name or current.get("display_name") or normalized_id,
+                "status": "processing",
+                "pdf_rel_path": self.pdf_relative_path(normalized_id),
+                "error": None,
+            },
+        )
+    def mark_failed(self, quest_root: Path, arxiv_id: str, *, error: str) -> dict[str, Any]:
+        normalized_id = normalize_arxiv_id(arxiv_id)
+        if not normalized_id:
+            raise ValueError("Invalid arXiv id.")
+        current = self.get_item(quest_root, normalized_id) or {}
+        return self.upsert_item(
+            quest_root,
+            {
+                **current,
+                "arxiv_id": normalized_id,
+                "status": "failed",
+                "error": error,
+                "pdf_rel_path": current.get("pdf_rel_path") or self.pdf_relative_path(normalized_id),
+            },
+        )
+    def mark_ready(self, quest_root: Path, arxiv_id: str) -> dict[str, Any]:
+        normalized_id = normalize_arxiv_id(arxiv_id)
+        if not normalized_id:
+            raise ValueError("Invalid arXiv id.")
+        current = self.get_item(quest_root, normalized_id) or {}
+        return self.upsert_item(
+            quest_root,
+            {
+                **current,
+                "arxiv_id": normalized_id,
+                "status": "ready",
+                "error": None,
+                "pdf_rel_path": current.get("pdf_rel_path") or self.pdf_relative_path(normalized_id),
+            },
+        )
+    def _materialize_item(self, quest_root: Path, item: dict[str, Any]) -> dict[str, Any]:
+        normalized = self._normalize_item(item)
+        arxiv_id = normalized["arxiv_id"]
+        pdf_rel_path = normalized.get("pdf_rel_path") or self.pdf_relative_path(arxiv_id)
+        normalized["pdf_rel_path"] = pdf_rel_path
+        pdf_path = quest_root / pdf_rel_path
+        if pdf_path.exists() and pdf_path.is_file():
+            relative = pdf_path.relative_to(quest_root).as_posix()
+            normalized["path"] = relative
+            normalized["document_id"] = f"questpath::{relative}"
+        else:
+            normalized["path"] = None
+            normalized["document_id"] = None
+        return normalized
+    def queue_pdf_download(self, quest_root: Path, arxiv_id: str, *, pdf_url: str | None = None) -> bool:
+        normalized_id = normalize_arxiv_id(arxiv_id)
+        if not normalized_id:
+            return False
+        target_path = self.pdf_path(quest_root, normalized_id)
+        if target_path.exists() and target_path.is_file():
+            self.mark_ready(quest_root, normalized_id)
+            return False
+        target_url = str(pdf_url or "").strip() or f"https://arxiv.org/pdf/{normalized_id}.pdf"
+        inflight_key = (str(quest_root.resolve()), normalized_id)
+        with self._download_lock:
+            if inflight_key in self._inflight_downloads:
+                return False
+            self._inflight_downloads.add(inflight_key)
+        thread = threading.Thread(
+            target=self._download_pdf_worker,
+            kwargs={
+                "quest_root": quest_root,
+                "arxiv_id": normalized_id,
+                "pdf_url": target_url,
+                "inflight_key": inflight_key,
+            },
+            daemon=True,
+            name=f"deepscientist-arxiv-{normalized_id}",
+        )
+        thread.start()
+        return True
+    def _download_pdf_worker(
+        self,
+        *,
+        quest_root: Path,
+        arxiv_id: str,
+        pdf_url: str,
+        inflight_key: tuple[str, str],
+    ) -> None:
+        try:
+            ensure_dir(self._pdf_dir(quest_root))
+            target_path = self.pdf_path(quest_root, arxiv_id)
+            request = Request(
+                pdf_url,
+                headers={
+                    "User-Agent": USER_AGENT,
+                    "Accept": "application/pdf,*/*;q=0.8",
+                },
+            )
+            with urlopen(request, timeout=20) as response:  # noqa: S310
+                payload = response.read()
+            if not payload.startswith(b"%PDF"):
+                raise ValueError("Downloaded payload is not a PDF.")
+            temp_path = target_path.with_suffix(f"{target_path.suffix}.tmp")
+            temp_path.write_bytes(payload)
+            temp_path.replace(target_path)
+            self.mark_ready(quest_root, arxiv_id)
+        except Exception as exc:  # noqa: BLE001
+            self.mark_failed(quest_root, arxiv_id, error=str(exc).strip() or "download_failed")
+        finally:
+            with self._download_lock:
+                self._inflight_downloads.discard(inflight_key)

package/src/deepscientist/bash_exec/monitor.py CHANGED Viewed

@@ -22,7 +22,7 @@ from .service import (
     _coerce_session_status,
     _parse_progress_marker,
 )
-from ..shared import append_jsonl, ensure_dir, read_json, read_jsonl, utc_now
+from ..shared import append_jsonl, ensure_dir, iter_jsonl, read_json, read_jsonl, utc_now
 DEFAULT_STOP_GRACE_SECONDS = 5
 TERMINAL_IO_POLL_SECONDS = 0.02
@@ -298,7 +298,7 @@ def run_monitor(session_dir: Path) -> int:
     log_path.touch(exist_ok=True)
     input_path.touch(exist_ok=True)
     if not input_cursor_path.exists():
-        _atomic_write_json(input_cursor_path, {"offset": len(read_jsonl(input_path)), "updated_at": utc_now()})
+        _atomic_write_json(input_cursor_path, {"offset": sum(1 for _ in iter_jsonl(input_path)), "updated_at": utc_now()})
     tool_env = os.environ.pop("DS_BASH_EXEC_TOOL_ENV", "")
     env_payload = os.environ.copy()
@@ -451,9 +451,11 @@ def run_monitor(session_dir: Path) -> int:
             if output_fd is not None and process.poll() is None:
                 cursor_payload = read_json(input_cursor_path, {}) or {}
                 offset = int(cursor_payload.get("offset") or 0)
-                input_entries = read_jsonl(input_path)
-                if offset < len(input_entries):
-                    for entry in input_entries[offset:]:
+                total_input_entries = sum(1 for _ in iter_jsonl(input_path))
+                if offset < total_input_entries:
+                    for index, entry in enumerate(iter_jsonl(input_path)):
+                        if index < offset:
+                            continue
                         raw_data = str(entry.get("data") or "")
                         if raw_data:
                             try:

package/src/deepscientist/bash_exec/service.py CHANGED Viewed

@@ -11,12 +11,13 @@ import sys
 import tempfile
 import threading
 import time
+from collections import deque
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
 from ..mcp.context import McpContext
-from ..shared import append_jsonl, ensure_dir, generate_id, read_json, read_jsonl, utc_now
+from ..shared import append_jsonl, ensure_dir, generate_id, iter_jsonl, read_json, read_jsonl, read_jsonl_tail, utc_now
 from .runtime import TerminalRuntimeManager
 BASH_STATUS_MARKER_PREFIX = "__DS_BASH_STATUS__"
@@ -24,6 +25,9 @@ BASH_CARRIAGE_RETURN_PREFIX = "__DS_BASH_CR__"
 BASH_PROGRESS_PREFIX = "__DS_PROGRESS__"
 BASH_TERMINAL_PROMPT_PREFIX = "__DS_TERMINAL_PROMPT__"
 DEFAULT_LOG_TAIL_LIMIT = 200
+DEFAULT_INLINE_BASH_LOG_LINE_LIMIT = 2000
+DEFAULT_INLINE_BASH_LOG_HEAD_LINES = 500
+DEFAULT_INLINE_BASH_LOG_TAIL_LINES = 1500
 DEFAULT_POLL_INTERVAL_SECONDS = 0.35
 TERMINAL_STATUSES = {"completed", "failed", "terminated"}
 DEFAULT_TERMINAL_SESSION_ID = "terminal-main"
@@ -46,6 +50,52 @@ def _atomic_write_json(path: Path, payload: Any) -> None:
     temp_path.replace(path)
+def _count_jsonl_records(path: Path) -> int:
+    return sum(1 for _ in iter_jsonl(path))
+def _build_terminal_log_preview_payload(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        return {
+            "log": "",
+            "log_line_count": 0,
+            "log_truncated": False,
+        }
+    head_lines: list[str] = []
+    tail_lines: deque[str] = deque(maxlen=DEFAULT_INLINE_BASH_LOG_TAIL_LINES)
+    total = 0
+    with path.open("r", encoding="utf-8", errors="replace") as handle:
+        for raw_line in handle:
+            line = raw_line.rstrip("\n")
+            total += 1
+            if total <= DEFAULT_INLINE_BASH_LOG_HEAD_LINES:
+                head_lines.append(line)
+            tail_lines.append(line)
+    if total <= DEFAULT_INLINE_BASH_LOG_LINE_LIMIT:
+        return {
+            "log": "\n".join(list(tail_lines)),
+            "log_line_count": total,
+            "log_truncated": False,
+        }
+    omitted = max(0, total - DEFAULT_INLINE_BASH_LOG_HEAD_LINES - DEFAULT_INLINE_BASH_LOG_TAIL_LINES)
+    marker = (
+        "[... omitted "
+        f"{omitted} lines from the middle of this log. "
+        "Use bash_exec(mode='read', id=..., start=..., tail=...) for a specific window.]"
+    )
+    return {
+        "log": "\n".join(head_lines + [marker] + list(tail_lines)),
+        "log_line_count": total,
+        "log_truncated": True,
+        "log_preview_head_lines": DEFAULT_INLINE_BASH_LOG_HEAD_LINES,
+        "log_preview_tail_lines": DEFAULT_INLINE_BASH_LOG_TAIL_LINES,
+        "log_preview_omitted_lines": omitted,
+    }
 def _normalize_string(value: object) -> str:
     return str(value or "").strip()
@@ -67,6 +117,14 @@ def _session_sort_key(session: dict[str, Any]) -> tuple[str, str]:
 def _is_process_alive(pid: object) -> bool:
     if not isinstance(pid, int) or pid <= 0:
         return False
+    proc_stat_path = Path("/proc") / str(pid) / "stat"
+    if proc_stat_path.exists():
+        try:
+            parts = proc_stat_path.read_text(encoding="utf-8").split()
+        except OSError:
+            parts = []
+        if len(parts) >= 3 and parts[2] == "Z":
+            return False
     try:
         os.kill(pid, 0)
     except ProcessLookupError:
@@ -560,7 +618,8 @@ class BashExecService:
         if not self.meta_path(quest_root, bash_id).exists():
             raise FileNotFoundError(f"Unknown bash session `{bash_id}`.")
         deadline = time.monotonic() + 0.6
-        entries = read_jsonl(self.log_path(quest_root, bash_id))
+        path = self.log_path(quest_root, bash_id)
+        entries = read_jsonl_tail(path, max(1, limit))
         while time.monotonic() < deadline:
             if any(str(entry.get("stream") or "") not in {"system", "prompt"} for entry in entries):
                 break
@@ -572,24 +631,33 @@ class BashExecService:
                 time.sleep(0.05)
             else:
                 time.sleep(0.03)
-            entries = read_jsonl(self.log_path(quest_root, bash_id))
+            entries = read_jsonl_tail(path, max(1, limit))
         latest_seq = int(entries[-1].get("seq") or 0) if entries else 0
         normalized_before = before_seq if isinstance(before_seq, int) and before_seq > 0 else None
         normalized_after = after_seq if isinstance(after_seq, int) and after_seq >= 0 else None
-        if normalized_after is not None:
-            entries = [entry for entry in entries if int(entry.get("seq") or 0) > normalized_after]
-        if normalized_before is not None:
-            entries = [entry for entry in entries if int(entry.get("seq") or 0) < normalized_before]
-        selection_pool = entries
-        if prefer_visible:
-            visible_entries = [
-                entry for entry in entries if str(entry.get("stream") or "") not in {"system", "prompt"}
-            ]
-            if visible_entries:
-                selection_pool = visible_entries
         normalized_limit = max(1, limit)
-        truncated = len(selection_pool) > normalized_limit
-        selected = selection_pool[-normalized_limit:]
+        selection_pool: deque[dict[str, Any]] = deque(maxlen=normalized_limit)
+        visible_pool: deque[dict[str, Any]] = deque(maxlen=normalized_limit)
+        total_filtered = 0
+        for entry in iter_jsonl(path):
+            seq = int(entry.get("seq") or 0)
+            latest_seq = max(latest_seq, seq)
+            if normalized_after is not None and seq <= normalized_after:
+                continue
+            if normalized_before is not None and seq >= normalized_before:
+                continue
+            total_filtered += 1
+            selection_pool.append(entry)
+            if str(entry.get("stream") or "") not in {"system", "prompt"}:
+                visible_pool.append(entry)
+        selected_source: list[dict[str, Any]]
+        if prefer_visible and visible_pool:
+            selected_source = list(visible_pool)
+            truncated = total_filtered > len(visible_pool)
+        else:
+            selected_source = list(selection_pool)
+            truncated = total_filtered > len(selection_pool)
+        selected = selected_source[-normalized_limit:]
         if order == "desc":
             selected = list(reversed(selected))
         tail_start_seq = int(selected[0].get("seq") or 0) if selected else None
@@ -860,7 +928,7 @@ class BashExecService:
             "last_input_at": None,
             "last_prompt_at": None,
             "last_command": None,
-            "history_count": len(read_jsonl(self.history_path(quest_root, bash_id))),
+            "history_count": _count_jsonl_records(self.history_path(quest_root, bash_id)),
         }
     def ensure_terminal_session(
@@ -910,7 +978,7 @@ class BashExecService:
         self.prompt_events_path(resolved_quest_root, bash_id).touch()
         _atomic_write_json(
             self.input_cursor_path(resolved_quest_root, bash_id),
-            {"offset": len(read_jsonl(self.input_path(resolved_quest_root, bash_id))), "updated_at": utc_now()},
+            {"offset": _count_jsonl_records(self.input_path(resolved_quest_root, bash_id)), "updated_at": utc_now()},
         )
         _atomic_write_json(
             self.line_buffer_path(resolved_quest_root, bash_id),
@@ -1064,7 +1132,7 @@ class BashExecService:
                 append_jsonl(self.history_path(quest_root, bash_id), item)
             meta = read_json(self.meta_path(quest_root, bash_id), {})
             meta["last_command"] = completed[-1]["command"]
-            meta["history_count"] = len(read_jsonl(self.history_path(quest_root, bash_id)))
+            meta["history_count"] = _count_jsonl_records(self.history_path(quest_root, bash_id))
             meta["updated_at"] = utc_now()
             meta["last_input_at"] = utc_now()
             self._write_meta(quest_root, bash_id, meta)
@@ -1130,7 +1198,7 @@ class BashExecService:
             before_seq=None,
             order="asc",
         )
-        history = read_jsonl(self.history_path(quest_root, bash_id))
+        history = read_jsonl_tail(self.history_path(quest_root, bash_id), max(1, command_limit))
         latest_commands = [
             {
                 "command_id": item.get("command_id"),
@@ -1181,6 +1249,7 @@ class BashExecService:
             "label": session.get("label"),
             "command": session.get("command"),
             "workdir": session.get("workdir"),
+            "cwd": session.get("cwd"),
             "started_at": session.get("started_at"),
             "finished_at": session.get("finished_at"),
             "exit_code": session.get("exit_code"),
@@ -1199,7 +1268,7 @@ class BashExecService:
             "watchdog_overdue": session.get("watchdog_overdue"),
         }
         if include_log:
-            result["log"] = self.read_terminal_log(quest_root, str(session["bash_id"]))
+            result.update(self._log_preview_payload(quest_root, str(session["bash_id"])))
         if export_log or _normalize_string(export_log_to):
             cwd, _ = self.resolve_workdir(context, str(session.get("workdir") or ""))
             result.update(
@@ -1212,3 +1281,6 @@ class BashExecService:
                 )
             )
         return result
+    def _log_preview_payload(self, quest_root: Path, bash_id: str) -> dict[str, Any]:
+        return _build_terminal_log_preview_payload(self.terminal_log_path(quest_root, bash_id))

package/src/deepscientist/bridges/builtins.py CHANGED Viewed

@@ -6,6 +6,7 @@ from .connectors import (
     QQConnectorBridge,
     SlackConnectorBridge,
     TelegramConnectorBridge,
+    WeixinConnectorBridge,
     WhatsAppConnectorBridge,
 )
 from .registry import register_connector_bridge
@@ -13,6 +14,7 @@ from .registry import register_connector_bridge
 def register_builtin_connector_bridges() -> None:
     register_connector_bridge("qq", QQConnectorBridge)
+    register_connector_bridge("weixin", WeixinConnectorBridge)
     register_connector_bridge("telegram", TelegramConnectorBridge)
     register_connector_bridge("discord", DiscordConnectorBridge)
     register_connector_bridge("slack", SlackConnectorBridge)