openclaw-diag-cli 0.1.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +83 -71
  2. package/bin/ocdiag +0 -1
  3. package/bin/openclaw-diag.js +65 -176
  4. package/diag/01_sys_health.py +0 -2
  5. package/diag/02_environment.py +32 -6
  6. package/diag/03_configuration.py +4 -1
  7. package/diag/04_gateway.py +30 -8
  8. package/diag/05_recent_errors.py +24 -14
  9. package/diag/06_cron_jobs.py +4 -41
  10. package/diag/07_performance.py +114 -42
  11. package/diag/08_sessions.py +2 -54
  12. package/diag/09_plugin_diag.py +52 -25
  13. package/diag/10_shell_history.py +28 -10
  14. package/lib/__pycache__/bundle.cpython-310.pyc +0 -0
  15. package/lib/bundle.py +6 -13
  16. package/ocdiag/__init__.py +1 -1
  17. package/ocdiag/__pycache__/__init__.cpython-310.pyc +0 -0
  18. package/ocdiag/__pycache__/cli.cpython-310.pyc +0 -0
  19. package/ocdiag/__pycache__/dispatcher.cpython-310.pyc +0 -0
  20. package/ocdiag/__pycache__/doctor.cpython-310.pyc +0 -0
  21. package/ocdiag/__pycache__/jsonlog.cpython-310.pyc +0 -0
  22. package/ocdiag/__pycache__/output.cpython-310.pyc +0 -0
  23. package/ocdiag/__pycache__/paths.cpython-310.pyc +0 -0
  24. package/ocdiag/__pycache__/recent_logs.cpython-310.pyc +0 -0
  25. package/ocdiag/__pycache__/sensitive.cpython-310.pyc +0 -0
  26. package/ocdiag/__pycache__/sessions.cpython-310.pyc +0 -0
  27. package/ocdiag/__pycache__/timeutil.cpython-310.pyc +0 -0
  28. package/ocdiag/__pycache__/tokens.cpython-310.pyc +0 -0
  29. package/ocdiag/cli.py +16 -1
  30. package/ocdiag/dispatcher.py +140 -53
  31. package/ocdiag/doctor.py +162 -0
  32. package/ocdiag/jsonlog.py +0 -5
  33. package/ocdiag/paths.py +0 -17
  34. package/ocdiag/recent_logs.py +0 -3
  35. package/ocdiag/sensitive.py +95 -1
  36. package/ocdiag/sessions.py +161 -0
  37. package/ocdiag/timeutil.py +0 -11
  38. package/ocdiag/tokens.py +0 -4
  39. package/package.json +2 -2
  40. package/tools/oc_session_extract.py +190 -67
  41. package/tools/oc_session_trace.py +48 -46
@@ -1,4 +1,23 @@
1
- """Mask sensitive config values (keys, secrets, tokens)."""
1
+ """Mask sensitive config values + sanitize free-form text.
2
+
3
+ Two layers:
4
+
5
+ 1. ``mask`` / ``safe_val`` / ``is_sensitive_key`` — used when we already know
6
+ we're looking at a config key/value pair (configuration flatten, env vars).
7
+ Masking is keyed off the *key name*.
8
+
9
+ 2. ``sanitize_text`` — used when scanning free-form text (shell history lines,
10
+ plugin error messages, systemd unit files, session message bodies). We don't
11
+ know the structure, so we run a pattern-based scrubber. Best-effort: the
12
+ patterns below cover the common token shapes (Anthropic/OpenAI sk-, GitHub
13
+ ghp_/gho_/ghs_/github_pat_, npm npm_, AWS AKIA, ``Bearer xxx``, URL
14
+ credentials, ``KEY=value`` with secret-ish key). It will miss bespoke or
15
+ obfuscated formats — callers who need stronger guarantees should mask the
16
+ whole field.
17
+
18
+ The ``--unmask`` flag, declared in ``ocdiag.cli``, propagates to call sites
19
+ that opt-in to honouring it (currently the session extract tool).
20
+ """
2
21
 
3
22
  from __future__ import annotations
4
23
 
@@ -39,3 +58,78 @@ def safe_val(key: str, val, max_len: int = 300) -> str:
39
58
  return mask(val) if val else '""'
40
59
  s = str(val)
41
60
  return s[:max_len] + "..." if len(s) > max_len else s
61
+
62
+
63
+ # ── sanitize_text ──
64
+
65
+ # Token shapes worth scrubbing by themselves (no key=value context).
66
+ # Each pattern matches the *whole* secret; we replace with `<***>` keeping
67
+ # the leading prefix so the reader can still tell what kind of secret it was.
68
+ _TOKEN_PATTERNS = [
69
+ # Anthropic / OpenAI style (`sk-...` / `sk-ant-...`)
70
+ (re.compile(r"\b(sk-(?:ant-)?[A-Za-z0-9_\-]{16,})"), "sk-<***>"),
71
+ # GitHub PAT family
72
+ (re.compile(r"\b(gh[posu]_[A-Za-z0-9]{20,})"), "<gh-token>"),
73
+ (re.compile(r"\b(github_pat_[A-Za-z0-9_]{20,})"), "<github_pat>"),
74
+ # npm
75
+ (re.compile(r"\b(npm_[A-Za-z0-9]{30,})"), "<npm_token>"),
76
+ # AWS access key id
77
+ (re.compile(r"\b(AKIA[0-9A-Z]{16})"), "<AKIA-***>"),
78
+ # Authorization headers
79
+ (re.compile(r"(Bearer\s+)([A-Za-z0-9_\-\.=]{8,})", re.IGNORECASE), r"\1<***>"),
80
+ # URLs with embedded credentials: scheme://user:pass@host
81
+ (re.compile(r"([a-zA-Z][a-zA-Z0-9+\-.]*://)([^/\s:@]+):([^/\s@]+)@"), r"\1<user>:<***>@"),
82
+ ]
83
+
84
+ # KEY=VALUE / KEY: VALUE in free text where the key looks secret-ish.
85
+ # Use SENSITIVE_PATTERN over the key name; match value up to whitespace, quote,
86
+ # or end-of-line. Three forms:
87
+ # KEY=value (env var, dotenv)
88
+ # KEY="value" (shell quoted)
89
+ # KEY: value (yaml-ish)
90
+ _KV_BARE = re.compile(
91
+ r"\b([A-Za-z_][A-Za-z0-9_\-\.]*"
92
+ r"(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL|AUTH|PRIVATE|SIGNING)[A-Za-z0-9_\-\.]*)"
93
+ r"\s*=\s*([^\s\"';#]+)",
94
+ re.IGNORECASE,
95
+ )
96
+ _KV_QUOTED = re.compile(
97
+ r"\b([A-Za-z_][A-Za-z0-9_\-\.]*"
98
+ r"(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL|AUTH|PRIVATE|SIGNING)[A-Za-z0-9_\-\.]*)"
99
+ r"\s*=\s*([\"'])([^\"']+)\2",
100
+ re.IGNORECASE,
101
+ )
102
+ _KV_COLON = re.compile(
103
+ r"\b([A-Za-z_][A-Za-z0-9_\-\.]*"
104
+ r"(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL|AUTH|PRIVATE|SIGNING)[A-Za-z0-9_\-\.]*)"
105
+ r"\s*:\s*([^\s\"';#,}\]]+)",
106
+ re.IGNORECASE,
107
+ )
108
+
109
+
110
+ def sanitize_text(text: str, context: str = "generic") -> str:
111
+ """Scrub well-known secret shapes from free-form text.
112
+
113
+ Best-effort, not a guarantee. Returns the text unchanged if it's not a str.
114
+ """
115
+ if not isinstance(text, str) or not text:
116
+ return text
117
+
118
+ # Order: longer/more-specific (KV with quotes) first, then bare KV, then
119
+ # bare token shapes. KV passes also catch things like `API_KEY=abc` where
120
+ # the value would not match a token pattern.
121
+ def _kv_quoted_sub(m):
122
+ return f"{m.group(1)}={m.group(2)}<***>{m.group(2)}"
123
+
124
+ def _kv_bare_sub(m):
125
+ return f"{m.group(1)}=<***>"
126
+
127
+ def _kv_colon_sub(m):
128
+ return f"{m.group(1)}: <***>"
129
+
130
+ text = _KV_QUOTED.sub(_kv_quoted_sub, text)
131
+ text = _KV_BARE.sub(_kv_bare_sub, text)
132
+ text = _KV_COLON.sub(_kv_colon_sub, text)
133
+ for pat, repl in _TOKEN_PATTERNS:
134
+ text = pat.sub(repl, text)
135
+ return text
@@ -0,0 +1,161 @@
1
+ """Shared session-file lookup utilities for trace/extract.
2
+
3
+ A "session" is identified by a UUID. On disk it can have multiple files:
4
+ <uuid>.jsonl — active
5
+ <uuid>.jsonl.lock — write lock (transient, filtered by default)
6
+ <uuid>.jsonl.deleted.<ts> — soft-deleted
7
+ <uuid>.jsonl.reset.<ts> — pre-reset snapshot
8
+ <uuid>.jsonl.bak-<pid> — backup snapshot
9
+
10
+ Sibling artifacts (NOT session content):
11
+ <uuid>.trajectory.jsonl, <uuid>.acp-stream.jsonl, <uuid>.json
12
+
13
+ Callers may pass a full UUID or a prefix of at least MIN_PREFIX_LEN chars.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import glob
19
+ import os
20
+ import re
21
+ from typing import Dict, List, Optional, Tuple
22
+
23
+ from . import paths
24
+
25
+
26
+ MIN_PREFIX_LEN = 8
27
+
28
+ _TRANSIENT_SUFFIXES = (".lock", ".tmp", ".swp")
29
+
30
+ _UUID_CHAR = re.compile(r"^[0-9a-fA-F-]+$")
31
+
32
+
33
+ def classify_state(filename: str) -> str:
34
+ """Tag a session-file basename with its lifecycle state."""
35
+ if ".jsonl.deleted." in filename:
36
+ return "deleted"
37
+ if ".jsonl.reset." in filename:
38
+ return "reset"
39
+ if ".jsonl.bak-" in filename:
40
+ return "backup"
41
+ if filename.endswith(".jsonl.lock"):
42
+ return "lock"
43
+ if filename.endswith(".jsonl"):
44
+ return "active"
45
+ return "unknown"
46
+
47
+
48
+ def _session_uuid_of(filename: str) -> Optional[str]:
49
+ """Return the session UUID the file belongs to, or None for siblings."""
50
+ if ".trajectory" in filename or ".acp-stream" in filename:
51
+ return None
52
+ if filename.endswith(".json") and not filename.endswith(".jsonl"):
53
+ return None
54
+ idx = filename.find(".jsonl")
55
+ if idx <= 0:
56
+ return None
57
+ return filename[:idx]
58
+
59
+
60
+ def _is_transient(filename: str) -> bool:
61
+ if ".jsonl.bak-" in filename:
62
+ return False
63
+ return any(filename.endswith(s) for s in _TRANSIENT_SUFFIXES) or filename.endswith(".bak")
64
+
65
+
66
+ def is_valid_query(session_id: str) -> Tuple[bool, str]:
67
+ """Reject queries shorter than MIN_PREFIX_LEN or with non-UUID chars."""
68
+ if not session_id:
69
+ return False, "session id 不能为空"
70
+ if len(session_id) < MIN_PREFIX_LEN:
71
+ return False, (
72
+ f"session id 太短('{session_id}' 只有 {len(session_id)} 字符),"
73
+ f"至少需要 {MIN_PREFIX_LEN} 位 UUID 前缀"
74
+ )
75
+ if not _UUID_CHAR.match(session_id):
76
+ return False, f"session id 含非法字符(仅允许十六进制和连字符): '{session_id}'"
77
+ return True, ""
78
+
79
+
80
+ def resolve(
81
+ session_id: str,
82
+ base_dir: str = paths.SESSIONS_BASE,
83
+ agent: Optional[str] = None,
84
+ include_transient: bool = False,
85
+ ) -> Tuple[List[Tuple[str, str]], List[str]]:
86
+ """Resolve a UUID or prefix to its on-disk session files.
87
+
88
+ Returns ``(files, candidates)``:
89
+ - ``files``: ``[(abs_path, state), ...]`` for the resolved session,
90
+ sorted by lifecycle priority (active first). Empty when ambiguous or
91
+ when there are 0 matches.
92
+ - ``candidates``: when multiple distinct session UUIDs share the
93
+ prefix, this lists their full UUIDs sorted; otherwise empty.
94
+ """
95
+ if agent:
96
+ agent_dirs = [os.path.join(base_dir, agent)]
97
+ else:
98
+ agent_dirs = sorted(glob.glob(os.path.join(base_dir, "*")))
99
+
100
+ by_uuid: Dict[str, List[Tuple[str, str]]] = {}
101
+ for ad in agent_dirs:
102
+ sd = os.path.join(ad, "sessions")
103
+ if not os.path.isdir(sd):
104
+ continue
105
+ try:
106
+ entries = os.listdir(sd)
107
+ except OSError:
108
+ continue
109
+ for entry in entries:
110
+ if not entry.startswith(session_id):
111
+ continue
112
+ uuid = _session_uuid_of(entry)
113
+ if uuid is None:
114
+ continue
115
+ if not include_transient and _is_transient(entry):
116
+ continue
117
+ full = os.path.join(sd, entry)
118
+ if not os.path.isfile(full):
119
+ continue
120
+ state = classify_state(entry)
121
+ by_uuid.setdefault(uuid, []).append((full, state))
122
+
123
+ if not by_uuid:
124
+ return [], []
125
+ if len(by_uuid) > 1:
126
+ return [], sorted(by_uuid.keys())
127
+
128
+ files = next(iter(by_uuid.values()))
129
+ prio = {"active": 0, "lock": 1, "deleted": 2, "reset": 3, "backup": 4, "unknown": 9}
130
+ files.sort(key=lambda x: (prio.get(x[1], 9), x[0]))
131
+ return files, []
132
+
133
+
134
+ def recent_session_ids(
135
+ base_dir: str = paths.SESSIONS_BASE,
136
+ limit: int = 5,
137
+ ) -> List[str]:
138
+ """Return the most-recently-modified active session UUIDs."""
139
+ found: List[Tuple[float, str]] = []
140
+ for ad in glob.glob(os.path.join(base_dir, "*")):
141
+ sd = os.path.join(ad, "sessions")
142
+ if not os.path.isdir(sd):
143
+ continue
144
+ try:
145
+ entries = os.listdir(sd)
146
+ except OSError:
147
+ continue
148
+ for entry in entries:
149
+ if not entry.endswith(".jsonl"):
150
+ continue
151
+ uuid = _session_uuid_of(entry)
152
+ if uuid is None or entry != f"{uuid}.jsonl":
153
+ continue
154
+ path = os.path.join(sd, entry)
155
+ try:
156
+ mtime = os.path.getmtime(path)
157
+ except OSError:
158
+ continue
159
+ found.append((mtime, uuid))
160
+ found.sort(reverse=True)
161
+ return [sid for _, sid in found[:limit]]
@@ -37,17 +37,6 @@ def fmt_duration(sec) -> str:
37
37
  return f"{s/3600:.1f}h"
38
38
 
39
39
 
40
- def fmt_duration_ms(ms) -> str:
41
- if ms is None:
42
- return "?"
43
- s = float(ms) / 1000.0
44
- if s < 60:
45
- return f"{s:.1f}s"
46
- if s < 3600:
47
- return f"{s/60:.1f}min"
48
- return f"{s/3600:.1f}h"
49
-
50
-
51
40
  def fmt_age(ms_delta) -> str:
52
41
  s = abs(float(ms_delta)) / 1000
53
42
  if s < 60:
package/ocdiag/tokens.py CHANGED
@@ -16,10 +16,6 @@ def fmt_tokens(n) -> str:
16
16
  return str(n)
17
17
 
18
18
 
19
- def fmt_k(n) -> str:
20
- return fmt_tokens(n)
21
-
22
-
23
19
  def percentile(sorted_list: List[float], p: float) -> Optional[float]:
24
20
  if not sorted_list:
25
21
  return None
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "openclaw-diag-cli",
3
- "version": "0.1.3",
4
- "description": "OpenClaw read-only diagnostic CLI. Zero-dependency Python scripts wrapped in Node for npx-friendly install.",
3
+ "version": "0.2.2",
4
+ "description": "OpenClaw observer-only diagnostic CLI. Zero-dependency Python scripts wrapped in Node for npx-friendly install.",
5
5
  "keywords": [
6
6
  "openclaw",
7
7
  "diagnostic",
@@ -4,16 +4,17 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import argparse
7
- import glob
8
7
  import json
9
8
  import os
10
9
  import sys
10
+ from datetime import datetime, timezone
11
11
  from pathlib import Path
12
- from typing import Iterator, List, Optional, TextIO, Tuple
12
+ from typing import Any, Dict, List, Optional, TextIO, Tuple
13
13
 
14
14
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
15
15
 
16
- from ocdiag import paths
16
+ from ocdiag import paths, sessions
17
+ from ocdiag.sensitive import sanitize_text
17
18
 
18
19
 
19
20
  DEFAULT_BASE_DIR = paths.SESSIONS_BASE
@@ -28,38 +29,6 @@ def human_size(n: int) -> str:
28
29
  return f"{n:.1f} PB"
29
30
 
30
31
 
31
- def classify_state(filename: str) -> str:
32
- if filename.endswith(".jsonl"):
33
- return "active"
34
- if ".jsonl.deleted." in filename:
35
- return "deleted"
36
- if ".jsonl.reset." in filename:
37
- return "reset"
38
- if ".jsonl.bak-" in filename:
39
- return "backup"
40
- return "unknown"
41
-
42
-
43
- def find_session_files(session_id, base_dir=DEFAULT_BASE_DIR, agent=None):
44
- if agent:
45
- agent_dirs = [os.path.join(base_dir, agent)]
46
- else:
47
- agent_dirs = sorted(glob.glob(os.path.join(base_dir, "*")))
48
- found = []
49
- for agent_dir in agent_dirs:
50
- sessions_dir = os.path.join(agent_dir, "sessions")
51
- if not os.path.isdir(sessions_dir):
52
- continue
53
- pattern = os.path.join(sessions_dir, f"{session_id}.jsonl*")
54
- for path in sorted(glob.glob(pattern)):
55
- name = os.path.basename(path)
56
- if ".trajectory" in name:
57
- continue
58
- state = classify_state(name)
59
- found.append((path, state))
60
- return found
61
-
62
-
63
32
  def stream_records(path):
64
33
  with open(path, "r", encoding="utf-8", errors="replace") as f:
65
34
  for i, line in enumerate(f, start=1):
@@ -85,23 +54,54 @@ def write_header(out, path, state):
85
54
  out.write(SEPARATOR + "\n\n")
86
55
 
87
56
 
88
- def extract_file(path, state, out, pretty=True, type_filter=None):
57
+ def _sanitize_record(obj):
58
+ """Walk a session record and scrub free-form text content fields.
59
+
60
+ Sessions store user/assistant messages under ``message.content``. We don't
61
+ rewrite tool args or metadata: those keep structure that matters for
62
+ diagnosis. We only scrub free-form prose where secrets typically live
63
+ (user-pasted tokens, error tracebacks).
64
+ """
65
+ if not isinstance(obj, dict):
66
+ return obj
67
+ msg = obj.get("message")
68
+ if isinstance(msg, dict):
69
+ content = msg.get("content")
70
+ if isinstance(content, str):
71
+ msg["content"] = sanitize_text(content)
72
+ elif isinstance(content, list):
73
+ for part in content:
74
+ if isinstance(part, dict):
75
+ for k in ("text", "content"):
76
+ v = part.get(k)
77
+ if isinstance(v, str):
78
+ part[k] = sanitize_text(v)
79
+ for k in ("text", "summary"):
80
+ v = msg.get(k)
81
+ if isinstance(v, str):
82
+ msg[k] = sanitize_text(v)
83
+ return obj
84
+
85
+
86
+ def extract_file(path, state, out, pretty=True, type_filter=None, sanitize=True):
89
87
  write_header(out, path, state)
90
88
  written = 0
91
89
  for line_no, obj, raw, err in stream_records(path):
92
90
  if err is not None:
93
91
  out.write(f"--- Record {line_no} [PARSE ERROR: {err}] ---\n")
94
- out.write(raw + "\n\n")
92
+ out.write((sanitize_text(raw) if sanitize else raw) + "\n\n")
95
93
  written += 1
96
94
  continue
97
95
  rtype = obj.get("type", "?") if isinstance(obj, dict) else "?"
98
96
  if type_filter is not None and rtype not in type_filter:
99
97
  continue
100
98
  out.write(f"--- Record {line_no} [type: {rtype}] ---\n")
99
+ if sanitize:
100
+ obj = _sanitize_record(obj)
101
101
  if pretty:
102
102
  out.write(json.dumps(obj, indent=2, ensure_ascii=False))
103
103
  else:
104
- out.write(raw)
104
+ out.write(json.dumps(obj, ensure_ascii=False) if sanitize else raw)
105
105
  out.write("\n\n")
106
106
  written += 1
107
107
  return written
@@ -109,7 +109,23 @@ def extract_file(path, state, out, pretty=True, type_filter=None):
109
109
 
110
110
  def summarize_file(path, state, out):
111
111
  write_header(out, path, state)
112
- counts: dict = {}
112
+ info = _collect_summary(path, sanitize=False)
113
+ out.write(f"Total records: {info['total_records']}\n")
114
+ if info["parse_errors"]:
115
+ out.write(f"Parse errors: {info['parse_errors']}\n")
116
+ out.write("By type:\n")
117
+ by_type = info["by_type"]
118
+ for k in sorted(by_type, key=lambda k: -by_type[k]):
119
+ out.write(f" {k}: {by_type[k]}\n")
120
+ tr = info["time_range"]
121
+ if tr["start"] or tr["end"]:
122
+ out.write(f"Time range: {tr['start'] or '?'} → {tr['end'] or '?'}\n")
123
+ out.write("\n")
124
+
125
+
126
+ def _collect_summary(path: str, sanitize: bool = True) -> Dict[str, Any]:
127
+ """Walk one file and produce a summary block (used by text + JSON mode)."""
128
+ by_type: Dict[str, int] = {}
113
129
  total = 0
114
130
  earliest: Optional[str] = None
115
131
  latest: Optional[str] = None
@@ -120,25 +136,40 @@ def summarize_file(path, state, out):
120
136
  parse_errors += 1
121
137
  continue
122
138
  if not isinstance(obj, dict):
123
- counts["<non-object>"] = counts.get("<non-object>", 0) + 1
139
+ by_type["<non-object>"] = by_type.get("<non-object>", 0) + 1
124
140
  continue
125
141
  rtype = obj.get("type", "<no-type>")
126
- counts[rtype] = counts.get(rtype, 0) + 1
142
+ by_type[rtype] = by_type.get(rtype, 0) + 1
127
143
  ts = obj.get("timestamp")
128
144
  if isinstance(ts, str):
129
145
  if earliest is None or ts < earliest:
130
146
  earliest = ts
131
147
  if latest is None or ts > latest:
132
148
  latest = ts
133
- out.write(f"Total records: {total}\n")
134
- if parse_errors:
135
- out.write(f"Parse errors: {parse_errors}\n")
136
- out.write("By type:\n")
137
- for k in sorted(counts, key=lambda k: -counts[k]):
138
- out.write(f" {k}: {counts[k]}\n")
139
- if earliest or latest:
140
- out.write(f"Time range: {earliest or '?'} → {latest or '?'}\n")
141
- out.write("\n")
149
+ return {
150
+ "total_records": total,
151
+ "parse_errors": parse_errors,
152
+ "by_type": by_type,
153
+ "time_range": {"start": earliest, "end": latest},
154
+ }
155
+
156
+
157
+ def _collect_records(path: str, type_filter, sanitize: bool) -> List[Dict]:
158
+ out: List[Dict] = []
159
+ for line_no, obj, raw, err in stream_records(path):
160
+ if err is not None:
161
+ out.append({"line": line_no, "parse_error": err, "raw": raw})
162
+ continue
163
+ if not isinstance(obj, dict):
164
+ out.append({"line": line_no, "value": obj})
165
+ continue
166
+ rtype = obj.get("type", "?")
167
+ if type_filter is not None and rtype not in type_filter:
168
+ continue
169
+ if sanitize:
170
+ obj = _sanitize_record(obj)
171
+ out.append(obj)
172
+ return out
142
173
 
143
174
 
144
175
  def list_files(files, out):
@@ -176,32 +207,118 @@ def select_files(files, extract_all, _out):
176
207
  return []
177
208
 
178
209
 
210
+ def _resolve_or_die(session_id: str, base_dir: str, agent: Optional[str],
211
+ include_transient: bool) -> List[Tuple[str, str]]:
212
+ ok, msg = sessions.is_valid_query(session_id)
213
+ if not ok:
214
+ sys.stderr.write(f"Error: {msg}\n")
215
+ sys.exit(2)
216
+ files, candidates = sessions.resolve(
217
+ session_id, base_dir=base_dir, agent=agent,
218
+ include_transient=include_transient,
219
+ )
220
+ if candidates:
221
+ sys.stderr.write(
222
+ f"Error: 前缀 '{session_id}' 匹配多个 session(请补长前缀):\n"
223
+ )
224
+ for sid in candidates:
225
+ sys.stderr.write(f" {sid}\n")
226
+ sys.exit(1)
227
+ if not files:
228
+ sys.stderr.write(
229
+ f"Error: 找不到 session '{session_id}'(在 {base_dir} 下)"
230
+ + (f" agent={agent}" if agent else "")
231
+ + "\n"
232
+ )
233
+ suggestions = sessions.recent_session_ids(base_dir, limit=5)
234
+ if suggestions:
235
+ sys.stderr.write(" 最近的 5 个 session:\n")
236
+ for sid in suggestions:
237
+ sys.stderr.write(f" {sid}\n")
238
+ sys.stderr.write(" 提示:完整 UUID 或前缀(至少 8 位)都可。\n")
239
+ sys.exit(1)
240
+ return files
241
+
242
+
243
+ def _emit_json(session_id: str, selected: List[Tuple[str, str]],
244
+ out_fp: TextIO, summary_only: bool, type_filter,
245
+ sanitize: bool) -> None:
246
+ files_payload: List[Dict[str, Any]] = []
247
+ aggregate_total = 0
248
+ aggregate_by_type: Dict[str, int] = {}
249
+ aggregate_start: Optional[str] = None
250
+ aggregate_end: Optional[str] = None
251
+ for path, state in selected:
252
+ try:
253
+ size = os.path.getsize(path)
254
+ except OSError:
255
+ size = 0
256
+ entry: Dict[str, Any] = {
257
+ "path": path,
258
+ "state": state,
259
+ "size_bytes": size,
260
+ }
261
+ if summary_only:
262
+ s = _collect_summary(path, sanitize=sanitize)
263
+ entry["summary"] = s
264
+ aggregate_total += s["total_records"]
265
+ for k, v in s["by_type"].items():
266
+ aggregate_by_type[k] = aggregate_by_type.get(k, 0) + v
267
+ tr = s["time_range"]
268
+ if tr["start"] and (aggregate_start is None or tr["start"] < aggregate_start):
269
+ aggregate_start = tr["start"]
270
+ if tr["end"] and (aggregate_end is None or tr["end"] > aggregate_end):
271
+ aggregate_end = tr["end"]
272
+ else:
273
+ entry["records"] = _collect_records(path, type_filter, sanitize=sanitize)
274
+ files_payload.append(entry)
275
+
276
+ payload: Dict[str, Any] = {
277
+ "session_id": session_id,
278
+ "files": files_payload,
279
+ "generated_at": datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
280
+ "sanitized": sanitize,
281
+ }
282
+ if summary_only:
283
+ payload["summary"] = {
284
+ "total_records": aggregate_total,
285
+ "by_type": aggregate_by_type,
286
+ "time_range": {"start": aggregate_start, "end": aggregate_end},
287
+ }
288
+ out_fp.write(json.dumps(payload, ensure_ascii=False, indent=2))
289
+ out_fp.write("\n")
290
+
291
+
179
292
  def main() -> int:
180
293
  p = argparse.ArgumentParser(
294
+ prog=os.environ.get("OPENCLAW_DIAG_PROG") or None,
181
295
  description="Extract OpenClaw session JSONL files into human-readable format.",
182
296
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
183
297
  )
184
- p.add_argument("session_id", help="Session UUID to extract")
298
+ p.add_argument("session_id", help="Session UUID (full or 8+ char prefix)")
185
299
  p.add_argument("-o", "--output", help="Write output to FILE instead of stdout")
186
300
  p.add_argument("-a", "--all", action="store_true",
187
- help="Extract all versions found (active + deleted + reset + backup)")
188
- p.add_argument("--list", action="store_true", help="List found files; do not extract")
301
+ help="Extract all versions (active + reset + deleted + backup + lock)")
302
+ p.add_argument("--list", action="store_true",
303
+ help="List all matching files (incl. .lock); do not extract")
189
304
  p.add_argument("--agent", help="Limit search to specific agent directory")
190
305
  p.add_argument("--base-dir", default=DEFAULT_BASE_DIR, help="Override base directory")
191
306
  p.add_argument("--no-pretty", action="store_true", help="Output raw JSON lines")
192
307
  p.add_argument("--types", help="Filter by record type (comma-separated, e.g. 'message,toolCall')")
193
308
  p.add_argument("--summary", action="store_true",
194
309
  help="Show record-count summary instead of full extraction")
310
+ p.add_argument("--json", action="store_true",
311
+ help="Emit structured JSON (compatible with state collectors' --json)")
312
+ p.add_argument("--unmask", action="store_true",
313
+ help="Disable default sanitization of secret-shaped substrings "
314
+ "in message content (off = scrubbed)")
195
315
  args = p.parse_args()
196
316
 
197
- files = find_session_files(args.session_id, args.base_dir, args.agent)
198
- if not files:
199
- sys.stderr.write(
200
- f"Error: no files found for session ID '{args.session_id}' under {args.base_dir}"
201
- + (f" (agent={args.agent})" if args.agent else "")
202
- + "\n"
203
- )
204
- return 1
317
+ # --list and --all see lock files; default mode hides them so non-interactive
318
+ # callers (cron, jq pipes) don't trip on a transient .jsonl.lock sibling.
319
+ include_transient = bool(args.all or args.list)
320
+ files = _resolve_or_die(args.session_id, args.base_dir, args.agent,
321
+ include_transient=include_transient)
205
322
 
206
323
  if args.list:
207
324
  list_files(files, sys.stdout)
@@ -226,12 +343,18 @@ def main() -> int:
226
343
  out_fp = sys.stdout
227
344
 
228
345
  try:
229
- for path, state in selected:
230
- if args.summary:
231
- summarize_file(path, state, out_fp)
232
- else:
233
- extract_file(path, state, out_fp, pretty=not args.no_pretty,
234
- type_filter=type_filter)
346
+ if args.json:
347
+ _emit_json(args.session_id, selected, out_fp,
348
+ summary_only=args.summary,
349
+ type_filter=type_filter,
350
+ sanitize=not args.unmask)
351
+ else:
352
+ for path, state in selected:
353
+ if args.summary:
354
+ summarize_file(path, state, out_fp)
355
+ else:
356
+ extract_file(path, state, out_fp, pretty=not args.no_pretty,
357
+ type_filter=type_filter, sanitize=not args.unmask)
235
358
  except BrokenPipeError:
236
359
  try:
237
360
  sys.stdout.flush()