openclaw-diag-cli 0.1.3 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -71
- package/bin/ocdiag +0 -1
- package/bin/openclaw-diag.js +65 -176
- package/diag/01_sys_health.py +0 -2
- package/diag/02_environment.py +32 -6
- package/diag/03_configuration.py +4 -1
- package/diag/04_gateway.py +30 -8
- package/diag/05_recent_errors.py +24 -14
- package/diag/06_cron_jobs.py +4 -41
- package/diag/07_performance.py +114 -42
- package/diag/08_sessions.py +2 -54
- package/diag/09_plugin_diag.py +52 -25
- package/diag/10_shell_history.py +28 -10
- package/lib/__pycache__/bundle.cpython-310.pyc +0 -0
- package/lib/bundle.py +6 -13
- package/ocdiag/__init__.py +1 -1
- package/ocdiag/__pycache__/__init__.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/cli.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/dispatcher.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/doctor.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/jsonlog.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/output.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/paths.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/recent_logs.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/sensitive.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/sessions.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/timeutil.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/tokens.cpython-310.pyc +0 -0
- package/ocdiag/cli.py +16 -1
- package/ocdiag/dispatcher.py +140 -53
- package/ocdiag/doctor.py +162 -0
- package/ocdiag/jsonlog.py +0 -5
- package/ocdiag/paths.py +0 -17
- package/ocdiag/recent_logs.py +0 -3
- package/ocdiag/sensitive.py +95 -1
- package/ocdiag/sessions.py +161 -0
- package/ocdiag/timeutil.py +0 -11
- package/ocdiag/tokens.py +0 -4
- package/package.json +2 -2
- package/tools/oc_session_extract.py +190 -67
- package/tools/oc_session_trace.py +48 -46
package/ocdiag/sensitive.py
CHANGED
|
@@ -1,4 +1,23 @@
|
|
|
1
|
-
"""Mask sensitive config values
|
|
1
|
+
"""Mask sensitive config values + sanitize free-form text.
|
|
2
|
+
|
|
3
|
+
Two layers:
|
|
4
|
+
|
|
5
|
+
1. ``mask`` / ``safe_val`` / ``is_sensitive_key`` — used when we already know
|
|
6
|
+
we're looking at a config key/value pair (configuration flatten, env vars).
|
|
7
|
+
Masking is keyed off the *key name*.
|
|
8
|
+
|
|
9
|
+
2. ``sanitize_text`` — used when scanning free-form text (shell history lines,
|
|
10
|
+
plugin error messages, systemd unit files, session message bodies). We don't
|
|
11
|
+
know the structure, so we run a pattern-based scrubber. Best-effort: the
|
|
12
|
+
patterns below cover the common token shapes (Anthropic/OpenAI sk-, GitHub
|
|
13
|
+
ghp_/gho_/ghs_/github_pat_, npm npm_, AWS AKIA, ``Bearer xxx``, URL
|
|
14
|
+
credentials, ``KEY=value`` with secret-ish key). It will miss bespoke or
|
|
15
|
+
obfuscated formats — callers who need stronger guarantees should mask the
|
|
16
|
+
whole field.
|
|
17
|
+
|
|
18
|
+
The ``--unmask`` flag, declared in ``ocdiag.cli``, propagates to call sites
|
|
19
|
+
that opt-in to honouring it (currently the session extract tool).
|
|
20
|
+
"""
|
|
2
21
|
|
|
3
22
|
from __future__ import annotations
|
|
4
23
|
|
|
@@ -39,3 +58,78 @@ def safe_val(key: str, val, max_len: int = 300) -> str:
|
|
|
39
58
|
return mask(val) if val else '""'
|
|
40
59
|
s = str(val)
|
|
41
60
|
return s[:max_len] + "..." if len(s) > max_len else s
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ── sanitize_text ──
|
|
64
|
+
|
|
65
|
+
# Token shapes worth scrubbing by themselves (no key=value context).
|
|
66
|
+
# Each pattern matches the *whole* secret; we replace with `<***>` keeping
|
|
67
|
+
# the leading prefix so the reader can still tell what kind of secret it was.
|
|
68
|
+
_TOKEN_PATTERNS = [
|
|
69
|
+
# Anthropic / OpenAI style (`sk-...` / `sk-ant-...`)
|
|
70
|
+
(re.compile(r"\b(sk-(?:ant-)?[A-Za-z0-9_\-]{16,})"), "sk-<***>"),
|
|
71
|
+
# GitHub PAT family
|
|
72
|
+
(re.compile(r"\b(gh[posu]_[A-Za-z0-9]{20,})"), "<gh-token>"),
|
|
73
|
+
(re.compile(r"\b(github_pat_[A-Za-z0-9_]{20,})"), "<github_pat>"),
|
|
74
|
+
# npm
|
|
75
|
+
(re.compile(r"\b(npm_[A-Za-z0-9]{30,})"), "<npm_token>"),
|
|
76
|
+
# AWS access key id
|
|
77
|
+
(re.compile(r"\b(AKIA[0-9A-Z]{16})"), "<AKIA-***>"),
|
|
78
|
+
# Authorization headers
|
|
79
|
+
(re.compile(r"(Bearer\s+)([A-Za-z0-9_\-\.=]{8,})", re.IGNORECASE), r"\1<***>"),
|
|
80
|
+
# URLs with embedded credentials: scheme://user:pass@host
|
|
81
|
+
(re.compile(r"([a-zA-Z][a-zA-Z0-9+\-.]*://)([^/\s:@]+):([^/\s@]+)@"), r"\1<user>:<***>@"),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# KEY=VALUE / KEY: VALUE in free text where the key looks secret-ish.
|
|
85
|
+
# Use SENSITIVE_PATTERN over the key name; match value up to whitespace, quote,
|
|
86
|
+
# or end-of-line. Three forms:
|
|
87
|
+
# KEY=value (env var, dotenv)
|
|
88
|
+
# KEY="value" (shell quoted)
|
|
89
|
+
# KEY: value (yaml-ish)
|
|
90
|
+
_KV_BARE = re.compile(
|
|
91
|
+
r"\b([A-Za-z_][A-Za-z0-9_\-\.]*"
|
|
92
|
+
r"(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL|AUTH|PRIVATE|SIGNING)[A-Za-z0-9_\-\.]*)"
|
|
93
|
+
r"\s*=\s*([^\s\"';#]+)",
|
|
94
|
+
re.IGNORECASE,
|
|
95
|
+
)
|
|
96
|
+
_KV_QUOTED = re.compile(
|
|
97
|
+
r"\b([A-Za-z_][A-Za-z0-9_\-\.]*"
|
|
98
|
+
r"(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL|AUTH|PRIVATE|SIGNING)[A-Za-z0-9_\-\.]*)"
|
|
99
|
+
r"\s*=\s*([\"'])([^\"']+)\2",
|
|
100
|
+
re.IGNORECASE,
|
|
101
|
+
)
|
|
102
|
+
_KV_COLON = re.compile(
|
|
103
|
+
r"\b([A-Za-z_][A-Za-z0-9_\-\.]*"
|
|
104
|
+
r"(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL|AUTH|PRIVATE|SIGNING)[A-Za-z0-9_\-\.]*)"
|
|
105
|
+
r"\s*:\s*([^\s\"';#,}\]]+)",
|
|
106
|
+
re.IGNORECASE,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def sanitize_text(text: str, context: str = "generic") -> str:
|
|
111
|
+
"""Scrub well-known secret shapes from free-form text.
|
|
112
|
+
|
|
113
|
+
Best-effort, not a guarantee. Returns the text unchanged if it's not a str.
|
|
114
|
+
"""
|
|
115
|
+
if not isinstance(text, str) or not text:
|
|
116
|
+
return text
|
|
117
|
+
|
|
118
|
+
# Order: longer/more-specific (KV with quotes) first, then bare KV, then
|
|
119
|
+
# bare token shapes. KV passes also catch things like `API_KEY=abc` where
|
|
120
|
+
# the value would not match a token pattern.
|
|
121
|
+
def _kv_quoted_sub(m):
|
|
122
|
+
return f"{m.group(1)}={m.group(2)}<***>{m.group(2)}"
|
|
123
|
+
|
|
124
|
+
def _kv_bare_sub(m):
|
|
125
|
+
return f"{m.group(1)}=<***>"
|
|
126
|
+
|
|
127
|
+
def _kv_colon_sub(m):
|
|
128
|
+
return f"{m.group(1)}: <***>"
|
|
129
|
+
|
|
130
|
+
text = _KV_QUOTED.sub(_kv_quoted_sub, text)
|
|
131
|
+
text = _KV_BARE.sub(_kv_bare_sub, text)
|
|
132
|
+
text = _KV_COLON.sub(_kv_colon_sub, text)
|
|
133
|
+
for pat, repl in _TOKEN_PATTERNS:
|
|
134
|
+
text = pat.sub(repl, text)
|
|
135
|
+
return text
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Shared session-file lookup utilities for trace/extract.
|
|
2
|
+
|
|
3
|
+
A "session" is identified by a UUID. On disk it can have multiple files:
|
|
4
|
+
<uuid>.jsonl — active
|
|
5
|
+
<uuid>.jsonl.lock — write lock (transient, filtered by default)
|
|
6
|
+
<uuid>.jsonl.deleted.<ts> — soft-deleted
|
|
7
|
+
<uuid>.jsonl.reset.<ts> — pre-reset snapshot
|
|
8
|
+
<uuid>.jsonl.bak-<pid> — backup snapshot
|
|
9
|
+
|
|
10
|
+
Sibling artifacts (NOT session content):
|
|
11
|
+
<uuid>.trajectory.jsonl, <uuid>.acp-stream.jsonl, <uuid>.json
|
|
12
|
+
|
|
13
|
+
Callers may pass a full UUID or a prefix of at least MIN_PREFIX_LEN chars.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import glob
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
from typing import Dict, List, Optional, Tuple
|
|
22
|
+
|
|
23
|
+
from . import paths
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
MIN_PREFIX_LEN = 8
|
|
27
|
+
|
|
28
|
+
_TRANSIENT_SUFFIXES = (".lock", ".tmp", ".swp")
|
|
29
|
+
|
|
30
|
+
_UUID_CHAR = re.compile(r"^[0-9a-fA-F-]+$")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def classify_state(filename: str) -> str:
|
|
34
|
+
"""Tag a session-file basename with its lifecycle state."""
|
|
35
|
+
if ".jsonl.deleted." in filename:
|
|
36
|
+
return "deleted"
|
|
37
|
+
if ".jsonl.reset." in filename:
|
|
38
|
+
return "reset"
|
|
39
|
+
if ".jsonl.bak-" in filename:
|
|
40
|
+
return "backup"
|
|
41
|
+
if filename.endswith(".jsonl.lock"):
|
|
42
|
+
return "lock"
|
|
43
|
+
if filename.endswith(".jsonl"):
|
|
44
|
+
return "active"
|
|
45
|
+
return "unknown"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _session_uuid_of(filename: str) -> Optional[str]:
|
|
49
|
+
"""Return the session UUID the file belongs to, or None for siblings."""
|
|
50
|
+
if ".trajectory" in filename or ".acp-stream" in filename:
|
|
51
|
+
return None
|
|
52
|
+
if filename.endswith(".json") and not filename.endswith(".jsonl"):
|
|
53
|
+
return None
|
|
54
|
+
idx = filename.find(".jsonl")
|
|
55
|
+
if idx <= 0:
|
|
56
|
+
return None
|
|
57
|
+
return filename[:idx]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_transient(filename: str) -> bool:
|
|
61
|
+
if ".jsonl.bak-" in filename:
|
|
62
|
+
return False
|
|
63
|
+
return any(filename.endswith(s) for s in _TRANSIENT_SUFFIXES) or filename.endswith(".bak")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def is_valid_query(session_id: str) -> Tuple[bool, str]:
|
|
67
|
+
"""Reject queries shorter than MIN_PREFIX_LEN or with non-UUID chars."""
|
|
68
|
+
if not session_id:
|
|
69
|
+
return False, "session id 不能为空"
|
|
70
|
+
if len(session_id) < MIN_PREFIX_LEN:
|
|
71
|
+
return False, (
|
|
72
|
+
f"session id 太短('{session_id}' 只有 {len(session_id)} 字符),"
|
|
73
|
+
f"至少需要 {MIN_PREFIX_LEN} 位 UUID 前缀"
|
|
74
|
+
)
|
|
75
|
+
if not _UUID_CHAR.match(session_id):
|
|
76
|
+
return False, f"session id 含非法字符(仅允许十六进制和连字符): '{session_id}'"
|
|
77
|
+
return True, ""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def resolve(
|
|
81
|
+
session_id: str,
|
|
82
|
+
base_dir: str = paths.SESSIONS_BASE,
|
|
83
|
+
agent: Optional[str] = None,
|
|
84
|
+
include_transient: bool = False,
|
|
85
|
+
) -> Tuple[List[Tuple[str, str]], List[str]]:
|
|
86
|
+
"""Resolve a UUID or prefix to its on-disk session files.
|
|
87
|
+
|
|
88
|
+
Returns ``(files, candidates)``:
|
|
89
|
+
- ``files``: ``[(abs_path, state), ...]`` for the resolved session,
|
|
90
|
+
sorted by lifecycle priority (active first). Empty when ambiguous or
|
|
91
|
+
when there are 0 matches.
|
|
92
|
+
- ``candidates``: when multiple distinct session UUIDs share the
|
|
93
|
+
prefix, this lists their full UUIDs sorted; otherwise empty.
|
|
94
|
+
"""
|
|
95
|
+
if agent:
|
|
96
|
+
agent_dirs = [os.path.join(base_dir, agent)]
|
|
97
|
+
else:
|
|
98
|
+
agent_dirs = sorted(glob.glob(os.path.join(base_dir, "*")))
|
|
99
|
+
|
|
100
|
+
by_uuid: Dict[str, List[Tuple[str, str]]] = {}
|
|
101
|
+
for ad in agent_dirs:
|
|
102
|
+
sd = os.path.join(ad, "sessions")
|
|
103
|
+
if not os.path.isdir(sd):
|
|
104
|
+
continue
|
|
105
|
+
try:
|
|
106
|
+
entries = os.listdir(sd)
|
|
107
|
+
except OSError:
|
|
108
|
+
continue
|
|
109
|
+
for entry in entries:
|
|
110
|
+
if not entry.startswith(session_id):
|
|
111
|
+
continue
|
|
112
|
+
uuid = _session_uuid_of(entry)
|
|
113
|
+
if uuid is None:
|
|
114
|
+
continue
|
|
115
|
+
if not include_transient and _is_transient(entry):
|
|
116
|
+
continue
|
|
117
|
+
full = os.path.join(sd, entry)
|
|
118
|
+
if not os.path.isfile(full):
|
|
119
|
+
continue
|
|
120
|
+
state = classify_state(entry)
|
|
121
|
+
by_uuid.setdefault(uuid, []).append((full, state))
|
|
122
|
+
|
|
123
|
+
if not by_uuid:
|
|
124
|
+
return [], []
|
|
125
|
+
if len(by_uuid) > 1:
|
|
126
|
+
return [], sorted(by_uuid.keys())
|
|
127
|
+
|
|
128
|
+
files = next(iter(by_uuid.values()))
|
|
129
|
+
prio = {"active": 0, "lock": 1, "deleted": 2, "reset": 3, "backup": 4, "unknown": 9}
|
|
130
|
+
files.sort(key=lambda x: (prio.get(x[1], 9), x[0]))
|
|
131
|
+
return files, []
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def recent_session_ids(
|
|
135
|
+
base_dir: str = paths.SESSIONS_BASE,
|
|
136
|
+
limit: int = 5,
|
|
137
|
+
) -> List[str]:
|
|
138
|
+
"""Return the most-recently-modified active session UUIDs."""
|
|
139
|
+
found: List[Tuple[float, str]] = []
|
|
140
|
+
for ad in glob.glob(os.path.join(base_dir, "*")):
|
|
141
|
+
sd = os.path.join(ad, "sessions")
|
|
142
|
+
if not os.path.isdir(sd):
|
|
143
|
+
continue
|
|
144
|
+
try:
|
|
145
|
+
entries = os.listdir(sd)
|
|
146
|
+
except OSError:
|
|
147
|
+
continue
|
|
148
|
+
for entry in entries:
|
|
149
|
+
if not entry.endswith(".jsonl"):
|
|
150
|
+
continue
|
|
151
|
+
uuid = _session_uuid_of(entry)
|
|
152
|
+
if uuid is None or entry != f"{uuid}.jsonl":
|
|
153
|
+
continue
|
|
154
|
+
path = os.path.join(sd, entry)
|
|
155
|
+
try:
|
|
156
|
+
mtime = os.path.getmtime(path)
|
|
157
|
+
except OSError:
|
|
158
|
+
continue
|
|
159
|
+
found.append((mtime, uuid))
|
|
160
|
+
found.sort(reverse=True)
|
|
161
|
+
return [sid for _, sid in found[:limit]]
|
package/ocdiag/timeutil.py
CHANGED
|
@@ -37,17 +37,6 @@ def fmt_duration(sec) -> str:
|
|
|
37
37
|
return f"{s/3600:.1f}h"
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def fmt_duration_ms(ms) -> str:
|
|
41
|
-
if ms is None:
|
|
42
|
-
return "?"
|
|
43
|
-
s = float(ms) / 1000.0
|
|
44
|
-
if s < 60:
|
|
45
|
-
return f"{s:.1f}s"
|
|
46
|
-
if s < 3600:
|
|
47
|
-
return f"{s/60:.1f}min"
|
|
48
|
-
return f"{s/3600:.1f}h"
|
|
49
|
-
|
|
50
|
-
|
|
51
40
|
def fmt_age(ms_delta) -> str:
|
|
52
41
|
s = abs(float(ms_delta)) / 1000
|
|
53
42
|
if s < 60:
|
package/ocdiag/tokens.py
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "openclaw-diag-cli",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "OpenClaw
|
|
3
|
+
"version": "0.2.2",
|
|
4
|
+
"description": "OpenClaw observer-only diagnostic CLI. Zero-dependency Python scripts wrapped in Node for npx-friendly install.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"openclaw",
|
|
7
7
|
"diagnostic",
|
|
@@ -4,16 +4,17 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import argparse
|
|
7
|
-
import glob
|
|
8
7
|
import json
|
|
9
8
|
import os
|
|
10
9
|
import sys
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Any, Dict, List, Optional, TextIO, Tuple
|
|
13
13
|
|
|
14
14
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
15
15
|
|
|
16
|
-
from ocdiag import paths
|
|
16
|
+
from ocdiag import paths, sessions
|
|
17
|
+
from ocdiag.sensitive import sanitize_text
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
DEFAULT_BASE_DIR = paths.SESSIONS_BASE
|
|
@@ -28,38 +29,6 @@ def human_size(n: int) -> str:
|
|
|
28
29
|
return f"{n:.1f} PB"
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
def classify_state(filename: str) -> str:
|
|
32
|
-
if filename.endswith(".jsonl"):
|
|
33
|
-
return "active"
|
|
34
|
-
if ".jsonl.deleted." in filename:
|
|
35
|
-
return "deleted"
|
|
36
|
-
if ".jsonl.reset." in filename:
|
|
37
|
-
return "reset"
|
|
38
|
-
if ".jsonl.bak-" in filename:
|
|
39
|
-
return "backup"
|
|
40
|
-
return "unknown"
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def find_session_files(session_id, base_dir=DEFAULT_BASE_DIR, agent=None):
|
|
44
|
-
if agent:
|
|
45
|
-
agent_dirs = [os.path.join(base_dir, agent)]
|
|
46
|
-
else:
|
|
47
|
-
agent_dirs = sorted(glob.glob(os.path.join(base_dir, "*")))
|
|
48
|
-
found = []
|
|
49
|
-
for agent_dir in agent_dirs:
|
|
50
|
-
sessions_dir = os.path.join(agent_dir, "sessions")
|
|
51
|
-
if not os.path.isdir(sessions_dir):
|
|
52
|
-
continue
|
|
53
|
-
pattern = os.path.join(sessions_dir, f"{session_id}.jsonl*")
|
|
54
|
-
for path in sorted(glob.glob(pattern)):
|
|
55
|
-
name = os.path.basename(path)
|
|
56
|
-
if ".trajectory" in name:
|
|
57
|
-
continue
|
|
58
|
-
state = classify_state(name)
|
|
59
|
-
found.append((path, state))
|
|
60
|
-
return found
|
|
61
|
-
|
|
62
|
-
|
|
63
32
|
def stream_records(path):
|
|
64
33
|
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
65
34
|
for i, line in enumerate(f, start=1):
|
|
@@ -85,23 +54,54 @@ def write_header(out, path, state):
|
|
|
85
54
|
out.write(SEPARATOR + "\n\n")
|
|
86
55
|
|
|
87
56
|
|
|
88
|
-
def
|
|
57
|
+
def _sanitize_record(obj):
|
|
58
|
+
"""Walk a session record and scrub free-form text content fields.
|
|
59
|
+
|
|
60
|
+
Sessions store user/assistant messages under ``message.content``. We don't
|
|
61
|
+
rewrite tool args or metadata: those keep structure that matters for
|
|
62
|
+
diagnosis. We only scrub free-form prose where secrets typically live
|
|
63
|
+
(user-pasted tokens, error tracebacks).
|
|
64
|
+
"""
|
|
65
|
+
if not isinstance(obj, dict):
|
|
66
|
+
return obj
|
|
67
|
+
msg = obj.get("message")
|
|
68
|
+
if isinstance(msg, dict):
|
|
69
|
+
content = msg.get("content")
|
|
70
|
+
if isinstance(content, str):
|
|
71
|
+
msg["content"] = sanitize_text(content)
|
|
72
|
+
elif isinstance(content, list):
|
|
73
|
+
for part in content:
|
|
74
|
+
if isinstance(part, dict):
|
|
75
|
+
for k in ("text", "content"):
|
|
76
|
+
v = part.get(k)
|
|
77
|
+
if isinstance(v, str):
|
|
78
|
+
part[k] = sanitize_text(v)
|
|
79
|
+
for k in ("text", "summary"):
|
|
80
|
+
v = msg.get(k)
|
|
81
|
+
if isinstance(v, str):
|
|
82
|
+
msg[k] = sanitize_text(v)
|
|
83
|
+
return obj
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def extract_file(path, state, out, pretty=True, type_filter=None, sanitize=True):
|
|
89
87
|
write_header(out, path, state)
|
|
90
88
|
written = 0
|
|
91
89
|
for line_no, obj, raw, err in stream_records(path):
|
|
92
90
|
if err is not None:
|
|
93
91
|
out.write(f"--- Record {line_no} [PARSE ERROR: {err}] ---\n")
|
|
94
|
-
out.write(raw + "\n\n")
|
|
92
|
+
out.write((sanitize_text(raw) if sanitize else raw) + "\n\n")
|
|
95
93
|
written += 1
|
|
96
94
|
continue
|
|
97
95
|
rtype = obj.get("type", "?") if isinstance(obj, dict) else "?"
|
|
98
96
|
if type_filter is not None and rtype not in type_filter:
|
|
99
97
|
continue
|
|
100
98
|
out.write(f"--- Record {line_no} [type: {rtype}] ---\n")
|
|
99
|
+
if sanitize:
|
|
100
|
+
obj = _sanitize_record(obj)
|
|
101
101
|
if pretty:
|
|
102
102
|
out.write(json.dumps(obj, indent=2, ensure_ascii=False))
|
|
103
103
|
else:
|
|
104
|
-
out.write(raw)
|
|
104
|
+
out.write(json.dumps(obj, ensure_ascii=False) if sanitize else raw)
|
|
105
105
|
out.write("\n\n")
|
|
106
106
|
written += 1
|
|
107
107
|
return written
|
|
@@ -109,7 +109,23 @@ def extract_file(path, state, out, pretty=True, type_filter=None):
|
|
|
109
109
|
|
|
110
110
|
def summarize_file(path, state, out):
|
|
111
111
|
write_header(out, path, state)
|
|
112
|
-
|
|
112
|
+
info = _collect_summary(path, sanitize=False)
|
|
113
|
+
out.write(f"Total records: {info['total_records']}\n")
|
|
114
|
+
if info["parse_errors"]:
|
|
115
|
+
out.write(f"Parse errors: {info['parse_errors']}\n")
|
|
116
|
+
out.write("By type:\n")
|
|
117
|
+
by_type = info["by_type"]
|
|
118
|
+
for k in sorted(by_type, key=lambda k: -by_type[k]):
|
|
119
|
+
out.write(f" {k}: {by_type[k]}\n")
|
|
120
|
+
tr = info["time_range"]
|
|
121
|
+
if tr["start"] or tr["end"]:
|
|
122
|
+
out.write(f"Time range: {tr['start'] or '?'} → {tr['end'] or '?'}\n")
|
|
123
|
+
out.write("\n")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _collect_summary(path: str, sanitize: bool = True) -> Dict[str, Any]:
|
|
127
|
+
"""Walk one file and produce a summary block (used by text + JSON mode)."""
|
|
128
|
+
by_type: Dict[str, int] = {}
|
|
113
129
|
total = 0
|
|
114
130
|
earliest: Optional[str] = None
|
|
115
131
|
latest: Optional[str] = None
|
|
@@ -120,25 +136,40 @@ def summarize_file(path, state, out):
|
|
|
120
136
|
parse_errors += 1
|
|
121
137
|
continue
|
|
122
138
|
if not isinstance(obj, dict):
|
|
123
|
-
|
|
139
|
+
by_type["<non-object>"] = by_type.get("<non-object>", 0) + 1
|
|
124
140
|
continue
|
|
125
141
|
rtype = obj.get("type", "<no-type>")
|
|
126
|
-
|
|
142
|
+
by_type[rtype] = by_type.get(rtype, 0) + 1
|
|
127
143
|
ts = obj.get("timestamp")
|
|
128
144
|
if isinstance(ts, str):
|
|
129
145
|
if earliest is None or ts < earliest:
|
|
130
146
|
earliest = ts
|
|
131
147
|
if latest is None or ts > latest:
|
|
132
148
|
latest = ts
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
149
|
+
return {
|
|
150
|
+
"total_records": total,
|
|
151
|
+
"parse_errors": parse_errors,
|
|
152
|
+
"by_type": by_type,
|
|
153
|
+
"time_range": {"start": earliest, "end": latest},
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _collect_records(path: str, type_filter, sanitize: bool) -> List[Dict]:
|
|
158
|
+
out: List[Dict] = []
|
|
159
|
+
for line_no, obj, raw, err in stream_records(path):
|
|
160
|
+
if err is not None:
|
|
161
|
+
out.append({"line": line_no, "parse_error": err, "raw": raw})
|
|
162
|
+
continue
|
|
163
|
+
if not isinstance(obj, dict):
|
|
164
|
+
out.append({"line": line_no, "value": obj})
|
|
165
|
+
continue
|
|
166
|
+
rtype = obj.get("type", "?")
|
|
167
|
+
if type_filter is not None and rtype not in type_filter:
|
|
168
|
+
continue
|
|
169
|
+
if sanitize:
|
|
170
|
+
obj = _sanitize_record(obj)
|
|
171
|
+
out.append(obj)
|
|
172
|
+
return out
|
|
142
173
|
|
|
143
174
|
|
|
144
175
|
def list_files(files, out):
|
|
@@ -176,32 +207,118 @@ def select_files(files, extract_all, _out):
|
|
|
176
207
|
return []
|
|
177
208
|
|
|
178
209
|
|
|
210
|
+
def _resolve_or_die(session_id: str, base_dir: str, agent: Optional[str],
|
|
211
|
+
include_transient: bool) -> List[Tuple[str, str]]:
|
|
212
|
+
ok, msg = sessions.is_valid_query(session_id)
|
|
213
|
+
if not ok:
|
|
214
|
+
sys.stderr.write(f"Error: {msg}\n")
|
|
215
|
+
sys.exit(2)
|
|
216
|
+
files, candidates = sessions.resolve(
|
|
217
|
+
session_id, base_dir=base_dir, agent=agent,
|
|
218
|
+
include_transient=include_transient,
|
|
219
|
+
)
|
|
220
|
+
if candidates:
|
|
221
|
+
sys.stderr.write(
|
|
222
|
+
f"Error: 前缀 '{session_id}' 匹配多个 session(请补长前缀):\n"
|
|
223
|
+
)
|
|
224
|
+
for sid in candidates:
|
|
225
|
+
sys.stderr.write(f" {sid}\n")
|
|
226
|
+
sys.exit(1)
|
|
227
|
+
if not files:
|
|
228
|
+
sys.stderr.write(
|
|
229
|
+
f"Error: 找不到 session '{session_id}'(在 {base_dir} 下)"
|
|
230
|
+
+ (f" agent={agent}" if agent else "")
|
|
231
|
+
+ "\n"
|
|
232
|
+
)
|
|
233
|
+
suggestions = sessions.recent_session_ids(base_dir, limit=5)
|
|
234
|
+
if suggestions:
|
|
235
|
+
sys.stderr.write(" 最近的 5 个 session:\n")
|
|
236
|
+
for sid in suggestions:
|
|
237
|
+
sys.stderr.write(f" {sid}\n")
|
|
238
|
+
sys.stderr.write(" 提示:完整 UUID 或前缀(至少 8 位)都可。\n")
|
|
239
|
+
sys.exit(1)
|
|
240
|
+
return files
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _emit_json(session_id: str, selected: List[Tuple[str, str]],
|
|
244
|
+
out_fp: TextIO, summary_only: bool, type_filter,
|
|
245
|
+
sanitize: bool) -> None:
|
|
246
|
+
files_payload: List[Dict[str, Any]] = []
|
|
247
|
+
aggregate_total = 0
|
|
248
|
+
aggregate_by_type: Dict[str, int] = {}
|
|
249
|
+
aggregate_start: Optional[str] = None
|
|
250
|
+
aggregate_end: Optional[str] = None
|
|
251
|
+
for path, state in selected:
|
|
252
|
+
try:
|
|
253
|
+
size = os.path.getsize(path)
|
|
254
|
+
except OSError:
|
|
255
|
+
size = 0
|
|
256
|
+
entry: Dict[str, Any] = {
|
|
257
|
+
"path": path,
|
|
258
|
+
"state": state,
|
|
259
|
+
"size_bytes": size,
|
|
260
|
+
}
|
|
261
|
+
if summary_only:
|
|
262
|
+
s = _collect_summary(path, sanitize=sanitize)
|
|
263
|
+
entry["summary"] = s
|
|
264
|
+
aggregate_total += s["total_records"]
|
|
265
|
+
for k, v in s["by_type"].items():
|
|
266
|
+
aggregate_by_type[k] = aggregate_by_type.get(k, 0) + v
|
|
267
|
+
tr = s["time_range"]
|
|
268
|
+
if tr["start"] and (aggregate_start is None or tr["start"] < aggregate_start):
|
|
269
|
+
aggregate_start = tr["start"]
|
|
270
|
+
if tr["end"] and (aggregate_end is None or tr["end"] > aggregate_end):
|
|
271
|
+
aggregate_end = tr["end"]
|
|
272
|
+
else:
|
|
273
|
+
entry["records"] = _collect_records(path, type_filter, sanitize=sanitize)
|
|
274
|
+
files_payload.append(entry)
|
|
275
|
+
|
|
276
|
+
payload: Dict[str, Any] = {
|
|
277
|
+
"session_id": session_id,
|
|
278
|
+
"files": files_payload,
|
|
279
|
+
"generated_at": datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
280
|
+
"sanitized": sanitize,
|
|
281
|
+
}
|
|
282
|
+
if summary_only:
|
|
283
|
+
payload["summary"] = {
|
|
284
|
+
"total_records": aggregate_total,
|
|
285
|
+
"by_type": aggregate_by_type,
|
|
286
|
+
"time_range": {"start": aggregate_start, "end": aggregate_end},
|
|
287
|
+
}
|
|
288
|
+
out_fp.write(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
289
|
+
out_fp.write("\n")
|
|
290
|
+
|
|
291
|
+
|
|
179
292
|
def main() -> int:
|
|
180
293
|
p = argparse.ArgumentParser(
|
|
294
|
+
prog=os.environ.get("OPENCLAW_DIAG_PROG") or None,
|
|
181
295
|
description="Extract OpenClaw session JSONL files into human-readable format.",
|
|
182
296
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
183
297
|
)
|
|
184
|
-
p.add_argument("session_id", help="Session UUID
|
|
298
|
+
p.add_argument("session_id", help="Session UUID (full or 8+ char prefix)")
|
|
185
299
|
p.add_argument("-o", "--output", help="Write output to FILE instead of stdout")
|
|
186
300
|
p.add_argument("-a", "--all", action="store_true",
|
|
187
|
-
help="Extract all versions
|
|
188
|
-
p.add_argument("--list", action="store_true",
|
|
301
|
+
help="Extract all versions (active + reset + deleted + backup + lock)")
|
|
302
|
+
p.add_argument("--list", action="store_true",
|
|
303
|
+
help="List all matching files (incl. .lock); do not extract")
|
|
189
304
|
p.add_argument("--agent", help="Limit search to specific agent directory")
|
|
190
305
|
p.add_argument("--base-dir", default=DEFAULT_BASE_DIR, help="Override base directory")
|
|
191
306
|
p.add_argument("--no-pretty", action="store_true", help="Output raw JSON lines")
|
|
192
307
|
p.add_argument("--types", help="Filter by record type (comma-separated, e.g. 'message,toolCall')")
|
|
193
308
|
p.add_argument("--summary", action="store_true",
|
|
194
309
|
help="Show record-count summary instead of full extraction")
|
|
310
|
+
p.add_argument("--json", action="store_true",
|
|
311
|
+
help="Emit structured JSON (compatible with state collectors' --json)")
|
|
312
|
+
p.add_argument("--unmask", action="store_true",
|
|
313
|
+
help="Disable default sanitization of secret-shaped substrings "
|
|
314
|
+
"in message content (off = scrubbed)")
|
|
195
315
|
args = p.parse_args()
|
|
196
316
|
|
|
197
|
-
files
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
+ "\n"
|
|
203
|
-
)
|
|
204
|
-
return 1
|
|
317
|
+
# --list and --all see lock files; default mode hides them so non-interactive
|
|
318
|
+
# callers (cron, jq pipes) don't trip on a transient .jsonl.lock sibling.
|
|
319
|
+
include_transient = bool(args.all or args.list)
|
|
320
|
+
files = _resolve_or_die(args.session_id, args.base_dir, args.agent,
|
|
321
|
+
include_transient=include_transient)
|
|
205
322
|
|
|
206
323
|
if args.list:
|
|
207
324
|
list_files(files, sys.stdout)
|
|
@@ -226,12 +343,18 @@ def main() -> int:
|
|
|
226
343
|
out_fp = sys.stdout
|
|
227
344
|
|
|
228
345
|
try:
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
346
|
+
if args.json:
|
|
347
|
+
_emit_json(args.session_id, selected, out_fp,
|
|
348
|
+
summary_only=args.summary,
|
|
349
|
+
type_filter=type_filter,
|
|
350
|
+
sanitize=not args.unmask)
|
|
351
|
+
else:
|
|
352
|
+
for path, state in selected:
|
|
353
|
+
if args.summary:
|
|
354
|
+
summarize_file(path, state, out_fp)
|
|
355
|
+
else:
|
|
356
|
+
extract_file(path, state, out_fp, pretty=not args.no_pretty,
|
|
357
|
+
type_filter=type_filter, sanitize=not args.unmask)
|
|
235
358
|
except BrokenPipeError:
|
|
236
359
|
try:
|
|
237
360
|
sys.stdout.flush()
|