@zhushanwen/pi-evolve-daily 0.1.0 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # evolve-daily
2
+
3
+ 每日进化数据采集器 — 每天首次 session 自动运行 Python 分析器,生成使用报告。
4
+
5
+ ## 功能
6
+
7
+ - **自动采集**:每天首次启动 Pi 时自动运行 `analyze.py` 分析 session 数据
8
+ - **JSON 报告**:输出到 `~/.pi/agent/evolution-data/daily-reports/YYYY-MM-DD.json`
9
+ - **配套 skills**:内置 `/evolve`、`/evolve-apply`、`/evolve-report` 三个 skill
10
+
11
+ ## 安装
12
+
13
+ ```bash
14
+ # symlink 方式(开发推荐)
15
+ ln -s /path/to/xyz-pi-extensions-workspace/main/packages/evolve-daily \
16
+ ~/.pi/agent/extensions/evolve-daily
17
+
18
+ # npm 方式(正式)
19
+ pi install npm:@zhushanwen/pi-evolve-daily
20
+ ```
21
+
22
+ ## 使用
23
+
24
+ 安装后自动生效,无需手动操作。
25
+
26
+ | Skill | 说明 |
27
+ |-------|------|
28
+ | `/evolve` | 分析使用模式,生成进化建议 |
29
+ | `/evolve-apply` | 应用/跳过/回滚进化建议 |
30
+ | `/evolve-report` | 查看每日报告和使用统计 |
31
+
32
+ ## 依赖
33
+
34
+ - Python 3 + `analyze.py`(位于 `~/.pi/agent/scripts/pi-session-analyzer/`)
35
+
36
+ ## 文件结构
37
+
38
+ ```
39
+ evolve-daily/
40
+ ├── index.ts
41
+ ├── src/
42
+ │ └── index.ts # 入口 — session_start 事件中触发分析
43
+ └── skills/
44
+ ├── evolve/
45
+ ├── evolve-apply/
46
+ └── evolve-report/
47
+ ```
package/package.json CHANGED
@@ -1,17 +1,33 @@
1
1
  {
2
2
  "name": "@zhushanwen/pi-evolve-daily",
3
- "version": "0.1.0",
3
+ "version": "0.1.4",
4
4
  "description": "Daily evolution data collector — runs Python analyzer on first session of the day.",
5
+ "type": "module",
5
6
  "main": "src/index.ts",
7
+ "pi": {
8
+ "extensions": [
9
+ "./src/index.ts"
10
+ ],
11
+ "skills": [
12
+ "./skills"
13
+ ]
14
+ },
15
+ "keywords": [
16
+ "pi-package"
17
+ ],
6
18
  "files": [
7
19
  "src/",
8
20
  "index.ts",
9
- "skills/"
21
+ "skills/",
22
+ "scripts/**/*.py"
10
23
  ],
11
24
  "peerDependencies": {
12
25
  "@mariozechner/pi-coding-agent": "*"
13
26
  },
14
27
  "scripts": {
15
28
  "typecheck": "npx tsc --noEmit"
29
+ },
30
+ "devDependencies": {
31
+ "@types/node": "^24.0.0"
16
32
  }
17
33
  }
@@ -0,0 +1,186 @@
1
+ """analyze.py — CLI 入口,编排 parser → extractors → miner → reporter pipeline。"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import random
8
+ import sys
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+
12
+ # 将脚本目录加入 sys.path 以支持直接运行
13
+ _SCRIPT_DIR = Path(__file__).resolve().parent
14
+ sys.path.insert(0, str(_SCRIPT_DIR))
15
+
16
+ from config import SESSIONS_DIR, REPORTS_DIR # type: ignore[import-not-found]
17
+ from parser import parse_all_sessions # type: ignore[import-not-found]
18
+ from extractors.tools import analyze_tool_usage # type: ignore[import-not-found]
19
+ from extractors.tokens import analyze_token_usage # type: ignore[import-not-found]
20
+ from extractors.errors import analyze_errors # type: ignore[import-not-found]
21
+ from extractors.users import analyze_user_patterns # type: ignore[import-not-found]
22
+ from extractors.skills import analyze_skill_usage # type: ignore[import-not-found]
23
+ from extractors.cross_project import analyze_cross_project # type: ignore[import-not-found]
24
+ from extractors.satisfaction import analyze_satisfaction # type: ignore[import-not-found]
25
+ from extractors.skill_state import analyze_skill_state # type: ignore[import-not-found]
26
+ from miner import mine_patterns # type: ignore[import-not-found]
27
+ from reporter import to_markdown, to_json_string # type: ignore[import-not-found]
28
+
29
+ # Extractor 失败时的空结果降级
30
+ _EMPTY_TOOL = {"total_calls": 0, "by_tool": {}, "edit_retry_rate": 0,
31
+ "duplicate_reads": [], "bash_command_types": {}, "tool_sequences": []}
32
+ _EMPTY_TOKEN = {"total_input": 0, "total_output": 0, "total_cache_read": 0,
33
+ "by_project": [], "by_model": [], "hotspots": [], "cost_total": 0}
34
+ _EMPTY_ERROR = {"total_errors": 0, "by_tool": {}, "bash_failure_rate": 0,
35
+ "edit_match_failure_rate": 0, "top_error_patterns": [],
36
+ "self_correction_rate": 0, "by_project": [], "failure_refs": []}
37
+ _EMPTY_USER = {"total_user_messages": 0, "avg_per_session": 0,
38
+ "corrections": {"total": 0, "by_keyword": {}},
39
+ "repeated_requests": [], "supplementary_instructions": {"total": 0}}
40
+ _EMPTY_SKILL = {"installed_skills": 0, "triggered_skills": {}, "never_triggered": [],
41
+ "skill_file_sizes": {}, "total_skill_reads": 0, "by_project": {}}
42
+ _EMPTY_CROSS = {"project_count": 0, "projects": [],
43
+ "common_tool_sequences": [], "project_type_distribution": {}}
44
+ _EMPTY_SAT = {"total_sessions": 0, "single_turn_completion_rate": 0,
45
+ "avg_turns_per_session": 0, "avg_tool_calls_per_session": 0,
46
+ "session_duration_stats": {}, "by_project": []}
47
+ _EMPTY_SKILL_STATE = {"total_tracked": 0, "unique_skills": 0,
48
+ "by_skill": {}, "slow_skills": [], "error_skills": []}
49
+
50
+ # users extractor 文本聚类在大 session 集上的性能限制
51
+ _USERS_EXTRACTOR_SESSION_LIMIT = 200
52
+
53
+
54
+ def _build_argparser() -> argparse.ArgumentParser:
55
+ p = argparse.ArgumentParser(
56
+ description="Pi Session Analyzer — 离线分析 Pi Agent session 数据",
57
+ )
58
+ p.add_argument("--since", default="7d", help="起始时间 (ISO 格式或 Nd,默认 7d)")
59
+ p.add_argument("--until", default=None, help="结束时间 (ISO 格式,默认 now)")
60
+ p.add_argument("--project", default=None, help="项目名过滤 (子串匹配目录名)")
61
+ p.add_argument("--sample", type=int, default=None, help="抽样模式: 随机取 N 个 session")
62
+ p.add_argument("--output", default=None, help="输出文件路径 (默认 stdout)")
63
+ p.add_argument("--format", choices=["markdown", "json"], default="markdown",
64
+ dest="fmt", help="输出格式 (默认 markdown)")
65
+ p.add_argument("--verbose", action="store_true", help="打印进度信息到 stderr")
66
+ return p
67
+
68
+
69
+ def _verbose(msg: str, verbose: bool) -> None:
70
+ if verbose:
71
+ print(f"[analyze] {msg}", file=sys.stderr)
72
+
73
+
74
+ def _safe_run(label: str, fn, fallback):
75
+ """运行 extractor,失败时打印 warning 并返回空结果。"""
76
+ try:
77
+ return fn()
78
+ except Exception as exc:
79
+ print(f"[analyze] Warning: {label} extractor 失败: {exc}", file=sys.stderr)
80
+ return fallback
81
+
82
+
83
+ def _resolve_sessions(args, verbose: bool) -> tuple[list, bool, int | None]:
84
+ """解析 + 抽样 sessions,返回 (sessions, is_sample, sample_size)。"""
85
+ sessions = parse_all_sessions(since=args.since, until=args.until, project=args.project)
86
+ _verbose(f"解析完成: {len(sessions)} sessions", verbose)
87
+
88
+ is_sample, sample_size = False, None
89
+ if args.sample is not None:
90
+ actual = min(args.sample, len(sessions))
91
+ if actual < args.sample:
92
+ print(f"[analyze] Warning: --sample {args.sample} > 可用 sessions {len(sessions)}, "
93
+ "降级为全量分析", file=sys.stderr)
94
+ else:
95
+ is_sample, sample_size = True, actual
96
+ sessions = random.sample(sessions, actual)
97
+ _verbose(f"抽样: {sample_size} sessions", verbose)
98
+
99
+ if not sessions:
100
+ print("[analyze] 无匹配 session,输出空报告", file=sys.stderr)
101
+
102
+ return sessions, is_sample, sample_size
103
+
104
+
105
+ def _build_session_time_map(sessions: list) -> dict[str, str]:
106
+ """建立 session_id → start_time 映射(供 miner DORMANT 时间判定)。"""
107
+ time_map: dict[str, str] = {}
108
+ for s in sessions:
109
+ if hasattr(s, "session_id") and hasattr(s, "start_time") and s.start_time:
110
+ time_map[s.session_id] = s.start_time
111
+ return time_map
112
+
113
+
114
+ def _run_extractors(sessions: list, verbose: bool) -> tuple[dict, ...]:
115
+ """运行 8 个 extractor(每个独立 try/except 降级),返回 8 个结果。"""
116
+ _verbose("运行 extractors...", verbose)
117
+
118
+ tool_stats = _safe_run("tools", lambda: analyze_tool_usage(sessions), _EMPTY_TOOL)
119
+ token_stats = _safe_run("tokens", lambda: analyze_token_usage(sessions), _EMPTY_TOKEN)
120
+ error_stats = _safe_run("errors", lambda: analyze_errors(sessions), _EMPTY_ERROR)
121
+
122
+ # users extractor 的文本聚类在大 session 集上很慢 (O(n*m)),限制输入量
123
+ if len(sessions) > _USERS_EXTRACTOR_SESSION_LIMIT:
124
+ users_subset = random.sample(sessions, _USERS_EXTRACTOR_SESSION_LIMIT)
125
+ _verbose(f"Users extractor: 使用 {len(users_subset)}/{len(sessions)} sessions (性能优化)",
126
+ verbose)
127
+ else:
128
+ users_subset = sessions
129
+ user_patterns = _safe_run("users", lambda: analyze_user_patterns(users_subset), _EMPTY_USER)
130
+
131
+ skill_stats = _safe_run("skills", lambda: analyze_skill_usage(sessions), _EMPTY_SKILL)
132
+ cross_project = _safe_run("cross_project", lambda: analyze_cross_project(sessions), _EMPTY_CROSS)
133
+ satisfaction = _safe_run("satisfaction", lambda: analyze_satisfaction(sessions), _EMPTY_SAT)
134
+ skill_state = _safe_run("skill_state", lambda: analyze_skill_state(sessions), _EMPTY_SKILL_STATE)
135
+ _verbose("Extractors 完成", verbose)
136
+
137
+ return (tool_stats, token_stats, error_stats, user_patterns,
138
+ skill_stats, cross_project, satisfaction, skill_state)
139
+
140
+
141
+ def _write_output(text: str, output_path: str | None, verbose: bool) -> None:
142
+ """写入输出文件或 stdout。"""
143
+ if output_path:
144
+ out = Path(output_path)
145
+ out.parent.mkdir(parents=True, exist_ok=True)
146
+ out.write_text(text, encoding="utf-8")
147
+ _verbose(f"报告写入: {out}", verbose)
148
+ else:
149
+ print(text)
150
+
151
+
152
+ def main(argv: list[str] | None = None) -> None:
153
+ args = _build_argparser().parse_args(argv)
154
+
155
+ # 检查 sessions 目录
156
+ if not Path(SESSIONS_DIR).exists():
157
+ print(f"错误: session 目录不存在: {SESSIONS_DIR}", file=sys.stderr)
158
+ sys.exit(1)
159
+
160
+ _verbose(f"解析 sessions (since={args.since}, until={args.until}, project={args.project})...",
161
+ args.verbose)
162
+ sessions, is_sample, sample_size = _resolve_sessions(args, args.verbose)
163
+ session_time_map = _build_session_time_map(sessions)
164
+ extractors = _run_extractors(sessions, args.verbose)
165
+
166
+ until_str = args.until or datetime.now(timezone.utc).isoformat()[:10]
167
+ _verbose("聚合分析...", args.verbose)
168
+ # 8 个 extractor 结果: 前 7 个位置参数 + skill_state 关键字参数
169
+ (tool_stats, token_stats, error_stats, user_patterns,
170
+ skill_stats, cross_project, satisfaction, skill_state) = extractors
171
+ aggregated = mine_patterns(
172
+ tool_stats, token_stats, error_stats, user_patterns,
173
+ skill_stats, cross_project, satisfaction,
174
+ skill_state=skill_state,
175
+ is_sample=is_sample, sample_size=sample_size,
176
+ total_sessions=len(sessions), since=args.since, until=until_str,
177
+ session_time_map=session_time_map,
178
+ )
179
+ _verbose("聚合完成", args.verbose)
180
+
181
+ output = to_json_string(aggregated) if args.fmt == "json" else to_markdown(aggregated)
182
+ _write_output(output, args.output, args.verbose)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
@@ -0,0 +1,50 @@
1
+ """配置:路径、阈值、常量。"""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from datetime import timedelta
6
+
7
+ # ── 路径 ──────────────────────────────────────────────
8
+ PI_AGENT_DIR = Path(os.path.expanduser("~/.pi/agent"))
9
+ SESSIONS_DIR = PI_AGENT_DIR / "sessions"
10
+ EVOLUTION_DATA_DIR = PI_AGENT_DIR / "evolution-data"
11
+ REPORTS_DIR = EVOLUTION_DATA_DIR / "reports"
12
+ DAILY_DIR = EVOLUTION_DATA_DIR / "daily"
13
+
14
+ # ── 信号提取阈值 ─────────────────────────────────────
15
+ # Signal 1: 工具使用
16
+ DUPLICATE_READ_THRESHOLD = 3 # 同一文件读取次数超过此值视为重复
17
+
18
+ # Signal 2: Token
19
+ TOKEN_HOTSPOT_PERCENTILE = 90 # token 消耗 top 百分位视为热点
20
+
21
+ # Signal 3: 错误
22
+ ERROR_KEYWORDS = [
23
+ "error", "fail", "failed", "exception",
24
+ "Could not find the exact text",
25
+ "ENOENT", "permission denied",
26
+ "non-zero exit code",
27
+ ]
28
+
29
+ # Signal 4: 用户重复指令
30
+ USER_CORRECTION_KEYWORDS = [
31
+ "不对", "不要", "别", "取消", "错了", "不是这样",
32
+ "no,", "wrong", "not like this", "don't",
33
+ "重新", "重来", "换个", "换一种",
34
+ ]
35
+
36
+ # Signal 5: Skill
37
+ SKILLS_DIR = PI_AGENT_DIR / "skills"
38
+ SKILL_FILE_NAME = "SKILL.md"
39
+
40
+ # Signal 7: 满意度隐式信号
41
+ SINGLE_TURN_MAX_MESSAGES = 3 # user+assistant 消息数 <= 此值视为单轮完成
42
+
43
+ # ── 报告 ──────────────────────────────────────────────
44
+ TOP_N_PROBLEMS = 10
45
+ TOP_N_PATTERNS = 10
46
+ TOP_N_SKILLS = 20
47
+
48
+ # ── 性能 ──────────────────────────────────────────────
49
+ MAX_FILES_PARALLEL = 8 # 并行解析文件数
50
+ BATCH_SIZE = 50 # 批量处理 session 数
@@ -0,0 +1,350 @@
1
+ """extract_context.py — 根据 failure_ref 从 JSONL 中提取完整上下文。
2
+
3
+ CLI 工具,供 evolve skill 通过 bash 调用:
4
+ python3 extract_context.py --session-id SID --tool-call-id TID [--context 5]
5
+
6
+ 也支持批量提取某个 error pattern 的典型案例:
7
+ python3 extract_context.py --pattern "Timeout" --from-report REPORT_JSON [--limit 2]
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ _SCRIPT_DIR = Path(__file__).resolve().parent
18
+ sys.path.insert(0, str(_SCRIPT_DIR))
19
+
20
+ from config import SESSIONS_DIR
21
+
22
+
23
+ # ── JSONL 定位 ────────────────────────────────────────
24
+
25
+ def _find_session_file(session_id: str) -> Path | None:
26
+ """根据 session_id 定位 JSONL 文件。session_id 嵌在文件名中。
27
+
28
+ 如果匹配到多个文件(罕见情况),按 mtime 降序取最新的。
29
+ """
30
+ if not SESSIONS_DIR.exists():
31
+ return None
32
+
33
+ candidates: list[Path] = []
34
+ for project_dir in SESSIONS_DIR.iterdir():
35
+ if not project_dir.is_dir():
36
+ continue
37
+ # 跳过可能的符号链接循环
38
+ if project_dir.is_symlink() and project_dir.is_dir():
39
+ continue
40
+ for jsonl_file in project_dir.glob(f"*{session_id}*.jsonl"):
41
+ candidates.append(jsonl_file)
42
+
43
+ if not candidates:
44
+ return None
45
+ if len(candidates) == 1:
46
+ return candidates[0]
47
+
48
+ # 多个匹配,按 mtime 降序取最新的
49
+ candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
50
+ return candidates[0]
51
+
52
+
53
+ # ── JSONL 上下文提取 ──────────────────────────────────
54
+
55
+ def _extract_tool_call_from_content(content, tool_call_id: str) -> dict | None:
56
+ """从 assistant message 的 content list 中查找 toolCall。"""
57
+ if not isinstance(content, list):
58
+ return None
59
+ for item in content:
60
+ if (isinstance(item, dict)
61
+ and item.get("type") == "toolCall"
62
+ and item.get("id") == tool_call_id):
63
+ return {
64
+ "name": item.get("name", ""),
65
+ "arguments": item.get("arguments", {}),
66
+ }
67
+ return None
68
+
69
+
70
+ def extract_tool_call_context(
71
+ jsonl_path: Path,
72
+ tool_call_id: str,
73
+ context_entries: int = 5,
74
+ ) -> dict | None:
75
+ """从 JSONL 文件中提取指定 tool_call_id 的完整上下文。
76
+
77
+ 返回:
78
+ {
79
+ "session_id": "...",
80
+ "tool_call": {"name": "...", "arguments": {...}},
81
+ "tool_result": {"tool_name": "...", "is_error": true, "content": "..."},
82
+ "before_context": [{"role": "user/assistant", "text": "..."}],
83
+ "after_context": [{"role": "user/assistant", "text": "..."}],
84
+ }
85
+ """
86
+ entries = _parse_entries(jsonl_path)
87
+ if not entries:
88
+ return None
89
+
90
+ session_id = _extract_session_id(entries)
91
+
92
+ # 定位 toolCall 和 toolResult
93
+ call_idx = None
94
+ result_idx = None
95
+
96
+ for i, entry in enumerate(entries):
97
+ msg = entry.get("message", {})
98
+ role = msg.get("role", "")
99
+
100
+ # 找 toolCall
101
+ if role == "assistant":
102
+ content = msg.get("content", [])
103
+ if isinstance(content, list):
104
+ for item in content:
105
+ if (isinstance(item, dict)
106
+ and item.get("type") == "toolCall"
107
+ and item.get("id") == tool_call_id):
108
+ call_idx = i
109
+ break
110
+
111
+ # 找 toolResult
112
+ if role in ("tool", "toolResult"):
113
+ if msg.get("toolCallId") == tool_call_id:
114
+ result_idx = i
115
+
116
+ if call_idx is None and result_idx is None:
117
+ return None
118
+
119
+ anchor_idx = result_idx if result_idx is not None else call_idx
120
+ # anchor_idx 不可能为 None,因为前一行确保了至少有一个非 None
121
+ if anchor_idx is None:
122
+ return None
123
+
124
+ # 提取 tool_call 信息
125
+ tool_call_info = None
126
+ if call_idx is not None:
127
+ call_msg = entries[call_idx].get("message", {})
128
+ tool_call_info = _extract_tool_call_from_content(
129
+ call_msg.get("content", []), tool_call_id
130
+ )
131
+
132
+ # 提取 tool_result 信息
133
+ tool_result_info = None
134
+ if result_idx is not None:
135
+ result_msg = entries[result_idx].get("message", {})
136
+ content_text = _extract_text(result_msg.get("content"))
137
+ tool_result_info = {
138
+ "tool_name": result_msg.get("toolName", ""),
139
+ "is_error": bool(result_msg.get("isError", False)),
140
+ "content": content_text[:2000], # 限制长度
141
+ }
142
+
143
+ # 提取前后上下文
144
+ before = _extract_context_before(entries, anchor_idx, context_entries)
145
+ after = _extract_context_after(entries, anchor_idx + 1, context_entries)
146
+
147
+ return {
148
+ "session_id": session_id,
149
+ "tool_call": tool_call_info,
150
+ "tool_result": tool_result_info,
151
+ "before_context": before,
152
+ "after_context": after,
153
+ }
154
+
155
+
156
+ # ── 内部辅助 ──────────────────────────────────────────
157
+
158
+ def _parse_entries(jsonl_path: Path) -> list[dict]:
159
+ """解析 JSONL 文件的所有行。"""
160
+ entries = []
161
+ try:
162
+ with open(jsonl_path, "r", encoding="utf-8") as f:
163
+ for line in f:
164
+ line = line.strip()
165
+ if not line:
166
+ continue
167
+ try:
168
+ entries.append(json.loads(line))
169
+ except json.JSONDecodeError:
170
+ print(
171
+ f"[extract_context] Warning: 忽略损坏的 JSON 行: "
172
+ f"{line[:80]}...",
173
+ file=sys.stderr,
174
+ )
175
+ continue
176
+ except OSError:
177
+ pass
178
+ return entries
179
+
180
+
181
+ def _extract_session_id(entries: list[dict]) -> str:
182
+ """从 session 类型的 entry 中提取 session ID。"""
183
+ for entry in entries:
184
+ if entry.get("type") == "session":
185
+ return entry.get("id", "")
186
+ return ""
187
+
188
+
189
+ def _extract_text(content) -> str:
190
+ """从 message.content 提取文本。"""
191
+ if isinstance(content, str):
192
+ return content
193
+ if isinstance(content, list):
194
+ parts = []
195
+ for item in content:
196
+ if isinstance(item, dict) and item.get("type") == "text":
197
+ parts.append(item.get("text", ""))
198
+ return "\n".join(parts)
199
+ return ""
200
+
201
+
202
+ def _summarize_entry(entry: dict) -> dict | None:
203
+ """将单条 entry 转为摘要 dict。返回 None 如果是非 message 类型。"""
204
+ msg = entry.get("message", {})
205
+ role = msg.get("role", "")
206
+ entry_type = entry.get("type", "")
207
+
208
+ if entry_type != "message":
209
+ return None
210
+
211
+ summary: dict = {"role": role}
212
+
213
+ if role == "user":
214
+ text = _extract_text(msg.get("content"))
215
+ summary["text"] = text[:300]
216
+ elif role == "assistant":
217
+ content = msg.get("content", [])
218
+ if isinstance(content, list):
219
+ for item in content:
220
+ if isinstance(item, dict) and item.get("type") == "text":
221
+ summary["text"] = item.get("text", "")[:300]
222
+ break
223
+ # 列出 toolCalls(不包含完整 arguments)
224
+ tc_names = [
225
+ item.get("name", "")
226
+ for item in content
227
+ if isinstance(item, dict) and item.get("type") == "toolCall"
228
+ ]
229
+ if tc_names:
230
+ summary["tool_calls"] = tc_names
231
+ elif role in ("tool", "toolResult"):
232
+ summary["tool_name"] = msg.get("toolName", "")
233
+ summary["is_error"] = bool(msg.get("isError", False))
234
+ text = _extract_text(msg.get("content"))
235
+ summary["text"] = text[:200]
236
+
237
+ return summary
238
+
239
+
240
+ def _extract_context_before(entries: list[dict], anchor: int, n: int) -> list[dict]:
241
+ """提取 anchor 位置之前的 N 条上下文。"""
242
+ begin = max(0, anchor - n)
243
+ result = []
244
+ for i in range(begin, anchor):
245
+ summary = _summarize_entry(entries[i])
246
+ if summary is not None:
247
+ result.append(summary)
248
+ return result
249
+
250
+
251
+ def _extract_context_after(entries: list[dict], anchor: int, n: int) -> list[dict]:
252
+ """提取 anchor 位置之后的 N 条上下文。"""
253
+ end = min(len(entries), anchor + n)
254
+ result = []
255
+ for i in range(anchor, end):
256
+ summary = _summarize_entry(entries[i])
257
+ if summary is not None:
258
+ result.append(summary)
259
+ return result
260
+
261
+
262
+ # ── 批量提取 ──────────────────────────────────────────
263
+
264
+ def extract_pattern_cases(
265
+ report_path: str,
266
+ pattern: str,
267
+ limit: int = 2,
268
+ ) -> list[dict]:
269
+ """从 daily-report JSON 中提取指定 error pattern 的典型案例上下文。"""
270
+ try:
271
+ with open(report_path, "r", encoding="utf-8") as f:
272
+ report = json.load(f)
273
+ except (OSError, json.JSONDecodeError) as e:
274
+ return [{"error": f"无法读取报告 {report_path}: {e}"}]
275
+
276
+ refs = report.get("error_stats", {}).get("failure_refs", [])
277
+ matched = [r for r in refs if r.get("pattern") == pattern]
278
+
279
+ # 优先选未自我纠正的案例(更有分析价值)
280
+ matched.sort(key=lambda r: r.get("self_corrected", False))
281
+
282
+ results = []
283
+ for ref in matched[:limit]:
284
+ sid = ref.get("session_id", "")
285
+ tcid = ref.get("tool_call_id", "")
286
+
287
+ jsonl_path = _find_session_file(sid)
288
+ if not jsonl_path:
289
+ results.append({
290
+ "ref": ref,
291
+ "error": f"session file not found for {sid}",
292
+ })
293
+ continue
294
+
295
+ ctx = extract_tool_call_context(jsonl_path, tcid)
296
+ if ctx:
297
+ results.append({"ref": ref, "context": ctx})
298
+ else:
299
+ results.append({
300
+ "ref": ref,
301
+ "error": f"tool_call_id {tcid} not found in session",
302
+ })
303
+
304
+ return results
305
+
306
+
307
+ # ── CLI ───────────────────────────────────────────────
308
+
309
+ def main() -> None:
310
+ parser = argparse.ArgumentParser(
311
+ description="从 Pi session JSONL 中提取 tool call 失败的完整上下文",
312
+ )
313
+ parser.add_argument("--session-id", help="Session ID (UUID)")
314
+ parser.add_argument("--tool-call-id", help="Tool call ID")
315
+ parser.add_argument("--context", type=int, default=5,
316
+ help="前后各取 N 条 entries 作为上下文 (默认 5)")
317
+ parser.add_argument("--pattern", help="Error pattern,批量模式")
318
+ parser.add_argument("--from-report", help="daily-report JSON 路径,批量模式用")
319
+ parser.add_argument("--limit", type=int, default=2,
320
+ help="批量模式下每种 pattern 最多取几个案例 (默认 2)")
321
+
322
+ args = parser.parse_args()
323
+
324
+ # 批量模式
325
+ if args.pattern and args.from_report:
326
+ cases = extract_pattern_cases(args.from_report, args.pattern, args.limit)
327
+ print(json.dumps(cases, ensure_ascii=False, indent=2))
328
+ return
329
+
330
+ # 单条模式
331
+ if not args.session_id or not args.tool_call_id:
332
+ print("错误: 单条模式需要 --session-id 和 --tool-call-id", file=sys.stderr)
333
+ print("批量模式需要 --pattern 和 --from-report", file=sys.stderr)
334
+ sys.exit(1)
335
+
336
+ jsonl_path = _find_session_file(args.session_id)
337
+ if not jsonl_path:
338
+ print(f"错误: 未找到 session {args.session_id} 的 JSONL 文件", file=sys.stderr)
339
+ sys.exit(1)
340
+
341
+ result = extract_tool_call_context(jsonl_path, args.tool_call_id, args.context)
342
+ if not result:
343
+ print(f"错误: 未在 session 中找到 tool_call_id={args.tool_call_id}", file=sys.stderr)
344
+ sys.exit(1)
345
+
346
+ print(json.dumps(result, ensure_ascii=False, indent=2))
347
+
348
+
349
+ if __name__ == "__main__":
350
+ main()
File without changes