@zhushanwen/pi-evolve-daily 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/analyzer/__init__.py +5 -0
- package/analyzer/analyze.py +203 -0
- package/analyzer/extractors/__init__.py +61 -0
- package/analyzer/extractors/compact.py +64 -0
- package/analyzer/extractors/context.py +139 -0
- package/analyzer/extractors/goal_quality.py +163 -0
- package/analyzer/extractors/subagent.py +112 -0
- package/analyzer/extractors/tool_errors.py +168 -0
- package/analyzer/extractors/tracker.py +124 -0
- package/analyzer/extractors/workflow.py +237 -0
- package/analyzer/rules/__init__.py +48 -0
- package/analyzer/rules/compact_early_trigger.py +30 -0
- package/analyzer/rules/compact_high_frequency.py +27 -0
- package/analyzer/rules/context_high_utilization.py +30 -0
- package/analyzer/rules/edit_match_failure.py +36 -0
- package/analyzer/rules/goal_low_completion.py +31 -0
- package/analyzer/rules/goal_low_evidence.py +47 -0
- package/analyzer/rules/goal_stall_frequent.py +30 -0
- package/analyzer/rules/low_self_correction.py +28 -0
- package/analyzer/rules/param_error_rate.py +29 -0
- package/analyzer/rules/subagent_failure_rate.py +28 -0
- package/analyzer/rules/subagent_high_retry.py +28 -0
- package/analyzer/rules/todo_high_abandon.py +30 -0
- package/analyzer/rules/workflow_gate_retry.py +35 -0
- package/analyzer/rules/workflow_slow_phase.py +40 -0
- package/package.json +15 -3
- package/src/index.ts +2 -2
- package/src/trackers/core.ts +4 -6
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Evolve Daily Analyzer - 使用新的 extractors 和 rules 分析 session JSONL。
|
|
3
|
+
|
|
4
|
+
用法:
|
|
5
|
+
python3 analyze.py --since 1d --format json --output report.json
|
|
6
|
+
python3 analyze.py --input session.jsonl --format json --output report.json
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
from datetime import datetime, timedelta
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
# 添加当前目录到 Python 路径,以便导入 extractors 和 rules
|
|
17
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
18
|
+
|
|
19
|
+
from extractors import run_extractors
|
|
20
|
+
from rules import run_rules
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_sessions(since_days: int = 1, input_file: str | None = None) -> list[dict]:
|
|
24
|
+
"""加载 session JSONL 数据。
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
since_days: 加载最近 N 天的数据。
|
|
28
|
+
input_file: 指定输入文件路径(优先级高于 since_days)。
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
session 列表。
|
|
32
|
+
"""
|
|
33
|
+
if input_file:
|
|
34
|
+
return _load_from_file(input_file)
|
|
35
|
+
|
|
36
|
+
# 从默认目录加载
|
|
37
|
+
sessions_dir = Path.home() / ".pi" / "agent" / "sessions"
|
|
38
|
+
if not sessions_dir.exists():
|
|
39
|
+
print(f"[evolve] Warning: Sessions directory not found: {sessions_dir}")
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
cutoff = datetime.now() - timedelta(days=since_days)
|
|
43
|
+
sessions = []
|
|
44
|
+
|
|
45
|
+
for session_file in sessions_dir.glob("*.jsonl"):
|
|
46
|
+
try:
|
|
47
|
+
# 从文件名解析日期
|
|
48
|
+
file_date = datetime.fromisoformat(session_file.stem[:10])
|
|
49
|
+
if file_date < cutoff:
|
|
50
|
+
continue
|
|
51
|
+
except (ValueError, IndexError):
|
|
52
|
+
# 文件名不是日期格式,跳过
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
session_data = _load_session_file(session_file)
|
|
56
|
+
if session_data:
|
|
57
|
+
sessions.append(session_data)
|
|
58
|
+
|
|
59
|
+
return sessions
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_from_file(input_file: str) -> list[dict]:
|
|
63
|
+
"""从单个文件加载 session 数据。"""
|
|
64
|
+
path = Path(input_file)
|
|
65
|
+
if not path.exists():
|
|
66
|
+
print(f"[evolve] Warning: Input file not found: {input_file}")
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
if path.suffix == ".jsonl":
|
|
70
|
+
session_data = _load_session_file(path)
|
|
71
|
+
return [session_data] if session_data else []
|
|
72
|
+
elif path.suffix == ".json":
|
|
73
|
+
try:
|
|
74
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
75
|
+
data = json.load(f)
|
|
76
|
+
return [data] if isinstance(data, dict) else data
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"[evolve] Warning: Failed to load JSON file {input_file}: {e}")
|
|
79
|
+
return []
|
|
80
|
+
else:
|
|
81
|
+
print(f"[evolve] Warning: Unsupported file format: {path.suffix}")
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _load_session_file(file_path: Path) -> dict | None:
|
|
86
|
+
"""加载单个 JSONL session 文件。"""
|
|
87
|
+
try:
|
|
88
|
+
messages = []
|
|
89
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
90
|
+
for line in f:
|
|
91
|
+
line = line.strip()
|
|
92
|
+
if line:
|
|
93
|
+
try:
|
|
94
|
+
messages.append(json.loads(line))
|
|
95
|
+
except json.JSONDecodeError:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
if not messages:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"session_id": file_path.stem,
|
|
103
|
+
"messages": messages,
|
|
104
|
+
"file_path": str(file_path),
|
|
105
|
+
}
|
|
106
|
+
except Exception as e:
|
|
107
|
+
print(f"[evolve] Warning: Failed to load session file {file_path}: {e}")
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def generate_report(sessions: list[dict], format: str = "json") -> dict:
|
|
112
|
+
"""生成分析报告。
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
sessions: session 列表。
|
|
116
|
+
format: 输出格式(json)。
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
分析报告字典。
|
|
120
|
+
"""
|
|
121
|
+
# 运行所有 extractors(传入当前工作目录作为 project_root)
|
|
122
|
+
project_root = str(Path.cwd())
|
|
123
|
+
extractor_results = run_extractors(sessions, project_root=project_root)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# 运行所有 miner rules
|
|
127
|
+
issues = run_rules(extractor_results)
|
|
128
|
+
|
|
129
|
+
# 生成报告
|
|
130
|
+
report = {
|
|
131
|
+
"generated_at": datetime.now().isoformat(),
|
|
132
|
+
"session_count": len(sessions),
|
|
133
|
+
"extractors": extractor_results,
|
|
134
|
+
"issues": issues,
|
|
135
|
+
"summary": {
|
|
136
|
+
"total_issues": len(issues),
|
|
137
|
+
"high_severity": sum(1 for i in issues if i.get("severity") == "high"),
|
|
138
|
+
"medium_severity": sum(1 for i in issues if i.get("severity") == "medium"),
|
|
139
|
+
"low_severity": sum(1 for i in issues if i.get("severity") == "low"),
|
|
140
|
+
},
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return report
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def main():
|
|
147
|
+
parser = argparse.ArgumentParser(description="Evolve Daily Analyzer")
|
|
148
|
+
parser.add_argument("--since", type=str, default="1d", help="分析最近 N 天的数据(如 1d, 7d)")
|
|
149
|
+
parser.add_argument("--input", type=str, help="指定输入文件路径")
|
|
150
|
+
parser.add_argument("--format", type=str, default="json", choices=["json"], help="输出格式")
|
|
151
|
+
parser.add_argument("--output", type=str, help="输出文件路径")
|
|
152
|
+
parser.add_argument("--verbose", action="store_true", help="详细输出")
|
|
153
|
+
|
|
154
|
+
args = parser.parse_args()
|
|
155
|
+
|
|
156
|
+
# 解析 since 参数
|
|
157
|
+
since_str = args.since.lower().rstrip("d")
|
|
158
|
+
try:
|
|
159
|
+
since_days = int(since_str)
|
|
160
|
+
except ValueError:
|
|
161
|
+
print(f"[evolve] Error: Invalid --since value: {args.since}")
|
|
162
|
+
sys.exit(1)
|
|
163
|
+
|
|
164
|
+
# 加载 sessions
|
|
165
|
+
if args.verbose:
|
|
166
|
+
print(f"[evolve] Loading sessions (since {since_days} days)...")
|
|
167
|
+
sessions = load_sessions(since_days=since_days, input_file=args.input)
|
|
168
|
+
|
|
169
|
+
if not sessions:
|
|
170
|
+
print("[evolve] Warning: No sessions found")
|
|
171
|
+
# 生成空报告
|
|
172
|
+
report = {
|
|
173
|
+
"generated_at": datetime.now().isoformat(),
|
|
174
|
+
"session_count": 0,
|
|
175
|
+
"extractors": {},
|
|
176
|
+
"issues": [],
|
|
177
|
+
"summary": {
|
|
178
|
+
"total_issues": 0,
|
|
179
|
+
"high_severity": 0,
|
|
180
|
+
"medium_severity": 0,
|
|
181
|
+
"low_severity": 0,
|
|
182
|
+
},
|
|
183
|
+
}
|
|
184
|
+
else:
|
|
185
|
+
if args.verbose:
|
|
186
|
+
print(f"[evolve] Found {len(sessions)} sessions")
|
|
187
|
+
print("[evolve] Running extractors...")
|
|
188
|
+
report = generate_report(sessions, format=args.format)
|
|
189
|
+
|
|
190
|
+
# 输出报告
|
|
191
|
+
if args.output:
|
|
192
|
+
output_path = Path(args.output)
|
|
193
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
194
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
195
|
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
196
|
+
if args.verbose:
|
|
197
|
+
print(f"[evolve] Report saved to {args.output}")
|
|
198
|
+
else:
|
|
199
|
+
print(json.dumps(report, ensure_ascii=False, indent=2))
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
if __name__ == "__main__":
|
|
203
|
+
main()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Extractor 自动发现机制。
|
|
2
|
+
|
|
3
|
+
通过 pkgutil.iter_modules 自动发现 extractors/ 目录下的所有模块,
|
|
4
|
+
每个模块必须实现 extract(sessions: list[dict]) -> dict 接口。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pkgutil
|
|
8
|
+
import importlib
|
|
9
|
+
from typing import Protocol, runtime_checkable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@runtime_checkable
|
|
13
|
+
class BaseExtractor(Protocol):
|
|
14
|
+
"""Extractor 协议:所有 extractor 必须实现 extract 方法。"""
|
|
15
|
+
|
|
16
|
+
def extract(self, sessions: list[dict]) -> dict: ...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def discover_extractors() -> dict[str, object]:
|
|
20
|
+
"""自动发现所有 extractor 模块。
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
dict[str, module]: 模块名到模块对象的映射。
|
|
24
|
+
"""
|
|
25
|
+
extractors: dict[str, object] = {}
|
|
26
|
+
for _importer, modname, _ispkg in pkgutil.iter_modules(__path__):
|
|
27
|
+
if modname.startswith("_"):
|
|
28
|
+
continue
|
|
29
|
+
try:
|
|
30
|
+
module = importlib.import_module(f".{modname}", __package__)
|
|
31
|
+
if hasattr(module, "extract"):
|
|
32
|
+
extractors[modname] = module
|
|
33
|
+
except Exception as exc:
|
|
34
|
+
print(f"[evolve] Warning: Failed to load extractor {modname}: {exc}")
|
|
35
|
+
return extractors
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_extractors(sessions: list[dict], project_root: str = "") -> dict:
|
|
39
|
+
"""运行所有 extractor,每个 extractor 独立运行,失败时返回空结果。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
sessions: session JSONL 解析后的字典列表。
|
|
43
|
+
project_root: 项目根目录路径(可选,用于 workflow extractor 扫描 .xyz-harness/)。
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
合并后的提取结果,key 为 "{extractor_name}_stats"。
|
|
47
|
+
"""
|
|
48
|
+
import inspect
|
|
49
|
+
results: dict = {}
|
|
50
|
+
extractors = discover_extractors()
|
|
51
|
+
for name, extractor in extractors.items():
|
|
52
|
+
try:
|
|
53
|
+
sig = inspect.signature(extractor.extract)
|
|
54
|
+
if "project_root" in sig.parameters:
|
|
55
|
+
results[f"{name}_stats"] = extractor.extract(sessions, project_root=project_root) # type: ignore[attr-defined]
|
|
56
|
+
else:
|
|
57
|
+
results[f"{name}_stats"] = extractor.extract(sessions) # type: ignore[attr-defined]
|
|
58
|
+
except Exception as exc:
|
|
59
|
+
print(f"[evolve] Warning: Extractor {name} failed: {exc}")
|
|
60
|
+
results[f"{name}_stats"] = {}
|
|
61
|
+
return results
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""统计 session 中的 compactionSummary 消息。"""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract(sessions: list[dict]) -> dict:
|
|
7
|
+
"""从 session 列表中提取 compact 统计。
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
sessions: session JSONL 解析后的字典列表。
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
包含 compact 频率、分布、触发时机等统计信息。
|
|
14
|
+
"""
|
|
15
|
+
total_compacts = 0
|
|
16
|
+
compact_turn_indices: list[int] = []
|
|
17
|
+
sessions_with_compact = 0
|
|
18
|
+
total_sessions = len(sessions)
|
|
19
|
+
|
|
20
|
+
for session in sessions:
|
|
21
|
+
messages = session.get("messages", [])
|
|
22
|
+
session_compacts = 0
|
|
23
|
+
|
|
24
|
+
for i, msg in enumerate(messages):
|
|
25
|
+
if msg.get("type") == "compaction":
|
|
26
|
+
total_compacts += 1
|
|
27
|
+
session_compacts += 1
|
|
28
|
+
# turn 索引 = 消息序号 / 2(粗略估算)
|
|
29
|
+
compact_turn_indices.append(i // 2)
|
|
30
|
+
|
|
31
|
+
if session_compacts > 0:
|
|
32
|
+
sessions_with_compact += 1
|
|
33
|
+
|
|
34
|
+
# 计算分布
|
|
35
|
+
avg_compacts = total_compacts / max(total_sessions, 1)
|
|
36
|
+
per_session_counts = [
|
|
37
|
+
sum(1 for msg in s.get("messages", []) if msg.get("type") == "compaction")
|
|
38
|
+
for s in sessions
|
|
39
|
+
]
|
|
40
|
+
max_compacts = max(per_session_counts) if per_session_counts else 0
|
|
41
|
+
|
|
42
|
+
# 分布桶:[0次, 1次, 2次, 3次, 4次, 5次, 6次+]
|
|
43
|
+
distribution = [0] * 7
|
|
44
|
+
for count in per_session_counts:
|
|
45
|
+
if count >= 6:
|
|
46
|
+
distribution[6] += 1
|
|
47
|
+
else:
|
|
48
|
+
distribution[count] += 1
|
|
49
|
+
|
|
50
|
+
# 早期触发统计(turn < 5 时触发 compact)
|
|
51
|
+
early_trigger_count = sum(1 for idx in compact_turn_indices if idx < 5)
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
"total_compacts": total_compacts,
|
|
55
|
+
"compacts_per_session": {
|
|
56
|
+
"avg": avg_compacts,
|
|
57
|
+
"max": max_compacts,
|
|
58
|
+
"distribution": distribution,
|
|
59
|
+
},
|
|
60
|
+
"compact_turn_indices": compact_turn_indices,
|
|
61
|
+
"early_trigger_count": early_trigger_count,
|
|
62
|
+
"sessions_with_compact": sessions_with_compact,
|
|
63
|
+
"total_sessions": total_sessions,
|
|
64
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""计算估算的上下文窗口利用率。"""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
# 模型 context limit 映射(已知模型)
|
|
6
|
+
MODEL_CONTEXT_LIMITS: dict[str, int] = {
|
|
7
|
+
"claude-sonnet-4": 200_000,
|
|
8
|
+
"claude-haiku-3.5": 200_000,
|
|
9
|
+
"deepseek-v3": 64_000,
|
|
10
|
+
"deepseek-r1": 64_000,
|
|
11
|
+
"gpt-4o": 128_000,
|
|
12
|
+
"gpt-4o-mini": 128_000,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def estimate_tokens_from_chars(char_count: int, text_sample: str = "") -> int:
|
|
17
|
+
"""粗略估算 token 数。
|
|
18
|
+
|
|
19
|
+
如果有 text_sample,按中英文字符比例估算。
|
|
20
|
+
否则使用保守的混合比例 0.5 token/char。
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
char_count: 字符总数。
|
|
24
|
+
text_sample: 用于估算中英文比例的文本样本。
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
估算的 token 数。
|
|
28
|
+
"""
|
|
29
|
+
if char_count == 0:
|
|
30
|
+
return 0
|
|
31
|
+
if text_sample:
|
|
32
|
+
chinese_chars = sum(1 for c in text_sample if "\u4e00" <= c <= "\u9fff")
|
|
33
|
+
ratio = chinese_chars / len(text_sample)
|
|
34
|
+
# 混合比例:中文 ~1.5 token/char,英文 ~0.25 token/char
|
|
35
|
+
return int(char_count * (ratio * 1.5 + (1 - ratio) * 0.25))
|
|
36
|
+
# 无样本时使用保守的混合比例
|
|
37
|
+
return int(char_count * 0.5)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _extract_content_length(msg: dict) -> int:
|
|
41
|
+
"""提取消息内容的字符数。"""
|
|
42
|
+
# 处理嵌套的消息格式 (msg.message.content)
|
|
43
|
+
message = msg.get("message", msg)
|
|
44
|
+
content = message.get("content", "")
|
|
45
|
+
if isinstance(content, str):
|
|
46
|
+
return len(content)
|
|
47
|
+
if isinstance(content, list):
|
|
48
|
+
return sum(
|
|
49
|
+
len(item.get("text", ""))
|
|
50
|
+
for item in content
|
|
51
|
+
if isinstance(item, dict) and "text" in item
|
|
52
|
+
)
|
|
53
|
+
return 0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract(sessions: list[dict]) -> dict:
|
|
57
|
+
"""从 session 列表中提取上下文利用率统计。
|
|
58
|
+
|
|
59
|
+
通过累积消息字符数估算上下文使用量,结合模型 context limit 计算利用率。
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
sessions: session JSONL 解析后的字典列表。
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
包含上下文利用率分布、峰值、模型映射等统计信息。
|
|
66
|
+
"""
|
|
67
|
+
models_used: set[str] = set()
|
|
68
|
+
context_limits: dict[str, int] = {}
|
|
69
|
+
utilization_samples: list[float] = []
|
|
70
|
+
compact_at_high_utilization = 0
|
|
71
|
+
total_compacts = 0
|
|
72
|
+
|
|
73
|
+
for session in sessions:
|
|
74
|
+
messages = session.get("messages", [])
|
|
75
|
+
current_model: str | None = None
|
|
76
|
+
cumulative_chars = 0
|
|
77
|
+
|
|
78
|
+
for msg in messages:
|
|
79
|
+
# 检查 model_change 事件
|
|
80
|
+
if msg.get("type") == "model_change":
|
|
81
|
+
model_id = msg.get("modelId", "")
|
|
82
|
+
if model_id:
|
|
83
|
+
current_model = model_id
|
|
84
|
+
models_used.add(model_id)
|
|
85
|
+
if model_id in MODEL_CONTEXT_LIMITS:
|
|
86
|
+
context_limits[model_id] = MODEL_CONTEXT_LIMITS[model_id]
|
|
87
|
+
|
|
88
|
+
# 累积消息字符数
|
|
89
|
+
cumulative_chars += _extract_content_length(msg)
|
|
90
|
+
|
|
91
|
+
# compact 事件
|
|
92
|
+
if msg.get("type") == "compaction":
|
|
93
|
+
total_compacts += 1
|
|
94
|
+
if current_model and current_model in MODEL_CONTEXT_LIMITS:
|
|
95
|
+
limit = MODEL_CONTEXT_LIMITS[current_model]
|
|
96
|
+
estimated_tokens = estimate_tokens_from_chars(cumulative_chars)
|
|
97
|
+
utilization = estimated_tokens / limit
|
|
98
|
+
if utilization >= 0.7:
|
|
99
|
+
compact_at_high_utilization += 1
|
|
100
|
+
utilization_samples.append(utilization)
|
|
101
|
+
# compact 后重置累积
|
|
102
|
+
cumulative_chars = 0
|
|
103
|
+
|
|
104
|
+
# session 结束时记录最终利用率
|
|
105
|
+
if (
|
|
106
|
+
current_model
|
|
107
|
+
and current_model in MODEL_CONTEXT_LIMITS
|
|
108
|
+
and cumulative_chars > 0
|
|
109
|
+
):
|
|
110
|
+
limit = MODEL_CONTEXT_LIMITS[current_model]
|
|
111
|
+
estimated_tokens = estimate_tokens_from_chars(cumulative_chars)
|
|
112
|
+
utilization = estimated_tokens / limit
|
|
113
|
+
utilization_samples.append(utilization)
|
|
114
|
+
|
|
115
|
+
# 计算统计
|
|
116
|
+
avg_utilization = sum(utilization_samples) / max(len(utilization_samples), 1)
|
|
117
|
+
peak_utilization = max(utilization_samples) if utilization_samples else 0.0
|
|
118
|
+
|
|
119
|
+
# 分布桶
|
|
120
|
+
distribution = {"0-30%": 0, "30-60%": 0, "60-90%": 0, "90%+": 0}
|
|
121
|
+
for u in utilization_samples:
|
|
122
|
+
if u < 0.3:
|
|
123
|
+
distribution["0-30%"] += 1
|
|
124
|
+
elif u < 0.6:
|
|
125
|
+
distribution["30-60%"] += 1
|
|
126
|
+
elif u < 0.9:
|
|
127
|
+
distribution["60-90%"] += 1
|
|
128
|
+
else:
|
|
129
|
+
distribution["90%+"] += 1
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
"models_used": sorted(models_used),
|
|
133
|
+
"context_limits": context_limits,
|
|
134
|
+
"avg_estimated_utilization": avg_utilization,
|
|
135
|
+
"peak_estimated_utilization": peak_utilization,
|
|
136
|
+
"utilization_distribution": distribution,
|
|
137
|
+
"compact_at_high_utilization": compact_at_high_utilization,
|
|
138
|
+
"total_compacts": total_compacts,
|
|
139
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""分析 Goal 任务拆分质量和 Todo 使用质量。"""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def score_evidence(evidence: str) -> float:
|
|
8
|
+
"""Evidence 质量评分 0.0-1.0。
|
|
9
|
+
|
|
10
|
+
评分维度:长度、路径引用、测试关键词、结果关键词、数值。
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
evidence: 任务的 evidence 文本。
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
0.0-1.0 之间的质量评分。
|
|
17
|
+
"""
|
|
18
|
+
if not evidence:
|
|
19
|
+
return 0.0
|
|
20
|
+
score = 0.0
|
|
21
|
+
if len(evidence) >= 20:
|
|
22
|
+
score += 0.3
|
|
23
|
+
if re.search(r"[/\\]", evidence):
|
|
24
|
+
score += 0.2
|
|
25
|
+
if re.search(r"test|spec|check", evidence, re.I):
|
|
26
|
+
score += 0.2
|
|
27
|
+
if re.search(r"pass|fail|success|error", evidence, re.I):
|
|
28
|
+
score += 0.2
|
|
29
|
+
if re.search(r"\d+", evidence):
|
|
30
|
+
score += 0.1
|
|
31
|
+
return min(score, 1.0)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _extract_text_from_content(content: Any) -> str:
|
|
35
|
+
"""从消息 content 中提取纯文本。"""
|
|
36
|
+
if isinstance(content, str):
|
|
37
|
+
return content
|
|
38
|
+
if isinstance(content, list):
|
|
39
|
+
return " ".join(
|
|
40
|
+
item.get("text", "")
|
|
41
|
+
for item in content
|
|
42
|
+
if isinstance(item, dict) and "text" in item
|
|
43
|
+
)
|
|
44
|
+
return ""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def extract(sessions: list[dict]) -> dict:
|
|
48
|
+
"""从 session 列表中提取 Goal/Todo 质量统计。
|
|
49
|
+
|
|
50
|
+
分析 Goal 完成率、任务拆分质量、Evidence 质量、Stall 频率、Token 消耗,
|
|
51
|
+
以及 Todo 的完成率、放弃率等。
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
sessions: session JSONL 解析后的字典列表。
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
包含 goal_quality_stats 和 todo_stats 两个维度。
|
|
58
|
+
"""
|
|
59
|
+
goals_total = 0
|
|
60
|
+
goals_completed = 0
|
|
61
|
+
goals_budget_limited = 0
|
|
62
|
+
goals_cancelled = 0
|
|
63
|
+
all_tasks: list[dict] = []
|
|
64
|
+
all_evidence: list[str] = []
|
|
65
|
+
stall_count = 0
|
|
66
|
+
total_tokens = 0
|
|
67
|
+
|
|
68
|
+
todo_total = 0
|
|
69
|
+
todo_completed = 0
|
|
70
|
+
todo_abandoned = 0
|
|
71
|
+
|
|
72
|
+
for session in sessions:
|
|
73
|
+
messages = session.get("messages", [])
|
|
74
|
+
|
|
75
|
+
for msg in messages:
|
|
76
|
+
# Goal state entries
|
|
77
|
+
if msg.get("customType") == "goal-state":
|
|
78
|
+
goals_total += 1
|
|
79
|
+
state = msg.get("data", {})
|
|
80
|
+
status = state.get("status", "")
|
|
81
|
+
|
|
82
|
+
if status == "complete":
|
|
83
|
+
goals_completed += 1
|
|
84
|
+
elif status == "budget_limited":
|
|
85
|
+
goals_budget_limited += 1
|
|
86
|
+
elif status == "cancelled":
|
|
87
|
+
goals_cancelled += 1
|
|
88
|
+
|
|
89
|
+
tasks = state.get("tasks", [])
|
|
90
|
+
for task in tasks:
|
|
91
|
+
all_tasks.append(task)
|
|
92
|
+
evidence = task.get("evidence", "")
|
|
93
|
+
if evidence:
|
|
94
|
+
all_evidence.append(evidence)
|
|
95
|
+
|
|
96
|
+
stall_count += state.get("stallCount", 0)
|
|
97
|
+
total_tokens += state.get("tokensUsed", 0)
|
|
98
|
+
|
|
99
|
+
# Todo tool calls
|
|
100
|
+
if (
|
|
101
|
+
msg.get("role") == "toolResult"
|
|
102
|
+
and msg.get("toolName") == "todo"
|
|
103
|
+
):
|
|
104
|
+
content = _extract_text_from_content(msg.get("content", ""))
|
|
105
|
+
|
|
106
|
+
# 解析 todo 操作
|
|
107
|
+
if "add" in content.lower() or "添加" in content:
|
|
108
|
+
todo_total += 1
|
|
109
|
+
if "completed" in content.lower() or "完成" in content:
|
|
110
|
+
todo_completed += 1
|
|
111
|
+
if "delete" in content.lower() or "删除" in content:
|
|
112
|
+
todo_abandoned += 1
|
|
113
|
+
|
|
114
|
+
# 任务统计
|
|
115
|
+
total_tasks = len(all_tasks)
|
|
116
|
+
completed_tasks = sum(1 for t in all_tasks if t.get("status") == "completed")
|
|
117
|
+
cancelled_tasks = sum(1 for t in all_tasks if t.get("status") == "cancelled")
|
|
118
|
+
pending_tasks = sum(1 for t in all_tasks if t.get("status") == "pending")
|
|
119
|
+
|
|
120
|
+
# Evidence 统计
|
|
121
|
+
tasks_with_evidence = len(all_evidence)
|
|
122
|
+
evidence_scores = [score_evidence(e) for e in all_evidence]
|
|
123
|
+
avg_evidence_score = sum(evidence_scores) / max(len(evidence_scores), 1)
|
|
124
|
+
low_quality_count = sum(1 for s in evidence_scores if s < 0.4)
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"goals_total": goals_total,
|
|
128
|
+
"goals_completed": goals_completed,
|
|
129
|
+
"goals_budget_limited": goals_budget_limited,
|
|
130
|
+
"goals_cancelled": goals_cancelled,
|
|
131
|
+
"completion_rate": goals_completed / max(goals_total, 1),
|
|
132
|
+
"avg_tasks_per_goal": total_tasks / max(goals_total, 1),
|
|
133
|
+
"task_stats": {
|
|
134
|
+
"total": total_tasks,
|
|
135
|
+
"completed": completed_tasks,
|
|
136
|
+
"cancelled": cancelled_tasks,
|
|
137
|
+
"pending": pending_tasks,
|
|
138
|
+
"completion_rate": completed_tasks / max(total_tasks, 1),
|
|
139
|
+
"cancel_rate": cancelled_tasks / max(total_tasks, 1),
|
|
140
|
+
},
|
|
141
|
+
"evidence_stats": {
|
|
142
|
+
"tasks_with_evidence": tasks_with_evidence,
|
|
143
|
+
"evidence_rate": tasks_with_evidence / max(total_tasks, 1),
|
|
144
|
+
"avg_evidence_score": avg_evidence_score,
|
|
145
|
+
"low_quality_evidence_count": low_quality_count,
|
|
146
|
+
},
|
|
147
|
+
"stall_stats": {
|
|
148
|
+
"goals_with_stall": 1 if stall_count > 0 else 0,
|
|
149
|
+
"stall_rate": (1 if stall_count > 0 else 0) / max(goals_total, 1),
|
|
150
|
+
"avg_stall_count": stall_count / max(goals_total, 1),
|
|
151
|
+
},
|
|
152
|
+
"token_stats": {
|
|
153
|
+
"avg_tokens_per_goal": total_tokens / max(goals_total, 1),
|
|
154
|
+
"avg_tokens_per_task": total_tokens / max(total_tasks, 1),
|
|
155
|
+
},
|
|
156
|
+
"todo_stats": {
|
|
157
|
+
"total_todos": todo_total,
|
|
158
|
+
"completed": todo_completed,
|
|
159
|
+
"abandoned": todo_abandoned,
|
|
160
|
+
"completion_rate": todo_completed / max(todo_total, 1),
|
|
161
|
+
"abandon_rate": todo_abandoned / max(todo_total, 1),
|
|
162
|
+
},
|
|
163
|
+
}
|