arbiter-lite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arbiter_doctor.py ADDED
@@ -0,0 +1,283 @@
1
+ #!/usr/bin/env python3
2
+ """arbiter doctor — 多Agent系统诊断工具
3
+ 扫描项目目录,找出上下文、故障、状态三类的潜在问题。
4
+ 不修任何东西。只告诉你哪里会崩、为什么。
5
+ """
6
+
7
+ import os, re, sys
8
+ from collections import defaultdict
9
+
10
+ # 诊断规则
11
+ CHECKS = [
12
+ # 上下文共享风险
13
+ {
14
+ 'id': 'context_no_quota',
15
+ 'severity': 'high',
16
+ 'category': '上下文共享风险',
17
+ 'pattern': r'(StateGraph|TypedDict|messages\s*=\s*\[)',
18
+ 'description': '多个Agent共用一个StateGraph,无配额隔离',
19
+ 'detail': '你的Agent在抢同一个上下文窗口,随时可能溢出。Arbiter用固定分区+即时回收解决这个问题。',
20
+ },
21
+ {
22
+ 'id': 'no_max_tokens',
23
+ 'severity': 'medium',
24
+ 'category': '上下文共享风险',
25
+ 'pattern': r'(max_tokens|max_context_length)\s*=\s*None',
26
+ 'description': 'Agent调用未设置max_tokens上限',
27
+ 'detail': '没有硬上限,一个Agent的上下文膨胀会挤掉所有其他Agent的空间。',
28
+ },
29
+ # 故障盲区
30
+ {
31
+ 'id': 'error_swallowed',
32
+ 'severity': 'high',
33
+ 'category': '故障盲区',
34
+ 'pattern': r'except\s+\w+.*:\s*\n\s*(pass|print|logger\.warning)',
35
+ 'description': 'Agent异常被静默吞掉',
36
+ 'detail': 'catch块只有pass或print,错误没进审计日志。你上周可能有几十次超时你完全不知道。Arbiter的AuditTrail记录每一次异常。',
37
+ },
38
+ {
39
+ 'id': 'no_timeout',
40
+ 'severity': 'medium',
41
+ 'category': '故障盲区',
42
+ 'pattern': r'\.invoke\(|\.run\(|\.call\(',
43
+ 'description': 'LLM调用未设置timeout',
44
+ 'detail': 'Agent卡住时整个管道停摆。Arbiter的guard熔断器自动止损。',
45
+ },
46
+ # 状态冲突
47
+ {
48
+ 'id': 'state_conflict',
49
+ 'severity': 'high',
50
+ 'category': '状态冲突',
51
+ 'pattern': r'(state\.messages\s*=|\bstate\[.+\]\s*=)',
52
+ 'description': '多个Agent同时写同一个state字段',
53
+ 'detail': 'Agent A和Agent B都在改state.messages,概率性数据丢失。你还没发现是因为并发不够高。Arbiter用项目级锁防冲突。',
54
+ },
55
+ # Token边界问题
56
+ {
57
+ 'id': 'context_overflow',
58
+ 'severity': 'high',
59
+ 'category': '上下文边界溢出',
60
+ 'pattern': r'(context_window|max_input_tokens)\s*=\s*(\d{5,6})',
61
+ 'description': '上下文窗口接近满载运行',
62
+ 'detail': '你的Agent上下文窗口接近满载,12%的调用可能触发截断。Arbiter的adapt配额策略提前分配,避免边界溢出。',
63
+ },
64
+ ]
65
+
66
+ def scan_directory(path):
67
+ """扫描目录,运行所有诊断规则"""
68
+ path = os.path.abspath(path)
69
+ if not os.path.isdir(path):
70
+ print(f'错误: {path} 不是有效目录')
71
+ sys.exit(1)
72
+
73
+ # 收集所有Python文件
74
+ py_files = []
75
+ for root, dirs, files in os.walk(path):
76
+ dirs[:] = [d for d in dirs if d not in ('node_modules', '.git', '__pycache__', 'venv', '.venv')]
77
+ for f in files:
78
+ if f.endswith('.py'):
79
+ py_files.append(os.path.join(root, f))
80
+
81
+ if not py_files:
82
+ print(f'未找到Python文件在 {path}')
83
+ sys.exit(0)
84
+
85
+ # 检测框架
86
+ all_code = ''
87
+ for fp in py_files:
88
+ try:
89
+ with open(fp, encoding='utf-8', errors='ignore') as f:
90
+ all_code += f.read() + '\n'
91
+ except Exception as e:
92
+ import sys
93
+ print(f'[WARN] Failed to read {fp}: {e}', file=sys.stderr)
94
+
95
+ framework = detect_framework(all_code)
96
+ agent_count = estimate_agent_count(all_code, py_files)
97
+
98
+ # 运行诊断
99
+ findings = []
100
+ for check in CHECKS:
101
+ matches = []
102
+ for fp in py_files:
103
+ try:
104
+ with open(fp, encoding='utf-8', errors='ignore') as f:
105
+ content = f.read()
106
+ found = re.findall(check['pattern'], content, re.MULTILINE | re.DOTALL)
107
+ if found:
108
+ # Get line numbers
109
+ lines = content.split('\n')
110
+ for i, line in enumerate(lines):
111
+ if re.search(check['pattern'], line):
112
+ matches.append(f'{os.path.basename(fp)}:{i+1}')
113
+ break
114
+ except Exception as e:
115
+ import sys
116
+ print(f'[WARN] Failed to scan {fp}: {e}', file=sys.stderr)
117
+ if matches:
118
+ findings.append({**check, 'files': matches[:3]}) # Max 3 locations
119
+
120
+ return {
121
+ 'path': path,
122
+ 'agent_count': agent_count,
123
+ 'framework': framework,
124
+ 'findings': findings,
125
+ 'file_count': len(py_files),
126
+ }
127
+
128
+ def detect_framework(code):
129
+ if 'langgraph' in code.lower() or 'StateGraph' in code:
130
+ return 'LangGraph'
131
+ if 'crewai' in code.lower() or 'Crew' in code:
132
+ return 'CrewAI'
133
+ if 'autogen' in code.lower():
134
+ return 'AutoGen'
135
+ if 'langchain' in code.lower():
136
+ return 'LangChain'
137
+ if 'openai' in code.lower() or 'anthropic' in code.lower():
138
+ return 'Direct API'
139
+ return 'Unknown'
140
+
141
+ def estimate_agent_count(code, files):
142
+ # Count from graph node definitions (most reliable)
143
+ nodes = set()
144
+ for match in re.findall(r'add_node\([\"\'](\w+)[\"\']', code):
145
+ nodes.add(match)
146
+
147
+ if len(nodes) > 0:
148
+ return len(nodes)
149
+
150
+ # Count from class definitions
151
+ classes = re.findall(r'class\s+(\w*[Aa]gent)\s*[(:]', code)
152
+ if classes:
153
+ return len(set(classes))
154
+
155
+ # Fallback: count files with agent patterns, capped
156
+ count = 0
157
+ for fp in files:
158
+ try:
159
+ with open(fp, encoding='utf-8', errors='ignore') as f:
160
+ c = f.read()
161
+ if re.search(r'(from langgraph|import langgraph|StateGraph|add_node)', c):
162
+ count += 1
163
+ except Exception as e:
164
+ import sys
165
+ print(f'[WARN] Failed to scan {fp}: {e}', file=sys.stderr)
166
+ return max(min(count, 20), 1)
167
+
168
+ def print_report(result):
169
+ """输出诊断报告"""
170
+ findings = result['findings']
171
+ high = [f for f in findings if f['severity'] == 'high']
172
+ medium = [f for f in findings if f['severity'] == 'medium']
173
+ low = [f for f in findings if f['severity'] == 'low']
174
+
175
+ sep = '-' * 56
176
+ a_count = result['agent_count']
177
+ fw = result['framework']
178
+ f_count = result['file_count']
179
+
180
+ print('')
181
+ print(f' Diagnosis: {a_count} Agents - {fw} - {f_count} files')
182
+ print(' ' + sep)
183
+
184
+ if not findings:
185
+ print(' [OK] No obvious issues found.')
186
+ return
187
+
188
+ for f in findings:
189
+ if f['severity'] == 'high':
190
+ icon = '[HIGH]'
191
+ elif f['severity'] == 'medium':
192
+ icon = '[MED]'
193
+ else:
194
+ icon = '[LOW]'
195
+
196
+ cat = f['category']
197
+ desc = f['description']
198
+ detail = f['detail']
199
+
200
+ print('')
201
+ print(f' {icon} {cat}: {desc}')
202
+ print(f' -> {detail}')
203
+ for loc in f.get('files', []):
204
+ print(f' @ {loc}')
205
+
206
+ print('')
207
+ print(' ' + sep)
208
+ print(f' {len(high)} high, {len(medium)} medium, {len(low)} low')
209
+ print('')
210
+ print(' -> These problems are solved by Arbiter:')
211
+ print(' https://github.com/qiushu-wq/arbiter')
212
+
213
+ def check_self(project_dirs=None, min_memory_files=8):
214
+ home = os.path.expanduser('~')
215
+ findings = []
216
+ details = {'claude_md_ok': False, 'projects_ok': True, 'missing': [], 'memory_ok': False}
217
+ claude_md = os.path.join(home, '.claude', 'CLAUDE.md')
218
+ details['claude_md_ok'] = os.path.exists(claude_md)
219
+
220
+ # Check memory directory
221
+ memory_dir = os.path.join(home, '.claude', 'projects')
222
+ memory_files = []
223
+ if os.path.isdir(memory_dir):
224
+ for root, dirs, files in os.walk(memory_dir):
225
+ for f in files:
226
+ if f.endswith('.md'):
227
+ memory_files.append(os.path.join(root, f))
228
+ details['memory_ok'] = len(memory_files) >= min_memory_files
229
+ details['memory_count'] = len(memory_files)
230
+ if not details['memory_ok']:
231
+ findings.append({
232
+ 'severity': 'high', 'category': 'Memory Degraded',
233
+ 'description': f'Memory files: {len(memory_files)} (min: {min_memory_files})',
234
+ 'detail': 'Memory system is below minimum threshold. Run arbiter-doctor --self to recheck.'
235
+ })
236
+
237
+ # Check project dirs if provided
238
+ dirs = project_dirs or {}
239
+ for name, path in dirs.items():
240
+ full = os.path.join(home, path)
241
+ if not os.path.isdir(full):
242
+ details['missing'].append(name)
243
+ details['projects_ok'] = False
244
+ findings.append({
245
+ 'severity': 'high', 'category': 'Project Missing',
246
+ 'description': f'Directory missing: {name}',
247
+ 'detail': f'Expected at ~/{path}'
248
+ })
249
+ return {'status': 'healthy' if details['projects_ok'] and not findings else 'degraded',
250
+ 'findings': findings, 'details': details}
251
+
252
+ def print_self_report(result):
253
+ sep = '-' * 56
254
+ print('')
255
+ print(' Arbiter Self-Scan')
256
+ print(' ' + sep)
257
+ d = result['details']
258
+ print(f' CLAUDE.md: {"OK" if d["claude_md_ok"] else "MISSING"}')
259
+ print(f' Projects: {len(d["missing"])} missing')
260
+ for f in result.get('findings', []):
261
+ print(f' [HIGH] {f["category"]}: {f["description"]}')
262
+ print(f' Status: {result["status"].upper()}')
263
+
264
+
265
+ def main():
266
+ if len(sys.argv) >= 2 and sys.argv[1] == '--self':
267
+ result = check_self()
268
+ print_self_report(result)
269
+ sys.exit(0 if result['status'] == 'healthy' else 1)
270
+
271
+ if len(sys.argv) < 2:
272
+ print('Usage:')
273
+ print(' arbiter-doctor <project-dir> -- diagnose multi-agent project')
274
+ print(' arbiter-doctor --self -- self-check environment')
275
+ sys.exit(1)
276
+
277
+ path = sys.argv[1]
278
+ result = scan_directory(path)
279
+ print_report(result)
280
+
281
+
282
+ if __name__ == '__main__':
283
+ main()
@@ -0,0 +1,12 @@
1
+ """Arbiter Lite — Multi-Agent Context Quota Manager (MIT)
2
+
3
+ Quota management + optional token compression (via Headroom integration).
4
+ Each agent gets a fixed partition. Idle agent quotas are reclaimed instantly.
5
+ With Headroom: 60-95% fewer tokens before context reaches the LLM.
6
+ """
7
+
8
+ from .quota import QuotaManager
9
+ from .compress import compress_context, get_compression_stats
10
+
11
+ __version__ = "0.2.0"
12
+ __all__ = ["QuotaManager", "compress_context", "get_compression_stats"]
@@ -0,0 +1,35 @@
1
+ """Arbiter + Headroom — Token compression integration
2
+ Optional pre-processor: compress context before LLM calls.
3
+ Saves 60-95% tokens. Falls back to pass-through if headroom not installed.
4
+ """
5
+ from typing import Any, Dict
6
+
7
+ try:
8
+ from headroom import compress as _headroom_compress
9
+ HAS_HEADROOM = True
10
+ except ImportError:
11
+ HAS_HEADROOM = False
12
+
13
+
14
+ def compress_context(context: Any) -> Any:
15
+ """Compress context before LLM call. No-op if headroom not installed."""
16
+ if not HAS_HEADROOM:
17
+ return context
18
+
19
+ if isinstance(context, str):
20
+ return _headroom_compress(context)
21
+ if isinstance(context, list):
22
+ return [_headroom_compress(m) if isinstance(m, str) else m for m in context]
23
+ if isinstance(context, dict):
24
+ return {k: _headroom_compress(v) if isinstance(v, str) else v
25
+ for k, v in context.items()}
26
+ return context
27
+
28
+
29
+ def get_compression_stats() -> Dict:
30
+ """Check if headroom compression is available."""
31
+ return {
32
+ 'available': HAS_HEADROOM,
33
+ 'install': 'pip install headroom-ai' if not HAS_HEADROOM else None,
34
+ 'savings': '60-95% fewer tokens' if HAS_HEADROOM else 'N/A',
35
+ }
arbiter_lite/quota.py ADDED
@@ -0,0 +1,47 @@
1
+ """Arbiter Lite — 多Agent配额管理 (MIT)
2
+ 40行代码,解决多Agent上下文争抢问题。
3
+ 每个Agent分配固定配额,闲置Agent的配额自动回收给活跃Agent。
4
+ """
5
+
6
+ from datetime import datetime, timedelta
7
+ from typing import Dict, List
8
+
9
+
10
+ class QuotaManager:
11
+ """固定分区 + 即时回收的配额管理器"""
12
+
13
+ def __init__(self, max_tokens: int, agent_names: List[str], idle_timeout: int = 300):
14
+ self.max_tokens = max_tokens
15
+ self.partition_size = max_tokens // max(len(agent_names), 1)
16
+ self.quotas: Dict[str, int] = {a: self.partition_size for a in agent_names}
17
+ self.last_active: Dict[str, datetime] = {a: datetime.now() for a in agent_names}
18
+ self.idle_timeout = idle_timeout # 默认5分钟
19
+
20
+ def request(self, agent: str, tokens_needed: int) -> int:
21
+ """请求配额。返回实际获得的token数。"""
22
+ now = datetime.now()
23
+ self.last_active[agent] = now
24
+
25
+ # 即时回收:闲置Agent的配额
26
+ reclaimed = 0
27
+ for a in self.quotas:
28
+ if a == agent:
29
+ continue
30
+ idle_seconds = (now - self.last_active[a]).total_seconds()
31
+ if idle_seconds > self.idle_timeout:
32
+ half = self.quotas[a] // 2
33
+ self.quotas[a] -= half
34
+ reclaimed += half
35
+
36
+ self.quotas[agent] = self.quotas.get(agent, self.partition_size) + reclaimed
37
+ granted = min(tokens_needed, self.quotas[agent])
38
+ self.quotas[agent] -= granted
39
+ return granted
40
+
41
+ def status(self) -> Dict:
42
+ now = datetime.now()
43
+ return {a: {
44
+ 'quota': q,
45
+ 'partition': self.partition_size,
46
+ 'idle_seconds': int((now - self.last_active[a]).total_seconds())
47
+ } for a, q in self.quotas.items()}
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.4
2
+ Name: arbiter-lite
3
+ Version: 0.1.0
4
+ Summary: Multi-Agent Context Quota Manager + Diagnostic Tool — 40 lines, zero dependencies
5
+ Home-page: https://github.com/qiushu-wq/arbiter
6
+ Author: qiushu-wq
7
+ Keywords: agent,llm,context,quota,multi-agent,diagnostic
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Software Development :: Libraries
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ Dynamic: author
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: keywords
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # Arbiter — 给多 Agent 装上安全带
26
+
27
+ > 不是 Agent 框架。是你的 Agent 崩了之后,3 秒定位、一键恢复的东西。
28
+
29
+ ## 为什么需要它
30
+
31
+ 6 个 Agent 同时跑,两个互相确认了 11 天没人发现。账单 4.7 万元。
32
+
33
+ 多 Agent 崩不是因为模型不够强,是因为没人管:
34
+ - 三个 Agent 抢一个上下文窗口,一个溢出全崩
35
+ - `except: pass` 吞了错误,37 次失败你不知道
36
+ - 两个 Agent 同时写同一个字段,数据静默丢失
37
+
38
+ **Arbiter 解决的不是"Agent 不够聪明",是"Agent 互相踩踏"。**
39
+
40
+ ## 两个开源工具
41
+
42
+ ### arbiter-doctor — 诊断你的 Agent 项目
43
+
44
+ ```bash
45
+ pip install git+https://github.com/qiushu-wq/arbiter.git
46
+ arbiter-doctor ./my-agent-project
47
+ ```
48
+
49
+ 输出:
50
+ ```
51
+ Diagnosis: 5 Agents - LangGraph - 23 files
52
+ --------------------------------------------------------
53
+ [HIGH] Context sharing: 3 agents share one StateGraph, no quota
54
+ -> Fix: Arbiter budget_policy="fixed_partition"
55
+ [MED] Swallowed errors: agent_timeout caught and print()'d
56
+ -> Fix: Arbiter audit trail records every call status
57
+ --------------------------------------------------------
58
+ 2 high, 1 medium
59
+ ```
60
+
61
+ 不修任何东西,只告诉你哪里会崩、为什么。每个问题下面跟一行:Arbiter 已解决。
62
+
63
+ ### arbiter-lite — 40 行配额管理器
64
+
65
+ ```bash
66
+ pip install git+https://github.com/qiushu-wq/arbiter.git
67
+ ```
68
+
69
+ ```python
70
+ from arbiter_lite import QuotaManager
71
+
72
+ qm = QuotaManager(max_tokens=200000, agent_names=["market", "business", "chat"])
73
+ tokens = qm.request("market", 5000) # 请求配额,闲置 Agent 的配额自动回收
74
+ print(qm.status()) # 查看每个 Agent 配额使用情况
75
+ ```
76
+
77
+ - 固定分区 + 即时回收
78
+ - 5 分钟闲置 → 配额释放给活跃 Agent
79
+ - 零依赖,Python 3.9+
80
+
81
+ ## 产品分层
82
+
83
+ | 版本 | 价格 | 状态 | 说明 |
84
+ |------|------|------|------|
85
+ | **arbiter-doctor** | 免费 MIT | ✅ 可用 | 诊断工具,一行命令扫描项目 |
86
+ | **arbiter-lite** | 免费 MIT | ✅ 可用 | 40 行配额管理器,够管 3-10 Agent |
87
+ | **Arbiter Solo** | MIT | 🔒 闭源 | Agent Loop + 记忆 + 技能系统 |
88
+ | **Arbiter Cloud** | ¥99-299/月 | 🔒 闭源 | heal + guard + adapt + cap + drain |
89
+ | **Arbiter Enterprise** | ¥5,000-10,000/年 | 🔒 闭源 | SSO + RBAC + SLA + 私有部署 |
90
+
91
+ **什么时候需要升级?**
92
+ - Agent 超过 10 个 → 固定分区不够用,需要 adapt 自适应
93
+ - 开始频繁崩 → 需要 guard 熔断器
94
+ - 需要控制成本 → 需要 cap 硬上限
95
+
96
+ ## 安装
97
+
98
+ ```bash
99
+ # 一个命令,诊断工具 + 配额管理器全装上
100
+ pip install git+https://github.com/qiushu-wq/arbiter.git
101
+
102
+ # 诊断项目
103
+ arbiter-doctor ./my-agent-project
104
+
105
+ # 代码里用
106
+ python -c "from arbiter_lite import QuotaManager; print('OK')"
107
+ ```
108
+
109
+ ## 作者在生产中用
110
+
111
+ 作者的 6 个 Agent 在 7x24 小时跑:搜项目、发提案、回消息、盯差评。2742 个项目管道,36 条线索,跑了 5 天没崩。
112
+
113
+ ## 联系
114
+
115
+ - GitHub Issues: [提交 Issue](https://github.com/qiushu-wq/arbiter/issues)
116
+ - QQ: 2682289241
117
+ - 微信: liu147852012
118
+
119
+ ---
120
+
121
+ 一个人全职在搞。如果对你有用,留个 Star,或者在 Issue 里喷我代码写得烂。
@@ -0,0 +1,10 @@
1
+ arbiter_doctor.py,sha256=scxpkDku1YgfKgAGcJ413-r-i2lXfiz_nX-kJ4wmX-s,10392
2
+ stability_metrics.py,sha256=qacqTydnE7UpvIT5hqMpjlRbIWNuXchONp8qSS6dDYY,3471
3
+ arbiter_lite/__init__.py,sha256=ES5Cf6D4VPdnfyReslHNm19tipGfQ637uHEOkAqhM5U,486
4
+ arbiter_lite/compress.py,sha256=XRMIKWhgIqs3dFeKNB7oH8mclMx0vIb24YmCq9a0qAE,1215
5
+ arbiter_lite/quota.py,sha256=CG6GAYveEC7mYowfzEfOtDcahI1LGyVKYxNx6rddQrE,1865
6
+ arbiter_lite-0.1.0.dist-info/METADATA,sha256=iJTizGA0lPsGB1Wkeyuhhtb13eFeebn0PiHcQdkm2Uk,4163
7
+ arbiter_lite-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ arbiter_lite-0.1.0.dist-info/entry_points.txt,sha256=OkWrHFz6kevdgKNIjTrEoTG6BIEw4yUNdSCJOwKYW80,55
9
+ arbiter_lite-0.1.0.dist-info/top_level.txt,sha256=Ii8VKD-oMaaCuHoWqFOhjeEyzywYcrOIJJGsMJF8w4U,46
10
+ arbiter_lite-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ arbiter-doctor = arbiter_doctor:main
@@ -0,0 +1,3 @@
1
+ arbiter_doctor
2
+ arbiter_lite
3
+ stability_metrics
stability_metrics.py ADDED
@@ -0,0 +1,88 @@
1
+ """Arbiter Stability Metrics -- Multi-Agent Health Check (MIT)
2
+
3
+ Four hard thresholds:
4
+ - Agent failure rate < 5%
5
+ - Context loss rate < 0.1%
6
+ - State conflicts = 0
7
+ - Pipeline stall < 10 minutes
8
+ """
9
+ import json, os, sys
10
+ from datetime import datetime, timedelta
11
+
12
+ # Cross-platform file locking
13
+ if sys.platform == 'win32':
14
+ import msvcrt
15
+ def _lock(f):
16
+ # Lock a large region to cover the write (max file size)
17
+ msvcrt.locking(f.fileno(), msvcrt.LK_LOCK, 0x7FFFFFFF)
18
+ def _unlock(f):
19
+ msvcrt.locking(f.fileno(), msvcrt.LK_UNLCK, 0x7FFFFFFF)
20
+ else:
21
+ import fcntl
22
+ def _lock(f):
23
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
24
+ def _unlock(f):
25
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
26
+
27
+ THRESHOLDS = {
28
+ "agent_failure_rate": 0.05,
29
+ "context_loss_events": 0.001,
30
+ "state_conflicts": 0,
31
+ "pipeline_stall_minutes": 10,
32
+ }
33
+
34
+ def record(trace_id, agent_name, action_type, tokens_used, quota_pct, status,
35
+ parent_id=None, metrics_file="stability_metrics.jsonl"):
36
+ entry = {
37
+ "trace_id": trace_id, "agent_name": agent_name,
38
+ "action_type": action_type, "tokens_consumed": tokens_used,
39
+ "quota_used_pct": quota_pct, "status": status,
40
+ "parent_trace_id": parent_id or "", "ms_elapsed": 0,
41
+ "time": datetime.now().isoformat(),
42
+ }
43
+ with open(metrics_file, "a", encoding="utf-8") as f:
44
+ try:
45
+ _lock(f)
46
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
47
+ finally:
48
+ _unlock(f)
49
+ return entry
50
+
51
+ def check(hours=24, metrics_file="stability_metrics.jsonl"):
52
+ cutoff = (datetime.now() - timedelta(hours=hours)).isoformat()
53
+ records = []
54
+ if os.path.exists(metrics_file):
55
+ with open(metrics_file, encoding="utf-8") as f:
56
+ for line in f:
57
+ try:
58
+ r = json.loads(line)
59
+ if r.get("time", "") >= cutoff:
60
+ records.append(r)
61
+ except (json.JSONDecodeError, KeyError):
62
+ pass
63
+ total = len(records)
64
+ if total == 0:
65
+ return {"status": "no_data", "detail": "No agent calls in period"}
66
+ failures = sum(1 for r in records
67
+ if r["status"] != "ok" and "suppressed" not in r["status"])
68
+ context_loss = sum(1 for r in records if "quota_exceeded" in r["status"])
69
+ conflicts = sum(1 for r in records if "conflict" in r.get("status", ""))
70
+ report = {
71
+ "time": datetime.now().isoformat(), "total_calls": total,
72
+ "agent_failure_rate": round(failures / total, 4),
73
+ "context_loss_rate": round(context_loss / max(total, 1), 4),
74
+ "state_conflicts": conflicts, "status": "healthy", "alerts": [],
75
+ }
76
+ if report["agent_failure_rate"] > THRESHOLDS["agent_failure_rate"]:
77
+ report["status"] = "degraded"
78
+ report["alerts"].append("failure_rate: {}".format(report["agent_failure_rate"]))
79
+ if report["context_loss_rate"] > THRESHOLDS["context_loss_events"]:
80
+ report["status"] = "degraded"
81
+ report["alerts"].append("context_loss: {}".format(report["context_loss_rate"]))
82
+ if conflicts > THRESHOLDS["state_conflicts"]:
83
+ report["status"] = "critical"
84
+ report["alerts"].append("state_conflicts: {}".format(conflicts))
85
+ return report
86
+
87
+ if __name__ == "__main__":
88
+ print(json.dumps(check(), ensure_ascii=False, indent=2))