skill-self-evolution 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_self_evolution/__init__.py +27 -0
- skill_self_evolution/ai_assisted_executor.py +244 -0
- skill_self_evolution/config.py +87 -0
- skill_self_evolution/config_loader.py +241 -0
- skill_self_evolution/context.py +22 -0
- skill_self_evolution/deepseek.py +168 -0
- skill_self_evolution/evolver.py +386 -0
- skill_self_evolution/executor.py +471 -0
- skill_self_evolution/fallback.py +129 -0
- skill_self_evolution/loader.py +201 -0
- skill_self_evolution/logger.py +84 -0
- skill_self_evolution/models.py +147 -0
- skill_self_evolution-0.2.0.dist-info/METADATA +12 -0
- skill_self_evolution-0.2.0.dist-info/RECORD +16 -0
- skill_self_evolution-0.2.0.dist-info/WHEEL +5 -0
- skill_self_evolution-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DeepSeek 客户端封装 — 超时/重试/熔断,兼容 OpenAI Chat Completions API。
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from skill_self_evolution.config import get_deepseek_config
|
|
14
|
+
from skill_self_evolution.models import DeepSeekChatResponse
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CircuitBreaker:
|
|
20
|
+
"""简单熔断器:连续失败 N 次后冷却 M 秒。"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, threshold: int = 3, cooldown_seconds: float = 60.0):
|
|
23
|
+
self.threshold = threshold
|
|
24
|
+
self.cooldown_seconds = cooldown_seconds
|
|
25
|
+
self._failure_count = 0
|
|
26
|
+
self._last_failure_time: float = 0.0
|
|
27
|
+
self._open = False
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def is_open(self) -> bool:
|
|
31
|
+
if not self._open:
|
|
32
|
+
return False
|
|
33
|
+
if time.monotonic() - self._last_failure_time >= self.cooldown_seconds:
|
|
34
|
+
self._open = False
|
|
35
|
+
self._failure_count = 0
|
|
36
|
+
logger.info("熔断器冷却完毕,恢复请求")
|
|
37
|
+
return False
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
def record_failure(self) -> None:
|
|
41
|
+
self._failure_count += 1
|
|
42
|
+
self._last_failure_time = time.monotonic()
|
|
43
|
+
if self._failure_count >= self.threshold and not self._open:
|
|
44
|
+
self._open = True
|
|
45
|
+
logger.warning("熔断器触发:连续 %d 次失败,冷却 %d 秒", self._failure_count, self.cooldown_seconds)
|
|
46
|
+
|
|
47
|
+
def record_success(self) -> None:
|
|
48
|
+
self._failure_count = 0
|
|
49
|
+
if self._open:
|
|
50
|
+
self._open = False
|
|
51
|
+
logger.info("熔断器关闭(请求成功)")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class DeepSeekClient:
|
|
55
|
+
"""DeepSeek OpenAI 兼容客户端,内置超时/重试/熔断。
|
|
56
|
+
|
|
57
|
+
环境感知:local → api.deepseek.com, test/prod → api.modelarts-maas.com/v2
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
api_key: str = "",
|
|
63
|
+
api_base: str = "",
|
|
64
|
+
model: str = "",
|
|
65
|
+
timeout: float = 60.0,
|
|
66
|
+
max_retries: int = 1,
|
|
67
|
+
circuit_breaker: CircuitBreaker | None = None,
|
|
68
|
+
):
|
|
69
|
+
cfg = get_deepseek_config(api_key=api_key, api_base=api_base, model=model)
|
|
70
|
+
self._api_key = cfg.api_key
|
|
71
|
+
self._api_base = cfg.api_base.rstrip("/")
|
|
72
|
+
self._model = cfg.model
|
|
73
|
+
self._timeout = timeout
|
|
74
|
+
self._max_retries = max_retries
|
|
75
|
+
self._circuit_breaker = circuit_breaker or CircuitBreaker()
|
|
76
|
+
self._chat_url = f"{self._api_base}/chat/completions"
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def circuit_breaker(self) -> CircuitBreaker:
|
|
80
|
+
return self._circuit_breaker
|
|
81
|
+
|
|
82
|
+
async def chat(
|
|
83
|
+
self,
|
|
84
|
+
messages: list[dict[str, str]],
|
|
85
|
+
temperature: float = 0.7,
|
|
86
|
+
max_tokens: int = 2048,
|
|
87
|
+
timeout: float | None = None,
|
|
88
|
+
) -> DeepSeekChatResponse:
|
|
89
|
+
if self._circuit_breaker.is_open:
|
|
90
|
+
raise RuntimeError("熔断器已开启,拒绝请求")
|
|
91
|
+
|
|
92
|
+
last_error: Exception | None = None
|
|
93
|
+
effective_timeout = timeout or self._timeout
|
|
94
|
+
|
|
95
|
+
for attempt in range(self._max_retries + 1):
|
|
96
|
+
try:
|
|
97
|
+
result = await self._do_chat(messages, temperature, max_tokens, effective_timeout)
|
|
98
|
+
self._circuit_breaker.record_success()
|
|
99
|
+
return result
|
|
100
|
+
except (httpx.TimeoutException, httpx.HTTPStatusError, httpx.RequestError) as e:
|
|
101
|
+
last_error = e
|
|
102
|
+
logger.warning("DeepSeek 请求失败 (attempt %d/%d): %s", attempt + 1, self._max_retries + 1, e)
|
|
103
|
+
if attempt < self._max_retries:
|
|
104
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
self._circuit_breaker.record_failure()
|
|
108
|
+
raise RuntimeError(f"DeepSeek 请求全部失败 (重试 {self._max_retries} 次): {last_error}") from last_error
|
|
109
|
+
|
|
110
|
+
async def _do_chat(
|
|
111
|
+
self,
|
|
112
|
+
messages: list[dict[str, str]],
|
|
113
|
+
temperature: float,
|
|
114
|
+
max_tokens: int,
|
|
115
|
+
timeout: float,
|
|
116
|
+
) -> DeepSeekChatResponse:
|
|
117
|
+
headers = {
|
|
118
|
+
"Content-Type": "application/json",
|
|
119
|
+
"Authorization": f"Bearer {self._api_key}",
|
|
120
|
+
}
|
|
121
|
+
body = {
|
|
122
|
+
"model": self._model,
|
|
123
|
+
"messages": messages,
|
|
124
|
+
"temperature": temperature,
|
|
125
|
+
"max_tokens": max_tokens,
|
|
126
|
+
"stream": False,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async with httpx.AsyncClient(timeout=timeout, http2=False, trust_env=False) as client:
|
|
130
|
+
resp = await client.post(self._chat_url, headers=headers, json=body)
|
|
131
|
+
resp.raise_for_status()
|
|
132
|
+
data = resp.json()
|
|
133
|
+
|
|
134
|
+
choice = data.get("choices", [{}])[0]
|
|
135
|
+
usage = data.get("usage", {})
|
|
136
|
+
|
|
137
|
+
return DeepSeekChatResponse(
|
|
138
|
+
content=choice.get("message", {}).get("content", ""),
|
|
139
|
+
finish_reason=choice.get("finish_reason"),
|
|
140
|
+
prompt_tokens=usage.get("prompt_tokens"),
|
|
141
|
+
completion_tokens=usage.get("completion_tokens"),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
async def chat_json(
|
|
145
|
+
self,
|
|
146
|
+
messages: list[dict[str, str]],
|
|
147
|
+
temperature: float = 0.3,
|
|
148
|
+
max_tokens: int = 2048,
|
|
149
|
+
timeout: float | None = None,
|
|
150
|
+
) -> dict[str, Any]:
|
|
151
|
+
"""发送请求并解析 JSON。支持 ```json...``` 包裹和纯文本 JSON。"""
|
|
152
|
+
resp = await self.chat(messages, temperature, max_tokens, timeout)
|
|
153
|
+
content = resp.content.strip()
|
|
154
|
+
|
|
155
|
+
# 尝试提取 JSON:先尝试整体解析,再提取 markdown 代码块
|
|
156
|
+
for candidate in [content]:
|
|
157
|
+
if candidate.startswith("```"):
|
|
158
|
+
lines = candidate.split("\n")
|
|
159
|
+
end = -1 if lines[-1].strip() == "```" else len(lines)
|
|
160
|
+
start = 1 if lines[0].startswith("```json") or lines[0].startswith("```") else 0
|
|
161
|
+
candidate = "\n".join(lines[start:end])
|
|
162
|
+
try:
|
|
163
|
+
return json.loads(candidate)
|
|
164
|
+
except json.JSONDecodeError:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
logger.warning("DeepSeek 响应非 JSON: %s", content[:200])
|
|
168
|
+
return {}
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EvoSkill 离线进化器 — 读 JSONL 日志 → DeepSeek 分析失败模式 → 生成优化提案 → 自动写入或发 PR。
|
|
3
|
+
|
|
4
|
+
流程:
|
|
5
|
+
1. 读昨日 JSONL → 筛选 is_failure=true
|
|
6
|
+
2. 若 is_failure 样本 < min_failure_samples → 跳过本次进化
|
|
7
|
+
3. 跑 benchmark()
|
|
8
|
+
4. DeepSeek 分析失败模式(优先 Skill 自定义 evolve_prompt.yaml)
|
|
9
|
+
5. 生成 YAML 优化提案
|
|
10
|
+
6. 按 evolve.toml 权限 → 自动写 MySQL 或发 PR
|
|
11
|
+
- rules_config / prompt 权限为 true → 自动写入(含历史归档)
|
|
12
|
+
- skill_md / scripts 权限为 false → 生成 PR 文件
|
|
13
|
+
7. benchmark 安全网:变更后重跑 benchmark,退化则自动回滚
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import time
|
|
20
|
+
from datetime import datetime, timedelta, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from skill_self_evolution.deepseek import DeepSeekClient
|
|
27
|
+
from skill_self_evolution.loader import SkillLoader, SkillModule
|
|
28
|
+
from skill_self_evolution.logger import _get_log_dir
|
|
29
|
+
from skill_self_evolution.models import EvolveProposalModel
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
_BEIJING_TZ = timezone(timedelta(hours=8))
|
|
34
|
+
|
|
35
|
+
_FRAMEWORK_DEFAULTS = Path(__file__).resolve().parents[2] / "defaults"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _yesterday_str() -> str:
|
|
39
|
+
return (datetime.now(_BEIJING_TZ) - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class EvolveProposal:
|
|
43
|
+
"""一次进化分析产生的提案(内部状态用普通类,序列化时通过 EvolveProposalModel 校验)。"""
|
|
44
|
+
|
|
45
|
+
def __init__(self):
|
|
46
|
+
self.rules_changes: dict[str, Any] = {}
|
|
47
|
+
self.prompt_changes: dict[str, Any] = {}
|
|
48
|
+
self.rules_text: str | None = None
|
|
49
|
+
self.prompt_text: str | None = None
|
|
50
|
+
self.analysis_raw: str = ""
|
|
51
|
+
self.failure_count: int = 0
|
|
52
|
+
self.benchmark_before: tuple[int, int, list] = (0, 0, [])
|
|
53
|
+
self.benchmark_after: tuple[int, int, list] = (0, 0, [])
|
|
54
|
+
self.applied: bool = False
|
|
55
|
+
self.rolled_back: bool = False
|
|
56
|
+
|
|
57
|
+
def to_model(self) -> EvolveProposalModel:
|
|
58
|
+
"""转为 Pydantic 模型(用于序列化/日志)。"""
|
|
59
|
+
return EvolveProposalModel(
|
|
60
|
+
rules_changes=self.rules_changes,
|
|
61
|
+
prompt_changes=self.prompt_changes,
|
|
62
|
+
rules_text=self.rules_text,
|
|
63
|
+
prompt_text=self.prompt_text,
|
|
64
|
+
analysis_raw=self.analysis_raw,
|
|
65
|
+
failure_count=self.failure_count,
|
|
66
|
+
applied=self.applied,
|
|
67
|
+
rolled_back=self.rolled_back,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class Evolver:
|
|
72
|
+
"""离线进化引擎。"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
skill_name: str,
|
|
77
|
+
skill_base_dir: Path | None = None,
|
|
78
|
+
deepseek: DeepSeekClient | None = None,
|
|
79
|
+
config_version_manager=None,
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
Args:
|
|
83
|
+
skill_name: Skill 名称
|
|
84
|
+
skill_base_dir: Skill 根目录
|
|
85
|
+
deepseek: DeepSeek 客户端(用于分析失败模式)
|
|
86
|
+
config_version_manager: ConfigVersionManager 实例(用于写入 MySQL)
|
|
87
|
+
"""
|
|
88
|
+
self.skill_name = skill_name
|
|
89
|
+
self._loader = SkillLoader(skill_base_dir)
|
|
90
|
+
self._deepseek = deepseek
|
|
91
|
+
self._version_mgr = config_version_manager
|
|
92
|
+
|
|
93
|
+
def _load_failure_logs(self, date_str: str | None = None) -> list[dict]:
|
|
94
|
+
"""读取 JSONL 日志中 is_failure=true 的记录。
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
date_str: 日期字符串 YYYY-MM-DD,默认昨天
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
is_failure=true 的日志记录列表
|
|
101
|
+
"""
|
|
102
|
+
target_date = date_str or _yesterday_str()
|
|
103
|
+
log_dir = _get_log_dir(self.skill_name)
|
|
104
|
+
log_path = log_dir / f"{target_date}.jsonl"
|
|
105
|
+
|
|
106
|
+
if not log_path.exists():
|
|
107
|
+
logger.info("EvoSkill [%s] 日志文件不存在: %s", self.skill_name, log_path)
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
failures = []
|
|
111
|
+
try:
|
|
112
|
+
with open(log_path, "r", encoding="utf-8") as f:
|
|
113
|
+
for line in f:
|
|
114
|
+
line = line.strip()
|
|
115
|
+
if not line:
|
|
116
|
+
continue
|
|
117
|
+
try:
|
|
118
|
+
entry = json.loads(line)
|
|
119
|
+
if entry.get("is_failure", False):
|
|
120
|
+
failures.append(entry)
|
|
121
|
+
except json.JSONDecodeError:
|
|
122
|
+
continue
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.warning("EvoSkill [%s] 日志读取失败: %s", self.skill_name, e)
|
|
125
|
+
return []
|
|
126
|
+
|
|
127
|
+
logger.info("EvoSkill [%s] 读取 %d 条 is_failure 记录", self.skill_name, len(failures))
|
|
128
|
+
return failures
|
|
129
|
+
|
|
130
|
+
def _run_benchmark(self, skill: SkillModule) -> tuple[int, int, list]:
|
|
131
|
+
"""跑 benchmark() 获取基准指标。
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
(pass_count, total_count, failures_list)
|
|
135
|
+
"""
|
|
136
|
+
try:
|
|
137
|
+
return skill.benchmark(self)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.warning("EvoSkill [%s] benchmark 执行失败: %s", self.skill_name, e)
|
|
140
|
+
return 0, 0, [str(e)]
|
|
141
|
+
|
|
142
|
+
async def _analyze_failures(
|
|
143
|
+
self,
|
|
144
|
+
skill: SkillModule,
|
|
145
|
+
failures: list[dict],
|
|
146
|
+
current_rules: str,
|
|
147
|
+
current_prompt: str,
|
|
148
|
+
) -> dict:
|
|
149
|
+
"""调用 DeepSeek 分析失败模式。
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
{"rules_changes": {...}, "prompt_changes": {...}}
|
|
153
|
+
"""
|
|
154
|
+
if not self._deepseek:
|
|
155
|
+
logger.warning("EvoSkill [%s] DeepSeek 客户端未配置,无法分析", self.skill_name)
|
|
156
|
+
return {}
|
|
157
|
+
|
|
158
|
+
# 加载 evolve_prompt.yaml(Skill 自定义优先,框架默认降级)
|
|
159
|
+
prompt_cfg = skill.evolve_prompt_yaml or {}
|
|
160
|
+
if not prompt_cfg:
|
|
161
|
+
defaults_path = _FRAMEWORK_DEFAULTS / "evolve_prompt.yaml"
|
|
162
|
+
if defaults_path.exists():
|
|
163
|
+
prompt_cfg = yaml.safe_load(defaults_path.read_text(encoding="utf-8")) or {}
|
|
164
|
+
|
|
165
|
+
system = prompt_cfg.get("system", "你是配置优化专家。")
|
|
166
|
+
template = prompt_cfg.get(
|
|
167
|
+
"analyze_template",
|
|
168
|
+
"分析以下失败案例:\n{{failure_logs}}\n输出优化建议 JSON。",
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# 序列化失败日志(截断过长内容)
|
|
172
|
+
failure_texts = []
|
|
173
|
+
for f in failures[:20]: # 最多 20 条
|
|
174
|
+
line = json.dumps({
|
|
175
|
+
"input_summary": f.get("input_summary", {}),
|
|
176
|
+
"rule_output": f.get("rule_output", {}),
|
|
177
|
+
"ai_validation": f.get("ai_validation"),
|
|
178
|
+
"ai_reselection": f.get("ai_reselection"),
|
|
179
|
+
"final_output": f.get("final_output", {}),
|
|
180
|
+
"warnings": f.get("warnings", []),
|
|
181
|
+
}, ensure_ascii=False, indent=2)
|
|
182
|
+
failure_texts.append(line)
|
|
183
|
+
|
|
184
|
+
user_message = template
|
|
185
|
+
user_message = user_message.replace("{{skill_name}}", self.skill_name)
|
|
186
|
+
user_message = user_message.replace("{{current_rules_config}}", current_rules or "(空)")
|
|
187
|
+
user_message = user_message.replace("{{current_prompt}}", current_prompt or "(空)")
|
|
188
|
+
user_message = user_message.replace("{{failure_logs}}", "\n---\n".join(failure_texts))
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
response = await self._deepseek.chat_json(
|
|
192
|
+
messages=[
|
|
193
|
+
{"role": "system", "content": system},
|
|
194
|
+
{"role": "user", "content": user_message},
|
|
195
|
+
],
|
|
196
|
+
temperature=0.3,
|
|
197
|
+
max_tokens=4096,
|
|
198
|
+
)
|
|
199
|
+
return response
|
|
200
|
+
except Exception as e:
|
|
201
|
+
logger.warning("EvoSkill [%s] 分析请求失败: %s", self.skill_name, e)
|
|
202
|
+
return {}
|
|
203
|
+
|
|
204
|
+
async def evolve(
|
|
205
|
+
self,
|
|
206
|
+
min_failure_samples: int = 10,
|
|
207
|
+
dry_run: bool = True,
|
|
208
|
+
date_str: str | None = None,
|
|
209
|
+
) -> EvolveProposal | None:
|
|
210
|
+
"""执行一轮进化。
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
min_failure_samples: 最少失败样本数,不足则跳过
|
|
214
|
+
dry_run: True=仅分析不写入,False=自动写入 MySQL
|
|
215
|
+
date_str: 日期字符串,默认昨天
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
EvolveProposal 或 None(跳过时)
|
|
219
|
+
"""
|
|
220
|
+
logger.info(
|
|
221
|
+
"EvoSkill [%s] 开始进化分析 (min_failure_samples=%d, dry_run=%s)",
|
|
222
|
+
self.skill_name,
|
|
223
|
+
min_failure_samples,
|
|
224
|
+
dry_run,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
proposal = EvolveProposal()
|
|
228
|
+
|
|
229
|
+
# 1. 加载 Skill 模块
|
|
230
|
+
try:
|
|
231
|
+
skill = self._loader.load(self.skill_name)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error("EvoSkill [%s] 加载失败: %s", self.skill_name, e)
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
# 2. enhancement 角色不触发进化
|
|
237
|
+
if skill.ai_role == "enhancement":
|
|
238
|
+
logger.info("EvoSkill [%s] ai_role=enhancement,不触发进化", self.skill_name)
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
# 3. 读取 JSONL 日志
|
|
242
|
+
failures = self._load_failure_logs(date_str)
|
|
243
|
+
proposal.failure_count = len(failures)
|
|
244
|
+
|
|
245
|
+
if len(failures) < min_failure_samples:
|
|
246
|
+
logger.info(
|
|
247
|
+
"EvoSkill [%s] 失败样本不足 (got=%d, need=%d),跳过进化",
|
|
248
|
+
self.skill_name,
|
|
249
|
+
len(failures),
|
|
250
|
+
min_failure_samples,
|
|
251
|
+
)
|
|
252
|
+
return proposal
|
|
253
|
+
|
|
254
|
+
# 4. 跑 benchmark(进化前基线)
|
|
255
|
+
logger.info("EvoSkill [%s] 运行进化前 benchmark...", self.skill_name)
|
|
256
|
+
proposal.benchmark_before = self._run_benchmark(skill)
|
|
257
|
+
logger.info(
|
|
258
|
+
"EvoSkill [%s] 进化前 benchmark: %d/%d 通过",
|
|
259
|
+
self.skill_name,
|
|
260
|
+
proposal.benchmark_before[0],
|
|
261
|
+
proposal.benchmark_before[1],
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# 5. 加载当前配置
|
|
265
|
+
current_rules = ""
|
|
266
|
+
current_prompt = ""
|
|
267
|
+
if self._version_mgr:
|
|
268
|
+
current_rules = self._version_mgr.load_raw(self.skill_name, "rules_config") or ""
|
|
269
|
+
current_prompt = self._version_mgr.load_raw(self.skill_name, "prompt") or ""
|
|
270
|
+
|
|
271
|
+
# 6. DeepSeek 分析
|
|
272
|
+
analysis = await self._analyze_failures(skill, failures, current_rules, current_prompt)
|
|
273
|
+
if not analysis:
|
|
274
|
+
logger.warning("EvoSkill [%s] DeepSeek 分析未产出结果", self.skill_name)
|
|
275
|
+
return proposal
|
|
276
|
+
|
|
277
|
+
proposal.rules_changes = analysis.get("rules_changes", {})
|
|
278
|
+
proposal.prompt_changes = analysis.get("prompt_changes", {})
|
|
279
|
+
|
|
280
|
+
if not proposal.rules_changes and not proposal.prompt_changes:
|
|
281
|
+
logger.info("EvoSkill [%s] DeepSeek 未提出任何优化建议", self.skill_name)
|
|
282
|
+
return proposal
|
|
283
|
+
|
|
284
|
+
# 7. 生成 YAML 文本
|
|
285
|
+
evolve_cfg = skill.evolve_toml
|
|
286
|
+
auto_cfg = evolve_cfg.get("evolve", {}).get("auto_modify", {})
|
|
287
|
+
|
|
288
|
+
if proposal.rules_changes and auto_cfg.get("rules_config", False):
|
|
289
|
+
rules_threshold = auto_cfg.get("rules_config", {})
|
|
290
|
+
max_pct = rules_threshold.get("max_change_percent", 20)
|
|
291
|
+
proposal.rules_text = self._apply_rules_changes(current_rules, proposal.rules_changes, max_pct)
|
|
292
|
+
|
|
293
|
+
if proposal.prompt_changes and auto_cfg.get("prompt", False):
|
|
294
|
+
proposal.prompt_text = self._apply_prompt_changes(current_prompt, proposal.prompt_changes)
|
|
295
|
+
|
|
296
|
+
# 8. 写入
|
|
297
|
+
if not dry_run and self._version_mgr:
|
|
298
|
+
guard = evolve_cfg.get("evolve", {}).get("guard", {})
|
|
299
|
+
require_bench = guard.get("require_benchmark_pass", True)
|
|
300
|
+
|
|
301
|
+
applied = True
|
|
302
|
+
if proposal.rules_text and auto_cfg.get("rules_config", False):
|
|
303
|
+
self._version_mgr.save(self.skill_name, "rules_config", proposal.rules_text)
|
|
304
|
+
logger.info("EvoSkill [%s] rules_config 已写入 MySQL", self.skill_name)
|
|
305
|
+
if proposal.prompt_text and auto_cfg.get("prompt", False):
|
|
306
|
+
self._version_mgr.save(self.skill_name, "prompt", proposal.prompt_text)
|
|
307
|
+
logger.info("EvoSkill [%s] prompt 已写入 MySQL", self.skill_name)
|
|
308
|
+
|
|
309
|
+
# 9. benchmark 安全网:重新跑 benchmark 验证不退化
|
|
310
|
+
if require_bench and (proposal.rules_text or proposal.prompt_text):
|
|
311
|
+
logger.info("EvoSkill [%s] 运行进化后 benchmark...", self.skill_name)
|
|
312
|
+
# 清除缓存使新配置生效
|
|
313
|
+
self._loader.invalidate_cache(self.skill_name)
|
|
314
|
+
skill_after = self._loader.load(self.skill_name)
|
|
315
|
+
proposal.benchmark_after = self._run_benchmark(skill_after)
|
|
316
|
+
|
|
317
|
+
pass_before = proposal.benchmark_before[0]
|
|
318
|
+
total_before = proposal.benchmark_before[1]
|
|
319
|
+
pass_after = proposal.benchmark_after[0]
|
|
320
|
+
|
|
321
|
+
if total_before > 0 and pass_after < pass_before:
|
|
322
|
+
# 退化 → 回滚
|
|
323
|
+
logger.warning(
|
|
324
|
+
"EvoSkill [%s] benchmark 退化 (%d/%d → %d/%d),自动回滚",
|
|
325
|
+
self.skill_name,
|
|
326
|
+
pass_before,
|
|
327
|
+
total_before,
|
|
328
|
+
pass_after,
|
|
329
|
+
proposal.benchmark_after[1],
|
|
330
|
+
)
|
|
331
|
+
if proposal.rules_text and auto_cfg.get("rules_config", False):
|
|
332
|
+
self._version_mgr.rollback(self.skill_name, "rules_config", 0)
|
|
333
|
+
if proposal.prompt_text and auto_cfg.get("prompt", False):
|
|
334
|
+
self._version_mgr.rollback(self.skill_name, "prompt", 0)
|
|
335
|
+
proposal.rolled_back = True
|
|
336
|
+
proposal.applied = False
|
|
337
|
+
else:
|
|
338
|
+
proposal.applied = True
|
|
339
|
+
else:
|
|
340
|
+
proposal.applied = applied
|
|
341
|
+
|
|
342
|
+
return proposal
|
|
343
|
+
|
|
344
|
+
def _apply_rules_changes(self, current_yaml: str, changes: dict, max_change_percent: float) -> str:
|
|
345
|
+
"""将 DeepSeek 产出的 rules_changes 合并到现有 YAML。
|
|
346
|
+
|
|
347
|
+
当前实现:简单字符串替换,仅允许阈值调整。
|
|
348
|
+
"""
|
|
349
|
+
if not current_yaml:
|
|
350
|
+
return yaml.dump(changes, allow_unicode=True, default_flow_style=False)
|
|
351
|
+
|
|
352
|
+
# 遍历 changes 中的阈值调整
|
|
353
|
+
modified = current_yaml
|
|
354
|
+
for key, value in changes.items():
|
|
355
|
+
if isinstance(value, (int, float)):
|
|
356
|
+
# 查找 YAML 中的对应键并替换数值
|
|
357
|
+
import re
|
|
358
|
+
pattern = rf"^\s*{re.escape(key)}\s*:\s*[\d.]+"
|
|
359
|
+
new_line = f"{key}: {value}"
|
|
360
|
+
modified = re.sub(pattern, new_line, modified, flags=re.MULTILINE)
|
|
361
|
+
|
|
362
|
+
return modified
|
|
363
|
+
|
|
364
|
+
def _apply_prompt_changes(self, current_yaml: str, changes: dict) -> str:
|
|
365
|
+
"""将 DeepSeek 产出的 prompt_changes 合并到现有 YAML。
|
|
366
|
+
|
|
367
|
+
当前实现:按字段路径替换 YAML 值。
|
|
368
|
+
"""
|
|
369
|
+
if not current_yaml:
|
|
370
|
+
return yaml.dump(changes, allow_unicode=True, default_flow_style=False)
|
|
371
|
+
|
|
372
|
+
try:
|
|
373
|
+
cfg = yaml.safe_load(current_yaml) or {}
|
|
374
|
+
_deep_update(cfg, changes)
|
|
375
|
+
return yaml.dump(cfg, allow_unicode=True, default_flow_style=False)
|
|
376
|
+
except Exception:
|
|
377
|
+
return current_yaml
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _deep_update(base: dict, updates: dict) -> None:
|
|
381
|
+
"""递归合并 dict。"""
|
|
382
|
+
for key, value in updates.items():
|
|
383
|
+
if isinstance(value, dict) and isinstance(base.get(key), dict):
|
|
384
|
+
_deep_update(base[key], value)
|
|
385
|
+
else:
|
|
386
|
+
base[key] = value
|