agentic-rubric-runner 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ """Document evaluation pipeline — Phase 1 generation + Phase 2 rubric grading."""
aarrr_agent/agent.py ADDED
@@ -0,0 +1,242 @@
1
+ """Phase 1 Agent tool-use 循环(状态机约束)。"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from openai import OpenAI
10
+
11
+ from aarrr_agent.config import MAX_AGENT_TURNS
12
+ from aarrr_agent.errors import PipelineError
13
+ from aarrr_agent.llm import call_chat_completion
14
+ from aarrr_agent.phase1_state import WRITE_TOOLS, Phase1State
15
+ from aarrr_agent.tools import TOOLS, Phase1ToolContext, dispatch_tool, save_trace
16
+
17
+ SYSTEM_PROMPT = """你是一个专业的增长分析 Agent,在受控状态机下完成任务。
18
+
19
+ 必须按顺序调用工具(不可跳步、不可乱序):
20
+ 1. read_text — 读取 query.txt
21
+ 2. read_pdf — 读取附件 PDF
22
+ 3. extract_evidence_pack — 抽取证据包 evidence_pack.json
23
+ 4. (可选)self_check_report — 对报告草稿自检
24
+ 5. write_structured_report 或 write_pdf_report — 提交最终报告(必须最后调用)
25
+
26
+ 重要约束:
27
+ - 必须通过工具读取文件,不得假设内容
28
+ - Phase 1 只能使用 query.txt 与附件 PDF
29
+ - 报告中所有关键事实必须来自附件,并引用证据编号,如:次日留存率是核心指标。[E01]
30
+ - 优先使用 write_structured_report 提交 JSON 结构化报告
31
+ - 报告须覆盖:单一北极星指标、健康/诊断分层、AARRR 五阶段、目标值、红黄预警、周/月/季复盘机制
32
+ - write 工具调用后不得再调用任何工具
33
+
34
+ 结构化报告 JSON 字段类型(write_structured_report 的 report 参数):
35
+ - title: 字符串
36
+ - executive_summary: 对象,如 {"overview":"...", "priority_actions":["..."]}
37
+ - north_star_metric: 对象,如 {"name":"...", "reason":"...", "evidence_refs":["E01"]}
38
+ - aarrr_stages: 数组,每项含 stage / health_metric / diagnostic_metrics
39
+ - warning_rules: 数组,每项含 metric / yellow / red(不要用 red_alerts 嵌套对象)
40
+ - review_cadence: 对象,如 {"weekly":"...", "monthly":"...", "quarterly":"..."}
41
+ - action_plan: 字符串数组,如 ["行动1", "行动2"]
42
+ - evidence_refs: 字符串数组,如 ["E01", "E02"]
43
+ 若 JSON 结构不确定,请改用 write_pdf_report 提交 Markdown 报告。
44
+ 若附件领域与任务不完全一致,仍应尽力基于附件事实完成报告;Phase 2 会对离题附件自动压低得分。
45
+
46
+ 完成后输出 "PHASE1_DONE"(仅当 write 工具已成功执行)。"""
47
+
48
+
49
+ def _message_to_dict(msg: Any) -> dict[str, Any]:
50
+ data: dict[str, Any] = {"role": msg.role}
51
+ if msg.content is not None:
52
+ data["content"] = msg.content
53
+ elif msg.tool_calls:
54
+ # DeepSeek/OpenAI 要求带 tool_calls 的 assistant 消息显式 content=null
55
+ data["content"] = None
56
+ if msg.tool_calls:
57
+ data["tool_calls"] = [
58
+ {
59
+ "id": tc.id,
60
+ "type": "function",
61
+ "function": {
62
+ "name": tc.function.name,
63
+ "arguments": tc.function.arguments,
64
+ },
65
+ }
66
+ for tc in msg.tool_calls
67
+ ]
68
+ return data
69
+
70
+
71
+ def _report_content_from_tool(tool_name: str, tool_args: dict[str, Any]) -> str | None:
72
+ if tool_name == "write_pdf_report":
73
+ return tool_args.get("content")
74
+ if tool_name == "write_structured_report":
75
+ from aarrr_agent.structured_report import StructuredReport, structured_to_markdown
76
+
77
+ return structured_to_markdown(
78
+ StructuredReport.model_validate(tool_args.get("report", {}))
79
+ )
80
+ return None
81
+
82
+
83
+ def _execute_tool_turn(
84
+ msg: Any,
85
+ messages: list[dict[str, Any]],
86
+ trace: list[dict[str, Any]],
87
+ ctx: Phase1ToolContext,
88
+ ) -> str | None:
89
+ """
90
+ 执行一轮 tool_calls,并按规定顺序写入 messages:
91
+ assistant(tool_calls) → tool × N(中间不得插入 user/assistant)。
92
+ """
93
+ report_content: str | None = None
94
+
95
+ for tc in msg.tool_calls:
96
+ tool_name = tc.function.name
97
+ try:
98
+ tool_args = json.loads(tc.function.arguments or "{}")
99
+ except json.JSONDecodeError as exc:
100
+ messages.append(
101
+ {
102
+ "role": "tool",
103
+ "tool_call_id": tc.id,
104
+ "content": f"[工具错误] 参数 JSON 无效: {exc}",
105
+ }
106
+ )
107
+ continue
108
+
109
+ try:
110
+ tool_result = dispatch_tool(tool_name, tool_args, trace, ctx=ctx)
111
+ except PipelineError as exc:
112
+ tool_result = f"[{exc.code}] {exc.message}"
113
+ except Exception as exc:
114
+ tool_result = f"[工具错误] {type(exc).__name__}: {exc}"
115
+
116
+ written = _report_content_from_tool(tool_name, tool_args)
117
+ if written:
118
+ report_content = written
119
+
120
+ messages.append(
121
+ {"role": "tool", "tool_call_id": tc.id, "content": tool_result}
122
+ )
123
+
124
+ return report_content
125
+
126
+
127
+ def _finalize_phase1(
128
+ ctx: Phase1ToolContext,
129
+ trace: list[dict[str, Any]],
130
+ *,
131
+ phase1_done: bool,
132
+ ) -> None:
133
+ """校验 Phase 1 是否正常完成。"""
134
+ if ctx.state.state == Phase1State.DONE:
135
+ return
136
+
137
+ write_errors = [
138
+ e for e in trace if e.get("tool") in WRITE_TOOLS and e.get("status") == "error"
139
+ ]
140
+
141
+ if phase1_done:
142
+ raise PipelineError(
143
+ "E003",
144
+ "Agent 声称完成但 write_pdf_report / write_structured_report 未成功执行。"
145
+ "请查看工具调用日志中的错误详情。",
146
+ )
147
+
148
+ if write_errors:
149
+ raise PipelineError(
150
+ "E003",
151
+ f"报告写入失败:{write_errors[-1].get('error', '未知错误')}",
152
+ )
153
+
154
+ ctx.state.assert_complete(phase1_done=phase1_done)
155
+
156
+
157
+ def run_phase1_agent(
158
+ query_path: str,
159
+ pdf_path: str,
160
+ pdf_output_path: str,
161
+ client: OpenAI,
162
+ model: str,
163
+ trace: list[dict[str, Any]],
164
+ emergency_trace_path: str = "agent_trace_emergency.jsonl",
165
+ ) -> str:
166
+ ctx = Phase1ToolContext(
167
+ query_path=Path(query_path),
168
+ pdf_path=Path(pdf_path),
169
+ pdf_output_path=Path(pdf_output_path),
170
+ )
171
+
172
+ messages: list[dict[str, Any]] = [
173
+ {"role": "system", "content": SYSTEM_PROMPT},
174
+ {
175
+ "role": "user",
176
+ "content": (
177
+ f"请完成任务。\n"
178
+ f"任务文件:{ctx.query_path}\n"
179
+ f"PDF 附件:{ctx.pdf_path}\n"
180
+ f"PDF 输出:{pdf_output_path}\n"
181
+ f"证据包输出:{ctx.evidence_path}"
182
+ ),
183
+ },
184
+ ]
185
+
186
+ report_content: str | None = None
187
+ phase1_done = False
188
+
189
+ for turn in range(MAX_AGENT_TURNS):
190
+ print(f"[Agent] Turn {turn + 1}/{MAX_AGENT_TURNS} [state={ctx.state.state.value}]...")
191
+ try:
192
+ response = call_chat_completion(
193
+ client,
194
+ model=model,
195
+ messages=messages,
196
+ tools=TOOLS,
197
+ tool_choice="auto",
198
+ )
199
+ except PipelineError:
200
+ save_trace(trace, emergency_trace_path)
201
+ raise
202
+
203
+ msg = response.choices[0].message
204
+ messages.append(_message_to_dict(msg))
205
+
206
+ if msg.tool_calls:
207
+ written = _execute_tool_turn(msg, messages, trace, ctx)
208
+ if written:
209
+ report_content = written
210
+ if ctx.state.state == Phase1State.DONE:
211
+ phase1_done = True
212
+ break
213
+ continue
214
+
215
+ if msg.content and "PHASE1_DONE" in msg.content:
216
+ if ctx.state.state == Phase1State.DONE:
217
+ phase1_done = True
218
+ break
219
+ messages.append(
220
+ {
221
+ "role": "user",
222
+ "content": (
223
+ "报告尚未成功写入。你必须先调用 write_pdf_report 或 write_structured_report "
224
+ "并收到成功回执后,才能输出 PHASE1_DONE。"
225
+ ),
226
+ }
227
+ )
228
+ continue
229
+
230
+ if response.choices[0].finish_reason == "stop":
231
+ break
232
+
233
+ _finalize_phase1(ctx, trace, phase1_done=phase1_done)
234
+
235
+ if not report_content:
236
+ md_path = Path(pdf_output_path).with_suffix(".md")
237
+ if md_path.exists():
238
+ report_content = md_path.read_text(encoding="utf-8")
239
+ else:
240
+ raise RuntimeError("Agent 未生成报告内容")
241
+
242
+ return report_content
@@ -0,0 +1,224 @@
1
+ @page {
2
+ size: A4;
3
+ margin: 18mm 16mm 20mm 16mm;
4
+ @bottom-left {
5
+ content: "文档评审控制台";
6
+ color: #64748b;
7
+ font-size: 9px;
8
+ }
9
+ @bottom-right {
10
+ content: "第 " counter(page) " 页";
11
+ color: #64748b;
12
+ font-size: 9px;
13
+ }
14
+ }
15
+
16
+ @page :first {
17
+ margin-top: 22mm;
18
+ }
19
+
20
+ * {
21
+ box-sizing: border-box;
22
+ }
23
+
24
+ body {
25
+ font-family: "Microsoft YaHei", "Noto Sans CJK SC", "PingFang SC", sans-serif;
26
+ font-size: 10.5pt;
27
+ line-height: 1.55;
28
+ color: #1e293b;
29
+ margin: 0;
30
+ padding: 0;
31
+ }
32
+
33
+ .cover {
34
+ min-height: 240mm;
35
+ display: flex;
36
+ flex-direction: column;
37
+ justify-content: center;
38
+ page-break-after: always;
39
+ }
40
+
41
+ .cover .eyebrow {
42
+ font-size: 10pt;
43
+ letter-spacing: 0.12em;
44
+ text-transform: uppercase;
45
+ color: #2563eb;
46
+ margin-bottom: 12px;
47
+ }
48
+
49
+ .cover h1 {
50
+ font-size: 26pt;
51
+ font-weight: 700;
52
+ color: #0f172a;
53
+ margin: 0 0 12px 0;
54
+ line-height: 1.25;
55
+ }
56
+
57
+ .cover .subtitle {
58
+ font-size: 12pt;
59
+ color: #64748b;
60
+ margin: 0 0 32px 0;
61
+ }
62
+
63
+ .cover .meta {
64
+ font-size: 9pt;
65
+ color: #94a3b8;
66
+ display: flex;
67
+ flex-direction: column;
68
+ gap: 4px;
69
+ }
70
+
71
+ .section {
72
+ margin-bottom: 20px;
73
+ }
74
+
75
+ .section.page-break {
76
+ page-break-before: always;
77
+ }
78
+
79
+ h2 {
80
+ font-size: 15pt;
81
+ color: #1e3a5f;
82
+ border-bottom: 2px solid #2563eb;
83
+ padding-bottom: 6px;
84
+ margin: 0 0 14px 0;
85
+ }
86
+
87
+ h3 {
88
+ font-size: 11pt;
89
+ color: #1e40af;
90
+ margin: 14px 0 8px 0;
91
+ }
92
+
93
+ p {
94
+ margin: 0 0 8px 0;
95
+ text-align: justify;
96
+ }
97
+
98
+ ul, ol {
99
+ margin: 0 0 10px 0;
100
+ padding-left: 20px;
101
+ }
102
+
103
+ blockquote.quote {
104
+ border-left: 3px solid #2563eb;
105
+ margin: 8px 0;
106
+ padding: 8px 12px;
107
+ background: #f8fafc;
108
+ color: #475569;
109
+ }
110
+
111
+ .summary-grid {
112
+ display: grid;
113
+ grid-template-columns: repeat(3, 1fr);
114
+ gap: 12px;
115
+ margin-bottom: 16px;
116
+ }
117
+
118
+ .metric-card {
119
+ border: 1px solid #cbd5e1;
120
+ border-radius: 8px;
121
+ padding: 14px;
122
+ background: #f8fafc;
123
+ break-inside: avoid;
124
+ }
125
+
126
+ .metric-card.primary {
127
+ background: #eff6ff;
128
+ border-color: #93c5fd;
129
+ border-left: 4px solid #1e40af;
130
+ }
131
+
132
+ .metric-card span {
133
+ display: block;
134
+ font-size: 8.5pt;
135
+ color: #64748b;
136
+ margin-bottom: 6px;
137
+ }
138
+
139
+ .metric-card strong {
140
+ font-size: 13pt;
141
+ color: #0f172a;
142
+ }
143
+
144
+ .bullet-list li {
145
+ margin-bottom: 4px;
146
+ }
147
+
148
+ .aarrr-flow {
149
+ display: flex;
150
+ gap: 6px;
151
+ margin: 16px 0;
152
+ break-inside: avoid;
153
+ }
154
+
155
+ .aarrr-stage {
156
+ flex: 1;
157
+ border: 1px solid #cbd5e1;
158
+ border-radius: 6px;
159
+ padding: 10px 8px;
160
+ background: #fff;
161
+ text-align: center;
162
+ }
163
+
164
+ .aarrr-stage .name {
165
+ font-weight: 700;
166
+ color: #1e40af;
167
+ font-size: 11pt;
168
+ margin-bottom: 6px;
169
+ }
170
+
171
+ .aarrr-stage .metric {
172
+ font-size: 8.5pt;
173
+ color: #334155;
174
+ margin-bottom: 4px;
175
+ }
176
+
177
+ .aarrr-stage .diag {
178
+ font-size: 7.5pt;
179
+ color: #64748b;
180
+ }
181
+
182
+ table {
183
+ width: 100%;
184
+ border-collapse: collapse;
185
+ margin: 10px 0 16px 0;
186
+ font-size: 9pt;
187
+ }
188
+
189
+ thead {
190
+ display: table-header-group;
191
+ }
192
+
193
+ th {
194
+ background: #1e40af;
195
+ color: #fff;
196
+ padding: 8px;
197
+ text-align: left;
198
+ font-weight: 600;
199
+ }
200
+
201
+ td {
202
+ border: 1px solid #cbd5e1;
203
+ padding: 7px 8px;
204
+ vertical-align: top;
205
+ }
206
+
207
+ tbody tr:nth-child(even) {
208
+ background: #f8fafc;
209
+ }
210
+
211
+ .warn-green { background: #ecfdf5 !important; }
212
+ .warn-yellow { background: #fffbeb !important; }
213
+ .warn-red { background: #fef2f2 !important; }
214
+
215
+ .appendix {
216
+ font-size: 9pt;
217
+ color: #64748b;
218
+ }
219
+
220
+ code {
221
+ font-family: Consolas, monospace;
222
+ font-size: 9pt;
223
+ color: #0f766e;
224
+ }
@@ -0,0 +1,206 @@
1
+ """附件领域相关性检测与评分门控。"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from aarrr_agent.schemas import GradingResult
9
+
10
+ # 任务期望附件应包含的增长/AARRR 领域词(仅用于附件正文检测)
11
+ _DOMAIN_KEYWORDS = (
12
+ "社交电商",
13
+ "AARRR",
14
+ "用户增长",
15
+ "获客",
16
+ "激活",
17
+ "留存",
18
+ "变现",
19
+ "传播",
20
+ "北极星",
21
+ "GMV",
22
+ "留存率",
23
+ "病毒系数",
24
+ "终身价值",
25
+ "获客成本",
26
+ "裂变",
27
+ "分享推荐",
28
+ "拼团",
29
+ "社交电商",
30
+ )
31
+
32
+ # 明显偏离任务领域的信号(附件中出现则视为离题源文档)
33
+ _OFF_DOMAIN_SIGNALS = (
34
+ "樱桃",
35
+ "栽培",
36
+ "果树",
37
+ "DNS",
38
+ "dns",
39
+ "中继",
40
+ "中继服务器",
41
+ "RCODE",
42
+ "dig ",
43
+ "FORMERR",
44
+ "select()",
45
+ "dnsrelay",
46
+ "dnsperf",
47
+ "计算机组成",
48
+ "实验报告",
49
+ "电路",
50
+ "汇编",
51
+ "农作物",
52
+ "病虫害",
53
+ "土壤",
54
+ "G网",
55
+ "路由表",
56
+ "域名解析",
57
+ "机器人",
58
+ "智能机器人",
59
+ "智能车",
60
+ "实践训练",
61
+ "训练指导书",
62
+ "指导书",
63
+ "嵌入式",
64
+ "单片机",
65
+ "STM32",
66
+ )
67
+
68
+ # 报告强行套用离题附件时的典型幻觉措辞
69
+ _FORCED_ANALOGY = re.compile(
70
+ r"DNS|select\s*\(|RCODE|dnsrelay|dnsperf|上游中继|本地解析|主循环|client_fd|upstream",
71
+ re.I,
72
+ )
73
+
74
+
75
+ def assess_attachment_domain(attachment_text: str, query_text: str = "") -> dict[str, Any]:
76
+ """
77
+ 检测附件是否属于任务要求的领域。
78
+ 注意:仅以附件正文为准,不把 query 关键词混入(避免 DNS 附件因 query 误判为相关)。
79
+ """
80
+ body = attachment_text[:120000]
81
+ domain_hits = [kw for kw in _DOMAIN_KEYWORDS if kw in body]
82
+ off_hits = [kw for kw in _OFF_DOMAIN_SIGNALS if kw in body]
83
+
84
+ # 至少 3 个领域词,且显著多于离题信号
85
+ relevant = len(domain_hits) >= 3 and len(domain_hits) > len(off_hits)
86
+
87
+ # query 仅作辅助说明,不参与 relevant 判定
88
+ _ = query_text
89
+
90
+ return {
91
+ "relevant": relevant,
92
+ "domain_hits": domain_hits,
93
+ "off_domain_hits": off_hits,
94
+ "domain_hit_count": len(domain_hits),
95
+ "off_domain_hit_count": len(off_hits),
96
+ }
97
+
98
+
99
+ def format_e007_user_message(
100
+ assessment: dict[str, Any] | None = None,
101
+ *,
102
+ filename: str = "",
103
+ ) -> str:
104
+ """生成面向用户的 E007 说明(强调这是预期拦截,而非系统故障)。"""
105
+ off = ", ".join((assessment or {}).get("off_domain_hits", [])[:8])
106
+ parts = [
107
+ "附件与 query 要求的「社交电商 / AARRR 用户增长策略」领域不一致,无法据此生成指标方案。",
108
+ ]
109
+ if filename:
110
+ parts.append(f"当前文件:{filename}")
111
+ if off:
112
+ parts.append(f"离题信号:{off}")
113
+ parts.append(
114
+ "可继续运行完整评审;Phase 2 会对离题附件自动压低得分。"
115
+ )
116
+ return " ".join(parts)
117
+
118
+
119
+ def preflight_attachment_pdf(pdf_path: str) -> dict[str, Any]:
120
+ """运行 Phase 1 前快速检测附件领域,避免对离题 PDF 浪费 API 调用。"""
121
+ from aarrr_agent.tools import read_pdf
122
+
123
+ return assess_attachment_domain(read_pdf(pdf_path))
124
+
125
+
126
+ def detect_forced_analogy_report(report_text: str, attachment_text: str) -> bool:
127
+ """报告是否将离题附件(如 DNS 实验)强行类比为增长指标。"""
128
+ assessment = assess_attachment_domain(attachment_text)
129
+ if assessment["relevant"]:
130
+ return False
131
+ return bool(_FORCED_ANALOGY.search(report_text))
132
+
133
+
134
+ def h15_failed(result: GradingResult) -> bool:
135
+ """H15(关键事实可追溯)是否未通过。"""
136
+ h15 = next((c for c in result.hard_constraints if c.id == "H15"), None)
137
+ return h15 is not None and h15.score == 0
138
+
139
+
140
+ def should_enforce_attachment_gate(
141
+ result: GradingResult,
142
+ assessment: dict[str, Any],
143
+ *,
144
+ forced_analogy: bool,
145
+ ) -> bool:
146
+ """是否应触发程序门控(离题附件、强行类比、或 H15 未通过)。"""
147
+ if forced_analogy:
148
+ return True
149
+ if not assessment["relevant"]:
150
+ return True
151
+ return h15_failed(result)
152
+
153
+
154
+ def enforce_attachment_gate(
155
+ result: GradingResult,
156
+ rubrics: dict[str, Any],
157
+ attachment_text: str,
158
+ query_text: str = "",
159
+ report_text: str = "",
160
+ ) -> tuple[GradingResult, dict[str, Any]]:
161
+ """
162
+ 附件与任务领域不匹配时,程序强制压低分数。
163
+ 离题附件场景下仅保留 H01(PDF 格式)可能为 1,其余硬约束归零。
164
+ """
165
+ assessment = assess_attachment_domain(attachment_text, query_text)
166
+ forced_analogy = detect_forced_analogy_report(report_text, attachment_text)
167
+ if not should_enforce_attachment_gate(result, assessment, forced_analogy=forced_analogy):
168
+ return result, assessment
169
+
170
+ rubric = rubrics["rubric"]
171
+ if not assessment["relevant"] or forced_analogy:
172
+ gate_reason = (
173
+ f"程序门控:附件与社交电商/AARRR 增长领域不匹配"
174
+ f"(附件领域词 {assessment['domain_hit_count']} 个,"
175
+ f"离题信号 {assessment['off_domain_hit_count']} 个:"
176
+ f"{', '.join(assessment['off_domain_hits'][:6]) or '无'})。"
177
+ )
178
+ if forced_analogy:
179
+ gate_reason += " 报告将离题附件(如 DNS 实验)强行类比为增长指标,事实不可追溯。"
180
+ else:
181
+ gate_reason = (
182
+ "程序门控:H15 未通过(关键事实无法追溯到附件),"
183
+ "硬约束 H02-H15 与全部软/可选项归零。"
184
+ )
185
+
186
+ for i, item in enumerate(rubric["hard_constraints"], 1):
187
+ cid = f"H{i:02d}"
188
+ hc = next(c for c in result.hard_constraints if c.id == cid)
189
+ # 离题附件:仅 PDF 格式类硬约束可保留
190
+ if cid == "H01":
191
+ continue
192
+ hc.score = 0
193
+ hc.missing = list(dict.fromkeys([*hc.missing, "附件领域不匹配或事实伪造"]))
194
+ hc.reason = f"{gate_reason} 原评审:{hc.reason}"
195
+
196
+ for sc in result.soft_constraints:
197
+ sc.score = 0
198
+ sc.missing = list(dict.fromkeys([*sc.missing, "离题附件不支持软约束给分"]))
199
+ sc.reason = f"{gate_reason} 原评审:{sc.reason}"
200
+
201
+ for oc in result.optional_constraints:
202
+ oc.score = 0
203
+ oc.missing = list(dict.fromkeys([*oc.missing, "离题附件"]))
204
+ oc.reason = f"{gate_reason} 原评审:{oc.reason}"
205
+
206
+ return result, assessment