PyPI - agentic-rubric-runner - Versions diffs - 0.5.0__py3-none-any.whl - Mend

agentic-rubric-runner 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

aarrr_agent/__init__.py +1 -0
aarrr_agent/agent.py +242 -0
aarrr_agent/assets/executive_report.css +224 -0
aarrr_agent/attachment_relevance.py +206 -0
aarrr_agent/benchmark.py +815 -0
aarrr_agent/cli.py +435 -0
aarrr_agent/config.py +33 -0
aarrr_agent/env.py +17 -0
aarrr_agent/errors.py +21 -0
aarrr_agent/evidence.py +141 -0
aarrr_agent/grader.py +253 -0
aarrr_agent/grading_calibration.py +35 -0
aarrr_agent/grading_report.py +298 -0
aarrr_agent/html_pdf.py +79 -0
aarrr_agent/html_report.py +28 -0
aarrr_agent/llm.py +47 -0
aarrr_agent/md_report_parser.py +310 -0
aarrr_agent/pdf_gen.py +437 -0
aarrr_agent/phase1_state.py +83 -0
aarrr_agent/pipeline.py +232 -0
aarrr_agent/report_models.py +45 -0
aarrr_agent/reporting.py +46 -0
aarrr_agent/retrieval.py +88 -0
aarrr_agent/schemas.py +96 -0
aarrr_agent/structured_report.py +341 -0
aarrr_agent/templates/executive_report.html +109 -0
aarrr_agent/tools.py +415 -0
aarrr_agent/validation.py +61 -0
aarrr_agent/web_app.py +548 -0
agentic_rubric_runner-0.5.0.dist-info/METADATA +596 -0
agentic_rubric_runner-0.5.0.dist-info/RECORD +34 -0
agentic_rubric_runner-0.5.0.dist-info/WHEEL +4 -0
agentic_rubric_runner-0.5.0.dist-info/entry_points.txt +3 -0
agentic_rubric_runner-0.5.0.dist-info/licenses/LICENSE +21 -0

aarrr_agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Document evaluation pipeline — Phase 1 generation + Phase 2 rubric grading."""

aarrr_agent/agent.py ADDED Viewed

@@ -0,0 +1,242 @@
+"""Phase 1 Agent tool-use 循环（状态机约束）。"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from openai import OpenAI
+from aarrr_agent.config import MAX_AGENT_TURNS
+from aarrr_agent.errors import PipelineError
+from aarrr_agent.llm import call_chat_completion
+from aarrr_agent.phase1_state import WRITE_TOOLS, Phase1State
+from aarrr_agent.tools import TOOLS, Phase1ToolContext, dispatch_tool, save_trace
+SYSTEM_PROMPT = """你是一个专业的增长分析 Agent，在受控状态机下完成任务。
+必须按顺序调用工具（不可跳步、不可乱序）：
+1. read_text — 读取 query.txt
+2. read_pdf — 读取附件 PDF
+3. extract_evidence_pack — 抽取证据包 evidence_pack.json
+4. （可选）self_check_report — 对报告草稿自检
+5. write_structured_report 或 write_pdf_report — 提交最终报告（必须最后调用）
+重要约束：
+- 必须通过工具读取文件，不得假设内容
+- Phase 1 只能使用 query.txt 与附件 PDF
+- 报告中所有关键事实必须来自附件，并引用证据编号，如：次日留存率是核心指标。[E01]
+- 优先使用 write_structured_report 提交 JSON 结构化报告
+- 报告须覆盖：单一北极星指标、健康/诊断分层、AARRR 五阶段、目标值、红黄预警、周/月/季复盘机制
+- write 工具调用后不得再调用任何工具
+结构化报告 JSON 字段类型（write_structured_report 的 report 参数）：
+- title: 字符串
+- executive_summary: 对象，如 {"overview":"...", "priority_actions":["..."]}
+- north_star_metric: 对象，如 {"name":"...", "reason":"...", "evidence_refs":["E01"]}
+- aarrr_stages: 数组，每项含 stage / health_metric / diagnostic_metrics
+- warning_rules: 数组，每项含 metric / yellow / red（不要用 red_alerts 嵌套对象）
+- review_cadence: 对象，如 {"weekly":"...", "monthly":"...", "quarterly":"..."}
+- action_plan: 字符串数组，如 ["行动1", "行动2"]
+- evidence_refs: 字符串数组，如 ["E01", "E02"]
+若 JSON 结构不确定，请改用 write_pdf_report 提交 Markdown 报告。
+若附件领域与任务不完全一致，仍应尽力基于附件事实完成报告；Phase 2 会对离题附件自动压低得分。
+完成后输出 "PHASE1_DONE"（仅当 write 工具已成功执行）。"""
+def _message_to_dict(msg: Any) -> dict[str, Any]:
+    data: dict[str, Any] = {"role": msg.role}
+    if msg.content is not None:
+        data["content"] = msg.content
+    elif msg.tool_calls:
+        # DeepSeek/OpenAI 要求带 tool_calls 的 assistant 消息显式 content=null
+        data["content"] = None
+    if msg.tool_calls:
+        data["tool_calls"] = [
+            {
+                "id": tc.id,
+                "type": "function",
+                "function": {
+                    "name": tc.function.name,
+                    "arguments": tc.function.arguments,
+                },
+            }
+            for tc in msg.tool_calls
+        ]
+    return data
+def _report_content_from_tool(tool_name: str, tool_args: dict[str, Any]) -> str | None:
+    if tool_name == "write_pdf_report":
+        return tool_args.get("content")
+    if tool_name == "write_structured_report":
+        from aarrr_agent.structured_report import StructuredReport, structured_to_markdown
+        return structured_to_markdown(
+            StructuredReport.model_validate(tool_args.get("report", {}))
+        )
+    return None
+def _execute_tool_turn(
+    msg: Any,
+    messages: list[dict[str, Any]],
+    trace: list[dict[str, Any]],
+    ctx: Phase1ToolContext,
+) -> str | None:
+    """
+    执行一轮 tool_calls，并按规定顺序写入 messages：
+    assistant(tool_calls) → tool × N（中间不得插入 user/assistant）。
+    """
+    report_content: str | None = None
+    for tc in msg.tool_calls:
+        tool_name = tc.function.name
+        try:
+            tool_args = json.loads(tc.function.arguments or "{}")
+        except json.JSONDecodeError as exc:
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tc.id,
+                    "content": f"[工具错误] 参数 JSON 无效: {exc}",
+                }
+            )
+            continue
+        try:
+            tool_result = dispatch_tool(tool_name, tool_args, trace, ctx=ctx)
+        except PipelineError as exc:
+            tool_result = f"[{exc.code}] {exc.message}"
+        except Exception as exc:
+            tool_result = f"[工具错误] {type(exc).__name__}: {exc}"
+        written = _report_content_from_tool(tool_name, tool_args)
+        if written:
+            report_content = written
+        messages.append(
+            {"role": "tool", "tool_call_id": tc.id, "content": tool_result}
+        )
+    return report_content
+def _finalize_phase1(
+    ctx: Phase1ToolContext,
+    trace: list[dict[str, Any]],
+    *,
+    phase1_done: bool,
+) -> None:
+    """校验 Phase 1 是否正常完成。"""
+    if ctx.state.state == Phase1State.DONE:
+        return
+    write_errors = [
+        e for e in trace if e.get("tool") in WRITE_TOOLS and e.get("status") == "error"
+    ]
+    if phase1_done:
+        raise PipelineError(
+            "E003",
+            "Agent 声称完成但 write_pdf_report / write_structured_report 未成功执行。"
+            "请查看工具调用日志中的错误详情。",
+        )
+    if write_errors:
+        raise PipelineError(
+            "E003",
+            f"报告写入失败：{write_errors[-1].get('error', '未知错误')}",
+        )
+    ctx.state.assert_complete(phase1_done=phase1_done)
+def run_phase1_agent(
+    query_path: str,
+    pdf_path: str,
+    pdf_output_path: str,
+    client: OpenAI,
+    model: str,
+    trace: list[dict[str, Any]],
+    emergency_trace_path: str = "agent_trace_emergency.jsonl",
+) -> str:
+    ctx = Phase1ToolContext(
+        query_path=Path(query_path),
+        pdf_path=Path(pdf_path),
+        pdf_output_path=Path(pdf_output_path),
+    )
+    messages: list[dict[str, Any]] = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": (
+                f"请完成任务。\n"
+                f"任务文件：{ctx.query_path}\n"
+                f"PDF 附件：{ctx.pdf_path}\n"
+                f"PDF 输出：{pdf_output_path}\n"
+                f"证据包输出：{ctx.evidence_path}"
+            ),
+        },
+    ]
+    report_content: str | None = None
+    phase1_done = False
+    for turn in range(MAX_AGENT_TURNS):
+        print(f"[Agent] Turn {turn + 1}/{MAX_AGENT_TURNS} [state={ctx.state.state.value}]...")
+        try:
+            response = call_chat_completion(
+                client,
+                model=model,
+                messages=messages,
+                tools=TOOLS,
+                tool_choice="auto",
+            )
+        except PipelineError:
+            save_trace(trace, emergency_trace_path)
+            raise
+        msg = response.choices[0].message
+        messages.append(_message_to_dict(msg))
+        if msg.tool_calls:
+            written = _execute_tool_turn(msg, messages, trace, ctx)
+            if written:
+                report_content = written
+            if ctx.state.state == Phase1State.DONE:
+                phase1_done = True
+                break
+            continue
+        if msg.content and "PHASE1_DONE" in msg.content:
+            if ctx.state.state == Phase1State.DONE:
+                phase1_done = True
+                break
+            messages.append(
+                {
+                    "role": "user",
+                    "content": (
+                        "报告尚未成功写入。你必须先调用 write_pdf_report 或 write_structured_report "
+                        "并收到成功回执后，才能输出 PHASE1_DONE。"
+                    ),
+                }
+            )
+            continue
+        if response.choices[0].finish_reason == "stop":
+            break
+    _finalize_phase1(ctx, trace, phase1_done=phase1_done)
+    if not report_content:
+        md_path = Path(pdf_output_path).with_suffix(".md")
+        if md_path.exists():
+            report_content = md_path.read_text(encoding="utf-8")
+        else:
+            raise RuntimeError("Agent 未生成报告内容")
+    return report_content

aarrr_agent/assets/executive_report.css ADDED Viewed

@@ -0,0 +1,224 @@
+@page {
+  size: A4;
+  margin: 18mm 16mm 20mm 16mm;
+  @bottom-left {
+    content: "文档评审控制台";
+    color: #64748b;
+    font-size: 9px;
+  }
+  @bottom-right {
+    content: "第 " counter(page) " 页";
+    color: #64748b;
+    font-size: 9px;
+  }
+}
+@page :first {
+  margin-top: 22mm;
+}
+* {
+  box-sizing: border-box;
+}
+body {
+  font-family: "Microsoft YaHei", "Noto Sans CJK SC", "PingFang SC", sans-serif;
+  font-size: 10.5pt;
+  line-height: 1.55;
+  color: #1e293b;
+  margin: 0;
+  padding: 0;
+}
+.cover {
+  min-height: 240mm;
+  display: flex;
+  flex-direction: column;
+  justify-content: center;
+  page-break-after: always;
+}
+.cover .eyebrow {
+  font-size: 10pt;
+  letter-spacing: 0.12em;
+  text-transform: uppercase;
+  color: #2563eb;
+  margin-bottom: 12px;
+}
+.cover h1 {
+  font-size: 26pt;
+  font-weight: 700;
+  color: #0f172a;
+  margin: 0 0 12px 0;
+  line-height: 1.25;
+}
+.cover .subtitle {
+  font-size: 12pt;
+  color: #64748b;
+  margin: 0 0 32px 0;
+}
+.cover .meta {
+  font-size: 9pt;
+  color: #94a3b8;
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+.section {
+  margin-bottom: 20px;
+}
+.section.page-break {
+  page-break-before: always;
+}
+h2 {
+  font-size: 15pt;
+  color: #1e3a5f;
+  border-bottom: 2px solid #2563eb;
+  padding-bottom: 6px;
+  margin: 0 0 14px 0;
+}
+h3 {
+  font-size: 11pt;
+  color: #1e40af;
+  margin: 14px 0 8px 0;
+}
+p {
+  margin: 0 0 8px 0;
+  text-align: justify;
+}
+ul, ol {
+  margin: 0 0 10px 0;
+  padding-left: 20px;
+}
+blockquote.quote {
+  border-left: 3px solid #2563eb;
+  margin: 8px 0;
+  padding: 8px 12px;
+  background: #f8fafc;
+  color: #475569;
+}
+.summary-grid {
+  display: grid;
+  grid-template-columns: repeat(3, 1fr);
+  gap: 12px;
+  margin-bottom: 16px;
+}
+.metric-card {
+  border: 1px solid #cbd5e1;
+  border-radius: 8px;
+  padding: 14px;
+  background: #f8fafc;
+  break-inside: avoid;
+}
+.metric-card.primary {
+  background: #eff6ff;
+  border-color: #93c5fd;
+  border-left: 4px solid #1e40af;
+}
+.metric-card span {
+  display: block;
+  font-size: 8.5pt;
+  color: #64748b;
+  margin-bottom: 6px;
+}
+.metric-card strong {
+  font-size: 13pt;
+  color: #0f172a;
+}
+.bullet-list li {
+  margin-bottom: 4px;
+}
+.aarrr-flow {
+  display: flex;
+  gap: 6px;
+  margin: 16px 0;
+  break-inside: avoid;
+}
+.aarrr-stage {
+  flex: 1;
+  border: 1px solid #cbd5e1;
+  border-radius: 6px;
+  padding: 10px 8px;
+  background: #fff;
+  text-align: center;
+}
+.aarrr-stage .name {
+  font-weight: 700;
+  color: #1e40af;
+  font-size: 11pt;
+  margin-bottom: 6px;
+}
+.aarrr-stage .metric {
+  font-size: 8.5pt;
+  color: #334155;
+  margin-bottom: 4px;
+}
+.aarrr-stage .diag {
+  font-size: 7.5pt;
+  color: #64748b;
+}
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 10px 0 16px 0;
+  font-size: 9pt;
+}
+thead {
+  display: table-header-group;
+}
+th {
+  background: #1e40af;
+  color: #fff;
+  padding: 8px;
+  text-align: left;
+  font-weight: 600;
+}
+td {
+  border: 1px solid #cbd5e1;
+  padding: 7px 8px;
+  vertical-align: top;
+}
+tbody tr:nth-child(even) {
+  background: #f8fafc;
+}
+.warn-green { background: #ecfdf5 !important; }
+.warn-yellow { background: #fffbeb !important; }
+.warn-red { background: #fef2f2 !important; }
+.appendix {
+  font-size: 9pt;
+  color: #64748b;
+}
+code {
+  font-family: Consolas, monospace;
+  font-size: 9pt;
+  color: #0f766e;
+}

aarrr_agent/attachment_relevance.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""附件领域相关性检测与评分门控。"""
+from __future__ import annotations
+import re
+from typing import Any
+from aarrr_agent.schemas import GradingResult
+# 任务期望附件应包含的增长/AARRR 领域词（仅用于附件正文检测）
+_DOMAIN_KEYWORDS = (
+    "社交电商",
+    "AARRR",
+    "用户增长",
+    "获客",
+    "激活",
+    "留存",
+    "变现",
+    "传播",
+    "北极星",
+    "GMV",
+    "留存率",
+    "病毒系数",
+    "终身价值",
+    "获客成本",
+    "裂变",
+    "分享推荐",
+    "拼团",
+    "社交电商",
+)
+# 明显偏离任务领域的信号（附件中出现则视为离题源文档）
+_OFF_DOMAIN_SIGNALS = (
+    "樱桃",
+    "栽培",
+    "果树",
+    "DNS",
+    "dns",
+    "中继",
+    "中继服务器",
+    "RCODE",
+    "dig ",
+    "FORMERR",
+    "select()",
+    "dnsrelay",
+    "dnsperf",
+    "计算机组成",
+    "实验报告",
+    "电路",
+    "汇编",
+    "农作物",
+    "病虫害",
+    "土壤",
+    "G网",
+    "路由表",
+    "域名解析",
+    "机器人",
+    "智能机器人",
+    "智能车",
+    "实践训练",
+    "训练指导书",
+    "指导书",
+    "嵌入式",
+    "单片机",
+    "STM32",
+)
+# 报告强行套用离题附件时的典型幻觉措辞
+_FORCED_ANALOGY = re.compile(
+    r"DNS|select\s*\(|RCODE|dnsrelay|dnsperf|上游中继|本地解析|主循环|client_fd|upstream",
+    re.I,
+)
+def assess_attachment_domain(attachment_text: str, query_text: str = "") -> dict[str, Any]:
+    """
+    检测附件是否属于任务要求的领域。
+    注意：仅以附件正文为准，不把 query 关键词混入（避免 DNS 附件因 query 误判为相关）。
+    """
+    body = attachment_text[:120000]
+    domain_hits = [kw for kw in _DOMAIN_KEYWORDS if kw in body]
+    off_hits = [kw for kw in _OFF_DOMAIN_SIGNALS if kw in body]
+    # 至少 3 个领域词，且显著多于离题信号
+    relevant = len(domain_hits) >= 3 and len(domain_hits) > len(off_hits)
+    # query 仅作辅助说明，不参与 relevant 判定
+    _ = query_text
+    return {
+        "relevant": relevant,
+        "domain_hits": domain_hits,
+        "off_domain_hits": off_hits,
+        "domain_hit_count": len(domain_hits),
+        "off_domain_hit_count": len(off_hits),
+    }
+def format_e007_user_message(
+    assessment: dict[str, Any] | None = None,
+    *,
+    filename: str = "",
+) -> str:
+    """生成面向用户的 E007 说明（强调这是预期拦截，而非系统故障）。"""
+    off = ", ".join((assessment or {}).get("off_domain_hits", [])[:8])
+    parts = [
+        "附件与 query 要求的「社交电商 / AARRR 用户增长策略」领域不一致，无法据此生成指标方案。",
+    ]
+    if filename:
+        parts.append(f"当前文件：{filename}")
+    if off:
+        parts.append(f"离题信号：{off}")
+    parts.append(
+        "可继续运行完整评审；Phase 2 会对离题附件自动压低得分。"
+    )
+    return " ".join(parts)
+def preflight_attachment_pdf(pdf_path: str) -> dict[str, Any]:
+    """运行 Phase 1 前快速检测附件领域，避免对离题 PDF 浪费 API 调用。"""
+    from aarrr_agent.tools import read_pdf
+    return assess_attachment_domain(read_pdf(pdf_path))
+def detect_forced_analogy_report(report_text: str, attachment_text: str) -> bool:
+    """报告是否将离题附件（如 DNS 实验）强行类比为增长指标。"""
+    assessment = assess_attachment_domain(attachment_text)
+    if assessment["relevant"]:
+        return False
+    return bool(_FORCED_ANALOGY.search(report_text))
+def h15_failed(result: GradingResult) -> bool:
+    """H15（关键事实可追溯）是否未通过。"""
+    h15 = next((c for c in result.hard_constraints if c.id == "H15"), None)
+    return h15 is not None and h15.score == 0
+def should_enforce_attachment_gate(
+    result: GradingResult,
+    assessment: dict[str, Any],
+    *,
+    forced_analogy: bool,
+) -> bool:
+    """是否应触发程序门控（离题附件、强行类比、或 H15 未通过）。"""
+    if forced_analogy:
+        return True
+    if not assessment["relevant"]:
+        return True
+    return h15_failed(result)
+def enforce_attachment_gate(
+    result: GradingResult,
+    rubrics: dict[str, Any],
+    attachment_text: str,
+    query_text: str = "",
+    report_text: str = "",
+) -> tuple[GradingResult, dict[str, Any]]:
+    """
+    附件与任务领域不匹配时，程序强制压低分数。
+    离题附件场景下仅保留 H01（PDF 格式）可能为 1，其余硬约束归零。
+    """
+    assessment = assess_attachment_domain(attachment_text, query_text)
+    forced_analogy = detect_forced_analogy_report(report_text, attachment_text)
+    if not should_enforce_attachment_gate(result, assessment, forced_analogy=forced_analogy):
+        return result, assessment
+    rubric = rubrics["rubric"]
+    if not assessment["relevant"] or forced_analogy:
+        gate_reason = (
+            f"程序门控：附件与社交电商/AARRR 增长领域不匹配"
+            f"（附件领域词 {assessment['domain_hit_count']} 个，"
+            f"离题信号 {assessment['off_domain_hit_count']} 个："
+            f"{', '.join(assessment['off_domain_hits'][:6]) or '无'}）。"
+        )
+        if forced_analogy:
+            gate_reason += " 报告将离题附件（如 DNS 实验）强行类比为增长指标，事实不可追溯。"
+    else:
+        gate_reason = (
+            "程序门控：H15 未通过（关键事实无法追溯到附件），"
+            "硬约束 H02-H15 与全部软/可选项归零。"
+        )
+    for i, item in enumerate(rubric["hard_constraints"], 1):
+        cid = f"H{i:02d}"
+        hc = next(c for c in result.hard_constraints if c.id == cid)
+        # 离题附件：仅 PDF 格式类硬约束可保留
+        if cid == "H01":
+            continue
+        hc.score = 0
+        hc.missing = list(dict.fromkeys([*hc.missing, "附件领域不匹配或事实伪造"]))
+        hc.reason = f"{gate_reason} 原评审：{hc.reason}"
+    for sc in result.soft_constraints:
+        sc.score = 0
+        sc.missing = list(dict.fromkeys([*sc.missing, "离题附件不支持软约束给分"]))
+        sc.reason = f"{gate_reason} 原评审：{sc.reason}"
+    for oc in result.optional_constraints:
+        oc.score = 0
+        oc.missing = list(dict.fromkeys([*oc.missing, "离题附件"]))
+        oc.reason = f"{gate_reason} 原评审：{oc.reason}"
+    return result, assessment