npm - astron-eval - Versions diffs - 0.0.1 - Mend

astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

package/skills/model-evaluation/scripts/eval_task.py ADDED Viewed

@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+"""评测任务管理：提交任务、查询状态、轮询结果"""
+import argparse
+import json
+import time
+import requests
+from pathlib import Path
+from utils import (
+    TERMINAL_STATES,
+    handle_cli_error,
+)
+from files import (
+    load_json,
+    save_json,
+    load_config_kv,
+)
+from clients import (
+    ApiClient,
+    TokenManager,
+)
+from eval_dimension import update_config, check_config
+# ============================================================================
+# 提交任务
+# ============================================================================
+def cmd_submit(args):
+    """提交评测任务"""
+    # 验证文件存在
+    for f, desc in [(args.eval_set, "评测集"), (args.eval_dimension, "评测维度"), (args.eval_judge, "评委配置")]:
+        if not Path(f).exists():
+            raise FileNotFoundError(f"{desc}文件不存在: {f}")
+    # 步骤1：自动填充 judge_id
+    update_result = update_config(args.eval_dimension, args.eval_judge, None)
+    if not update_result.get("success"):
+        raise ValueError(f"填充judge_id失败: {update_result.get('errors')}")
+    # 步骤2：校验维度配置
+    check_result = check_config(args.eval_dimension)
+    if not check_result.get("success"):
+        errors = check_result.get("errors", [])
+        raise ValueError(f"维度配置校验失败({len(errors)}个错误): {errors}")
+    # 步骤3：提交任务
+    config_result = load_config_kv(args.config)
+    if not config_result.get("success"):
+        raise ValueError(f"配置文件加载失败: {config_result.get('message')}")
+    config = config_result.get("data", {})
+    # 使用 TokenManager 和 ApiClient
+    token_manager = TokenManager(args.auth)
+    client = ApiClient(token_manager, config.get('base_url', 'http://127.0.0.1:8080'))
+    # 构建请求
+    evalset_result = load_json(args.eval_set)
+    if not evalset_result.get("success"):
+        raise ValueError(f"评测集文件加载失败: {evalset_result.get('message')}")
+    evalset_id = evalset_result.get("data", {}).get('dataset')
+    dimensions_result = load_json(args.eval_dimension)
+    if not dimensions_result.get("success"):
+        raise ValueError(f"维度配置加载失败: {dimensions_result.get('message')}")
+    dimensions = dimensions_result.get("data", {})
+    judges_result = load_json(args.eval_judge)
+    if not judges_result.get("success"):
+        raise ValueError(f"评委配置加载失败: {judges_result.get('message')}")
+    judges = judges_result.get("data", {})
+    payload = {
+        "apiVersion": "v1",
+        "models": [judges] if judges else [],
+        "agents": [],
+        "spec": {
+            "templates": [{
+                "name": "模型评测",
+                "type": "evaluation",
+                "parameters": {"evalset": evalset_id, "eval": dimensions.get("evals")}
+            }]
+        }
+    }
+    task_data = client.post("/open/api/v1/eval/tasks", json=payload)
+    save_json(args.output, {"task_id": task_data.get('id'), "evalset_id": evalset_id})
+    return {"task_id": task_data.get('id'), "status": task_data.get('status')}
+# ============================================================================
+# 查询状态
+# ============================================================================
+def check_status(task_id: str, client: ApiClient, output_file: str) -> dict:
+    """查询单次任务状态"""
+    task_data = client.get(f"/open/api/v1/eval/tasks/{task_id}")
+    status = task_data.get('status')
+    result = {"task_id": task_id, "status": status}
+    # 成功时下载报告
+    if status == 'Succeeded':
+        artifacts = {a['type']: a['url'] for a in task_data.get('artifacts', [])}
+        report_url = artifacts.get('report_file')
+        if report_url:
+            resp = requests.get(report_url)
+            resp.raise_for_status()
+            save_json(output_file, resp.json())
+        result["platform_url"] = artifacts.get('platform_page')
+        result["report_file"] = output_file if report_url else None
+    return result
+def cmd_status(args):
+    """查询任务状态"""
+    config_result = load_config_kv(args.config)
+    if not config_result.get("success"):
+        raise ValueError(f"配置文件加载失败: {config_result.get('message')}")
+    config = config_result.get("data", {})
+    # 使用 TokenManager 和 ApiClient
+    token_manager = TokenManager(args.auth)
+    client = ApiClient(token_manager, config.get('base_url', 'http://127.0.0.1:8080'))
+    evaltask_result = load_json(args.evaltask)
+    if not evaltask_result.get("success"):
+        raise ValueError(f"任务元信息加载失败: {evaltask_result.get('message')}")
+    task_id = evaltask_result.get("data", {}).get('task_id')
+    if not task_id:
+        raise ValueError("评测任务元信息文件中未找到task_id")
+    # 轮询模式
+    if args.poll:
+        start = time.time()
+        while True:
+            elapsed = time.time() - start
+            if elapsed > args.timeout:
+                return {"task_id": task_id, "status": "Timeout", "error": f"轮询超时（{args.timeout}秒）"}
+            result_obj = check_status(task_id, client, args.output)
+            if result_obj["status"] in TERMINAL_STATES:
+                return result_obj
+            print(json.dumps({"task_id": task_id, "status": result_obj["status"], "elapsed": int(elapsed),
+                              "message": f"任务执行中，{args.interval}秒后重试..."}, ensure_ascii=False), flush=True)
+            time.sleep(args.interval)
+    return check_status(task_id, client, args.output)
+# ============================================================================
+# 结果摘要
+# ============================================================================
+def extract_text_from_content(content: list) -> str:
+    """递归提取 content 中的文本"""
+    texts = []
+    for item in content:
+        if item.get('type') == 'paragraph' and item.get('text'):
+            texts.append(item['text'])
+        elif item.get('type') in ('section',) and item.get('content'):
+            texts.extend(extract_text_from_content(item['content']))
+    return '\n'.join(texts)
+def find_section_by_title(content: list, title: str) -> dict:
+    """根据标题查找 section"""
+    for item in content:
+        if item.get('type') == 'section':
+            if item.get('title') == title:
+                return item
+            if item.get('content'):
+                result = find_section_by_title(item['content'], title)
+                if result:
+                    return result
+    return None
+def find_table_by_title(content: list, title: str) -> list:
+    """根据标题查找表格数据"""
+    for item in content:
+        if item.get('type') == 'table' and title in item.get('title', ''):
+            return item.get('dataset', {}).get('source', [])
+        if item.get('type') == 'section' and item.get('content'):
+            result = find_table_by_title(item['content'], title)
+            if result:
+                return result
+    return []
+def cmd_summary(args):
+    """生成评测结果摘要"""
+    result_file = Path(args.result)
+    if not result_file.exists():
+        raise FileNotFoundError(f"评测结果文件不存在: {result_file}")
+    load_result = load_json(args.result)
+    if not load_result.get("success"):
+        raise ValueError(f"评测结果加载失败: {load_result.get('message')}")
+    data = load_result.get("data", {})
+    output = []
+    # 1. 综合得分 (从顶层 metric.aggregations 中提取)
+    aggregations = data.get('metric', {}).get('aggregations', [])
+    if aggregations:
+        output.append("## 综合得分")
+        output.append("| 模型 | 分类 | 综合得分 |")
+        output.append("|------|------|----------|")
+        for agg in aggregations:
+            if agg.get('name') == '综合得分':
+                for group in agg.get('groups', []):
+                    model = category = ""
+                    for g in group.get('group', []):
+                        if g.get('g') == 'model':
+                            model = g.get('v', '')
+                        elif g.get('g') == 'category':
+                            category = g.get('v', '')
+                    score = group.get('payload', {}).get('average', 0)
+                    output.append(f"| {model} | {category} | {score:.2f} |")
+        output.append("")
+    # 2. 各维度表现
+    summary = data.get('summary', {})
+    content = summary.get('content', [])
+    # 查找综合得分表格
+    score_table = find_table_by_title(content, '综合得分')
+    if score_table and len(score_table) > 1:
+        output.append("## 各维度得分")
+        headers = score_table[0]
+        rows = score_table[1:]
+        output.append("| " + " | ".join(headers[:5]) + " |")
+        output.append("| " + " | ".join(["---"] * min(5, len(headers))) + " |")
+        for row in rows[:10]:  # 限制显示前10行
+            output.append("| " + " | ".join(str(v) if isinstance(v, (int, float)) else v for v in row[:5]) + " |")
+        output.append("")
+    # 查找良好率表格
+    good_rate_table = find_table_by_title(content, '良好率')
+    if good_rate_table and len(good_rate_table) > 1:
+        output.append("## 良好率")
+        headers = good_rate_table[0]
+        rows = good_rate_table[1:]
+        output.append("| " + " | ".join(headers) + " |")
+        output.append("| " + " | ".join(["---"] * len(headers)) + " |")
+        for row in rows[:5]:  # 限制显示前5行
+            output.append("| " + " | ".join(f"{v:.1f}" if isinstance(v, float) else str(v) for v in row) + " |")
+        output.append("")
+    # 3. 改进建议
+    suggestion_section = find_section_by_title(content, '2.3 改进建议')
+    if suggestion_section:
+        suggestion_text = extract_text_from_content(suggestion_section.get('content', []))
+        if suggestion_text:
+            output.append("## 改进建议")
+            output.append(suggestion_text)
+            output.append("")
+    # 4. 在线报告链接
+    if args.platform_url:
+        output.append("## 在线报告")
+        output.append(args.platform_url)
+    return {"summary": "\n".join(output)}
+# ============================================================================
+# CLI 入口
+# ============================================================================
+def main():
+    parser = argparse.ArgumentParser(description='评测任务管理')
+    subparsers = parser.add_subparsers(dest='command', help='子命令')
+    # submit
+    p = subparsers.add_parser('submit', help='提交评测任务')
+    p.add_argument('--config', required=True, help='服务配置文件')
+    p.add_argument('--auth', required=True, help='鉴权信息文件')
+    p.add_argument('--eval_set', required=True, help='评测集标识文件')
+    p.add_argument('--eval_dimension', required=True, help='评测维度配置文件')
+    p.add_argument('--eval_judge', required=True, help='评委配置文件')
+    p.add_argument('--output', required=True, help='评测任务元信息输出文件')
+    p.set_defaults(func=cmd_submit)
+    # status
+    p = subparsers.add_parser('status', help='查询任务状态')
+    p.add_argument('--config', required=True, help='服务配置文件')
+    p.add_argument('--auth', required=True, help='鉴权信息文件')
+    p.add_argument('--evaltask', required=True, help='评测任务元信息文件')
+    p.add_argument('--output', required=True, help='评测报告输出路径')
+    p.add_argument('--poll', action='store_true', help='启用自动轮询模式')
+    p.add_argument('--interval', type=int, default=30, help='轮询间隔秒数')
+    p.add_argument('--timeout', type=int, default=3600, help='轮询超时秒数')
+    p.set_defaults(func=cmd_status)
+    # summary
+    p = subparsers.add_parser('summary', help='生成评测结果摘要')
+    p.add_argument('--result', required=True, help='评测结果文件(evaltask-result.json)')
+    p.add_argument('--platform_url', default='', help='在线报告链接(可选)')
+    p.set_defaults(func=cmd_summary)
+    args = parser.parse_args()
+    # Python 3.6 兼容：手动检查子命令
+    if args.command is None:
+        parser.error("请指定子命令: submit, status, summary")
+    try:
+        result_obj = args.func(args)
+        print(json.dumps(result_obj, ensure_ascii=False))
+    except Exception as e:
+        handle_cli_error(e)
+if __name__ == '__main__':
+    main()

package/skills/model-evaluation/scripts/files/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+文件处理模块
+包含所有文件操作相关功能：
+- load_json/save_json: JSON 文件读写
+- load_config_yaml/load_config_kv: 配置文件加载
+- load_data: 多格式数据文件加载
+- load_jsonl_stream/load_csv_stream: 流式读取
+- extract_fields/suggest_mapping: 字段映射工具
+"""
+from .file_utils import (
+    load_json,
+    save_json,
+    load_config_yaml,
+    load_config_kv,
+    load_data,
+    extract_fields,
+    suggest_mapping,
+)
+from .streaming import (
+    load_jsonl_stream,
+    load_csv_stream,
+)
+__all__ = [
+    'load_json',
+    'save_json',
+    'load_config_yaml',
+    'load_config_kv',
+    'load_data',
+    'extract_fields',
+    'suggest_mapping',
+    'load_jsonl_stream',
+    'load_csv_stream',
+]

package/skills/model-evaluation/scripts/files/file_utils.py ADDED Viewed

@@ -0,0 +1,330 @@
+# -*- coding: utf-8 -*-
+"""
+文件工具函数模块
+统一 JSON/YAML/配置文件的读写操作
+"""
+import json
+import csv
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+from utils.constants import (
+    ERR_FILE_NOT_FOUND,
+    ERR_FILE_ENCODING,
+    ERR_FILE_PARSE,
+    ERR_CONFIG_INVALID,
+    REQUIRED_FIELDS,
+    OPTIONAL_FIELDS,
+    FIELD_PATTERNS,
+    CASE_ID_EXACT_MATCH,
+)
+from utils.errors import (
+    result,
+    FileEncodingError,
+    FileParseError,
+    ConfigError,
+)
+# ============================================================================
+# JSON 文件操作
+# ============================================================================
+def load_json(path: str, encoding: str = "utf-8") -> Dict[str, Any]:
+    """
+    加载 JSON 文件
+    Args:
+        path: 文件路径
+        encoding: 文件编码（默认 utf-8）
+    Returns:
+        成功: {"success": True, "data": {...}, ...}
+        失败: {"success": False, "code": ..., "message": ...}
+    注意: D-03 - 文件不存在或无效时返回错误字典，而非 None
+    """
+    p = Path(path)
+    if not p.exists():
+        return result("load", "not_found", f"文件不存在: {path}", code=ERR_FILE_NOT_FOUND)
+    try:
+        content = p.read_text(encoding=encoding)
+        data = json.loads(content)
+        return result("load", "loaded", f"成功加载: {path}", data=data)
+    except UnicodeDecodeError:
+        return result("load", "encoding_error",
+                     f"无法使用 {encoding} 编码读取文件: {path}",
+                     code=ERR_FILE_ENCODING)
+    except json.JSONDecodeError as e:
+        return result("load", "parse_error",
+                     f"JSON 解析失败: {path} - {e}",
+                     code=ERR_FILE_PARSE)
+def save_json(path: str, data: Any, encoding: str = "utf-8") -> Dict[str, Any]:
+    """
+    保存数据到 JSON 文件
+    Args:
+        path: 文件路径
+        data: 要保存的数据
+        encoding: 文件编码（默认 utf-8）
+    Returns:
+        {"success": True, "message": "保存成功", "path": ...}
+    """
+    p = Path(path)
+    try:
+        p.parent.mkdir(parents=True, exist_ok=True)
+        content = json.dumps(data, indent=2, ensure_ascii=False)
+        p.write_text(content, encoding=encoding)
+        return result("save", "saved", f"保存成功: {path}", data={"path": str(p)})
+    except Exception as e:
+        return result("save", "error", f"保存失败: {e}", success=False)
+# ============================================================================
+# 配置文件操作
+# ============================================================================
+def _parse_simple_yaml(content: str) -> Dict[str, Any]:
+    """
+    简单 YAML 解析器（仅支持 key: value 格式）
+    用于解析项目配置文件，避免依赖 pyyaml 库。
+    支持格式：
+    - key: value
+    - key: "quoted value"
+    - key: 'quoted value'
+    - # 注释行
+    - 空行
+    """
+    data = {}
+    for line in content.split('\n'):
+        line = line.strip()
+        # 跳过空行和注释
+        if not line or line.startswith('#'):
+            continue
+        # 解析 key: value
+        if ':' in line:
+            key, value = line.split(':', 1)
+            key = key.strip()
+            value = value.strip()
+            # 解析值类型
+            if not value:
+                data[key] = None
+            elif value.startswith('"') and value.endswith('"'):
+                data[key] = value[1:-1]
+            elif value.startswith("'") and value.endswith("'"):
+                data[key] = value[1:-1]
+            elif value.lower() == 'true':
+                data[key] = True
+            elif value.lower() == 'false':
+                data[key] = False
+            elif value.lower() == 'null':
+                data[key] = None
+            else:
+                # 尝试数字，否则作为字符串
+                try:
+                    if '.' in value:
+                        data[key] = float(value)
+                    else:
+                        data[key] = int(value)
+                except ValueError:
+                    data[key] = value
+    return data
+def load_config_yaml(path: str, encoding: str = "utf-8") -> Dict[str, Any]:
+    """
+    加载 YAML 配置文件
+    用于加载 YAML 格式的配置文件（如 eval-auth.cfg 可能是 YAML）。
+    使用内置解析器，无需 pyyaml 依赖。
+    """
+    p = Path(path)
+    if not p.exists():
+        return result("load_config", "not_found", f"配置文件不存在: {path}",
+                     code=ERR_FILE_NOT_FOUND)
+    try:
+        content = p.read_text(encoding=encoding)
+        data = _parse_simple_yaml(content)
+        if data is None:
+            data = {}
+        return result("load_config", "loaded", f"配置加载成功: {path}", data=data)
+    except Exception as e:
+        return result("load_config", "parse_error", f"配置解析失败: {e}",
+                     code=ERR_CONFIG_INVALID)
+    except UnicodeDecodeError:
+        return result("load_config", "encoding_error",
+                     f"无法使用 {encoding} 编码读取文件: {path}",
+                     code=ERR_FILE_ENCODING)
+def load_config_kv(path: str, encoding: str = "utf-8") -> Dict[str, Any]:
+    """
+    加载 key:value 格式的配置文件
+    用于加载服务配置文件（如 eval-server.cfg）。
+    格式为每行一个 key: value 对。
+    """
+    p = Path(path)
+    if not p.exists():
+        return result("load_config", "not_found", f"配置文件不存在: {path}",
+                     code=ERR_FILE_NOT_FOUND)
+    try:
+        config = {}
+        for line in p.read_text(encoding=encoding).splitlines():
+            if ':' in line:
+                key, value = line.split(':', 1)
+                config[key.strip()] = value.strip().strip('"')
+        return result("load_config", "loaded", f"配置加载成功: {path}", data=config)
+    except UnicodeDecodeError:
+        return result("load_config", "encoding_error",
+                     f"无法使用 {encoding} 编码读取文件: {path}",
+                     code=ERR_FILE_ENCODING)
+# ============================================================================
+# 数据文件操作
+# ============================================================================
+def load_data(path: str, encoding: str = "utf-8") -> Dict[str, Any]:
+    """
+    根据文件类型加载数据
+    支持: .json, .jsonl, .csv, .xlsx, .xls
+    Returns:
+        {"success": True, "data": [...], "format": "jsonl", ...}
+    """
+    p = Path(path)
+    if not p.exists():
+        return result("load_data", "not_found", f"数据文件不存在: {path}",
+                     code=ERR_FILE_NOT_FOUND)
+    suffix = p.suffix.lower()
+    try:
+        if suffix == '.jsonl':
+            lines = p.read_text(encoding=encoding).splitlines()
+            data = [json.loads(line) for line in lines if line.strip()]
+            return result("load_data", "loaded", f"成功加载 {len(data)} 条记录",
+                        data={"items": data, "format": "jsonl", "total": len(data)})
+        if suffix == '.json':
+            content = p.read_text(encoding=encoding)
+            data = json.loads(content)
+            if not isinstance(data, list):
+                data = [data]
+            return result("load_data", "loaded", f"成功加载 {len(data)} 条记录",
+                        data={"items": data, "format": "json", "total": len(data)})
+        if suffix == '.csv':
+            import csv
+            with open(path, 'r', encoding=encoding) as f:
+                reader = csv.DictReader(f)
+                data = list(reader)
+            return result("load_data", "loaded", f"成功加载 {len(data)} 条记录",
+                        data={"items": data, "format": "csv", "total": len(data)})
+        if suffix in ('.xlsx', '.xls'):
+            try:
+                import pandas as pd
+                data = pd.read_excel(path).to_dict('records')
+                return result("load_data", "loaded", f"成功加载 {len(data)} 条记录",
+                            data={"items": data, "format": "xlsx", "total": len(data)})
+            except ImportError:
+                return result("load_data", "error", "处理 Excel 文件需要安装 pandas",
+                             success=False)
+        return result("load_data", "error", f"不支持的文件格式: {suffix}", success=False)
+    except json.JSONDecodeError as e:
+        return result("load_data", "parse_error", f"JSON 解析失败: {e}",
+                     code=ERR_FILE_PARSE)
+    except UnicodeDecodeError:
+        return result("load_data", "encoding_error",
+                     f"无法使用 {encoding} 编码读取文件: {path}",
+                     code=ERR_FILE_ENCODING)
+# ============================================================================
+# 字段映射工具
+# ============================================================================
+def extract_fields(items: List[dict]) -> Dict[str, Any]:
+    """
+    从数据项提取字段信息
+    """
+    fields = {}
+    for item in items[:100]:
+        if not isinstance(item, dict):
+            continue
+        for key, value in item.items():
+            if key not in fields:
+                fields[key] = {"type": type(value).__name__}
+    return fields
+def suggest_mapping(fields: Dict) -> Dict[str, Dict]:
+    """
+    根据字段名建议映射
+    返回格式：
+    {
+        "question": {"source_field": "question", "default": null},
+        "answer": {"source_field": "answer", "default": null},
+        "model": {"source_field": null, "default": null},
+        "case_id": {"source_field": "id", "default": null}
+    }
+    匹配规则：
+    1. 精确匹配优先（字段名完全等于关键词）
+    2. 包含匹配次之（字段名包含关键词，按关键词长度降序优先）
+    特殊处理：
+    - 'id' 字段精确匹配到 case_id（避免 seq_id、user_id 误匹配）
+    """
+    mapping = {}
+    for field_name in fields:
+        field_lower = field_name.lower()
+        # 特殊处理：'id' 精确匹配到 case_id
+        if field_lower in CASE_ID_EXACT_MATCH and 'case_id' not in mapping:
+            mapping['case_id'] = {"source_field": field_name, "default": None}
+            continue
+        for target, keywords in FIELD_PATTERNS.items():
+            if target in mapping:
+                continue
+            # 1. 精确匹配
+            if field_lower in keywords:
+                mapping[target] = {"source_field": field_name, "default": None}
+                break
+            # 2. 包含匹配，按关键词长度降序优先匹配更精确的关键词
+            sorted_keywords = sorted([k for k in keywords if len(k) >= 3], key=len, reverse=True)
+            if any(k in field_lower for k in sorted_keywords):
+                mapping[target] = {"source_field": field_name, "default": None}
+                break
+    # 确保必填字段都有映射条目（即使 source_field 为 null）
+    for field in REQUIRED_FIELDS:
+        if field not in mapping:
+            mapping[field] = {"source_field": None, "default": None}
+    return mapping