vibeval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vibeval-0.1.0/.gitignore +26 -0
- vibeval-0.1.0/CLAUDE.md +96 -0
- vibeval-0.1.0/LICENSE +21 -0
- vibeval-0.1.0/PKG-INFO +124 -0
- vibeval-0.1.0/README.md +105 -0
- vibeval-0.1.0/examples/meeting_app/.vibeval.yml +1 -0
- vibeval-0.1.0/examples/meeting_app/__init__.py +0 -0
- vibeval-0.1.0/examples/meeting_app/llm_client.py +24 -0
- vibeval-0.1.0/examples/meeting_app/summarizer.py +58 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/datasets/meeting_summaries/manifest.yaml +31 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/datasets/meeting_summaries/planning.json +8 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/datasets/meeting_summaries/standup.json +8 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/__init__.py +0 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/conftest.py +110 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/pytest.ini +3 -0
- vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/test_summarizer.py +112 -0
- vibeval-0.1.0/examples/meeting_app/transcript_api.py +25 -0
- vibeval-0.1.0/plugin/.claude-plugin/marketplace.json +13 -0
- vibeval-0.1.0/plugin/.claude-plugin/plugin.json +9 -0
- vibeval-0.1.0/plugin/commands/vibeval-analyze.md +138 -0
- vibeval-0.1.0/plugin/commands/vibeval-design.md +128 -0
- vibeval-0.1.0/plugin/commands/vibeval-generate.md +224 -0
- vibeval-0.1.0/plugin/commands/vibeval-run.md +102 -0
- vibeval-0.1.0/plugin/commands/vibeval-update.md +123 -0
- vibeval-0.1.0/plugin/skills/protocol/SKILL.md +47 -0
- vibeval-0.1.0/plugin/skills/protocol/references/00-philosophy.md +143 -0
- vibeval-0.1.0/plugin/skills/protocol/references/01-overview.md +48 -0
- vibeval-0.1.0/plugin/skills/protocol/references/02-dataset.md +126 -0
- vibeval-0.1.0/plugin/skills/protocol/references/03-judge-spec.md +180 -0
- vibeval-0.1.0/plugin/skills/protocol/references/04-result.md +124 -0
- vibeval-0.1.0/plugin/skills/protocol/references/05-comparison.md +55 -0
- vibeval-0.1.0/pyproject.toml +39 -0
- vibeval-0.1.0/src/vibeval/__init__.py +3 -0
- vibeval-0.1.0/src/vibeval/cli.py +287 -0
- vibeval-0.1.0/src/vibeval/compare.py +289 -0
- vibeval-0.1.0/src/vibeval/config.py +66 -0
- vibeval-0.1.0/src/vibeval/conversation.py +57 -0
- vibeval-0.1.0/src/vibeval/dataset.py +147 -0
- vibeval-0.1.0/src/vibeval/judge.py +113 -0
- vibeval-0.1.0/src/vibeval/llm.py +266 -0
- vibeval-0.1.0/src/vibeval/report.py +743 -0
- vibeval-0.1.0/src/vibeval/result.py +135 -0
- vibeval-0.1.0/src/vibeval/rules.py +254 -0
- vibeval-0.1.0/tests/test_compare.py +208 -0
- vibeval-0.1.0/tests/test_conversation.py +75 -0
- vibeval-0.1.0/tests/test_judge.py +184 -0
- vibeval-0.1.0/tests/test_report.py +218 -0
- vibeval-0.1.0/tests/test_result.py +90 -0
- vibeval-0.1.0/tests/test_rules.py +193 -0
- vibeval-0.1.0/tests/test_target.py +164 -0
- vibeval-0.1.0/uv.lock +225 -0
vibeval-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
|
|
8
|
+
# Virtual environment
|
|
9
|
+
.venv/
|
|
10
|
+
|
|
11
|
+
# pytest
|
|
12
|
+
.pytest_cache/
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# vibeval results (generated at runtime)
|
|
16
|
+
.vibeval/
|
|
17
|
+
**/results/
|
|
18
|
+
**/comparisons/
|
|
19
|
+
|
|
20
|
+
# IDE
|
|
21
|
+
.vscode/
|
|
22
|
+
.idea/
|
|
23
|
+
*.swp
|
|
24
|
+
|
|
25
|
+
# OS
|
|
26
|
+
.DS_Store
|
vibeval-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# vibeval 项目开发指南
|
|
2
|
+
|
|
3
|
+
## 项目概述
|
|
4
|
+
|
|
5
|
+
vibeval (Vibe Coding Eval) 是一个 AI 应用评测框架,由两部分组成:
|
|
6
|
+
- **vibeval CLI** (`src/vibeval/`) — 评测工具(judge、compare、simulate、report 等)
|
|
7
|
+
- **Claude Code Plugin** (`plugin/`) — VibeCoding 工作流(analyze、design、generate、run、update)
|
|
8
|
+
|
|
9
|
+
## 核心原则
|
|
10
|
+
|
|
11
|
+
### 1. 协议为本
|
|
12
|
+
|
|
13
|
+
`plugin/skills/protocol/references/` 下的协议文档是整个项目的核心设定(Source of Truth)。所有 plugin 命令、CLI 代码、文档都必须围绕和遵循协议。当任何地方的内容与协议矛盾时,以协议为准。编写文档或命令时,避免重复协议中已有的定义,改为引用协议文件。
|
|
14
|
+
|
|
15
|
+
协议文件:
|
|
16
|
+
- `00-philosophy.md` — 评测哲学(信息不对称 + 全局视野)
|
|
17
|
+
- `01-overview.md` — 目录结构、统一 turn 模型
|
|
18
|
+
- `02-dataset.md` — 数据集格式
|
|
19
|
+
- `03-judge-spec.md` — 评判规格(rule/llm、target、所有字段定义)
|
|
20
|
+
- `04-result.md` — 结果格式(trace turns/steps)
|
|
21
|
+
- `05-comparison.md` — 横评格式
|
|
22
|
+
|
|
23
|
+
### 2. 语言无关
|
|
24
|
+
|
|
25
|
+
所有设计必须避免语言耦合。vibeval CLI 提供通用的、可通过命令行直接执行的功能,不需要在代码层面做依赖。用户的测试代码不 import vibeval 包,而是通过 subprocess 调用 CLI(如 `vibeval simulate`),或直接按协议格式生成文件。CLI 既服务开发者,也服务 VibeCoding Agent。
|
|
26
|
+
|
|
27
|
+
### 3. CLI Help 是命令文档的唯一来源
|
|
28
|
+
|
|
29
|
+
CLI 的每个命令必须保持完整、最新的 `--help` 描述(包括用途、参数说明、使用示例)。Plugin 的命令文档和 SKILL.md 中不重复 CLI 的参数细节,而是提示通过 `vibeval --help` / `vibeval <command> --help` 查看。这样 CLI 和文档永远不会不同步。
|
|
30
|
+
|
|
31
|
+
### 4. 测试目录的区分
|
|
32
|
+
|
|
33
|
+
`tests/` 目录下是 vibeval CLI 工具和代码自身的单元测试、集成测试。`examples/` 下的项目(如 `meeting_app/`)是独立的示例应用,它们有自己的测试(在 `examples/meeting_app/tests/vibeval/` 下),演示的是用户使用 vibeval 的完整流程。两者不要混淆。
|
|
34
|
+
|
|
35
|
+
## 开发约定
|
|
36
|
+
|
|
37
|
+
### 运行测试
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# vibeval 自身的测试
|
|
41
|
+
python -m pytest tests/ -v
|
|
42
|
+
|
|
43
|
+
# 示例应用的测试(独立运行)
|
|
44
|
+
cd examples/meeting_app/tests/vibeval/meeting_summary/tests
|
|
45
|
+
python -m pytest test_summarizer.py -v
|
|
46
|
+
|
|
47
|
+
# 示例应用的 judge
|
|
48
|
+
cd examples/meeting_app
|
|
49
|
+
vibeval judge meeting_summary latest
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 修改协议
|
|
53
|
+
|
|
54
|
+
修改协议文件后,检查以下是否需要同步更新:
|
|
55
|
+
- `src/vibeval/` 中的代码实现(rules.py、llm.py、judge.py、compare.py、result.py)
|
|
56
|
+
- `plugin/commands/` 中的命令文档(应引用协议而非重复内容)
|
|
57
|
+
- `plugin/skills/protocol/SKILL.md`(Quick Reference 摘要)
|
|
58
|
+
- CLI 的 `--help` 描述
|
|
59
|
+
|
|
60
|
+
### 修改 CLI
|
|
61
|
+
|
|
62
|
+
新增或修改 CLI 命令时:
|
|
63
|
+
- 在 `cli.py` 中提供完整的 `help` 和 `description` 文本
|
|
64
|
+
- 不需要同步更新 plugin 文档中的命令细节——plugin 文档应引导查看 `vibeval --help`
|
|
65
|
+
- 添加对应的测试到 `tests/`
|
|
66
|
+
|
|
67
|
+
### 版本管理
|
|
68
|
+
|
|
69
|
+
项目版本在三处维护,必须保持一致:
|
|
70
|
+
- `pyproject.toml` — `version` 字段
|
|
71
|
+
- `plugin/.claude-plugin/plugin.json` — `version` 字段
|
|
72
|
+
- `src/vibeval/__init__.py` — `__version__` 变量
|
|
73
|
+
|
|
74
|
+
发版或变更版本号时,三处同时更新。
|
|
75
|
+
|
|
76
|
+
### 修改 Plugin
|
|
77
|
+
|
|
78
|
+
修改 plugin 命令或 skill 时:
|
|
79
|
+
- 基础概念定义(数据格式、字段、规则名等)一律引用 `references/` 中的协议文件,不内联重复
|
|
80
|
+
- 操作指导(如何设计、如何生成)可以在命令文档中展开,但引用协议作为定义来源
|
|
81
|
+
- CLI 命令的参数和用法,引导使用 `vibeval --help` 查看
|
|
82
|
+
|
|
83
|
+
### 项目结构
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
vibeval/
|
|
87
|
+
├── src/vibeval/ # CLI 工具实现
|
|
88
|
+
├── plugin/ # Claude Code 插件
|
|
89
|
+
│ ├── commands/ # /vibeval-analyze, /vibeval-design, /vibeval-generate, /vibeval-run, /vibeval-update
|
|
90
|
+
│ └── skills/protocol/ # 数据协议(Source of Truth)
|
|
91
|
+
├── examples/ # 独立的示例应用
|
|
92
|
+
├── tests/ # vibeval 自身的测试
|
|
93
|
+
├── CLAUDE.md
|
|
94
|
+
├── README.md
|
|
95
|
+
└── pyproject.toml
|
|
96
|
+
```
|
vibeval-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 SandyKid
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vibeval-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vibeval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: vibeval (Vibe Coding Eval) — AI application testing framework
|
|
5
|
+
Project-URL: Homepage, https://github.com/SandyKidYao/vibeval
|
|
6
|
+
Project-URL: Repository, https://github.com/SandyKidYao/vibeval
|
|
7
|
+
Project-URL: Issues, https://github.com/SandyKidYao/vibeval/issues
|
|
8
|
+
Author-email: SandyKid <yupengyao912@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: pyyaml>=6.0
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# vibeval — Vibe Coding Eval
|
|
21
|
+
|
|
22
|
+
AI 应用的快速评测框架。只需安装 Claude Code,即可完成从代码分析、测试生成到评估的全闭环。
|
|
23
|
+
|
|
24
|
+
## 解决什么问题
|
|
25
|
+
|
|
26
|
+
传统软件测试框架无法评估 AI 输出的质量;传统 AI 评测平台依赖数据集构建,跟不上功能迭代速度。vibeval 在两者之间取得平衡:
|
|
27
|
+
|
|
28
|
+
- 通过 VibeCoding 分析你的代码,快速生成合成数据和测试用例
|
|
29
|
+
- 确定性规则 + LLM 语义评判,双重评估
|
|
30
|
+
- 跨版本横评,追踪质量变化
|
|
31
|
+
- 语言无关:生成的测试代码适配你的项目框架,不依赖 vibeval 包
|
|
32
|
+
|
|
33
|
+
## 前置要求
|
|
34
|
+
|
|
35
|
+
- [Claude Code](https://claude.ai/code)
|
|
36
|
+
- Python 3.10+
|
|
37
|
+
|
|
38
|
+
## 安装
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# 安装 vibeval CLI
|
|
42
|
+
pip install vibeval
|
|
43
|
+
|
|
44
|
+
# 安装 Claude Code 插件(在 Claude Code 中执行)
|
|
45
|
+
/install-plugin /path/to/vibeval/plugin
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## 使用方式
|
|
49
|
+
|
|
50
|
+
在 Claude Code 中执行,全程不需要离开对话:
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
/vibeval-analyze → /vibeval-design → /vibeval-generate → /vibeval-run
|
|
54
|
+
↑
|
|
55
|
+
代码变更 → /vibeval-update ──────┘
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 分析代码
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
/vibeval-analyze meeting_summary src/services/
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
分析源码,识别 AI 调用点、数据流、Mock 点,给出可评估性改进建议。
|
|
65
|
+
|
|
66
|
+
### 设计测试
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
/vibeval-design meeting_summary
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
设计合成数据规格、评判标准、测试结构。每一步产出可编辑的中间文件,可以 review 和修改。
|
|
73
|
+
|
|
74
|
+
### 生成代码和数据
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
/vibeval-generate meeting_summary
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
生成完整的测试套件(合成数据 + 测试代码),使用你项目的测试框架(pytest/vitest/jest/...)。
|
|
81
|
+
|
|
82
|
+
### 运行 + 评估
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
/vibeval-run meeting_summary
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
一步完成:执行测试 → 评估 → 诊断分析 → 生成报告。
|
|
89
|
+
|
|
90
|
+
### 代码变更后更新
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
/vibeval-update meeting_summary
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
检测代码变更,增量更新测试套件和数据。
|
|
97
|
+
|
|
98
|
+
### 跨版本对比
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# 统计对比
|
|
102
|
+
vibeval diff meeting_summary run_a run_b
|
|
103
|
+
|
|
104
|
+
# LLM 深度横评
|
|
105
|
+
vibeval compare meeting_summary run_a run_b
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 可视化报告
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
vibeval report meeting_summary latest --open
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
生成自包含的 HTML 报告,涵盖测试设计、数据、过程 trace 和评判结果。
|
|
115
|
+
|
|
116
|
+
### 更多命令
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
vibeval --help
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
MIT
|
vibeval-0.1.0/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# vibeval — Vibe Coding Eval
|
|
2
|
+
|
|
3
|
+
AI 应用的快速评测框架。只需安装 Claude Code,即可完成从代码分析、测试生成到评估的全闭环。
|
|
4
|
+
|
|
5
|
+
## 解决什么问题
|
|
6
|
+
|
|
7
|
+
传统软件测试框架无法评估 AI 输出的质量;传统 AI 评测平台依赖数据集构建,跟不上功能迭代速度。vibeval 在两者之间取得平衡:
|
|
8
|
+
|
|
9
|
+
- 通过 VibeCoding 分析你的代码,快速生成合成数据和测试用例
|
|
10
|
+
- 确定性规则 + LLM 语义评判,双重评估
|
|
11
|
+
- 跨版本横评,追踪质量变化
|
|
12
|
+
- 语言无关:生成的测试代码适配你的项目框架,不依赖 vibeval 包
|
|
13
|
+
|
|
14
|
+
## 前置要求
|
|
15
|
+
|
|
16
|
+
- [Claude Code](https://claude.ai/code)
|
|
17
|
+
- Python 3.10+
|
|
18
|
+
|
|
19
|
+
## 安装
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# 安装 vibeval CLI
|
|
23
|
+
pip install vibeval
|
|
24
|
+
|
|
25
|
+
# 安装 Claude Code 插件(在 Claude Code 中执行)
|
|
26
|
+
/install-plugin /path/to/vibeval/plugin
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## 使用方式
|
|
30
|
+
|
|
31
|
+
在 Claude Code 中执行,全程不需要离开对话:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
/vibeval-analyze → /vibeval-design → /vibeval-generate → /vibeval-run
|
|
35
|
+
↑
|
|
36
|
+
代码变更 → /vibeval-update ──────┘
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### 分析代码
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
/vibeval-analyze meeting_summary src/services/
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
分析源码,识别 AI 调用点、数据流、Mock 点,给出可评估性改进建议。
|
|
46
|
+
|
|
47
|
+
### 设计测试
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
/vibeval-design meeting_summary
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
设计合成数据规格、评判标准、测试结构。每一步产出可编辑的中间文件,可以 review 和修改。
|
|
54
|
+
|
|
55
|
+
### 生成代码和数据
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
/vibeval-generate meeting_summary
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
生成完整的测试套件(合成数据 + 测试代码),使用你项目的测试框架(pytest/vitest/jest/...)。
|
|
62
|
+
|
|
63
|
+
### 运行 + 评估
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
/vibeval-run meeting_summary
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
一步完成:执行测试 → 评估 → 诊断分析 → 生成报告。
|
|
70
|
+
|
|
71
|
+
### 代码变更后更新
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
/vibeval-update meeting_summary
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
检测代码变更,增量更新测试套件和数据。
|
|
78
|
+
|
|
79
|
+
### 跨版本对比
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# 统计对比
|
|
83
|
+
vibeval diff meeting_summary run_a run_b
|
|
84
|
+
|
|
85
|
+
# LLM 深度横评
|
|
86
|
+
vibeval compare meeting_summary run_a run_b
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### 可视化报告
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
vibeval report meeting_summary latest --open
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
生成自包含的 HTML 报告,涵盖测试设计、数据、过程 trace 和评判结果。
|
|
96
|
+
|
|
97
|
+
### 更多命令
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
vibeval --help
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vibeval_root: tests/vibeval
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""LLM client — wraps the actual LLM API call.
|
|
2
|
+
|
|
3
|
+
In production this calls OpenAI/Anthropic. In tests this gets mocked.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def chat(prompt: str, system: str = "") -> str:
|
|
10
|
+
"""Send a prompt to the LLM and return the response text."""
|
|
11
|
+
# In a real app, this would be:
|
|
12
|
+
# client = openai.OpenAI()
|
|
13
|
+
# resp = client.chat.completions.create(
|
|
14
|
+
# model="gpt-4o",
|
|
15
|
+
# messages=[
|
|
16
|
+
# {"role": "system", "content": system},
|
|
17
|
+
# {"role": "user", "content": prompt},
|
|
18
|
+
# ],
|
|
19
|
+
# )
|
|
20
|
+
# return resp.choices[0].message.content
|
|
21
|
+
raise RuntimeError(
|
|
22
|
+
"LLM client not configured. "
|
|
23
|
+
"Set OPENAI_API_KEY or mock this in tests."
|
|
24
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Meeting summarizer — the core AI pipeline.
|
|
2
|
+
|
|
3
|
+
Pipeline:
|
|
4
|
+
1. Fetch transcript from API
|
|
5
|
+
2. Call LLM to generate summary
|
|
6
|
+
3. Call LLM to extract action items
|
|
7
|
+
4. Return structured result
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from . import llm_client, transcript_api
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def summarize_meeting(meeting_id: str) -> dict[str, Any]:
|
|
18
|
+
"""Run the full meeting summarization pipeline."""
|
|
19
|
+
|
|
20
|
+
# Step 1: Fetch transcript
|
|
21
|
+
transcript = transcript_api.fetch_transcript(meeting_id)
|
|
22
|
+
speakers = transcript.get("speakers", [])
|
|
23
|
+
text = transcript.get("text", "")
|
|
24
|
+
|
|
25
|
+
# Step 2: Generate summary
|
|
26
|
+
summary_prompt = (
|
|
27
|
+
f"Summarize the following meeting transcript.\n"
|
|
28
|
+
f"Speakers: {', '.join(speakers)}\n\n"
|
|
29
|
+
f"Transcript:\n{text}\n\n"
|
|
30
|
+
f"Requirements:\n"
|
|
31
|
+
f"- Include ALL mentioned times, dates, and deadlines\n"
|
|
32
|
+
f"- Attribute key points to the correct speaker\n"
|
|
33
|
+
f"- Be concise but complete"
|
|
34
|
+
)
|
|
35
|
+
summary = llm_client.chat(
|
|
36
|
+
prompt=summary_prompt,
|
|
37
|
+
system="You are a meeting summarizer. Be thorough and accurate.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Step 3: Extract action items
|
|
41
|
+
actions_prompt = (
|
|
42
|
+
f"Extract action items from this meeting summary:\n\n"
|
|
43
|
+
f"{summary}\n\n"
|
|
44
|
+
f"Format each as: - [Owner] Action (deadline if mentioned)"
|
|
45
|
+
)
|
|
46
|
+
action_items = llm_client.chat(
|
|
47
|
+
prompt=actions_prompt,
|
|
48
|
+
system="You extract action items from meeting summaries.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Step 4: Return structured result
|
|
52
|
+
return {
|
|
53
|
+
"meeting_id": meeting_id,
|
|
54
|
+
"speakers": speakers,
|
|
55
|
+
"summary": summary,
|
|
56
|
+
"action_items": action_items,
|
|
57
|
+
"transcript_length": len(text),
|
|
58
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: meeting_summaries
|
|
2
|
+
description: "Synthetic meeting transcripts for testing the summarization pipeline"
|
|
3
|
+
version: "1"
|
|
4
|
+
tags:
|
|
5
|
+
- meeting
|
|
6
|
+
- summarization
|
|
7
|
+
|
|
8
|
+
judge_specs:
|
|
9
|
+
- method: rule
|
|
10
|
+
rule: contains_all
|
|
11
|
+
args:
|
|
12
|
+
field: "outputs.summary"
|
|
13
|
+
values_from: "expected_times"
|
|
14
|
+
weight: gate
|
|
15
|
+
|
|
16
|
+
- method: rule
|
|
17
|
+
rule: length_between
|
|
18
|
+
args:
|
|
19
|
+
field: "outputs.action_items"
|
|
20
|
+
min: 10
|
|
21
|
+
max: 5000
|
|
22
|
+
|
|
23
|
+
- method: rule
|
|
24
|
+
rule: max_steps
|
|
25
|
+
args:
|
|
26
|
+
max: 20
|
|
27
|
+
|
|
28
|
+
- method: rule
|
|
29
|
+
rule: max_turns
|
|
30
|
+
args:
|
|
31
|
+
max: 1
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_id": "planning",
|
|
3
|
+
"meeting_id": "mtg-plan-001",
|
|
4
|
+
"_tags": ["planning", "deadlines"],
|
|
5
|
+
"speakers": ["Diana", "Eve"],
|
|
6
|
+
"text": "Diana: Let's go over the project timeline. Phase 1 needs to be completed by June 30th. The kickoff was on January 15th.\n\nEve: Right. The mid-project review is scheduled for April 15th at 10:00 AM. We also have weekly syncs every Tuesday at 3:30 PM.\n\nDiana: The final demo is on July 5th. We need to submit the report by end of day July 10th.\n\nEve: Got it. I'll set up the sprint planning for next Monday at 9:30 AM.",
|
|
7
|
+
"expected_times": ["June 30th", "January 15th", "April 15th", "10:00 AM", "Tuesday", "3:30 PM", "July 5th", "July 10th", "Monday", "9:30 AM"]
|
|
8
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_id": "standup",
|
|
3
|
+
"meeting_id": "mtg-standup-001",
|
|
4
|
+
"_tags": ["standup", "multiple-times"],
|
|
5
|
+
"speakers": ["Alice", "Bob", "Charlie"],
|
|
6
|
+
"text": "Alice: Good morning everyone. It's 9:00 AM, let's start our standup. Bob, what did you work on yesterday?\n\nBob: Yesterday I finished the API refactoring that was due by March 15th. Today I'll work on the database migration, which needs to be done before the 3:00 PM deployment window.\n\nAlice: Great. Charlie?\n\nCharlie: I have a meeting with the client at 2:30 PM today to discuss the Q2 roadmap. The proposal deadline is April 1st. I'll also review Bob's PR before 11:00 AM.\n\nAlice: Perfect. Remember the all-hands is at 4:00 PM today. Let's wrap up by 9:15 AM.",
|
|
7
|
+
"expected_times": ["9:00 AM", "March 15th", "3:00 PM", "2:30 PM", "April 1st", "11:00 AM", "4:00 PM", "9:15 AM"]
|
|
8
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Pytest configuration for meeting_app vibeval tests.
|
|
2
|
+
|
|
3
|
+
[Generated by VibeCoding] This conftest:
|
|
4
|
+
1. Loads vibeval datasets
|
|
5
|
+
2. Provides a fixture that creates tracked mock wrappers
|
|
6
|
+
3. Saves vibeval-format results after each test
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
from unittest.mock import patch
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
import yaml
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# --- vibeval dataset loading ---
|
|
20
|
+
|
|
21
|
+
FEATURE_DIR = Path(__file__).parent.parent # tests/vibeval/meeting_summary/
|
|
22
|
+
DATASETS_DIR = FEATURE_DIR / "datasets"
|
|
23
|
+
RESULTS_DIR = FEATURE_DIR / "results"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_dataset(name: str) -> dict[str, Any]:
|
|
27
|
+
"""Load a dataset manifest + items."""
|
|
28
|
+
ds_dir = DATASETS_DIR / name
|
|
29
|
+
manifest = yaml.safe_load((ds_dir / "manifest.yaml").read_text())
|
|
30
|
+
|
|
31
|
+
items = {}
|
|
32
|
+
for f in sorted(ds_dir.iterdir()):
|
|
33
|
+
if f.name.startswith("manifest"):
|
|
34
|
+
continue
|
|
35
|
+
if f.suffix == ".json":
|
|
36
|
+
data = json.loads(f.read_text())
|
|
37
|
+
item_id = data.pop("_id", f.stem)
|
|
38
|
+
items[item_id] = data
|
|
39
|
+
|
|
40
|
+
return {"manifest": manifest, "items": items}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
MEETING_DS = load_dataset("meeting_summaries")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture(params=list(MEETING_DS["items"].keys()))
|
|
47
|
+
def meeting_item(request):
|
|
48
|
+
"""Parametrized fixture: yields each dataset item."""
|
|
49
|
+
item_id = request.param
|
|
50
|
+
return item_id, MEETING_DS["items"][item_id]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# --- vibeval result saving ---
|
|
54
|
+
|
|
55
|
+
class VibevalResultCollector:
|
|
56
|
+
"""Collects turn-based trace and saves vibeval-format results (v0.2)."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, test_name: str, dataset: str, item_id: str, item_data: dict):
|
|
59
|
+
self.test_name = test_name
|
|
60
|
+
self.dataset = dataset
|
|
61
|
+
self.item_id = item_id
|
|
62
|
+
self.item_data = item_data
|
|
63
|
+
self.turns: list[dict] = []
|
|
64
|
+
self.outputs: dict = {}
|
|
65
|
+
self.start_time = time.time()
|
|
66
|
+
# Current turn being built
|
|
67
|
+
self._current_input: dict = {}
|
|
68
|
+
self._current_steps: list[dict] = []
|
|
69
|
+
|
|
70
|
+
def begin_turn(self, input_data: dict):
|
|
71
|
+
"""Start a new turn with the given input."""
|
|
72
|
+
self._current_input = input_data
|
|
73
|
+
self._current_steps = []
|
|
74
|
+
|
|
75
|
+
def step(self, step_type: str, data: dict):
|
|
76
|
+
"""Record a processing step in the current turn."""
|
|
77
|
+
self._current_steps.append({
|
|
78
|
+
"type": step_type,
|
|
79
|
+
"data": data,
|
|
80
|
+
"timestamp": time.time(),
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
def end_turn(self, output_data: dict):
|
|
84
|
+
"""Complete the current turn with the given output."""
|
|
85
|
+
self.turns.append({
|
|
86
|
+
"turn": len(self.turns) + 1,
|
|
87
|
+
"input": self._current_input,
|
|
88
|
+
"steps": self._current_steps,
|
|
89
|
+
"output": output_data,
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
def save(self, run_id: str = "latest"):
|
|
93
|
+
duration = time.time() - self.start_time
|
|
94
|
+
result = {
|
|
95
|
+
"test_name": self.test_name,
|
|
96
|
+
"dataset": self.dataset,
|
|
97
|
+
"item_id": self.item_id,
|
|
98
|
+
"judge_results": [],
|
|
99
|
+
"trace": {"turns": self.turns},
|
|
100
|
+
"inputs": {k: v for k, v in self.item_data.items() if not k.startswith("expected")},
|
|
101
|
+
"outputs": self.outputs,
|
|
102
|
+
"timestamp": self.start_time,
|
|
103
|
+
"duration": duration,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
run_dir = RESULTS_DIR / run_id
|
|
107
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
path = run_dir / f"{self.test_name}__{self.item_id}.result.json"
|
|
109
|
+
path.write_text(json.dumps(result, indent=2, default=str, ensure_ascii=False))
|
|
110
|
+
return path
|