vibeval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. vibeval-0.1.0/.gitignore +26 -0
  2. vibeval-0.1.0/CLAUDE.md +96 -0
  3. vibeval-0.1.0/LICENSE +21 -0
  4. vibeval-0.1.0/PKG-INFO +124 -0
  5. vibeval-0.1.0/README.md +105 -0
  6. vibeval-0.1.0/examples/meeting_app/.vibeval.yml +1 -0
  7. vibeval-0.1.0/examples/meeting_app/__init__.py +0 -0
  8. vibeval-0.1.0/examples/meeting_app/llm_client.py +24 -0
  9. vibeval-0.1.0/examples/meeting_app/summarizer.py +58 -0
  10. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/datasets/meeting_summaries/manifest.yaml +31 -0
  11. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/datasets/meeting_summaries/planning.json +8 -0
  12. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/datasets/meeting_summaries/standup.json +8 -0
  13. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/__init__.py +0 -0
  14. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/conftest.py +110 -0
  15. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/pytest.ini +3 -0
  16. vibeval-0.1.0/examples/meeting_app/tests/vibeval/meeting_summary/tests/test_summarizer.py +112 -0
  17. vibeval-0.1.0/examples/meeting_app/transcript_api.py +25 -0
  18. vibeval-0.1.0/plugin/.claude-plugin/marketplace.json +13 -0
  19. vibeval-0.1.0/plugin/.claude-plugin/plugin.json +9 -0
  20. vibeval-0.1.0/plugin/commands/vibeval-analyze.md +138 -0
  21. vibeval-0.1.0/plugin/commands/vibeval-design.md +128 -0
  22. vibeval-0.1.0/plugin/commands/vibeval-generate.md +224 -0
  23. vibeval-0.1.0/plugin/commands/vibeval-run.md +102 -0
  24. vibeval-0.1.0/plugin/commands/vibeval-update.md +123 -0
  25. vibeval-0.1.0/plugin/skills/protocol/SKILL.md +47 -0
  26. vibeval-0.1.0/plugin/skills/protocol/references/00-philosophy.md +143 -0
  27. vibeval-0.1.0/plugin/skills/protocol/references/01-overview.md +48 -0
  28. vibeval-0.1.0/plugin/skills/protocol/references/02-dataset.md +126 -0
  29. vibeval-0.1.0/plugin/skills/protocol/references/03-judge-spec.md +180 -0
  30. vibeval-0.1.0/plugin/skills/protocol/references/04-result.md +124 -0
  31. vibeval-0.1.0/plugin/skills/protocol/references/05-comparison.md +55 -0
  32. vibeval-0.1.0/pyproject.toml +39 -0
  33. vibeval-0.1.0/src/vibeval/__init__.py +3 -0
  34. vibeval-0.1.0/src/vibeval/cli.py +287 -0
  35. vibeval-0.1.0/src/vibeval/compare.py +289 -0
  36. vibeval-0.1.0/src/vibeval/config.py +66 -0
  37. vibeval-0.1.0/src/vibeval/conversation.py +57 -0
  38. vibeval-0.1.0/src/vibeval/dataset.py +147 -0
  39. vibeval-0.1.0/src/vibeval/judge.py +113 -0
  40. vibeval-0.1.0/src/vibeval/llm.py +266 -0
  41. vibeval-0.1.0/src/vibeval/report.py +743 -0
  42. vibeval-0.1.0/src/vibeval/result.py +135 -0
  43. vibeval-0.1.0/src/vibeval/rules.py +254 -0
  44. vibeval-0.1.0/tests/test_compare.py +208 -0
  45. vibeval-0.1.0/tests/test_conversation.py +75 -0
  46. vibeval-0.1.0/tests/test_judge.py +184 -0
  47. vibeval-0.1.0/tests/test_report.py +218 -0
  48. vibeval-0.1.0/tests/test_result.py +90 -0
  49. vibeval-0.1.0/tests/test_rules.py +193 -0
  50. vibeval-0.1.0/tests/test_target.py +164 -0
  51. vibeval-0.1.0/uv.lock +225 -0
@@ -0,0 +1,26 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+
8
+ # Virtual environment
9
+ .venv/
10
+
11
+ # pytest
12
+ .pytest_cache/
13
+
14
+
15
+ # vibeval results (generated at runtime)
16
+ .vibeval/
17
+ **/results/
18
+ **/comparisons/
19
+
20
+ # IDE
21
+ .vscode/
22
+ .idea/
23
+ *.swp
24
+
25
+ # OS
26
+ .DS_Store
@@ -0,0 +1,96 @@
1
+ # vibeval 项目开发指南
2
+
3
+ ## 项目概述
4
+
5
+ vibeval (Vibe Coding Eval) 是一个 AI 应用评测框架,由两部分组成:
6
+ - **vibeval CLI** (`src/vibeval/`) — 评测工具(judge、compare、simulate、report 等)
7
+ - **Claude Code Plugin** (`plugin/`) — VibeCoding 工作流(analyze、design、generate、run、update)
8
+
9
+ ## 核心原则
10
+
11
+ ### 1. 协议为本
12
+
13
+ `plugin/skills/protocol/references/` 下的协议文档是整个项目的核心设定(Source of Truth)。所有 plugin 命令、CLI 代码、文档都必须围绕和遵循协议。当任何地方的内容与协议矛盾时,以协议为准。编写文档或命令时,避免重复协议中已有的定义,改为引用协议文件。
14
+
15
+ 协议文件:
16
+ - `00-philosophy.md` — 评测哲学(信息不对称 + 全局视野)
17
+ - `01-overview.md` — 目录结构、统一 turn 模型
18
+ - `02-dataset.md` — 数据集格式
19
+ - `03-judge-spec.md` — 评判规格(rule/llm、target、所有字段定义)
20
+ - `04-result.md` — 结果格式(trace turns/steps)
21
+ - `05-comparison.md` — 横评格式
22
+
23
+ ### 2. 语言无关
24
+
25
+ 所有设计必须避免语言耦合。vibeval CLI 提供通用的、可通过命令行直接执行的功能,不需要在代码层面做依赖。用户的测试代码不 import vibeval 包,而是通过 subprocess 调用 CLI(如 `vibeval simulate`),或直接按协议格式生成文件。CLI 既服务开发者,也服务 VibeCoding Agent。
26
+
27
+ ### 3. CLI Help 是命令文档的唯一来源
28
+
29
+ CLI 的每个命令必须保持完整、最新的 `--help` 描述(包括用途、参数说明、使用示例)。Plugin 的命令文档和 SKILL.md 中不重复 CLI 的参数细节,而是提示通过 `vibeval --help` / `vibeval <command> --help` 查看。这样 CLI 和文档永远不会不同步。
30
+
31
+ ### 4. 测试目录的区分
32
+
33
+ `tests/` 目录下是 vibeval CLI 工具和代码自身的单元测试、集成测试。`examples/` 下的项目(如 `meeting_app/`)是独立的示例应用,它们有自己的测试(在 `examples/meeting_app/tests/vibeval/` 下),演示的是用户使用 vibeval 的完整流程。两者不要混淆。
34
+
35
+ ## 开发约定
36
+
37
+ ### 运行测试
38
+
39
+ ```bash
40
+ # vibeval 自身的测试
41
+ python -m pytest tests/ -v
42
+
43
+ # 示例应用的测试(独立运行)
44
+ cd examples/meeting_app/tests/vibeval/meeting_summary/tests
45
+ python -m pytest test_summarizer.py -v
46
+
47
+ # 示例应用的 judge
48
+ cd examples/meeting_app
49
+ vibeval judge meeting_summary latest
50
+ ```
51
+
52
+ ### 修改协议
53
+
54
+ 修改协议文件后,检查以下是否需要同步更新:
55
+ - `src/vibeval/` 中的代码实现(rules.py、llm.py、judge.py、compare.py、result.py)
56
+ - `plugin/commands/` 中的命令文档(应引用协议而非重复内容)
57
+ - `plugin/skills/protocol/SKILL.md`(Quick Reference 摘要)
58
+ - CLI 的 `--help` 描述
59
+
60
+ ### 修改 CLI
61
+
62
+ 新增或修改 CLI 命令时:
63
+ - 在 `cli.py` 中提供完整的 `help` 和 `description` 文本
64
+ - 不需要同步更新 plugin 文档中的命令细节——plugin 文档应引导查看 `vibeval --help`
65
+ - 添加对应的测试到 `tests/`
66
+
67
+ ### 版本管理
68
+
69
+ 项目版本在三处维护,必须保持一致:
70
+ - `pyproject.toml` — `version` 字段
71
+ - `plugin/.claude-plugin/plugin.json` — `version` 字段
72
+ - `src/vibeval/__init__.py` — `__version__` 变量
73
+
74
+ 发版或变更版本号时,三处同时更新。
75
+
76
+ ### 修改 Plugin
77
+
78
+ 修改 plugin 命令或 skill 时:
79
+ - 基础概念定义(数据格式、字段、规则名等)一律引用 `references/` 中的协议文件,不内联重复
80
+ - 操作指导(如何设计、如何生成)可以在命令文档中展开,但引用协议作为定义来源
81
+ - CLI 命令的参数和用法,引导使用 `vibeval --help` 查看
82
+
83
+ ### 项目结构
84
+
85
+ ```
86
+ vibeval/
87
+ ├── src/vibeval/ # CLI 工具实现
88
+ ├── plugin/ # Claude Code 插件
89
+ │ ├── commands/ # /vibeval-analyze, /vibeval-design, /vibeval-generate, /vibeval-run, /vibeval-update
90
+ │ └── skills/protocol/ # 数据协议(Source of Truth)
91
+ ├── examples/ # 独立的示例应用
92
+ ├── tests/ # vibeval 自身的测试
93
+ ├── CLAUDE.md
94
+ ├── README.md
95
+ └── pyproject.toml
96
+ ```
vibeval-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 SandyKid
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vibeval-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: vibeval
3
+ Version: 0.1.0
4
+ Summary: vibeval (Vibe Coding Eval) — AI application testing framework
5
+ Project-URL: Homepage, https://github.com/SandyKidYao/vibeval
6
+ Project-URL: Repository, https://github.com/SandyKidYao/vibeval
7
+ Project-URL: Issues, https://github.com/SandyKidYao/vibeval/issues
8
+ Author-email: SandyKid <yupengyao912@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: pyyaml>=6.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=8.0; extra == 'dev'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # vibeval — Vibe Coding Eval
21
+
22
+ AI 应用的快速评测框架。只需安装 Claude Code,即可完成从代码分析、测试生成到评估的全闭环。
23
+
24
+ ## 解决什么问题
25
+
26
+ 传统软件测试框架无法评估 AI 输出的质量;传统 AI 评测平台依赖数据集构建,跟不上功能迭代速度。vibeval 在两者之间取得平衡:
27
+
28
+ - 通过 VibeCoding 分析你的代码,快速生成合成数据和测试用例
29
+ - 确定性规则 + LLM 语义评判,双重评估
30
+ - 跨版本横评,追踪质量变化
31
+ - 语言无关:生成的测试代码适配你的项目框架,不依赖 vibeval 包
32
+
33
+ ## 前置要求
34
+
35
+ - [Claude Code](https://claude.ai/code)
36
+ - Python 3.10+
37
+
38
+ ## 安装
39
+
40
+ ```bash
41
+ # 安装 vibeval CLI
42
+ pip install vibeval
43
+
44
+ # 安装 Claude Code 插件(在 Claude Code 中执行)
45
+ /install-plugin /path/to/vibeval/plugin
46
+ ```
47
+
48
+ ## 使用方式
49
+
50
+ 在 Claude Code 中执行,全程不需要离开对话:
51
+
52
+ ```
53
+ /vibeval-analyze → /vibeval-design → /vibeval-generate → /vibeval-run
54
+
55
+ 代码变更 → /vibeval-update ──────┘
56
+ ```
57
+
58
+ ### 分析代码
59
+
60
+ ```
61
+ /vibeval-analyze meeting_summary src/services/
62
+ ```
63
+
64
+ 分析源码,识别 AI 调用点、数据流、Mock 点,给出可评估性改进建议。
65
+
66
+ ### 设计测试
67
+
68
+ ```
69
+ /vibeval-design meeting_summary
70
+ ```
71
+
72
+ 设计合成数据规格、评判标准、测试结构。每一步产出可编辑的中间文件,可以 review 和修改。
73
+
74
+ ### 生成代码和数据
75
+
76
+ ```
77
+ /vibeval-generate meeting_summary
78
+ ```
79
+
80
+ 生成完整的测试套件(合成数据 + 测试代码),使用你项目的测试框架(pytest/vitest/jest/...)。
81
+
82
+ ### 运行 + 评估
83
+
84
+ ```
85
+ /vibeval-run meeting_summary
86
+ ```
87
+
88
+ 一步完成:执行测试 → 评估 → 诊断分析 → 生成报告。
89
+
90
+ ### 代码变更后更新
91
+
92
+ ```
93
+ /vibeval-update meeting_summary
94
+ ```
95
+
96
+ 检测代码变更,增量更新测试套件和数据。
97
+
98
+ ### 跨版本对比
99
+
100
+ ```bash
101
+ # 统计对比
102
+ vibeval diff meeting_summary run_a run_b
103
+
104
+ # LLM 深度横评
105
+ vibeval compare meeting_summary run_a run_b
106
+ ```
107
+
108
+ ### 可视化报告
109
+
110
+ ```bash
111
+ vibeval report meeting_summary latest --open
112
+ ```
113
+
114
+ 生成自包含的 HTML 报告,涵盖测试设计、数据、过程 trace 和评判结果。
115
+
116
+ ### 更多命令
117
+
118
+ ```bash
119
+ vibeval --help
120
+ ```
121
+
122
+ ## License
123
+
124
+ MIT
@@ -0,0 +1,105 @@
1
+ # vibeval — Vibe Coding Eval
2
+
3
+ AI 应用的快速评测框架。只需安装 Claude Code,即可完成从代码分析、测试生成到评估的全闭环。
4
+
5
+ ## 解决什么问题
6
+
7
+ 传统软件测试框架无法评估 AI 输出的质量;传统 AI 评测平台依赖数据集构建,跟不上功能迭代速度。vibeval 在两者之间取得平衡:
8
+
9
+ - 通过 VibeCoding 分析你的代码,快速生成合成数据和测试用例
10
+ - 确定性规则 + LLM 语义评判,双重评估
11
+ - 跨版本横评,追踪质量变化
12
+ - 语言无关:生成的测试代码适配你的项目框架,不依赖 vibeval 包
13
+
14
+ ## 前置要求
15
+
16
+ - [Claude Code](https://claude.ai/code)
17
+ - Python 3.10+
18
+
19
+ ## 安装
20
+
21
+ ```bash
22
+ # 安装 vibeval CLI
23
+ pip install vibeval
24
+
25
+ # 安装 Claude Code 插件(在 Claude Code 中执行)
26
+ /install-plugin /path/to/vibeval/plugin
27
+ ```
28
+
29
+ ## 使用方式
30
+
31
+ 在 Claude Code 中执行,全程不需要离开对话:
32
+
33
+ ```
34
+ /vibeval-analyze → /vibeval-design → /vibeval-generate → /vibeval-run
35
+
36
+ 代码变更 → /vibeval-update ──────┘
37
+ ```
38
+
39
+ ### 分析代码
40
+
41
+ ```
42
+ /vibeval-analyze meeting_summary src/services/
43
+ ```
44
+
45
+ 分析源码,识别 AI 调用点、数据流、Mock 点,给出可评估性改进建议。
46
+
47
+ ### 设计测试
48
+
49
+ ```
50
+ /vibeval-design meeting_summary
51
+ ```
52
+
53
+ 设计合成数据规格、评判标准、测试结构。每一步产出可编辑的中间文件,可以 review 和修改。
54
+
55
+ ### 生成代码和数据
56
+
57
+ ```
58
+ /vibeval-generate meeting_summary
59
+ ```
60
+
61
+ 生成完整的测试套件(合成数据 + 测试代码),使用你项目的测试框架(pytest/vitest/jest/...)。
62
+
63
+ ### 运行 + 评估
64
+
65
+ ```
66
+ /vibeval-run meeting_summary
67
+ ```
68
+
69
+ 一步完成:执行测试 → 评估 → 诊断分析 → 生成报告。
70
+
71
+ ### 代码变更后更新
72
+
73
+ ```
74
+ /vibeval-update meeting_summary
75
+ ```
76
+
77
+ 检测代码变更,增量更新测试套件和数据。
78
+
79
+ ### 跨版本对比
80
+
81
+ ```bash
82
+ # 统计对比
83
+ vibeval diff meeting_summary run_a run_b
84
+
85
+ # LLM 深度横评
86
+ vibeval compare meeting_summary run_a run_b
87
+ ```
88
+
89
+ ### 可视化报告
90
+
91
+ ```bash
92
+ vibeval report meeting_summary latest --open
93
+ ```
94
+
95
+ 生成自包含的 HTML 报告,涵盖测试设计、数据、过程 trace 和评判结果。
96
+
97
+ ### 更多命令
98
+
99
+ ```bash
100
+ vibeval --help
101
+ ```
102
+
103
+ ## License
104
+
105
+ MIT
@@ -0,0 +1 @@
1
+ vibeval_root: tests/vibeval
File without changes
@@ -0,0 +1,24 @@
1
+ """LLM client — wraps the actual LLM API call.
2
+
3
+ In production this calls OpenAI/Anthropic. In tests this gets mocked.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+
9
+ def chat(prompt: str, system: str = "") -> str:
10
+ """Send a prompt to the LLM and return the response text."""
11
+ # In a real app, this would be:
12
+ # client = openai.OpenAI()
13
+ # resp = client.chat.completions.create(
14
+ # model="gpt-4o",
15
+ # messages=[
16
+ # {"role": "system", "content": system},
17
+ # {"role": "user", "content": prompt},
18
+ # ],
19
+ # )
20
+ # return resp.choices[0].message.content
21
+ raise RuntimeError(
22
+ "LLM client not configured. "
23
+ "Set OPENAI_API_KEY or mock this in tests."
24
+ )
@@ -0,0 +1,58 @@
1
+ """Meeting summarizer — the core AI pipeline.
2
+
3
+ Pipeline:
4
+ 1. Fetch transcript from API
5
+ 2. Call LLM to generate summary
6
+ 3. Call LLM to extract action items
7
+ 4. Return structured result
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ from . import llm_client, transcript_api
15
+
16
+
17
+ def summarize_meeting(meeting_id: str) -> dict[str, Any]:
18
+ """Run the full meeting summarization pipeline."""
19
+
20
+ # Step 1: Fetch transcript
21
+ transcript = transcript_api.fetch_transcript(meeting_id)
22
+ speakers = transcript.get("speakers", [])
23
+ text = transcript.get("text", "")
24
+
25
+ # Step 2: Generate summary
26
+ summary_prompt = (
27
+ f"Summarize the following meeting transcript.\n"
28
+ f"Speakers: {', '.join(speakers)}\n\n"
29
+ f"Transcript:\n{text}\n\n"
30
+ f"Requirements:\n"
31
+ f"- Include ALL mentioned times, dates, and deadlines\n"
32
+ f"- Attribute key points to the correct speaker\n"
33
+ f"- Be concise but complete"
34
+ )
35
+ summary = llm_client.chat(
36
+ prompt=summary_prompt,
37
+ system="You are a meeting summarizer. Be thorough and accurate.",
38
+ )
39
+
40
+ # Step 3: Extract action items
41
+ actions_prompt = (
42
+ f"Extract action items from this meeting summary:\n\n"
43
+ f"{summary}\n\n"
44
+ f"Format each as: - [Owner] Action (deadline if mentioned)"
45
+ )
46
+ action_items = llm_client.chat(
47
+ prompt=actions_prompt,
48
+ system="You extract action items from meeting summaries.",
49
+ )
50
+
51
+ # Step 4: Return structured result
52
+ return {
53
+ "meeting_id": meeting_id,
54
+ "speakers": speakers,
55
+ "summary": summary,
56
+ "action_items": action_items,
57
+ "transcript_length": len(text),
58
+ }
@@ -0,0 +1,31 @@
1
+ name: meeting_summaries
2
+ description: "Synthetic meeting transcripts for testing the summarization pipeline"
3
+ version: "1"
4
+ tags:
5
+ - meeting
6
+ - summarization
7
+
8
+ judge_specs:
9
+ - method: rule
10
+ rule: contains_all
11
+ args:
12
+ field: "outputs.summary"
13
+ values_from: "expected_times"
14
+ weight: gate
15
+
16
+ - method: rule
17
+ rule: length_between
18
+ args:
19
+ field: "outputs.action_items"
20
+ min: 10
21
+ max: 5000
22
+
23
+ - method: rule
24
+ rule: max_steps
25
+ args:
26
+ max: 20
27
+
28
+ - method: rule
29
+ rule: max_turns
30
+ args:
31
+ max: 1
@@ -0,0 +1,8 @@
1
+ {
2
+ "_id": "planning",
3
+ "meeting_id": "mtg-plan-001",
4
+ "_tags": ["planning", "deadlines"],
5
+ "speakers": ["Diana", "Eve"],
6
+ "text": "Diana: Let's go over the project timeline. Phase 1 needs to be completed by June 30th. The kickoff was on January 15th.\n\nEve: Right. The mid-project review is scheduled for April 15th at 10:00 AM. We also have weekly syncs every Tuesday at 3:30 PM.\n\nDiana: The final demo is on July 5th. We need to submit the report by end of day July 10th.\n\nEve: Got it. I'll set up the sprint planning for next Monday at 9:30 AM.",
7
+ "expected_times": ["June 30th", "January 15th", "April 15th", "10:00 AM", "Tuesday", "3:30 PM", "July 5th", "July 10th", "Monday", "9:30 AM"]
8
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "_id": "standup",
3
+ "meeting_id": "mtg-standup-001",
4
+ "_tags": ["standup", "multiple-times"],
5
+ "speakers": ["Alice", "Bob", "Charlie"],
6
+ "text": "Alice: Good morning everyone. It's 9:00 AM, let's start our standup. Bob, what did you work on yesterday?\n\nBob: Yesterday I finished the API refactoring that was due by March 15th. Today I'll work on the database migration, which needs to be done before the 3:00 PM deployment window.\n\nAlice: Great. Charlie?\n\nCharlie: I have a meeting with the client at 2:30 PM today to discuss the Q2 roadmap. The proposal deadline is April 1st. I'll also review Bob's PR before 11:00 AM.\n\nAlice: Perfect. Remember the all-hands is at 4:00 PM today. Let's wrap up by 9:15 AM.",
7
+ "expected_times": ["9:00 AM", "March 15th", "3:00 PM", "2:30 PM", "April 1st", "11:00 AM", "4:00 PM", "9:15 AM"]
8
+ }
@@ -0,0 +1,110 @@
1
+ """Pytest configuration for meeting_app vibeval tests.
2
+
3
+ [Generated by VibeCoding] This conftest:
4
+ 1. Loads vibeval datasets
5
+ 2. Provides a fixture that creates tracked mock wrappers
6
+ 3. Saves vibeval-format results after each test
7
+ """
8
+
9
+ import json
10
+ import time
11
+ from pathlib import Path
12
+ from typing import Any
13
+ from unittest.mock import patch
14
+
15
+ import pytest
16
+ import yaml
17
+
18
+
19
+ # --- vibeval dataset loading ---
20
+
21
+ FEATURE_DIR = Path(__file__).parent.parent # tests/vibeval/meeting_summary/
22
+ DATASETS_DIR = FEATURE_DIR / "datasets"
23
+ RESULTS_DIR = FEATURE_DIR / "results"
24
+
25
+
26
+ def load_dataset(name: str) -> dict[str, Any]:
27
+ """Load a dataset manifest + items."""
28
+ ds_dir = DATASETS_DIR / name
29
+ manifest = yaml.safe_load((ds_dir / "manifest.yaml").read_text())
30
+
31
+ items = {}
32
+ for f in sorted(ds_dir.iterdir()):
33
+ if f.name.startswith("manifest"):
34
+ continue
35
+ if f.suffix == ".json":
36
+ data = json.loads(f.read_text())
37
+ item_id = data.pop("_id", f.stem)
38
+ items[item_id] = data
39
+
40
+ return {"manifest": manifest, "items": items}
41
+
42
+
43
+ MEETING_DS = load_dataset("meeting_summaries")
44
+
45
+
46
+ @pytest.fixture(params=list(MEETING_DS["items"].keys()))
47
+ def meeting_item(request):
48
+ """Parametrized fixture: yields each dataset item."""
49
+ item_id = request.param
50
+ return item_id, MEETING_DS["items"][item_id]
51
+
52
+
53
+ # --- vibeval result saving ---
54
+
55
+ class VibevalResultCollector:
56
+ """Collects turn-based trace and saves vibeval-format results (v0.2)."""
57
+
58
+ def __init__(self, test_name: str, dataset: str, item_id: str, item_data: dict):
59
+ self.test_name = test_name
60
+ self.dataset = dataset
61
+ self.item_id = item_id
62
+ self.item_data = item_data
63
+ self.turns: list[dict] = []
64
+ self.outputs: dict = {}
65
+ self.start_time = time.time()
66
+ # Current turn being built
67
+ self._current_input: dict = {}
68
+ self._current_steps: list[dict] = []
69
+
70
+ def begin_turn(self, input_data: dict):
71
+ """Start a new turn with the given input."""
72
+ self._current_input = input_data
73
+ self._current_steps = []
74
+
75
+ def step(self, step_type: str, data: dict):
76
+ """Record a processing step in the current turn."""
77
+ self._current_steps.append({
78
+ "type": step_type,
79
+ "data": data,
80
+ "timestamp": time.time(),
81
+ })
82
+
83
+ def end_turn(self, output_data: dict):
84
+ """Complete the current turn with the given output."""
85
+ self.turns.append({
86
+ "turn": len(self.turns) + 1,
87
+ "input": self._current_input,
88
+ "steps": self._current_steps,
89
+ "output": output_data,
90
+ })
91
+
92
+ def save(self, run_id: str = "latest"):
93
+ duration = time.time() - self.start_time
94
+ result = {
95
+ "test_name": self.test_name,
96
+ "dataset": self.dataset,
97
+ "item_id": self.item_id,
98
+ "judge_results": [],
99
+ "trace": {"turns": self.turns},
100
+ "inputs": {k: v for k, v in self.item_data.items() if not k.startswith("expected")},
101
+ "outputs": self.outputs,
102
+ "timestamp": self.start_time,
103
+ "duration": duration,
104
+ }
105
+
106
+ run_dir = RESULTS_DIR / run_id
107
+ run_dir.mkdir(parents=True, exist_ok=True)
108
+ path = run_dir / f"{self.test_name}__{self.item_id}.result.json"
109
+ path.write_text(json.dumps(result, indent=2, default=str, ensure_ascii=False))
110
+ return path
@@ -0,0 +1,3 @@
1
+ [pytest]
2
+ testpaths = .
3
+ pythonpath = ../../../../..