trainsieve 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trainsieve-0.1.0/.gitignore +18 -0
- trainsieve-0.1.0/PKG-INFO +11 -0
- trainsieve-0.1.0/README.md +191 -0
- trainsieve-0.1.0/demo.gif +0 -0
- trainsieve-0.1.0/demo.tape +57 -0
- trainsieve-0.1.0/pyproject.toml +23 -0
- trainsieve-0.1.0/sieve/__init__.py +1 -0
- trainsieve-0.1.0/sieve/cli.py +175 -0
- trainsieve-0.1.0/sieve/curate/__init__.py +0 -0
- trainsieve-0.1.0/sieve/curate/dataset.py +60 -0
- trainsieve-0.1.0/sieve/db.py +24 -0
- trainsieve-0.1.0/sieve/ingest/__init__.py +0 -0
- trainsieve-0.1.0/sieve/ingest/base.py +10 -0
- trainsieve-0.1.0/sieve/ingest/jsonl.py +35 -0
- trainsieve-0.1.0/sieve/models.py +63 -0
- trainsieve-0.1.0/sieve/score/__init__.py +0 -0
- trainsieve-0.1.0/sieve/score/base.py +10 -0
- trainsieve-0.1.0/sieve/score/heuristic.py +46 -0
- trainsieve-0.1.0/sieve/trigger/__init__.py +0 -0
- trainsieve-0.1.0/sieve/trigger/axolotl.py +61 -0
- trainsieve-0.1.0/tests/__init__.py +0 -0
- trainsieve-0.1.0/tests/test_dataset.py +80 -0
- trainsieve-0.1.0/tests/test_ingest.py +49 -0
- trainsieve-0.1.0/tests/test_score.py +40 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trainsieve
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production LLM traces → curated training datasets → fine-tune pipeline
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: jsonlines>=4.0
|
|
7
|
+
Requires-Dist: pydantic>=2.0
|
|
8
|
+
Requires-Dist: pyyaml>=6.0
|
|
9
|
+
Requires-Dist: rich>=13.0
|
|
10
|
+
Requires-Dist: sqlalchemy>=2.0
|
|
11
|
+
Requires-Dist: typer>=0.12
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# sieve
|
|
2
|
+
|
|
3
|
+
> Your model is only as good as the data you trained it on. Most teams never close that loop.
|
|
4
|
+
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://www.python.org)
|
|
7
|
+
[]()
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
Frontier labs (Anthropic, OpenAI, Google) continuously improve their models by mining production interactions for training signal. They call it a **data flywheel**. Every real user query that gets a great response becomes training data for the next version.
|
|
12
|
+
|
|
13
|
+
Your team doesn't have that. You fine-tune once, ship, and hope.
|
|
14
|
+
|
|
15
|
+
**sieve** is the missing piece: a CLI pipeline that turns your production LLM logs into a versioned, high-quality training dataset — and hands it off to Axolotl or LLaMA-Factory to retrain.
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
production logs ──► score ──► filter ──► versioned dataset ──► fine-tune
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
No API keys. No SaaS. Runs entirely on your machine.
|
|
22
|
+
|
|
23
|
+

|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## The problem it solves
|
|
28
|
+
|
|
29
|
+
You've fine-tuned a Llama model on your support chat data. It works well. Three months later, your product has changed, your users are asking different questions, and the model is drifting. You know the answers to those new questions exist somewhere in your logs.
|
|
30
|
+
|
|
31
|
+
But finding them means:
|
|
32
|
+
|
|
33
|
+
1. Manually exporting thousands of log rows
|
|
34
|
+
2. Reading through them to find the good ones
|
|
35
|
+
3. Formatting them for your training framework
|
|
36
|
+
4. Remembering which examples you already used last time
|
|
37
|
+
|
|
38
|
+
**sieve automates all of that.**
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Quickstart
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
git clone https://github.com/nidhisebastian008/sieve
|
|
46
|
+
cd sieve
|
|
47
|
+
python3 -m venv .venv && source .venv/bin/activate
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# ingest your logs (supports OpenAI, prompt/response, and conversations formats)
|
|
53
|
+
sieve ingest production_logs.jsonl
|
|
54
|
+
|
|
55
|
+
# score every interaction — no API key needed
|
|
56
|
+
sieve score
|
|
57
|
+
|
|
58
|
+
# see what you've got
|
|
59
|
+
sieve stats
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
Sieve Pipeline Stats
|
|
64
|
+
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
|
|
65
|
+
┃ Metric ┃ Value ┃
|
|
66
|
+
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
|
|
67
|
+
│ Total interactions │ 8420 │
|
|
68
|
+
│ Scored │ 8420 │
|
|
69
|
+
│ Avg quality score │ 0.614 │
|
|
70
|
+
│ Dataset versions │ 1 │
|
|
71
|
+
│ Training runs │ 0 │
|
|
72
|
+
└────────────────────┴───────┘
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# curate only the good stuff
|
|
77
|
+
sieve dataset create v1.0 --min-quality 0.7
|
|
78
|
+
|
|
79
|
+
# export for fine-tuning
|
|
80
|
+
sieve dataset export v1.0 --output train.jsonl
|
|
81
|
+
|
|
82
|
+
# generate training config + command
|
|
83
|
+
sieve train v1.0 --export-path train.jsonl --base-model meta-llama/Llama-3.2-3B-Instruct
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
✓ Training run recorded (id: 32bd15fb)
|
|
88
|
+
|
|
89
|
+
Run training with Axolotl:
|
|
90
|
+
pip install axolotl
|
|
91
|
+
axolotl train axolotl_config.yml
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Dataset versioning
|
|
97
|
+
|
|
98
|
+
The flywheel only works if you track what you've already trained on. sieve versions every dataset and supports incremental builds — next month's run only includes new interactions.
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# next month: only new interactions not in v1.0
|
|
102
|
+
sieve dataset create v2.0 --min-quality 0.7 --diff v1.0
|
|
103
|
+
|
|
104
|
+
sieve dataset list
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
┏━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
|
|
109
|
+
┃ Name ┃ Interactions ┃ Min Quality ┃ Parent ┃ Created ┃
|
|
110
|
+
┡━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
|
|
111
|
+
│ v1.0 │ 3102 │ 0.70 │ — │ 2026-06-30 10:00 │
|
|
112
|
+
│ v2.0 │ 891 │ 0.70 │ v1.0 │ 2026-07-30 09:00 │
|
|
113
|
+
└──────┴──────────────┴─────────────┴────────┴──────────────────┘
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Lineage is tracked in a local SQLite database at `~/.sieve/sieve.db`. You own your data.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Input formats
|
|
121
|
+
|
|
122
|
+
sieve normalises all common LLM log formats automatically:
|
|
123
|
+
|
|
124
|
+
```jsonl
|
|
125
|
+
{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
126
|
+
{"prompt": "...", "response": "..."}
|
|
127
|
+
{"conversations": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Scorers
|
|
133
|
+
|
|
134
|
+
| Scorer | Needs | What it checks |
|
|
135
|
+
|---|---|---|
|
|
136
|
+
| **Heuristic** (default) | Nothing | Response length, refusal patterns, empty messages |
|
|
137
|
+
| **LLM-as-judge** (coming) | Ollama or API key | Helpfulness, factuality, relevance via model grading |
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Architecture
|
|
142
|
+
|
|
143
|
+
sieve is built around pluggable interfaces. Swap out any stage.
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
sieve/
|
|
147
|
+
├── ingest/ ← BaseIngester (JSONL ✓ · Langfuse coming · OpenTelemetry coming)
|
|
148
|
+
├── score/ ← BaseScorer (Heuristic ✓ · LLM judge coming)
|
|
149
|
+
├── curate/ ← versioning, lineage, diff, export
|
|
150
|
+
└── trigger/ ← Axolotl ✓ · LLaMA-Factory coming · Modal coming
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Implement `BaseIngester` or `BaseScorer`, drop it in — no core changes needed.
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Who this is for
|
|
158
|
+
|
|
159
|
+
- Teams fine-tuning **open models** (Llama, Mistral, Qwen) on their own infrastructure
|
|
160
|
+
- Companies with **data privacy requirements** that can't send logs to a third-party platform
|
|
161
|
+
- Anyone doing **continuous fine-tuning** who is currently managing training data in spreadsheets
|
|
162
|
+
|
|
163
|
+
If you're using OpenAI or Anthropic APIs and never touching open models, sieve is not for you — yet.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Roadmap
|
|
168
|
+
|
|
169
|
+
- [ ] Langfuse ingester — pull traces directly without manual export
|
|
170
|
+
- [ ] OpenTelemetry ingester
|
|
171
|
+
- [ ] LLM-as-judge scorer (Ollama + Anthropic/OpenAI)
|
|
172
|
+
- [ ] LLaMA-Factory trigger
|
|
173
|
+
- [ ] Modal cloud GPU trigger
|
|
174
|
+
- [ ] HuggingFace dataset push
|
|
175
|
+
- [ ] `sieve eval` — score base model vs fine-tuned on held-out examples
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Contributing
|
|
180
|
+
|
|
181
|
+
PRs welcome. The most impactful areas right now: new ingesters and new scorers.
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
pytest tests/ -v # 12 tests, all passing
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## License
|
|
190
|
+
|
|
191
|
+
Apache 2.0
|
|
Binary file
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Output demo.gif
|
|
2
|
+
|
|
3
|
+
Set Shell "bash"
|
|
4
|
+
Set FontSize 14
|
|
5
|
+
Set Width 900
|
|
6
|
+
Set Height 550
|
|
7
|
+
Set Theme "Dracula"
|
|
8
|
+
Set Padding 20
|
|
9
|
+
Set TypingSpeed 40ms
|
|
10
|
+
|
|
11
|
+
Hide
|
|
12
|
+
Type "cd ~/sieve && source .venv/bin/activate"
|
|
13
|
+
Enter
|
|
14
|
+
Sleep 500ms
|
|
15
|
+
Show
|
|
16
|
+
|
|
17
|
+
Type "# sieve — production traces → training data pipeline"
|
|
18
|
+
Sleep 1s
|
|
19
|
+
Enter
|
|
20
|
+
Sleep 300ms
|
|
21
|
+
|
|
22
|
+
Type "sieve ingest /tmp/sample.jsonl"
|
|
23
|
+
Sleep 500ms
|
|
24
|
+
Enter
|
|
25
|
+
Sleep 1.5s
|
|
26
|
+
|
|
27
|
+
Type "sieve score"
|
|
28
|
+
Sleep 500ms
|
|
29
|
+
Enter
|
|
30
|
+
Sleep 1.5s
|
|
31
|
+
|
|
32
|
+
Type "sieve stats"
|
|
33
|
+
Sleep 500ms
|
|
34
|
+
Enter
|
|
35
|
+
Sleep 2s
|
|
36
|
+
|
|
37
|
+
Type "sieve dataset create v1.0 --min-quality 0.6"
|
|
38
|
+
Sleep 500ms
|
|
39
|
+
Enter
|
|
40
|
+
Sleep 1.5s
|
|
41
|
+
|
|
42
|
+
Type "sieve dataset list"
|
|
43
|
+
Sleep 500ms
|
|
44
|
+
Enter
|
|
45
|
+
Sleep 2s
|
|
46
|
+
|
|
47
|
+
Type "sieve dataset export v1.0 --output train.jsonl"
|
|
48
|
+
Sleep 500ms
|
|
49
|
+
Enter
|
|
50
|
+
Sleep 1.5s
|
|
51
|
+
|
|
52
|
+
Type "sieve train v1.0 --export-path train.jsonl"
|
|
53
|
+
Sleep 500ms
|
|
54
|
+
Enter
|
|
55
|
+
Sleep 2s
|
|
56
|
+
|
|
57
|
+
Sleep 2s
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "trainsieve"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Production LLM traces → curated training datasets → fine-tune pipeline"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"typer>=0.12",
|
|
12
|
+
"sqlalchemy>=2.0",
|
|
13
|
+
"pydantic>=2.0",
|
|
14
|
+
"rich>=13.0",
|
|
15
|
+
"jsonlines>=4.0",
|
|
16
|
+
"pyyaml>=6.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
sieve = "sieve.cli:app"
|
|
21
|
+
|
|
22
|
+
[tool.hatch.build.targets.wheel]
|
|
23
|
+
packages = ["sieve"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
app = typer.Typer(help="sieve — production traces → fine-tuning data pipeline", no_args_is_help=True)
|
|
10
|
+
dataset_app = typer.Typer(no_args_is_help=True)
|
|
11
|
+
app.add_typer(dataset_app, name="dataset", help="Dataset version management")
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _session():
|
|
17
|
+
from sieve.db import init_db, get_session
|
|
18
|
+
init_db()
|
|
19
|
+
return get_session()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@app.command()
|
|
23
|
+
def ingest(
|
|
24
|
+
path: Path = typer.Argument(..., help="JSONL file to ingest"),
|
|
25
|
+
):
|
|
26
|
+
"""Ingest interactions from a JSONL file."""
|
|
27
|
+
from sieve.ingest.jsonl import JSONLIngester
|
|
28
|
+
|
|
29
|
+
session = _session()
|
|
30
|
+
ingester = JSONLIngester(path)
|
|
31
|
+
count = 0
|
|
32
|
+
for interaction in ingester.ingest():
|
|
33
|
+
session.add(interaction)
|
|
34
|
+
count += 1
|
|
35
|
+
session.commit()
|
|
36
|
+
console.print(f"[green]✓[/green] Ingested [bold]{count}[/bold] interactions from {path}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@app.command()
|
|
40
|
+
def score(
|
|
41
|
+
min_len: int = typer.Option(50, help="Min assistant response character length"),
|
|
42
|
+
max_len: int = typer.Option(4000, help="Max assistant response character length"),
|
|
43
|
+
rescore: bool = typer.Option(False, "--rescore", help="Re-score already scored interactions"),
|
|
44
|
+
):
|
|
45
|
+
"""Score interactions with heuristic scorer."""
|
|
46
|
+
from sieve.models import Interaction
|
|
47
|
+
from sieve.score.heuristic import HeuristicScorer
|
|
48
|
+
|
|
49
|
+
session = _session()
|
|
50
|
+
scorer = HeuristicScorer(min_len, max_len)
|
|
51
|
+
|
|
52
|
+
query = session.query(Interaction)
|
|
53
|
+
if not rescore:
|
|
54
|
+
query = query.filter(Interaction.quality_score.is_(None))
|
|
55
|
+
|
|
56
|
+
interactions = query.all()
|
|
57
|
+
for i in interactions:
|
|
58
|
+
i.quality_score = scorer.score(i)
|
|
59
|
+
i.scored_at = datetime.utcnow()
|
|
60
|
+
i.scorer = "heuristic"
|
|
61
|
+
session.commit()
|
|
62
|
+
console.print(f"[green]✓[/green] Scored [bold]{len(interactions)}[/bold] interactions")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@app.command()
|
|
66
|
+
def stats():
|
|
67
|
+
"""Show pipeline stats."""
|
|
68
|
+
from sqlalchemy import func
|
|
69
|
+
from sieve.models import Interaction, DatasetVersion, TrainingRun
|
|
70
|
+
|
|
71
|
+
session = _session()
|
|
72
|
+
|
|
73
|
+
total = session.query(func.count(Interaction.id)).scalar()
|
|
74
|
+
scored = session.query(func.count(Interaction.id)).filter(Interaction.quality_score.isnot(None)).scalar()
|
|
75
|
+
avg = session.query(func.avg(Interaction.quality_score)).filter(Interaction.quality_score.isnot(None)).scalar()
|
|
76
|
+
versions = session.query(func.count(DatasetVersion.id)).scalar()
|
|
77
|
+
runs = session.query(func.count(TrainingRun.id)).scalar()
|
|
78
|
+
|
|
79
|
+
table = Table(title="Sieve Pipeline Stats")
|
|
80
|
+
table.add_column("Metric", style="bold")
|
|
81
|
+
table.add_column("Value")
|
|
82
|
+
table.add_row("Total interactions", str(total))
|
|
83
|
+
table.add_row("Scored", str(scored))
|
|
84
|
+
table.add_row("Avg quality score", f"{avg:.3f}" if avg else "—")
|
|
85
|
+
table.add_row("Dataset versions", str(versions))
|
|
86
|
+
table.add_row("Training runs", str(runs))
|
|
87
|
+
console.print(table)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataset_app.command("create")
|
|
91
|
+
def dataset_create(
|
|
92
|
+
name: str = typer.Argument(..., help="Version name, e.g. v1.0"),
|
|
93
|
+
min_quality: float = typer.Option(0.6, help="Minimum quality score filter"),
|
|
94
|
+
description: Optional[str] = typer.Option(None, help="Description"),
|
|
95
|
+
diff: Optional[str] = typer.Option(None, help="Parent version — only include new interactions"),
|
|
96
|
+
):
|
|
97
|
+
"""Create a dataset version from scored interactions."""
|
|
98
|
+
from sieve.curate.dataset import DatasetManager
|
|
99
|
+
|
|
100
|
+
session = _session()
|
|
101
|
+
mgr = DatasetManager(session)
|
|
102
|
+
version = mgr.create_version(name, min_quality, description, diff)
|
|
103
|
+
console.print(
|
|
104
|
+
f"[green]✓[/green] Dataset [bold]{name!r}[/bold] created "
|
|
105
|
+
f"with [bold]{len(version.interactions)}[/bold] interactions "
|
|
106
|
+
f"(min_quality={min_quality})"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataset_app.command("export")
|
|
111
|
+
def dataset_export(
|
|
112
|
+
name: str = typer.Argument(..., help="Version name to export"),
|
|
113
|
+
output: Path = typer.Option(Path("./export.jsonl"), help="Output JSONL path"),
|
|
114
|
+
):
|
|
115
|
+
"""Export a dataset version to JSONL for fine-tuning."""
|
|
116
|
+
from sieve.curate.dataset import DatasetManager
|
|
117
|
+
|
|
118
|
+
session = _session()
|
|
119
|
+
mgr = DatasetManager(session)
|
|
120
|
+
count = mgr.export_jsonl(name, output)
|
|
121
|
+
console.print(f"[green]✓[/green] Exported [bold]{count}[/bold] interactions to {output}")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataset_app.command("list")
|
|
125
|
+
def dataset_list():
|
|
126
|
+
"""List all dataset versions."""
|
|
127
|
+
from sieve.models import DatasetVersion
|
|
128
|
+
|
|
129
|
+
session = _session()
|
|
130
|
+
versions = session.query(DatasetVersion).order_by(DatasetVersion.created_at).all()
|
|
131
|
+
|
|
132
|
+
if not versions:
|
|
133
|
+
console.print("[yellow]No dataset versions yet. Run: sieve dataset create[/yellow]")
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
table = Table(title="Dataset Versions")
|
|
137
|
+
table.add_column("Name", style="bold")
|
|
138
|
+
table.add_column("Interactions")
|
|
139
|
+
table.add_column("Min Quality")
|
|
140
|
+
table.add_column("Parent")
|
|
141
|
+
table.add_column("Created")
|
|
142
|
+
for v in versions:
|
|
143
|
+
table.add_row(
|
|
144
|
+
v.name,
|
|
145
|
+
str(len(v.interactions)),
|
|
146
|
+
f"{v.min_quality_score:.2f}" if v.min_quality_score is not None else "—",
|
|
147
|
+
v.parent_name or "—",
|
|
148
|
+
v.created_at.strftime("%Y-%m-%d %H:%M"),
|
|
149
|
+
)
|
|
150
|
+
console.print(table)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@app.command()
|
|
154
|
+
def train(
|
|
155
|
+
version: str = typer.Argument(..., help="Dataset version name"),
|
|
156
|
+
base_model: str = typer.Option("meta-llama/Llama-3.2-3B-Instruct", help="HuggingFace model ID"),
|
|
157
|
+
output_dir: Path = typer.Option(Path("./output"), help="Training output directory"),
|
|
158
|
+
export_path: Path = typer.Option(Path("./export.jsonl"), help="Path to exported JSONL dataset"),
|
|
159
|
+
):
|
|
160
|
+
"""Generate Axolotl config and show the training command."""
|
|
161
|
+
from sieve.trigger.axolotl import AxolotlTrigger
|
|
162
|
+
|
|
163
|
+
session = _session()
|
|
164
|
+
trigger = AxolotlTrigger(session)
|
|
165
|
+
run = trigger.trigger(version, export_path, base_model, output_dir)
|
|
166
|
+
config_path = run.config["config_path"]
|
|
167
|
+
|
|
168
|
+
console.print(f"[green]✓[/green] Training run recorded [dim](id: {run.id[:8]})[/dim]")
|
|
169
|
+
console.print(f"\n[bold]Run training with Axolotl:[/bold]")
|
|
170
|
+
console.print(f" pip install axolotl")
|
|
171
|
+
console.print(f" axolotl train {config_path}")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import jsonlines
|
|
5
|
+
from sqlalchemy.orm import Session
|
|
6
|
+
|
|
7
|
+
from sieve.models import DatasetVersion, Interaction
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DatasetManager:
|
|
11
|
+
def __init__(self, session: Session):
|
|
12
|
+
self.session = session
|
|
13
|
+
|
|
14
|
+
def create_version(
|
|
15
|
+
self,
|
|
16
|
+
name: str,
|
|
17
|
+
min_quality: float = 0.0,
|
|
18
|
+
description: Optional[str] = None,
|
|
19
|
+
parent_name: Optional[str] = None,
|
|
20
|
+
) -> DatasetVersion:
|
|
21
|
+
query = self.session.query(Interaction)
|
|
22
|
+
|
|
23
|
+
if min_quality > 0:
|
|
24
|
+
query = query.filter(
|
|
25
|
+
Interaction.quality_score >= min_quality,
|
|
26
|
+
Interaction.quality_score.isnot(None),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if parent_name:
|
|
30
|
+
parent = self.session.query(DatasetVersion).filter_by(name=parent_name).first()
|
|
31
|
+
if parent:
|
|
32
|
+
parent_ids = [i.id for i in parent.interactions]
|
|
33
|
+
if parent_ids:
|
|
34
|
+
query = query.filter(~Interaction.id.in_(parent_ids))
|
|
35
|
+
|
|
36
|
+
interactions = query.all()
|
|
37
|
+
|
|
38
|
+
version = DatasetVersion(
|
|
39
|
+
name=name,
|
|
40
|
+
description=description,
|
|
41
|
+
parent_name=parent_name,
|
|
42
|
+
min_quality_score=min_quality,
|
|
43
|
+
)
|
|
44
|
+
version.interactions = interactions
|
|
45
|
+
self.session.add(version)
|
|
46
|
+
self.session.commit()
|
|
47
|
+
return version
|
|
48
|
+
|
|
49
|
+
def export_jsonl(self, version_name: str, output_path: Path) -> int:
|
|
50
|
+
version = self.session.query(DatasetVersion).filter_by(name=version_name).first()
|
|
51
|
+
if not version:
|
|
52
|
+
raise ValueError(f"Version {version_name!r} not found")
|
|
53
|
+
|
|
54
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
count = 0
|
|
56
|
+
with jsonlines.open(output_path, mode="w") as writer:
|
|
57
|
+
for interaction in version.interactions:
|
|
58
|
+
writer.write({"messages": interaction.messages})
|
|
59
|
+
count += 1
|
|
60
|
+
return count
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import create_engine
|
|
4
|
+
from sqlalchemy.orm import Session, sessionmaker
|
|
5
|
+
|
|
6
|
+
from sieve.models import Base
|
|
7
|
+
|
|
8
|
+
_DEFAULT_DB = Path.home() / ".sieve" / "sieve.db"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_engine(db_path: Path = _DEFAULT_DB):
|
|
12
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
return create_engine(f"sqlite:///{db_path}", echo=False)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def init_db(db_path: Path = _DEFAULT_DB):
|
|
17
|
+
engine = get_engine(db_path)
|
|
18
|
+
Base.metadata.create_all(engine)
|
|
19
|
+
return engine
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_session(db_path: Path = _DEFAULT_DB) -> Session:
|
|
23
|
+
engine = get_engine(db_path)
|
|
24
|
+
return sessionmaker(bind=engine)()
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Iterator
|
|
3
|
+
|
|
4
|
+
import jsonlines
|
|
5
|
+
|
|
6
|
+
from sieve.ingest.base import BaseIngester
|
|
7
|
+
from sieve.models import Interaction
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class JSONLIngester(BaseIngester):
|
|
11
|
+
def __init__(self, path: Path):
|
|
12
|
+
self.path = path
|
|
13
|
+
|
|
14
|
+
def ingest(self) -> Iterator[Interaction]:
|
|
15
|
+
with jsonlines.open(self.path) as reader:
|
|
16
|
+
for obj in reader:
|
|
17
|
+
messages = obj.get("messages") or obj.get("conversations") or []
|
|
18
|
+
|
|
19
|
+
if not messages and "prompt" in obj and "response" in obj:
|
|
20
|
+
messages = [
|
|
21
|
+
{"role": "user", "content": obj["prompt"]},
|
|
22
|
+
{"role": "assistant", "content": obj["response"]},
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
if not messages:
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
skip_keys = {"messages", "conversations", "prompt", "response"}
|
|
29
|
+
meta = {k: v for k, v in obj.items() if k not in skip_keys}
|
|
30
|
+
|
|
31
|
+
yield Interaction(
|
|
32
|
+
source="jsonl",
|
|
33
|
+
messages=messages,
|
|
34
|
+
metadata_={"source_file": str(self.path), **meta},
|
|
35
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from uuid import uuid4
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import Column, DateTime, Float, ForeignKey, JSON, String, Table
|
|
5
|
+
from sqlalchemy.orm import DeclarativeBase, relationship
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Base(DeclarativeBase):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
interaction_dataset = Table(
|
|
13
|
+
"interaction_dataset",
|
|
14
|
+
Base.metadata,
|
|
15
|
+
Column("interaction_id", String, ForeignKey("interactions.id")),
|
|
16
|
+
Column("dataset_version_id", String, ForeignKey("dataset_versions.id")),
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Interaction(Base):
|
|
21
|
+
__tablename__ = "interactions"
|
|
22
|
+
|
|
23
|
+
id = Column(String, primary_key=True, default=lambda: str(uuid4()))
|
|
24
|
+
source = Column(String, nullable=False)
|
|
25
|
+
messages = Column(JSON, nullable=False) # OpenAI chat format
|
|
26
|
+
metadata_ = Column("metadata", JSON, default=dict)
|
|
27
|
+
ingested_at = Column(DateTime, default=datetime.utcnow)
|
|
28
|
+
quality_score = Column(Float, nullable=True)
|
|
29
|
+
scored_at = Column(DateTime, nullable=True)
|
|
30
|
+
scorer = Column(String, nullable=True)
|
|
31
|
+
|
|
32
|
+
dataset_versions = relationship(
|
|
33
|
+
"DatasetVersion", secondary=interaction_dataset, back_populates="interactions"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatasetVersion(Base):
|
|
38
|
+
__tablename__ = "dataset_versions"
|
|
39
|
+
|
|
40
|
+
id = Column(String, primary_key=True, default=lambda: str(uuid4()))
|
|
41
|
+
name = Column(String, nullable=False, unique=True)
|
|
42
|
+
created_at = Column(DateTime, default=datetime.utcnow)
|
|
43
|
+
description = Column(String, nullable=True)
|
|
44
|
+
parent_name = Column(String, nullable=True)
|
|
45
|
+
min_quality_score = Column(Float, nullable=True)
|
|
46
|
+
|
|
47
|
+
interactions = relationship(
|
|
48
|
+
"Interaction", secondary=interaction_dataset, back_populates="dataset_versions"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TrainingRun(Base):
|
|
53
|
+
__tablename__ = "training_runs"
|
|
54
|
+
|
|
55
|
+
id = Column(String, primary_key=True, default=lambda: str(uuid4()))
|
|
56
|
+
dataset_version_name = Column(String, ForeignKey("dataset_versions.name"))
|
|
57
|
+
backend = Column(String, nullable=False)
|
|
58
|
+
config = Column(JSON, default=dict)
|
|
59
|
+
status = Column(String, default="triggered")
|
|
60
|
+
created_at = Column(DateTime, default=datetime.utcnow)
|
|
61
|
+
completed_at = Column(DateTime, nullable=True)
|
|
62
|
+
output_path = Column(String, nullable=True)
|
|
63
|
+
notes = Column(String, nullable=True)
|
|
File without changes
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from sieve.models import Interaction
|
|
2
|
+
from sieve.score.base import BaseScorer
|
|
3
|
+
|
|
4
|
+
_BAD_PATTERNS = [
|
|
5
|
+
"i cannot",
|
|
6
|
+
"i'm unable",
|
|
7
|
+
"i don't have access",
|
|
8
|
+
"as an ai",
|
|
9
|
+
"i apologize, but",
|
|
10
|
+
"i'm sorry, but i can't",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HeuristicScorer(BaseScorer):
|
|
15
|
+
def __init__(self, min_response_len: int = 50, max_response_len: int = 4000):
|
|
16
|
+
self.min_response_len = min_response_len
|
|
17
|
+
self.max_response_len = max_response_len
|
|
18
|
+
|
|
19
|
+
def score(self, interaction: Interaction) -> float:
|
|
20
|
+
messages = interaction.messages or []
|
|
21
|
+
assistant_msgs = [m for m in messages if m.get("role") == "assistant"]
|
|
22
|
+
user_msgs = [m for m in messages if m.get("role") == "user"]
|
|
23
|
+
|
|
24
|
+
if not assistant_msgs or not user_msgs:
|
|
25
|
+
return 0.0
|
|
26
|
+
|
|
27
|
+
scores = []
|
|
28
|
+
for msg in assistant_msgs:
|
|
29
|
+
content = msg.get("content", "")
|
|
30
|
+
if not isinstance(content, str):
|
|
31
|
+
content = str(content)
|
|
32
|
+
|
|
33
|
+
length = len(content)
|
|
34
|
+
if length < self.min_response_len:
|
|
35
|
+
length_score = length / self.min_response_len
|
|
36
|
+
elif length > self.max_response_len:
|
|
37
|
+
overage = (length - self.max_response_len) / self.max_response_len
|
|
38
|
+
length_score = max(0.5, 1.0 - overage)
|
|
39
|
+
else:
|
|
40
|
+
length_score = 1.0
|
|
41
|
+
|
|
42
|
+
lowered = content.lower()
|
|
43
|
+
penalty = sum(0.15 for p in _BAD_PATTERNS if p in lowered)
|
|
44
|
+
scores.append(max(0.0, length_score - penalty))
|
|
45
|
+
|
|
46
|
+
return sum(scores) / len(scores)
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
from sqlalchemy.orm import Session
|
|
6
|
+
|
|
7
|
+
from sieve.models import TrainingRun
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AxolotlTrigger:
|
|
11
|
+
def __init__(self, session: Session):
|
|
12
|
+
self.session = session
|
|
13
|
+
|
|
14
|
+
def generate_config(
|
|
15
|
+
self,
|
|
16
|
+
dataset_path: Path,
|
|
17
|
+
base_model: str,
|
|
18
|
+
output_dir: Path,
|
|
19
|
+
config_path: Optional[Path] = None,
|
|
20
|
+
) -> Path:
|
|
21
|
+
config = {
|
|
22
|
+
"base_model": base_model,
|
|
23
|
+
"datasets": [{"path": str(dataset_path.resolve()), "type": "sharegpt"}],
|
|
24
|
+
"output_dir": str(output_dir.resolve()),
|
|
25
|
+
"sequence_len": 2048,
|
|
26
|
+
"micro_batch_size": 2,
|
|
27
|
+
"num_epochs": 3,
|
|
28
|
+
"learning_rate": 2e-4,
|
|
29
|
+
"lora_r": 16,
|
|
30
|
+
"lora_alpha": 32,
|
|
31
|
+
"lora_dropout": 0.05,
|
|
32
|
+
"lora_target_modules": ["q_proj", "v_proj"],
|
|
33
|
+
"bf16": True,
|
|
34
|
+
"load_in_4bit": True,
|
|
35
|
+
"val_set_size": 0.05,
|
|
36
|
+
"logging_steps": 10,
|
|
37
|
+
"save_steps": 100,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
out = config_path or dataset_path.parent / "axolotl_config.yml"
|
|
41
|
+
out.write_text(yaml.dump(config, default_flow_style=False))
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
def trigger(
|
|
45
|
+
self,
|
|
46
|
+
dataset_version_name: str,
|
|
47
|
+
dataset_path: Path,
|
|
48
|
+
base_model: str,
|
|
49
|
+
output_dir: Path,
|
|
50
|
+
) -> TrainingRun:
|
|
51
|
+
config_path = self.generate_config(dataset_path, base_model, output_dir)
|
|
52
|
+
|
|
53
|
+
run = TrainingRun(
|
|
54
|
+
dataset_version_name=dataset_version_name,
|
|
55
|
+
backend="axolotl",
|
|
56
|
+
config={"config_path": str(config_path), "base_model": base_model},
|
|
57
|
+
status="triggered",
|
|
58
|
+
)
|
|
59
|
+
self.session.add(run)
|
|
60
|
+
self.session.commit()
|
|
61
|
+
return run
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from sqlalchemy import create_engine
|
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
|
7
|
+
|
|
8
|
+
from sieve.curate.dataset import DatasetManager
|
|
9
|
+
from sieve.models import Base, Interaction
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def session():
|
|
14
|
+
engine = create_engine("sqlite:///:memory:")
|
|
15
|
+
Base.metadata.create_all(engine)
|
|
16
|
+
Session = sessionmaker(bind=engine)
|
|
17
|
+
s = Session()
|
|
18
|
+
yield s
|
|
19
|
+
s.close()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _add_interaction(session, score: float, content: str = "test response " * 10):
|
|
23
|
+
i = Interaction(
|
|
24
|
+
source="test",
|
|
25
|
+
messages=[
|
|
26
|
+
{"role": "user", "content": "question"},
|
|
27
|
+
{"role": "assistant", "content": content},
|
|
28
|
+
],
|
|
29
|
+
quality_score=score,
|
|
30
|
+
)
|
|
31
|
+
session.add(i)
|
|
32
|
+
session.commit()
|
|
33
|
+
return i
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_create_version_filters_by_quality(session):
|
|
37
|
+
_add_interaction(session, score=0.9)
|
|
38
|
+
_add_interaction(session, score=0.3)
|
|
39
|
+
_add_interaction(session, score=0.7)
|
|
40
|
+
|
|
41
|
+
mgr = DatasetManager(session)
|
|
42
|
+
v = mgr.create_version("v1.0", min_quality=0.6)
|
|
43
|
+
assert len(v.interactions) == 2
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_create_version_diff_excludes_parent(session):
|
|
47
|
+
i1 = _add_interaction(session, score=0.9)
|
|
48
|
+
i2 = _add_interaction(session, score=0.9)
|
|
49
|
+
|
|
50
|
+
mgr = DatasetManager(session)
|
|
51
|
+
v1 = mgr.create_version("v1.0", min_quality=0.5)
|
|
52
|
+
assert len(v1.interactions) == 2
|
|
53
|
+
|
|
54
|
+
i3 = _add_interaction(session, score=0.9)
|
|
55
|
+
v2 = mgr.create_version("v2.0", min_quality=0.5, parent_name="v1.0")
|
|
56
|
+
assert len(v2.interactions) == 1
|
|
57
|
+
assert v2.interactions[0].id == i3.id
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_export_jsonl(session, tmp_path):
|
|
61
|
+
_add_interaction(session, score=0.9)
|
|
62
|
+
mgr = DatasetManager(session)
|
|
63
|
+
mgr.create_version("v1.0", min_quality=0.5)
|
|
64
|
+
|
|
65
|
+
out = tmp_path / "export.jsonl"
|
|
66
|
+
count = mgr.export_jsonl("v1.0", out)
|
|
67
|
+
assert count == 1
|
|
68
|
+
assert out.exists()
|
|
69
|
+
|
|
70
|
+
import jsonlines
|
|
71
|
+
with jsonlines.open(out) as r:
|
|
72
|
+
rows = list(r)
|
|
73
|
+
assert len(rows) == 1
|
|
74
|
+
assert "messages" in rows[0]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_export_unknown_version_raises(session, tmp_path):
|
|
78
|
+
mgr = DatasetManager(session)
|
|
79
|
+
with pytest.raises(ValueError, match="not found"):
|
|
80
|
+
mgr.export_jsonl("nonexistent", tmp_path / "out.jsonl")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import tempfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from sieve.ingest.jsonl import JSONLIngester
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _write_jsonl(path: Path, rows: list) -> None:
|
|
9
|
+
with open(path, "w") as f:
|
|
10
|
+
for row in rows:
|
|
11
|
+
f.write(json.dumps(row) + "\n")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_ingest_messages_format():
|
|
15
|
+
with tempfile.NamedTemporaryFile(suffix=".jsonl", mode="w", delete=False) as f:
|
|
16
|
+
path = Path(f.name)
|
|
17
|
+
|
|
18
|
+
_write_jsonl(path, [
|
|
19
|
+
{"messages": [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]},
|
|
20
|
+
])
|
|
21
|
+
results = list(JSONLIngester(path).ingest())
|
|
22
|
+
assert len(results) == 1
|
|
23
|
+
assert results[0].messages[0]["role"] == "user"
|
|
24
|
+
assert results[0].source == "jsonl"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_ingest_prompt_response_format():
|
|
28
|
+
with tempfile.NamedTemporaryFile(suffix=".jsonl", mode="w", delete=False) as f:
|
|
29
|
+
path = Path(f.name)
|
|
30
|
+
|
|
31
|
+
_write_jsonl(path, [
|
|
32
|
+
{"prompt": "What is Python?", "response": "A programming language."},
|
|
33
|
+
])
|
|
34
|
+
results = list(JSONLIngester(path).ingest())
|
|
35
|
+
assert len(results) == 1
|
|
36
|
+
assert results[0].messages[0]["content"] == "What is Python?"
|
|
37
|
+
assert results[0].messages[1]["role"] == "assistant"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_ingest_skips_empty_messages():
|
|
41
|
+
with tempfile.NamedTemporaryFile(suffix=".jsonl", mode="w", delete=False) as f:
|
|
42
|
+
path = Path(f.name)
|
|
43
|
+
|
|
44
|
+
_write_jsonl(path, [
|
|
45
|
+
{"no_messages": True},
|
|
46
|
+
{"messages": [{"role": "user", "content": "valid"}, {"role": "assistant", "content": "ok"}]},
|
|
47
|
+
])
|
|
48
|
+
results = list(JSONLIngester(path).ingest())
|
|
49
|
+
assert len(results) == 1
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from sieve.models import Interaction
|
|
2
|
+
from sieve.score.heuristic import HeuristicScorer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _make(user: str, assistant: str) -> Interaction:
|
|
6
|
+
return Interaction(
|
|
7
|
+
source="test",
|
|
8
|
+
messages=[
|
|
9
|
+
{"role": "user", "content": user},
|
|
10
|
+
{"role": "assistant", "content": assistant},
|
|
11
|
+
],
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
scorer = HeuristicScorer()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_good_response_scores_high():
|
|
19
|
+
i = _make("Explain recursion", "Recursion is when a function calls itself. " * 10)
|
|
20
|
+
assert scorer.score(i) >= 0.8
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_too_short_scores_low():
|
|
24
|
+
i = _make("What is 2+2?", "4")
|
|
25
|
+
assert scorer.score(i) < 0.5
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_bad_pattern_penalized():
|
|
29
|
+
i = _make("Help me", "As an AI, I cannot assist with that. " * 5)
|
|
30
|
+
assert scorer.score(i) <= 0.7
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_no_assistant_message_returns_zero():
|
|
34
|
+
i = Interaction(source="test", messages=[{"role": "user", "content": "hi"}])
|
|
35
|
+
assert scorer.score(i) == 0.0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_empty_messages_returns_zero():
|
|
39
|
+
i = Interaction(source="test", messages=[])
|
|
40
|
+
assert scorer.score(i) == 0.0
|