synkro 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synkro might be problematic. Click here for more details.
- synkro/__init__.py +165 -0
- synkro/cli.py +120 -0
- synkro/core/__init__.py +7 -0
- synkro/core/dataset.py +233 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +160 -0
- synkro/formatters/__init__.py +12 -0
- synkro/formatters/qa.py +85 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/generator.py +163 -0
- synkro/generation/planner.py +87 -0
- synkro/generation/responses.py +160 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +370 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +235 -0
- synkro/llm/rate_limits.py +95 -0
- synkro/models/__init__.py +43 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +15 -0
- synkro/modes/config.py +66 -0
- synkro/modes/qa.py +18 -0
- synkro/modes/sft.py +18 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +237 -0
- synkro/pipeline/runner.py +198 -0
- synkro/pipelines.py +105 -0
- synkro/prompts/__init__.py +44 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/qa_templates.py +97 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +201 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/grader.py +130 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/reporting.py +213 -0
- synkro/schemas.py +325 -0
- synkro/types/__init__.py +41 -0
- synkro/types/core.py +113 -0
- synkro/types/dataset_type.py +30 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.5.data/data/examples/__init__.py +148 -0
- synkro-0.4.5.dist-info/METADATA +221 -0
- synkro-0.4.5.dist-info/RECORD +58 -0
- synkro-0.4.5.dist-info/WHEEL +4 -0
- synkro-0.4.5.dist-info/entry_points.txt +2 -0
- synkro-0.4.5.dist-info/licenses/LICENSE +21 -0
synkro/__init__.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synkro - Generate high-quality training datasets from any document.
|
|
3
|
+
|
|
4
|
+
Modular Usage (recommended):
|
|
5
|
+
>>> from synkro.pipelines import create_pipeline
|
|
6
|
+
>>> from synkro.models.openai import OpenAI
|
|
7
|
+
>>> from synkro.types import DatasetType
|
|
8
|
+
>>>
|
|
9
|
+
>>> pipeline = create_pipeline(
|
|
10
|
+
... model=OpenAI.GPT_5_MINI,
|
|
11
|
+
... dataset_type=DatasetType.SFT,
|
|
12
|
+
... )
|
|
13
|
+
>>> dataset = pipeline.generate("policy text", traces=50)
|
|
14
|
+
>>> dataset.save("training.jsonl")
|
|
15
|
+
|
|
16
|
+
Simple Usage:
|
|
17
|
+
>>> import synkro
|
|
18
|
+
>>> dataset = synkro.generate("Your policy text...")
|
|
19
|
+
>>> dataset.save("training.jsonl")
|
|
20
|
+
|
|
21
|
+
Silent Mode (for embedding/testing):
|
|
22
|
+
>>> from synkro import SilentReporter, create_pipeline
|
|
23
|
+
>>> pipeline = create_pipeline(reporter=SilentReporter())
|
|
24
|
+
>>> dataset = pipeline.generate("policy text") # No console output
|
|
25
|
+
|
|
26
|
+
Tool Call Dataset:
|
|
27
|
+
>>> from synkro import create_pipeline, ToolDefinition, DatasetType
|
|
28
|
+
>>> web_search = ToolDefinition(
|
|
29
|
+
... name="web_search",
|
|
30
|
+
... description="Search the web",
|
|
31
|
+
... parameters={"type": "object", "properties": {"query": {"type": "string"}}}
|
|
32
|
+
... )
|
|
33
|
+
>>> pipeline = create_pipeline(
|
|
34
|
+
... dataset_type=DatasetType.TOOL_CALL,
|
|
35
|
+
... tools=[web_search],
|
|
36
|
+
... )
|
|
37
|
+
>>> dataset = pipeline.generate("Search guidelines", traces=50)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from synkro.pipelines import create_pipeline
|
|
41
|
+
from synkro.models import OpenAI, Anthropic, Google
|
|
42
|
+
from synkro.types import DatasetType, Message, Scenario, Trace, GradeResult, Plan, Category
|
|
43
|
+
from synkro.types import ToolDefinition, ToolCall, ToolFunction, ToolResult
|
|
44
|
+
from synkro.core.policy import Policy
|
|
45
|
+
from synkro.core.dataset import Dataset
|
|
46
|
+
from synkro.llm.client import LLM
|
|
47
|
+
from synkro.generation.generator import Generator
|
|
48
|
+
from synkro.generation.scenarios import ScenarioGenerator
|
|
49
|
+
from synkro.generation.responses import ResponseGenerator
|
|
50
|
+
from synkro.generation.planner import Planner
|
|
51
|
+
from synkro.quality.grader import Grader
|
|
52
|
+
from synkro.quality.refiner import Refiner
|
|
53
|
+
from synkro.quality.tool_grader import ToolCallGrader
|
|
54
|
+
from synkro.quality.tool_refiner import ToolCallRefiner
|
|
55
|
+
from synkro.formatters.sft import SFTFormatter
|
|
56
|
+
from synkro.formatters.qa import QAFormatter
|
|
57
|
+
from synkro.formatters.tool_call import ToolCallFormatter
|
|
58
|
+
from synkro.prompts import SystemPrompt, ScenarioPrompt, ResponsePrompt, GradePrompt
|
|
59
|
+
from synkro.reporting import ProgressReporter, RichReporter, SilentReporter
|
|
60
|
+
|
|
61
|
+
__version__ = "0.4.5"
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
# Pipeline creation
|
|
65
|
+
"create_pipeline",
|
|
66
|
+
# Quick function
|
|
67
|
+
"generate",
|
|
68
|
+
# Dataset type enum
|
|
69
|
+
"DatasetType",
|
|
70
|
+
# Core classes
|
|
71
|
+
"Policy",
|
|
72
|
+
"Dataset",
|
|
73
|
+
"Trace",
|
|
74
|
+
"Scenario",
|
|
75
|
+
"Message",
|
|
76
|
+
"GradeResult",
|
|
77
|
+
"Plan",
|
|
78
|
+
"Category",
|
|
79
|
+
# Tool types
|
|
80
|
+
"ToolDefinition",
|
|
81
|
+
"ToolCall",
|
|
82
|
+
"ToolFunction",
|
|
83
|
+
"ToolResult",
|
|
84
|
+
# Generation
|
|
85
|
+
"Generator",
|
|
86
|
+
"ScenarioGenerator",
|
|
87
|
+
"ResponseGenerator",
|
|
88
|
+
"Planner",
|
|
89
|
+
# Quality
|
|
90
|
+
"Grader",
|
|
91
|
+
"Refiner",
|
|
92
|
+
"ToolCallGrader",
|
|
93
|
+
"ToolCallRefiner",
|
|
94
|
+
# LLM
|
|
95
|
+
"LLM",
|
|
96
|
+
# Prompts
|
|
97
|
+
"SystemPrompt",
|
|
98
|
+
"ScenarioPrompt",
|
|
99
|
+
"ResponsePrompt",
|
|
100
|
+
"GradePrompt",
|
|
101
|
+
# Formatters
|
|
102
|
+
"SFTFormatter",
|
|
103
|
+
"QAFormatter",
|
|
104
|
+
"ToolCallFormatter",
|
|
105
|
+
# Reporters
|
|
106
|
+
"ProgressReporter",
|
|
107
|
+
"RichReporter",
|
|
108
|
+
"SilentReporter",
|
|
109
|
+
# Model enums (OpenAI, Anthropic, Google supported)
|
|
110
|
+
"OpenAI",
|
|
111
|
+
"Anthropic",
|
|
112
|
+
"Google",
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def generate(
|
|
117
|
+
policy: str | Policy,
|
|
118
|
+
traces: int = 20,
|
|
119
|
+
dataset_type: DatasetType = DatasetType.SFT,
|
|
120
|
+
generation_model: OpenAI | Anthropic | Google | str = OpenAI.GPT_5_MINI,
|
|
121
|
+
grading_model: OpenAI | Anthropic | Google | str = OpenAI.GPT_52,
|
|
122
|
+
max_iterations: int = 3,
|
|
123
|
+
skip_grading: bool = False,
|
|
124
|
+
reporter: ProgressReporter | None = None,
|
|
125
|
+
) -> Dataset:
|
|
126
|
+
"""
|
|
127
|
+
Generate training traces from a policy document.
|
|
128
|
+
|
|
129
|
+
This is a convenience function. For more control, use create_pipeline().
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
policy: Policy text or Policy object
|
|
133
|
+
traces: Number of traces to generate (default: 20)
|
|
134
|
+
dataset_type: Type of dataset - SFT (default) or QA
|
|
135
|
+
generation_model: Model for generating (default: gpt-5-mini)
|
|
136
|
+
grading_model: Model for grading (default: gpt-5.2)
|
|
137
|
+
max_iterations: Max refinement iterations per trace (default: 3)
|
|
138
|
+
skip_grading: Skip grading phase for faster generation (default: False)
|
|
139
|
+
reporter: Progress reporter (default: RichReporter for console output)
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Dataset object with generated traces
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
>>> import synkro
|
|
146
|
+
>>> dataset = synkro.generate("All expenses over $50 require approval")
|
|
147
|
+
>>> dataset.save("training.jsonl")
|
|
148
|
+
|
|
149
|
+
>>> # Silent mode
|
|
150
|
+
>>> from synkro import SilentReporter
|
|
151
|
+
>>> dataset = synkro.generate(policy, reporter=SilentReporter())
|
|
152
|
+
"""
|
|
153
|
+
if isinstance(policy, str):
|
|
154
|
+
policy = Policy(text=policy)
|
|
155
|
+
|
|
156
|
+
generator = Generator(
|
|
157
|
+
dataset_type=dataset_type,
|
|
158
|
+
generation_model=generation_model,
|
|
159
|
+
grading_model=grading_model,
|
|
160
|
+
max_iterations=max_iterations,
|
|
161
|
+
skip_grading=skip_grading,
|
|
162
|
+
reporter=reporter,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return generator.generate(policy, traces=traces)
|
synkro/cli.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Synkro CLI - Generate training data from the command line."""
|
|
2
|
+
|
|
3
|
+
import typer
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
app = typer.Typer(
|
|
8
|
+
name="synkro",
|
|
9
|
+
help="Generate training datasets from documents.",
|
|
10
|
+
no_args_is_help=True,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command()
|
|
15
|
+
def generate(
|
|
16
|
+
source: str = typer.Argument(
|
|
17
|
+
...,
|
|
18
|
+
help="Policy text, file path (.pdf, .docx, .txt, .md), folder path, or URL",
|
|
19
|
+
),
|
|
20
|
+
output: Optional[Path] = typer.Option(
|
|
21
|
+
None,
|
|
22
|
+
"--output", "-o",
|
|
23
|
+
help="Output file path (auto-generated if not specified)",
|
|
24
|
+
),
|
|
25
|
+
traces: int = typer.Option(
|
|
26
|
+
20,
|
|
27
|
+
"--traces", "-n",
|
|
28
|
+
help="Number of traces to generate",
|
|
29
|
+
),
|
|
30
|
+
format: str = typer.Option(
|
|
31
|
+
"sft",
|
|
32
|
+
"--format", "-f",
|
|
33
|
+
help="Output format: sft or qa",
|
|
34
|
+
),
|
|
35
|
+
model: str = typer.Option(
|
|
36
|
+
"gpt-4o-mini",
|
|
37
|
+
"--model", "-m",
|
|
38
|
+
help="Model for generation (e.g., gpt-4o-mini, claude-3-5-sonnet, gemini-2.5-flash)",
|
|
39
|
+
),
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Generate training data from a policy document.
|
|
43
|
+
|
|
44
|
+
Examples:
|
|
45
|
+
|
|
46
|
+
synkro generate policy.pdf
|
|
47
|
+
|
|
48
|
+
synkro generate policies/ # Load all files from folder
|
|
49
|
+
|
|
50
|
+
synkro generate "All expenses over $50 need approval" --traces 50
|
|
51
|
+
|
|
52
|
+
synkro generate handbook.docx -o training.jsonl -n 100
|
|
53
|
+
"""
|
|
54
|
+
import synkro
|
|
55
|
+
from synkro import Policy
|
|
56
|
+
|
|
57
|
+
# Determine if source is text, file, or URL
|
|
58
|
+
source_path = Path(source)
|
|
59
|
+
|
|
60
|
+
if source_path.exists():
|
|
61
|
+
# It's a file
|
|
62
|
+
policy = Policy.from_file(source_path)
|
|
63
|
+
elif source.startswith(("http://", "https://")):
|
|
64
|
+
# It's a URL
|
|
65
|
+
policy = Policy.from_url(source)
|
|
66
|
+
else:
|
|
67
|
+
# Treat as raw text
|
|
68
|
+
policy = Policy(text=source)
|
|
69
|
+
|
|
70
|
+
# Generate
|
|
71
|
+
dataset = synkro.generate(
|
|
72
|
+
policy,
|
|
73
|
+
traces=traces,
|
|
74
|
+
generation_model=model,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Save
|
|
78
|
+
if output:
|
|
79
|
+
dataset.save(output, format=format)
|
|
80
|
+
else:
|
|
81
|
+
dataset.save(format=format)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@app.command()
|
|
85
|
+
def demo():
|
|
86
|
+
"""
|
|
87
|
+
Run a quick demo with a built-in example policy.
|
|
88
|
+
"""
|
|
89
|
+
import synkro
|
|
90
|
+
from synkro.examples import EXPENSE_POLICY
|
|
91
|
+
from rich.console import Console
|
|
92
|
+
|
|
93
|
+
console = Console()
|
|
94
|
+
console.print("\n[cyan]Running demo with built-in expense policy...[/cyan]\n")
|
|
95
|
+
|
|
96
|
+
dataset = synkro.generate(EXPENSE_POLICY, traces=5)
|
|
97
|
+
dataset.save("demo_output.jsonl")
|
|
98
|
+
|
|
99
|
+
console.print("\n[green]Demo complete![/green]")
|
|
100
|
+
console.print("[dim]Check demo_output.jsonl for the generated training data.[/dim]\n")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@app.command()
|
|
104
|
+
def version():
|
|
105
|
+
"""Show version information."""
|
|
106
|
+
import synkro
|
|
107
|
+
from rich.console import Console
|
|
108
|
+
|
|
109
|
+
console = Console()
|
|
110
|
+
console.print(f"[cyan]synkro[/cyan] v{synkro.__version__}")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def main():
|
|
114
|
+
"""Entry point for the CLI."""
|
|
115
|
+
app()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
main()
|
|
120
|
+
|
synkro/core/__init__.py
ADDED
synkro/core/dataset.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Dataset class for managing generated traces."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterator
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from synkro.types.core import Trace
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Dataset(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
A collection of generated training traces.
|
|
19
|
+
|
|
20
|
+
Provides methods for filtering, saving, and exporting traces
|
|
21
|
+
in various formats.
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
>>> dataset = generator.generate(policy, traces=100)
|
|
25
|
+
|
|
26
|
+
>>> # Filter to only passing traces
|
|
27
|
+
>>> passing = dataset.filter(passed=True)
|
|
28
|
+
|
|
29
|
+
>>> # Save to JSONL
|
|
30
|
+
>>> dataset.save("training.jsonl")
|
|
31
|
+
|
|
32
|
+
>>> # Push to HuggingFace
|
|
33
|
+
>>> dataset.to_huggingface().push_to_hub("my-org/dataset")
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
traces: list[Trace] = Field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
class Config:
|
|
39
|
+
arbitrary_types_allowed = True
|
|
40
|
+
|
|
41
|
+
def __len__(self) -> int:
|
|
42
|
+
return len(self.traces)
|
|
43
|
+
|
|
44
|
+
def __iter__(self) -> Iterator[Trace]:
|
|
45
|
+
return iter(self.traces)
|
|
46
|
+
|
|
47
|
+
def __getitem__(self, idx: int) -> Trace:
|
|
48
|
+
return self.traces[idx]
|
|
49
|
+
|
|
50
|
+
def filter(
|
|
51
|
+
self,
|
|
52
|
+
passed: bool | None = None,
|
|
53
|
+
category: str | None = None,
|
|
54
|
+
min_length: int | None = None,
|
|
55
|
+
) -> "Dataset":
|
|
56
|
+
"""
|
|
57
|
+
Filter traces by criteria.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
passed: Filter by grade pass/fail status
|
|
61
|
+
category: Filter by scenario category
|
|
62
|
+
min_length: Minimum response length in characters
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
New Dataset with filtered traces
|
|
66
|
+
"""
|
|
67
|
+
filtered = self.traces
|
|
68
|
+
|
|
69
|
+
if passed is not None:
|
|
70
|
+
filtered = [
|
|
71
|
+
t for t in filtered if t.grade and t.grade.passed == passed
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
if category is not None:
|
|
75
|
+
filtered = [
|
|
76
|
+
t for t in filtered if t.scenario.category == category
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
if min_length is not None:
|
|
80
|
+
filtered = [
|
|
81
|
+
t for t in filtered if len(t.assistant_message) >= min_length
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
return Dataset(traces=filtered)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def passing_rate(self) -> float:
|
|
88
|
+
"""Get the percentage of traces that passed grading."""
|
|
89
|
+
if not self.traces:
|
|
90
|
+
return 0.0
|
|
91
|
+
|
|
92
|
+
passed = sum(1 for t in self.traces if t.grade and t.grade.passed)
|
|
93
|
+
return passed / len(self.traces)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def categories(self) -> list[str]:
|
|
97
|
+
"""Get unique categories in the dataset."""
|
|
98
|
+
return list(set(t.scenario.category for t in self.traces if t.scenario.category))
|
|
99
|
+
|
|
100
|
+
def save(self, path: str | Path | None = None, format: str = "sft") -> "Dataset":
|
|
101
|
+
"""
|
|
102
|
+
Save dataset to a JSONL file.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
path: Output file path (auto-generated if not provided)
|
|
106
|
+
format: Output format - "sft", "qa", or "tool_call"
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Self for method chaining
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> dataset.save() # Auto-names: synkro_sft_2024-01-15.jsonl
|
|
113
|
+
>>> dataset.save("training.jsonl")
|
|
114
|
+
>>> dataset.save("qa_data.jsonl", format="qa")
|
|
115
|
+
>>> dataset.save("tools.jsonl", format="tool_call")
|
|
116
|
+
"""
|
|
117
|
+
from synkro.formatters import SFTFormatter, QAFormatter, ToolCallFormatter
|
|
118
|
+
|
|
119
|
+
# Auto-generate filename if not provided
|
|
120
|
+
if path is None:
|
|
121
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
|
|
122
|
+
path = f"synkro_{format}_{timestamp}.jsonl"
|
|
123
|
+
|
|
124
|
+
path = Path(path)
|
|
125
|
+
|
|
126
|
+
if format == "sft":
|
|
127
|
+
SFTFormatter().save(self.traces, path)
|
|
128
|
+
elif format == "qa":
|
|
129
|
+
QAFormatter().save(self.traces, path)
|
|
130
|
+
elif format == "tool_call":
|
|
131
|
+
ToolCallFormatter().save(self.traces, path)
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unknown format: {format}. Use 'sft', 'qa', or 'tool_call'")
|
|
134
|
+
|
|
135
|
+
# Print confirmation
|
|
136
|
+
file_size = path.stat().st_size
|
|
137
|
+
size_str = f"{file_size / 1024:.1f} KB" if file_size < 1024 * 1024 else f"{file_size / 1024 / 1024:.1f} MB"
|
|
138
|
+
console.print(f"[green]📁 Saved:[/green] {path} ({size_str})")
|
|
139
|
+
|
|
140
|
+
return self
|
|
141
|
+
|
|
142
|
+
def to_jsonl(self, format: str = "sft") -> str:
|
|
143
|
+
"""
|
|
144
|
+
Convert dataset to JSONL string.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
format: Output format - "sft", "qa", or "tool_call"
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
JSONL formatted string
|
|
151
|
+
"""
|
|
152
|
+
from synkro.formatters import SFTFormatter, QAFormatter, ToolCallFormatter
|
|
153
|
+
|
|
154
|
+
if format == "sft":
|
|
155
|
+
return SFTFormatter().to_jsonl(self.traces)
|
|
156
|
+
elif format == "qa":
|
|
157
|
+
return QAFormatter().to_jsonl(self.traces)
|
|
158
|
+
elif format == "tool_call":
|
|
159
|
+
return ToolCallFormatter().to_jsonl(self.traces)
|
|
160
|
+
else:
|
|
161
|
+
raise ValueError(f"Unknown format: {format}. Use 'sft', 'qa', or 'tool_call'")
|
|
162
|
+
|
|
163
|
+
def to_huggingface(self):
|
|
164
|
+
"""
|
|
165
|
+
Convert to HuggingFace Dataset.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
HuggingFace Dataset object
|
|
169
|
+
|
|
170
|
+
Example:
|
|
171
|
+
>>> hf_dataset = dataset.to_huggingface()
|
|
172
|
+
>>> hf_dataset.push_to_hub("my-org/policy-traces")
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
from datasets import Dataset as HFDataset
|
|
176
|
+
|
|
177
|
+
# Convert to SFT format for HF
|
|
178
|
+
from synkro.formatters import SFTFormatter
|
|
179
|
+
|
|
180
|
+
examples = SFTFormatter(include_metadata=True).format(self.traces)
|
|
181
|
+
return HFDataset.from_list(examples)
|
|
182
|
+
except ImportError:
|
|
183
|
+
raise ImportError(
|
|
184
|
+
"datasets is required for HuggingFace export. "
|
|
185
|
+
"Install with: pip install datasets"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def to_dict(self) -> dict:
|
|
189
|
+
"""
|
|
190
|
+
Convert dataset to a dictionary.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Dictionary with trace data
|
|
194
|
+
"""
|
|
195
|
+
return {
|
|
196
|
+
"traces": [t.model_dump() for t in self.traces],
|
|
197
|
+
"stats": {
|
|
198
|
+
"total": len(self.traces),
|
|
199
|
+
"passing_rate": self.passing_rate,
|
|
200
|
+
"categories": self.categories,
|
|
201
|
+
},
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
def summary(self) -> str:
|
|
205
|
+
"""
|
|
206
|
+
Get a summary of the dataset.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Human-readable summary string
|
|
210
|
+
"""
|
|
211
|
+
lines = [
|
|
212
|
+
f"Dataset Summary",
|
|
213
|
+
f"===============",
|
|
214
|
+
f"Total traces: {len(self.traces)}",
|
|
215
|
+
f"Passing rate: {self.passing_rate:.1%}",
|
|
216
|
+
f"Categories: {len(self.categories)}",
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
if self.categories:
|
|
220
|
+
lines.append("")
|
|
221
|
+
lines.append("By category:")
|
|
222
|
+
for cat in self.categories:
|
|
223
|
+
count = sum(1 for t in self.traces if t.scenario.category == cat)
|
|
224
|
+
lines.append(f" - {cat}: {count}")
|
|
225
|
+
|
|
226
|
+
return "\n".join(lines)
|
|
227
|
+
|
|
228
|
+
def __str__(self) -> str:
|
|
229
|
+
return f"Dataset(traces={len(self.traces)}, passing={self.passing_rate:.1%})"
|
|
230
|
+
|
|
231
|
+
def __repr__(self) -> str:
|
|
232
|
+
return self.__str__()
|
|
233
|
+
|