synkro 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synkro might be problematic. Click here for more details.

Files changed (58) hide show
  1. synkro/__init__.py +165 -0
  2. synkro/cli.py +120 -0
  3. synkro/core/__init__.py +7 -0
  4. synkro/core/dataset.py +233 -0
  5. synkro/core/policy.py +337 -0
  6. synkro/errors.py +178 -0
  7. synkro/examples/__init__.py +148 -0
  8. synkro/factory.py +160 -0
  9. synkro/formatters/__init__.py +12 -0
  10. synkro/formatters/qa.py +85 -0
  11. synkro/formatters/sft.py +90 -0
  12. synkro/formatters/tool_call.py +127 -0
  13. synkro/generation/__init__.py +9 -0
  14. synkro/generation/generator.py +163 -0
  15. synkro/generation/planner.py +87 -0
  16. synkro/generation/responses.py +160 -0
  17. synkro/generation/scenarios.py +90 -0
  18. synkro/generation/tool_responses.py +370 -0
  19. synkro/generation/tool_simulator.py +114 -0
  20. synkro/llm/__init__.py +7 -0
  21. synkro/llm/client.py +235 -0
  22. synkro/llm/rate_limits.py +95 -0
  23. synkro/models/__init__.py +43 -0
  24. synkro/models/anthropic.py +26 -0
  25. synkro/models/google.py +19 -0
  26. synkro/models/openai.py +31 -0
  27. synkro/modes/__init__.py +15 -0
  28. synkro/modes/config.py +66 -0
  29. synkro/modes/qa.py +18 -0
  30. synkro/modes/sft.py +18 -0
  31. synkro/modes/tool_call.py +18 -0
  32. synkro/parsers.py +442 -0
  33. synkro/pipeline/__init__.py +20 -0
  34. synkro/pipeline/phases.py +237 -0
  35. synkro/pipeline/runner.py +198 -0
  36. synkro/pipelines.py +105 -0
  37. synkro/prompts/__init__.py +44 -0
  38. synkro/prompts/base.py +167 -0
  39. synkro/prompts/qa_templates.py +97 -0
  40. synkro/prompts/templates.py +281 -0
  41. synkro/prompts/tool_templates.py +201 -0
  42. synkro/quality/__init__.py +14 -0
  43. synkro/quality/grader.py +130 -0
  44. synkro/quality/refiner.py +137 -0
  45. synkro/quality/tool_grader.py +126 -0
  46. synkro/quality/tool_refiner.py +128 -0
  47. synkro/reporting.py +213 -0
  48. synkro/schemas.py +325 -0
  49. synkro/types/__init__.py +41 -0
  50. synkro/types/core.py +113 -0
  51. synkro/types/dataset_type.py +30 -0
  52. synkro/types/tool.py +94 -0
  53. synkro-0.4.5.data/data/examples/__init__.py +148 -0
  54. synkro-0.4.5.dist-info/METADATA +221 -0
  55. synkro-0.4.5.dist-info/RECORD +58 -0
  56. synkro-0.4.5.dist-info/WHEEL +4 -0
  57. synkro-0.4.5.dist-info/entry_points.txt +2 -0
  58. synkro-0.4.5.dist-info/licenses/LICENSE +21 -0
synkro/__init__.py ADDED
@@ -0,0 +1,165 @@
1
+ """
2
+ Synkro - Generate high-quality training datasets from any document.
3
+
4
+ Modular Usage (recommended):
5
+ >>> from synkro.pipelines import create_pipeline
6
+ >>> from synkro.models.openai import OpenAI
7
+ >>> from synkro.types import DatasetType
8
+ >>>
9
+ >>> pipeline = create_pipeline(
10
+ ... model=OpenAI.GPT_5_MINI,
11
+ ... dataset_type=DatasetType.SFT,
12
+ ... )
13
+ >>> dataset = pipeline.generate("policy text", traces=50)
14
+ >>> dataset.save("training.jsonl")
15
+
16
+ Simple Usage:
17
+ >>> import synkro
18
+ >>> dataset = synkro.generate("Your policy text...")
19
+ >>> dataset.save("training.jsonl")
20
+
21
+ Silent Mode (for embedding/testing):
22
+ >>> from synkro import SilentReporter, create_pipeline
23
+ >>> pipeline = create_pipeline(reporter=SilentReporter())
24
+ >>> dataset = pipeline.generate("policy text") # No console output
25
+
26
+ Tool Call Dataset:
27
+ >>> from synkro import create_pipeline, ToolDefinition, DatasetType
28
+ >>> web_search = ToolDefinition(
29
+ ... name="web_search",
30
+ ... description="Search the web",
31
+ ... parameters={"type": "object", "properties": {"query": {"type": "string"}}}
32
+ ... )
33
+ >>> pipeline = create_pipeline(
34
+ ... dataset_type=DatasetType.TOOL_CALL,
35
+ ... tools=[web_search],
36
+ ... )
37
+ >>> dataset = pipeline.generate("Search guidelines", traces=50)
38
+ """
39
+
40
+ from synkro.pipelines import create_pipeline
41
+ from synkro.models import OpenAI, Anthropic, Google
42
+ from synkro.types import DatasetType, Message, Scenario, Trace, GradeResult, Plan, Category
43
+ from synkro.types import ToolDefinition, ToolCall, ToolFunction, ToolResult
44
+ from synkro.core.policy import Policy
45
+ from synkro.core.dataset import Dataset
46
+ from synkro.llm.client import LLM
47
+ from synkro.generation.generator import Generator
48
+ from synkro.generation.scenarios import ScenarioGenerator
49
+ from synkro.generation.responses import ResponseGenerator
50
+ from synkro.generation.planner import Planner
51
+ from synkro.quality.grader import Grader
52
+ from synkro.quality.refiner import Refiner
53
+ from synkro.quality.tool_grader import ToolCallGrader
54
+ from synkro.quality.tool_refiner import ToolCallRefiner
55
+ from synkro.formatters.sft import SFTFormatter
56
+ from synkro.formatters.qa import QAFormatter
57
+ from synkro.formatters.tool_call import ToolCallFormatter
58
+ from synkro.prompts import SystemPrompt, ScenarioPrompt, ResponsePrompt, GradePrompt
59
+ from synkro.reporting import ProgressReporter, RichReporter, SilentReporter
60
+
61
+ __version__ = "0.4.5"
62
+
63
+ __all__ = [
64
+ # Pipeline creation
65
+ "create_pipeline",
66
+ # Quick function
67
+ "generate",
68
+ # Dataset type enum
69
+ "DatasetType",
70
+ # Core classes
71
+ "Policy",
72
+ "Dataset",
73
+ "Trace",
74
+ "Scenario",
75
+ "Message",
76
+ "GradeResult",
77
+ "Plan",
78
+ "Category",
79
+ # Tool types
80
+ "ToolDefinition",
81
+ "ToolCall",
82
+ "ToolFunction",
83
+ "ToolResult",
84
+ # Generation
85
+ "Generator",
86
+ "ScenarioGenerator",
87
+ "ResponseGenerator",
88
+ "Planner",
89
+ # Quality
90
+ "Grader",
91
+ "Refiner",
92
+ "ToolCallGrader",
93
+ "ToolCallRefiner",
94
+ # LLM
95
+ "LLM",
96
+ # Prompts
97
+ "SystemPrompt",
98
+ "ScenarioPrompt",
99
+ "ResponsePrompt",
100
+ "GradePrompt",
101
+ # Formatters
102
+ "SFTFormatter",
103
+ "QAFormatter",
104
+ "ToolCallFormatter",
105
+ # Reporters
106
+ "ProgressReporter",
107
+ "RichReporter",
108
+ "SilentReporter",
109
+ # Model enums (OpenAI, Anthropic, Google supported)
110
+ "OpenAI",
111
+ "Anthropic",
112
+ "Google",
113
+ ]
114
+
115
+
116
+ def generate(
117
+ policy: str | Policy,
118
+ traces: int = 20,
119
+ dataset_type: DatasetType = DatasetType.SFT,
120
+ generation_model: OpenAI | Anthropic | Google | str = OpenAI.GPT_5_MINI,
121
+ grading_model: OpenAI | Anthropic | Google | str = OpenAI.GPT_52,
122
+ max_iterations: int = 3,
123
+ skip_grading: bool = False,
124
+ reporter: ProgressReporter | None = None,
125
+ ) -> Dataset:
126
+ """
127
+ Generate training traces from a policy document.
128
+
129
+ This is a convenience function. For more control, use create_pipeline().
130
+
131
+ Args:
132
+ policy: Policy text or Policy object
133
+ traces: Number of traces to generate (default: 20)
134
+ dataset_type: Type of dataset - SFT (default) or QA
135
+ generation_model: Model for generating (default: gpt-5-mini)
136
+ grading_model: Model for grading (default: gpt-5.2)
137
+ max_iterations: Max refinement iterations per trace (default: 3)
138
+ skip_grading: Skip grading phase for faster generation (default: False)
139
+ reporter: Progress reporter (default: RichReporter for console output)
140
+
141
+ Returns:
142
+ Dataset object with generated traces
143
+
144
+ Example:
145
+ >>> import synkro
146
+ >>> dataset = synkro.generate("All expenses over $50 require approval")
147
+ >>> dataset.save("training.jsonl")
148
+
149
+ >>> # Silent mode
150
+ >>> from synkro import SilentReporter
151
+ >>> dataset = synkro.generate(policy, reporter=SilentReporter())
152
+ """
153
+ if isinstance(policy, str):
154
+ policy = Policy(text=policy)
155
+
156
+ generator = Generator(
157
+ dataset_type=dataset_type,
158
+ generation_model=generation_model,
159
+ grading_model=grading_model,
160
+ max_iterations=max_iterations,
161
+ skip_grading=skip_grading,
162
+ reporter=reporter,
163
+ )
164
+
165
+ return generator.generate(policy, traces=traces)
synkro/cli.py ADDED
@@ -0,0 +1,120 @@
1
+ """Synkro CLI - Generate training data from the command line."""
2
+
3
+ import typer
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ app = typer.Typer(
8
+ name="synkro",
9
+ help="Generate training datasets from documents.",
10
+ no_args_is_help=True,
11
+ )
12
+
13
+
14
+ @app.command()
15
+ def generate(
16
+ source: str = typer.Argument(
17
+ ...,
18
+ help="Policy text, file path (.pdf, .docx, .txt, .md), folder path, or URL",
19
+ ),
20
+ output: Optional[Path] = typer.Option(
21
+ None,
22
+ "--output", "-o",
23
+ help="Output file path (auto-generated if not specified)",
24
+ ),
25
+ traces: int = typer.Option(
26
+ 20,
27
+ "--traces", "-n",
28
+ help="Number of traces to generate",
29
+ ),
30
+ format: str = typer.Option(
31
+ "sft",
32
+ "--format", "-f",
33
+ help="Output format: sft or qa",
34
+ ),
35
+ model: str = typer.Option(
36
+ "gpt-4o-mini",
37
+ "--model", "-m",
38
+ help="Model for generation (e.g., gpt-4o-mini, claude-3-5-sonnet, gemini-2.5-flash)",
39
+ ),
40
+ ):
41
+ """
42
+ Generate training data from a policy document.
43
+
44
+ Examples:
45
+
46
+ synkro generate policy.pdf
47
+
48
+ synkro generate policies/ # Load all files from folder
49
+
50
+ synkro generate "All expenses over $50 need approval" --traces 50
51
+
52
+ synkro generate handbook.docx -o training.jsonl -n 100
53
+ """
54
+ import synkro
55
+ from synkro import Policy
56
+
57
+ # Determine if source is text, file, or URL
58
+ source_path = Path(source)
59
+
60
+ if source_path.exists():
61
+ # It's a file
62
+ policy = Policy.from_file(source_path)
63
+ elif source.startswith(("http://", "https://")):
64
+ # It's a URL
65
+ policy = Policy.from_url(source)
66
+ else:
67
+ # Treat as raw text
68
+ policy = Policy(text=source)
69
+
70
+ # Generate
71
+ dataset = synkro.generate(
72
+ policy,
73
+ traces=traces,
74
+ generation_model=model,
75
+ )
76
+
77
+ # Save
78
+ if output:
79
+ dataset.save(output, format=format)
80
+ else:
81
+ dataset.save(format=format)
82
+
83
+
84
+ @app.command()
85
+ def demo():
86
+ """
87
+ Run a quick demo with a built-in example policy.
88
+ """
89
+ import synkro
90
+ from synkro.examples import EXPENSE_POLICY
91
+ from rich.console import Console
92
+
93
+ console = Console()
94
+ console.print("\n[cyan]Running demo with built-in expense policy...[/cyan]\n")
95
+
96
+ dataset = synkro.generate(EXPENSE_POLICY, traces=5)
97
+ dataset.save("demo_output.jsonl")
98
+
99
+ console.print("\n[green]Demo complete![/green]")
100
+ console.print("[dim]Check demo_output.jsonl for the generated training data.[/dim]\n")
101
+
102
+
103
+ @app.command()
104
+ def version():
105
+ """Show version information."""
106
+ import synkro
107
+ from rich.console import Console
108
+
109
+ console = Console()
110
+ console.print(f"[cyan]synkro[/cyan] v{synkro.__version__}")
111
+
112
+
113
+ def main():
114
+ """Entry point for the CLI."""
115
+ app()
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
120
+
@@ -0,0 +1,7 @@
1
+ """Core classes for policy and dataset management."""
2
+
3
+ from synkro.core.policy import Policy
4
+ from synkro.core.dataset import Dataset
5
+
6
+ __all__ = ["Policy", "Dataset"]
7
+
synkro/core/dataset.py ADDED
@@ -0,0 +1,233 @@
1
+ """Dataset class for managing generated traces."""
2
+
3
+ import json
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Iterator
7
+
8
+ from pydantic import BaseModel, Field
9
+ from rich.console import Console
10
+
11
+ from synkro.types.core import Trace
12
+
13
+ console = Console()
14
+
15
+
16
+ class Dataset(BaseModel):
17
+ """
18
+ A collection of generated training traces.
19
+
20
+ Provides methods for filtering, saving, and exporting traces
21
+ in various formats.
22
+
23
+ Examples:
24
+ >>> dataset = generator.generate(policy, traces=100)
25
+
26
+ >>> # Filter to only passing traces
27
+ >>> passing = dataset.filter(passed=True)
28
+
29
+ >>> # Save to JSONL
30
+ >>> dataset.save("training.jsonl")
31
+
32
+ >>> # Push to HuggingFace
33
+ >>> dataset.to_huggingface().push_to_hub("my-org/dataset")
34
+ """
35
+
36
+ traces: list[Trace] = Field(default_factory=list)
37
+
38
+ class Config:
39
+ arbitrary_types_allowed = True
40
+
41
+ def __len__(self) -> int:
42
+ return len(self.traces)
43
+
44
+ def __iter__(self) -> Iterator[Trace]:
45
+ return iter(self.traces)
46
+
47
+ def __getitem__(self, idx: int) -> Trace:
48
+ return self.traces[idx]
49
+
50
+ def filter(
51
+ self,
52
+ passed: bool | None = None,
53
+ category: str | None = None,
54
+ min_length: int | None = None,
55
+ ) -> "Dataset":
56
+ """
57
+ Filter traces by criteria.
58
+
59
+ Args:
60
+ passed: Filter by grade pass/fail status
61
+ category: Filter by scenario category
62
+ min_length: Minimum response length in characters
63
+
64
+ Returns:
65
+ New Dataset with filtered traces
66
+ """
67
+ filtered = self.traces
68
+
69
+ if passed is not None:
70
+ filtered = [
71
+ t for t in filtered if t.grade and t.grade.passed == passed
72
+ ]
73
+
74
+ if category is not None:
75
+ filtered = [
76
+ t for t in filtered if t.scenario.category == category
77
+ ]
78
+
79
+ if min_length is not None:
80
+ filtered = [
81
+ t for t in filtered if len(t.assistant_message) >= min_length
82
+ ]
83
+
84
+ return Dataset(traces=filtered)
85
+
86
+ @property
87
+ def passing_rate(self) -> float:
88
+ """Get the percentage of traces that passed grading."""
89
+ if not self.traces:
90
+ return 0.0
91
+
92
+ passed = sum(1 for t in self.traces if t.grade and t.grade.passed)
93
+ return passed / len(self.traces)
94
+
95
+ @property
96
+ def categories(self) -> list[str]:
97
+ """Get unique categories in the dataset."""
98
+ return list(set(t.scenario.category for t in self.traces if t.scenario.category))
99
+
100
+ def save(self, path: str | Path | None = None, format: str = "sft") -> "Dataset":
101
+ """
102
+ Save dataset to a JSONL file.
103
+
104
+ Args:
105
+ path: Output file path (auto-generated if not provided)
106
+ format: Output format - "sft", "qa", or "tool_call"
107
+
108
+ Returns:
109
+ Self for method chaining
110
+
111
+ Example:
112
+ >>> dataset.save() # Auto-names: synkro_sft_2024-01-15.jsonl
113
+ >>> dataset.save("training.jsonl")
114
+ >>> dataset.save("qa_data.jsonl", format="qa")
115
+ >>> dataset.save("tools.jsonl", format="tool_call")
116
+ """
117
+ from synkro.formatters import SFTFormatter, QAFormatter, ToolCallFormatter
118
+
119
+ # Auto-generate filename if not provided
120
+ if path is None:
121
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
122
+ path = f"synkro_{format}_{timestamp}.jsonl"
123
+
124
+ path = Path(path)
125
+
126
+ if format == "sft":
127
+ SFTFormatter().save(self.traces, path)
128
+ elif format == "qa":
129
+ QAFormatter().save(self.traces, path)
130
+ elif format == "tool_call":
131
+ ToolCallFormatter().save(self.traces, path)
132
+ else:
133
+ raise ValueError(f"Unknown format: {format}. Use 'sft', 'qa', or 'tool_call'")
134
+
135
+ # Print confirmation
136
+ file_size = path.stat().st_size
137
+ size_str = f"{file_size / 1024:.1f} KB" if file_size < 1024 * 1024 else f"{file_size / 1024 / 1024:.1f} MB"
138
+ console.print(f"[green]📁 Saved:[/green] {path} ({size_str})")
139
+
140
+ return self
141
+
142
+ def to_jsonl(self, format: str = "sft") -> str:
143
+ """
144
+ Convert dataset to JSONL string.
145
+
146
+ Args:
147
+ format: Output format - "sft", "qa", or "tool_call"
148
+
149
+ Returns:
150
+ JSONL formatted string
151
+ """
152
+ from synkro.formatters import SFTFormatter, QAFormatter, ToolCallFormatter
153
+
154
+ if format == "sft":
155
+ return SFTFormatter().to_jsonl(self.traces)
156
+ elif format == "qa":
157
+ return QAFormatter().to_jsonl(self.traces)
158
+ elif format == "tool_call":
159
+ return ToolCallFormatter().to_jsonl(self.traces)
160
+ else:
161
+ raise ValueError(f"Unknown format: {format}. Use 'sft', 'qa', or 'tool_call'")
162
+
163
+ def to_huggingface(self):
164
+ """
165
+ Convert to HuggingFace Dataset.
166
+
167
+ Returns:
168
+ HuggingFace Dataset object
169
+
170
+ Example:
171
+ >>> hf_dataset = dataset.to_huggingface()
172
+ >>> hf_dataset.push_to_hub("my-org/policy-traces")
173
+ """
174
+ try:
175
+ from datasets import Dataset as HFDataset
176
+
177
+ # Convert to SFT format for HF
178
+ from synkro.formatters import SFTFormatter
179
+
180
+ examples = SFTFormatter(include_metadata=True).format(self.traces)
181
+ return HFDataset.from_list(examples)
182
+ except ImportError:
183
+ raise ImportError(
184
+ "datasets is required for HuggingFace export. "
185
+ "Install with: pip install datasets"
186
+ )
187
+
188
+ def to_dict(self) -> dict:
189
+ """
190
+ Convert dataset to a dictionary.
191
+
192
+ Returns:
193
+ Dictionary with trace data
194
+ """
195
+ return {
196
+ "traces": [t.model_dump() for t in self.traces],
197
+ "stats": {
198
+ "total": len(self.traces),
199
+ "passing_rate": self.passing_rate,
200
+ "categories": self.categories,
201
+ },
202
+ }
203
+
204
+ def summary(self) -> str:
205
+ """
206
+ Get a summary of the dataset.
207
+
208
+ Returns:
209
+ Human-readable summary string
210
+ """
211
+ lines = [
212
+ f"Dataset Summary",
213
+ f"===============",
214
+ f"Total traces: {len(self.traces)}",
215
+ f"Passing rate: {self.passing_rate:.1%}",
216
+ f"Categories: {len(self.categories)}",
217
+ ]
218
+
219
+ if self.categories:
220
+ lines.append("")
221
+ lines.append("By category:")
222
+ for cat in self.categories:
223
+ count = sum(1 for t in self.traces if t.scenario.category == cat)
224
+ lines.append(f" - {cat}: {count}")
225
+
226
+ return "\n".join(lines)
227
+
228
+ def __str__(self) -> str:
229
+ return f"Dataset(traces={len(self.traces)}, passing={self.passing_rate:.1%})"
230
+
231
+ def __repr__(self) -> str:
232
+ return self.__str__()
233
+