PyPI - synkro - Versions diffs - 0.4.5__py3-none-any.whl - Mend

synkro 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synkro might be problematic. Click here for more details.

Files changed (58) hide show

synkro/__init__.py +165 -0
synkro/cli.py +120 -0
synkro/core/__init__.py +7 -0
synkro/core/dataset.py +233 -0
synkro/core/policy.py +337 -0
synkro/errors.py +178 -0
synkro/examples/__init__.py +148 -0
synkro/factory.py +160 -0
synkro/formatters/__init__.py +12 -0
synkro/formatters/qa.py +85 -0
synkro/formatters/sft.py +90 -0
synkro/formatters/tool_call.py +127 -0
synkro/generation/__init__.py +9 -0
synkro/generation/generator.py +163 -0
synkro/generation/planner.py +87 -0
synkro/generation/responses.py +160 -0
synkro/generation/scenarios.py +90 -0
synkro/generation/tool_responses.py +370 -0
synkro/generation/tool_simulator.py +114 -0
synkro/llm/__init__.py +7 -0
synkro/llm/client.py +235 -0
synkro/llm/rate_limits.py +95 -0
synkro/models/__init__.py +43 -0
synkro/models/anthropic.py +26 -0
synkro/models/google.py +19 -0
synkro/models/openai.py +31 -0
synkro/modes/__init__.py +15 -0
synkro/modes/config.py +66 -0
synkro/modes/qa.py +18 -0
synkro/modes/sft.py +18 -0
synkro/modes/tool_call.py +18 -0
synkro/parsers.py +442 -0
synkro/pipeline/__init__.py +20 -0
synkro/pipeline/phases.py +237 -0
synkro/pipeline/runner.py +198 -0
synkro/pipelines.py +105 -0
synkro/prompts/__init__.py +44 -0
synkro/prompts/base.py +167 -0
synkro/prompts/qa_templates.py +97 -0
synkro/prompts/templates.py +281 -0
synkro/prompts/tool_templates.py +201 -0
synkro/quality/__init__.py +14 -0
synkro/quality/grader.py +130 -0
synkro/quality/refiner.py +137 -0
synkro/quality/tool_grader.py +126 -0
synkro/quality/tool_refiner.py +128 -0
synkro/reporting.py +213 -0
synkro/schemas.py +325 -0
synkro/types/__init__.py +41 -0
synkro/types/core.py +113 -0
synkro/types/dataset_type.py +30 -0
synkro/types/tool.py +94 -0
synkro-0.4.5.data/data/examples/__init__.py +148 -0
synkro-0.4.5.dist-info/METADATA +221 -0
synkro-0.4.5.dist-info/RECORD +58 -0
synkro-0.4.5.dist-info/WHEEL +4 -0
synkro-0.4.5.dist-info/entry_points.txt +2 -0
synkro-0.4.5.dist-info/licenses/LICENSE +21 -0

synkro/factory.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""Component factory for dependency injection.
+This module provides a factory for creating pipeline components,
+enabling testability and flexible configuration.
+"""
+from typing import TYPE_CHECKING
+from synkro.llm.client import LLM
+from synkro.modes.config import ModeConfig
+from synkro.generation.planner import Planner
+from synkro.generation.scenarios import ScenarioGenerator
+from synkro.generation.responses import ResponseGenerator
+from synkro.quality.grader import Grader
+from synkro.quality.refiner import Refiner
+if TYPE_CHECKING:
+    from synkro.types.tool import ToolDefinition
+    from synkro.generation.tool_simulator import ToolSimulator
+    from synkro.generation.tool_responses import ToolCallResponseGenerator
+    from synkro.quality.tool_grader import ToolCallGrader
+    from synkro.quality.tool_refiner import ToolCallRefiner
+class ComponentFactory:
+    """
+    Factory for creating pipeline components with shared LLM clients.
+    This centralizes component creation and ensures consistent configuration
+    across the pipeline.
+    Examples:
+        >>> factory = ComponentFactory(gen_llm, grade_llm, mode_config)
+        >>> planner = factory.create_planner()
+        >>> grader = factory.create_grader()
+        >>> # With tools for tool_call dataset type
+        >>> factory = ComponentFactory(gen_llm, grade_llm, mode_config, tools=[...])
+        >>> simulator = factory.create_tool_simulator()
+    """
+    def __init__(
+        self,
+        generation_llm: LLM,
+        grading_llm: LLM,
+        mode_config: ModeConfig,
+        tools: list["ToolDefinition"] | None = None,
+    ):
+        """
+        Initialize the factory.
+        Args:
+            generation_llm: LLM client for generation tasks (scenarios, responses, refinement)
+            grading_llm: LLM client for grading and planning (typically stronger model)
+            mode_config: Configuration for the dataset type (prompts, etc.)
+            tools: Optional list of tool definitions for tool_call dataset type
+        """
+        self.generation_llm = generation_llm
+        self.grading_llm = grading_llm
+        self.mode_config = mode_config
+        self.tools = tools or []
+    def create_planner(self) -> Planner:
+        """Create a Planner instance."""
+        return Planner(llm=self.grading_llm)
+    def create_scenario_generator(self) -> ScenarioGenerator:
+        """Create a ScenarioGenerator with mode-specific prompts."""
+        gen = ScenarioGenerator(llm=self.generation_llm)
+        gen.prompt_template = self.mode_config.scenario_prompt
+        return gen
+    def create_response_generator(self) -> ResponseGenerator:
+        """Create a ResponseGenerator with mode-specific prompts."""
+        gen = ResponseGenerator(llm=self.generation_llm)
+        gen.prompt_template = self.mode_config.response_prompt
+        return gen
+    def create_grader(self) -> "Grader | ToolCallGrader":
+        """
+        Create a Grader with mode-specific prompts.
+        Auto-selects ToolCallGrader when tools are configured.
+        """
+        if self.has_tools:
+            from synkro.quality.tool_grader import ToolCallGrader
+            return ToolCallGrader(llm=self.grading_llm, tools=self.tools)
+        grader = Grader(llm=self.grading_llm)
+        grader.prompt_template = self.mode_config.grade_prompt
+        return grader
+    def create_refiner(self) -> "Refiner | ToolCallRefiner":
+        """
+        Create a Refiner with mode-specific prompts.
+        Auto-selects ToolCallRefiner when tools are configured.
+        This ensures tool_calls format is preserved during refinement.
+        """
+        if self.has_tools:
+            from synkro.quality.tool_refiner import ToolCallRefiner
+            simulator = self.create_tool_simulator()
+            return ToolCallRefiner(
+                llm=self.generation_llm,
+                tools=self.tools,
+                simulator=simulator,
+            )
+        refiner = Refiner(llm=self.generation_llm)
+        refiner.prompt_template = self.mode_config.refine_prompt
+        return refiner
+    def create_tool_simulator(self) -> "ToolSimulator":
+        """Create a ToolSimulator instance for tool_call dataset type."""
+        from synkro.generation.tool_simulator import ToolSimulator
+        if not self.tools:
+            raise ValueError("Cannot create ToolSimulator without tools")
+        return ToolSimulator(tools=self.tools, llm=self.generation_llm)
+    def create_tool_call_response_generator(self) -> "ToolCallResponseGenerator":
+        """
+        Create a ToolCallResponseGenerator for generating proper tool call traces.
+        This generator uses JSON mode to produce structured tool calls in
+        OpenAI function calling format.
+        """
+        from synkro.generation.tool_responses import ToolCallResponseGenerator
+        if not self.tools:
+            raise ValueError("Cannot create ToolCallResponseGenerator without tools")
+        # Create simulator for generating tool responses
+        simulator = self.create_tool_simulator()
+        return ToolCallResponseGenerator(
+            tools=self.tools,
+            llm=self.generation_llm,
+            simulator=simulator,
+        )
+    def get_tools_description(self) -> str:
+        """Get formatted description of all available tools."""
+        if not self.tools:
+            return "No tools available"
+        descriptions = []
+        for tool in self.tools:
+            descriptions.append(tool.to_system_prompt())
+        return "\n\n".join(descriptions)
+    @property
+    def has_tools(self) -> bool:
+        """Check if tools are configured."""
+        return bool(self.tools)
+__all__ = ["ComponentFactory"]

synkro/formatters/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Output formatters for different training data formats."""
+from synkro.formatters.sft import SFTFormatter
+from synkro.formatters.qa import QAFormatter
+from synkro.formatters.tool_call import ToolCallFormatter
+__all__ = [
+    "SFTFormatter",
+    "QAFormatter",
+    "ToolCallFormatter",
+]

synkro/formatters/qa.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""QA (Question-Answer) formatter."""
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from synkro.types.core import Trace
+class QAFormatter:
+    """
+    Format traces for Question-Answer datasets.
+    QA format is simple question/answer pairs with optional context,
+    suitable for RAG training and knowledge extraction.
+    Example output:
+        {"question": "...", "answer": "...", "context": "..."}
+        {"question": "...", "answer": "...", "context": "..."}
+    """
+    def __init__(self, include_context: bool = True):
+        """
+        Initialize the QA formatter.
+        Args:
+            include_context: If True, include source context in output
+        """
+        self.include_context = include_context
+    def format(self, traces: list["Trace"]) -> list[dict]:
+        """
+        Format traces as QA pairs.
+        Args:
+            traces: List of traces to format
+        Returns:
+            List of QA examples (dicts with 'question', 'answer', optionally 'context')
+        """
+        examples = []
+        for trace in traces:
+            example = {
+                "question": trace.user_message,
+                "answer": trace.assistant_message,
+            }
+            if self.include_context:
+                # Use scenario context or the source section if available
+                example["context"] = trace.scenario.context or ""
+            examples.append(example)
+        return examples
+    def save(self, traces: list["Trace"], path: str | Path) -> None:
+        """
+        Save formatted traces to a JSONL file.
+        Args:
+            traces: List of traces to save
+            path: Output file path
+        """
+        path = Path(path)
+        examples = self.format(traces)
+        with open(path, "w") as f:
+            for example in examples:
+                f.write(json.dumps(example) + "\n")
+    def to_jsonl(self, traces: list["Trace"]) -> str:
+        """
+        Convert traces to JSONL string.
+        Args:
+            traces: List of traces to convert
+        Returns:
+            JSONL formatted string
+        """
+        examples = self.format(traces)
+        return "\n".join(json.dumps(e) for e in examples)

synkro/formatters/sft.py ADDED Viewed

@@ -0,0 +1,90 @@
+"""SFT (Supervised Fine-Tuning) formatter."""
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from synkro.types.core import Trace
+class SFTFormatter:
+    """
+    Format traces for Supervised Fine-Tuning (SFT).
+    SFT format is a simple array of conversations, each with messages.
+    This is the standard format used by OpenAI, HuggingFace, and most
+    fine-tuning platforms.
+    Example output:
+        {"messages": [{"role": "system", "content": "..."}, ...]}
+        {"messages": [{"role": "system", "content": "..."}, ...]}
+    """
+    def __init__(self, include_metadata: bool = False):
+        """
+        Initialize the SFT formatter.
+        Args:
+            include_metadata: If True, include trace metadata in output
+        """
+        self.include_metadata = include_metadata
+    def format(self, traces: list["Trace"]) -> list[dict]:
+        """
+        Format traces as SFT training examples.
+        Args:
+            traces: List of traces to format
+        Returns:
+            List of SFT examples (dicts with 'messages' key)
+        """
+        examples = []
+        for trace in traces:
+            example = {
+                "messages": [
+                    {"role": m.role, "content": m.content} for m in trace.messages
+                ]
+            }
+            if self.include_metadata:
+                example["metadata"] = {
+                    "scenario": trace.scenario.description,
+                    "category": trace.scenario.category,
+                    "grade": trace.grade.model_dump() if trace.grade else None,
+                }
+            examples.append(example)
+        return examples
+    def save(self, traces: list["Trace"], path: str | Path) -> None:
+        """
+        Save formatted traces to a JSONL file.
+        Args:
+            traces: List of traces to save
+            path: Output file path (should end in .jsonl)
+        """
+        path = Path(path)
+        examples = self.format(traces)
+        with open(path, "w") as f:
+            for example in examples:
+                f.write(json.dumps(example) + "\n")
+    def to_jsonl(self, traces: list["Trace"]) -> str:
+        """
+        Convert traces to JSONL string.
+        Args:
+            traces: List of traces to convert
+        Returns:
+            JSONL formatted string
+        """
+        examples = self.format(traces)
+        return "\n".join(json.dumps(e) for e in examples)

synkro/formatters/tool_call.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""Tool Call formatter for training data."""
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from synkro.types.core import Trace
+class ToolCallFormatter:
+    """
+    Format traces with tool calls for fine-tuning.
+    Outputs OpenAI function calling format compatible with most fine-tuning platforms.
+    Example output:
+        {
+          "messages": [
+            {"role": "system", "content": "You have access to: web_search(query)"},
+            {"role": "user", "content": "What's the weather in NYC?"},
+            {"role": "assistant", "content": null, "tool_calls": [
+              {"id": "call_1", "type": "function", "function": {"name": "web_search", "arguments": "{\\"query\\": \\"weather NYC\\"}"}}
+            ]},
+            {"role": "tool", "tool_call_id": "call_1", "content": "NYC: 72°F, sunny"},
+            {"role": "assistant", "content": "The weather in NYC is currently 72°F and sunny."}
+          ]
+        }
+    """
+    def __init__(self, include_metadata: bool = False):
+        """
+        Initialize the ToolCallFormatter.
+        Args:
+            include_metadata: If True, include trace metadata in output
+        """
+        self.include_metadata = include_metadata
+    def format(self, traces: list["Trace"]) -> list[dict]:
+        """
+        Format traces as tool-calling training examples.
+        Args:
+            traces: List of traces to format
+        Returns:
+            List of formatted examples with tool calls
+        """
+        examples = []
+        for trace in traces:
+            messages = []
+            for m in trace.messages:
+                msg = {"role": m.role}
+                # Handle content (can be None for tool-calling assistant messages)
+                if m.content is not None:
+                    msg["content"] = m.content
+                elif m.role == "assistant" and m.tool_calls:
+                    msg["content"] = None
+                else:
+                    msg["content"] = ""
+                # Handle tool calls
+                if m.tool_calls:
+                    msg["tool_calls"] = [
+                        {
+                            "id": tc.id,
+                            "type": tc.type,
+                            "function": {
+                                "name": tc.function.name,
+                                "arguments": tc.function.arguments,
+                            }
+                        }
+                        for tc in m.tool_calls
+                    ]
+                # Handle tool response
+                if m.tool_call_id:
+                    msg["tool_call_id"] = m.tool_call_id
+                messages.append(msg)
+            example = {"messages": messages}
+            if self.include_metadata:
+                example["metadata"] = {
+                    "scenario": trace.scenario.description,
+                    "category": trace.scenario.category,
+                    "grade": trace.grade.model_dump() if trace.grade else None,
+                    "has_tool_calls": trace.has_tool_calls,
+                }
+            examples.append(example)
+        return examples
+    def save(self, traces: list["Trace"], path: str | Path) -> None:
+        """
+        Save formatted traces to a JSONL file.
+        Args:
+            traces: List of traces to save
+            path: Output file path (should end in .jsonl)
+        """
+        path = Path(path)
+        examples = self.format(traces)
+        with open(path, "w") as f:
+            for example in examples:
+                f.write(json.dumps(example) + "\n")
+    def to_jsonl(self, traces: list["Trace"]) -> str:
+        """
+        Convert traces to JSONL string.
+        Args:
+            traces: List of traces to convert
+        Returns:
+            JSONL formatted string
+        """
+        examples = self.format(traces)
+        return "\n".join(json.dumps(e) for e in examples)

synkro/generation/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Generation components for creating training data."""
+from synkro.generation.generator import Generator
+from synkro.generation.scenarios import ScenarioGenerator
+from synkro.generation.responses import ResponseGenerator
+from synkro.generation.planner import Planner
+__all__ = ["Generator", "ScenarioGenerator", "ResponseGenerator", "Planner"]

synkro/generation/generator.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""Main Generator class orchestrating the full trace generation pipeline."""
+import asyncio
+from enum import Enum
+from typing import TYPE_CHECKING
+from synkro.llm.client import LLM
+from synkro.llm.rate_limits import auto_workers
+from synkro.models import Model, OpenAI
+from synkro.types.dataset_type import DatasetType
+from synkro.core.policy import Policy
+from synkro.core.dataset import Dataset
+from synkro.modes.config import get_mode_config
+from synkro.errors import handle_error
+from synkro.factory import ComponentFactory
+from synkro.reporting import ProgressReporter, RichReporter
+from synkro.pipeline.runner import GenerationPipeline
+if TYPE_CHECKING:
+    from synkro.types.tool import ToolDefinition
+class Generator:
+    """
+    Main orchestrator for generating training datasets.
+    The Generator handles the full pipeline:
+    1. Plan: Analyze policy and create category distribution
+    2. Generate: Create scenarios and responses
+    3. Grade: Evaluate response quality
+    4. Refine: Fix failed responses
+    5. Return: Dataset of passing traces
+    Examples:
+        >>> generator = Generator()
+        >>> dataset = generator.generate(policy, traces=20)
+        >>> # QA dataset
+        >>> generator = Generator(dataset_type=DatasetType.QA)
+        >>> dataset = generator.generate(policy)
+        >>> # Silent mode (no console output)
+        >>> from synkro.reporting import SilentReporter
+        >>> generator = Generator(reporter=SilentReporter())
+        >>> dataset = generator.generate(policy)
+        >>> # Tool call dataset
+        >>> from synkro import ToolDefinition
+        >>> tools = [ToolDefinition(name="search", description="...", parameters={})]
+        >>> generator = Generator(dataset_type=DatasetType.TOOL_CALL, tools=tools)
+        >>> dataset = generator.generate("Usage guidelines", traces=20)
+    """
+    def __init__(
+        self,
+        dataset_type: DatasetType = DatasetType.SFT,
+        generation_model: Model = OpenAI.GPT_4O_MINI,
+        grading_model: Model = OpenAI.GPT_4O,
+        max_iterations: int = 1,
+        skip_grading: bool = False,
+        reporter: ProgressReporter | None = None,
+        tools: list["ToolDefinition"] | None = None,
+    ):
+        """
+        Initialize the Generator.
+        Args:
+            dataset_type: Type of dataset to generate (QA, SFT, or TOOL_CALL)
+            generation_model: Model for scenarios/responses (default: gpt-4o-mini)
+            grading_model: Model for grading (default: gpt-4o, recommend stronger)
+            max_iterations: Max refinement iterations per trace (default: 1, no retries)
+            skip_grading: Skip grading phase for faster generation (default: False)
+            reporter: Progress reporter (default: RichReporter for console output)
+            tools: List of ToolDefinition for TOOL_CALL dataset type
+        """
+        self.dataset_type = dataset_type
+        self.mode_config = get_mode_config(dataset_type)
+        self.max_iterations = max_iterations
+        self.skip_grading = skip_grading
+        self.tools = tools
+        # Validate tools for TOOL_CALL dataset type
+        if dataset_type == DatasetType.TOOL_CALL and not tools:
+            raise ValueError("TOOL_CALL dataset type requires tools parameter")
+        # Store model info for reporting
+        self.generation_model = generation_model
+        self.grading_model = grading_model
+        # Create LLM clients
+        self.generation_llm = LLM(model=generation_model)
+        self.grading_llm = LLM(model=grading_model)
+        # Create factory for component creation
+        self.factory = ComponentFactory(
+            generation_llm=self.generation_llm,
+            grading_llm=self.grading_llm,
+            mode_config=self.mode_config,
+            tools=tools,
+        )
+        # Reporter for progress output
+        self.reporter = reporter or RichReporter()
+        # Auto-scale workers based on provider
+        model_str = generation_model.value if isinstance(generation_model, Enum) else str(generation_model)
+        self.workers = auto_workers(model_str)
+        # Create pipeline
+        self.pipeline = GenerationPipeline(
+            factory=self.factory,
+            reporter=self.reporter,
+            workers=self.workers,
+            max_iterations=max_iterations,
+            skip_grading=skip_grading,
+        )
+    @handle_error
+    def generate(self, policy: Policy | str, traces: int = 20) -> Dataset:
+        """
+        Generate a training dataset from a policy.
+        Args:
+            policy: Policy object or text string
+            traces: Target number of traces to generate (default: 20)
+        Returns:
+            Dataset with generated traces
+        """
+        if isinstance(policy, str):
+            policy = Policy(text=policy)
+        # Validate policy has enough content
+        policy.validate_length()
+        return asyncio.run(self._generate_async(policy, traces))
+    async def _generate_async(self, policy: Policy, traces: int) -> Dataset:
+        """Async implementation of generation pipeline."""
+        model_str = self.generation_model.value if isinstance(self.generation_model, Enum) else str(self.generation_model)
+        return await self.pipeline.run(
+            policy=policy,
+            traces=traces,
+            model=model_str,
+            dataset_type=self.dataset_type.value,
+        )
+    async def generate_async(self, policy: Policy | str, traces: int = 20) -> Dataset:
+        """
+        Async version of generate for use in async contexts.
+        Args:
+            policy: Policy object or text string
+            traces: Target number of traces to generate (default: 20)
+        Returns:
+            Dataset with generated traces
+        """
+        if isinstance(policy, str):
+            policy = Policy(text=policy)
+        return await self._generate_async(policy, traces)