PyPI - DeepFabric - Versions diffs - 4.4.0__py3-none-any.whl - Mend

DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

deepfabric/__init__.py +70 -0
deepfabric/__main__.py +6 -0
deepfabric/auth.py +382 -0
deepfabric/builders.py +303 -0
deepfabric/builders_agent.py +1304 -0
deepfabric/cli.py +1288 -0
deepfabric/config.py +899 -0
deepfabric/config_manager.py +251 -0
deepfabric/constants.py +94 -0
deepfabric/dataset_manager.py +534 -0
deepfabric/error_codes.py +581 -0
deepfabric/evaluation/__init__.py +47 -0
deepfabric/evaluation/backends/__init__.py +32 -0
deepfabric/evaluation/backends/ollama_backend.py +137 -0
deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
deepfabric/evaluation/backends/transformers_backend.py +326 -0
deepfabric/evaluation/evaluator.py +845 -0
deepfabric/evaluation/evaluators/__init__.py +13 -0
deepfabric/evaluation/evaluators/base.py +104 -0
deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
deepfabric/evaluation/evaluators/registry.py +66 -0
deepfabric/evaluation/inference.py +155 -0
deepfabric/evaluation/metrics.py +397 -0
deepfabric/evaluation/parser.py +304 -0
deepfabric/evaluation/reporters/__init__.py +13 -0
deepfabric/evaluation/reporters/base.py +56 -0
deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
deepfabric/evaluation/reporters/file_reporter.py +61 -0
deepfabric/evaluation/reporters/multi_reporter.py +56 -0
deepfabric/exceptions.py +67 -0
deepfabric/factory.py +26 -0
deepfabric/generator.py +1084 -0
deepfabric/graph.py +545 -0
deepfabric/hf_hub.py +214 -0
deepfabric/kaggle_hub.py +219 -0
deepfabric/llm/__init__.py +41 -0
deepfabric/llm/api_key_verifier.py +534 -0
deepfabric/llm/client.py +1206 -0
deepfabric/llm/errors.py +105 -0
deepfabric/llm/rate_limit_config.py +262 -0
deepfabric/llm/rate_limit_detector.py +278 -0
deepfabric/llm/retry_handler.py +270 -0
deepfabric/metrics.py +212 -0
deepfabric/progress.py +262 -0
deepfabric/prompts.py +290 -0
deepfabric/schemas.py +1000 -0
deepfabric/spin/__init__.py +6 -0
deepfabric/spin/client.py +263 -0
deepfabric/spin/models.py +26 -0
deepfabric/stream_simulator.py +90 -0
deepfabric/tools/__init__.py +5 -0
deepfabric/tools/defaults.py +85 -0
deepfabric/tools/loader.py +87 -0
deepfabric/tools/mcp_client.py +677 -0
deepfabric/topic_manager.py +303 -0
deepfabric/topic_model.py +20 -0
deepfabric/training/__init__.py +35 -0
deepfabric/training/api_key_prompt.py +302 -0
deepfabric/training/callback.py +363 -0
deepfabric/training/metrics_sender.py +301 -0
deepfabric/tree.py +438 -0
deepfabric/tui.py +1267 -0
deepfabric/update_checker.py +166 -0
deepfabric/utils.py +150 -0
deepfabric/validation.py +143 -0
deepfabric-4.4.0.dist-info/METADATA +702 -0
deepfabric-4.4.0.dist-info/RECORD +71 -0
deepfabric-4.4.0.dist-info/WHEEL +4 -0
deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0

deepfabric/generator.py ADDED Viewed

@@ -0,0 +1,1084 @@
+import asyncio
+import json
+import logging
+import math
+import random
+from collections.abc import AsyncGenerator
+from typing import TYPE_CHECKING, Any, Literal
+from datasets import Dataset as HFDataset
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+from .builders import ConversationBuilderFactory
+from .config import _normalize_reasoning_style
+from .constants import (
+    API_ERROR_INDICATORS,
+    DEFAULT_MAX_RETRIES,
+    DEFAULT_REQUEST_TIMEOUT,
+    DEFAULT_SAMPLE_RETRIES,
+    ENGINE_DEFAULT_BATCH_SIZE,
+    ENGINE_DEFAULT_NUM_EXAMPLES,
+    ENGINE_DEFAULT_TEMPERATURE,
+    ERROR_CATEGORIES,
+    ERROR_DATASET_FILENAME,
+    INTERRUPTED_DATASET_FILENAME,
+)
+from .error_codes import classify_error
+from .exceptions import DataSetGeneratorError
+from .llm import LLMClient
+from .metrics import trace
+from .progress import ProgressReporter
+from .prompts import (
+    AGENT_COT_MULTI_TURN_PROMPT,
+    AGENT_COT_TOOLS_PROMPT,
+    CONVERSATION_GENERATION_PROMPT,
+    FREETEXT_COT_PROMPT,
+    STRUCTURED_COT_PROMPT,
+    AgentPromptBuilder,
+)
+from .schemas import Conversation, ToolRegistry, get_conversation_schema
+from .tools import BUILTIN_TOOL_REGISTRY
+from .tools.loader import load_tools_from_dict, load_tools_from_endpoint
+from .topic_model import TopicModel
+from .utils import ensure_not_running_loop, is_validation_error
+# Handle circular import for type hints
+if TYPE_CHECKING:
+    from .topic_model import TopicModel
+logger = logging.getLogger(__name__)
+class DataSetGeneratorConfig(BaseModel):
+    """Configuration for the data engine."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    instructions: str = Field(default="", description="Additional instructions for data generation")
+    generation_system_prompt: str = Field(
+        ..., min_length=1, description="System prompt for content generation"
+    )
+    dataset_system_prompt: str | None = Field(
+        None,
+        description="System prompt that goes into the final dataset (falls back to generation_system_prompt if not provided)",
+    )
+    provider: str = Field(
+        ...,
+        min_length=1,
+        description="LLM provider (openai, anthropic, gemini, ollama)",
+    )
+    model_name: str = Field(..., min_length=1, description="Name of the model to use")
+    prompt_template: str | None = Field(default=None, description="Custom prompt template")
+    example_data: HFDataset | None = Field(
+        default=None, description="Example dataset for few-shot learning"
+    )
+    temperature: float = Field(
+        default=ENGINE_DEFAULT_TEMPERATURE,
+        ge=0.0,
+        le=2.0,
+        description="Temperature for model generation",
+    )
+    max_retries: int = Field(
+        default=DEFAULT_MAX_RETRIES,
+        ge=1,
+        le=10,
+        description="Maximum number of retries for failed requests (deprecated, use rate_limit config)",
+    )
+    max_tokens: int = Field(
+        default=2000,
+        ge=1,
+        description="Maximum tokens to generate in a single call to the llm",
+    )
+    default_batch_size: int = Field(
+        default=ENGINE_DEFAULT_BATCH_SIZE,
+        ge=1,
+        le=100,
+        description="Default batch size for generation",
+    )
+    default_num_examples: int = Field(
+        default=ENGINE_DEFAULT_NUM_EXAMPLES,
+        ge=0,
+        le=10,
+        description="Default number of examples to include",
+    )
+    request_timeout: int = Field(
+        default=DEFAULT_REQUEST_TIMEOUT,
+        ge=5,
+        le=300,
+        description="Request timeout in seconds",
+    )
+    sample_retries: int = Field(
+        default=DEFAULT_SAMPLE_RETRIES,
+        ge=0,
+        le=5,
+        description="Number of retries for individual sample validation failures",
+    )
+    sys_msg: bool = Field(default=True, description="Whether to include system message in dataset")
+    base_url: str | None = Field(
+        default=None,
+        description="Base URL for API endpoint (e.g., custom OpenAI-compatible servers)",
+    )
+    # Rate limiting configuration
+    rate_limit: dict[str, int | float | str | bool] | None = Field(
+        default=None,
+        description="Rate limiting and retry configuration (uses provider defaults if not specified)",
+    )
+    # Modular conversation configuration
+    conversation_type: Literal["basic", "chain_of_thought"] = Field(
+        default="basic",
+        description="Base conversation type: basic (simple chat), chain_of_thought (with reasoning traces)",
+    )
+    reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
+        default=None,
+        description="Reasoning style for chain_of_thought type: freetext (natural language) or agent (structured step-by-step for tool-calling). Note: 'structured' and 'hybrid' are deprecated.",
+    )
+    @field_validator("reasoning_style", mode="before")
+    @classmethod
+    def normalize_reasoning_style(cls, v: str | None) -> str | None:
+        """Normalize deprecated reasoning_style values."""
+        return _normalize_reasoning_style(v)
+    agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
+        default=None,
+        description="Agent mode: single_turn (one-shot tool use), multi_turn (extended agent conversations). Requires tools to be configured.",
+    )
+    # Tool configuration (used when agent_mode is enabled or for tool_calling)
+    tool_components: dict[str, list[str]] = Field(
+        default_factory=dict,
+        description=(
+            "Map of component name to tool names. 'builtin' uses built-in tools "
+            "and routes to /vfs/execute. Other components load from tools_endpoint "
+            "and route to /{component}/execute."
+        ),
+    )
+    custom_tools: list[dict] = Field(
+        default_factory=list, description="Custom tool definitions as dictionaries"
+    )
+    max_tools_per_query: int = Field(
+        default=3, ge=1, le=10, description="Maximum number of tools per query/turn"
+    )
+    max_tools_strict: bool = Field(
+        default=True,
+        description="If True, discard samples exceeding max_tools_per_query. If False, keep sample but truncate executions to limit.",
+    )
+    # Spin integration for real tool execution
+    spin_endpoint: str | None = Field(
+        default=None,
+        description="Spin service URL for real tool execution (e.g., 'http://localhost:3000')",
+    )
+    scenario_seed: dict | None = Field(
+        default=None,
+        description="Initial state to seed into Spin VFS before generation (e.g., {'files': {'main.py': '...'}})",
+    )
+    max_agent_steps: int = Field(
+        default=5,
+        ge=1,
+        le=10,
+        description="Maximum ReAct reasoning steps per sample before forcing conclusion",
+    )
+    # MCP/Mock tool integration - load tools from HTTP endpoint instead of code
+    tools_endpoint: str | None = Field(
+        default=None,
+        description="HTTP endpoint to load tool definitions from (e.g., 'http://localhost:3000/mock/list-tools'). Tools are loaded in MCP format.",
+    )
+    tool_execute_path: str | None = Field(
+        default=None,
+        description="Path for tool execution when using tools_endpoint (e.g., '/mock/execute'). Combined with spin_endpoint.",
+    )
+    # Multi-turn configuration (used when agent_mode="multi_turn")
+    min_turns: int = Field(
+        default=2,
+        ge=1,
+        le=10,
+        description="Minimum number of conversation turns for multi-turn agent mode",
+    )
+    max_turns: int = Field(
+        default=4,
+        ge=1,
+        le=10,
+        description="Maximum number of conversation turns for multi-turn agent mode",
+    )
+    min_tool_calls: int = Field(
+        default=2,
+        ge=0,
+        le=20,
+        description="Minimum number of tool calls required before allowing early conversation conclusion",
+    )
+class DataSetGenerator:
+    def __init__(self, **kwargs):
+        """Initialize DataSetGenerator with parameters."""
+        try:
+            self.config = DataSetGeneratorConfig.model_validate(kwargs)
+        except Exception as e:  # noqa: TRY003
+            raise DataSetGeneratorError(f"Invalid generator configuration: {str(e)}") from e
+        # Initialize from config
+        self.provider = self.config.provider
+        self.model_name = self.config.model_name
+        self._samples: list[dict] = []
+        self.failed_samples = []
+        self.failure_analysis = {category: [] for category in ERROR_CATEGORIES}
+        # Initialize LLM client with rate limiting configuration
+        llm_kwargs: dict[str, Any] = {"rate_limit_config": self.config.rate_limit}
+        if self.config.base_url:
+            llm_kwargs["base_url"] = self.config.base_url
+        self.llm_client = LLMClient(
+            provider=self.provider,
+            model_name=self.model_name,
+            **llm_kwargs,
+        )
+        trace(
+            "generator_created",
+            {
+                "provider": self.provider,
+                "model_name": self.model_name,
+                "conversation_type": self.config.conversation_type,
+            },
+        )
+        # Store dataset system prompt for dataset inclusion (with fallback)
+        self.dataset_system_prompt = (
+            self.config.dataset_system_prompt or self.config.generation_system_prompt
+        )
+        # Store generation prompt for content generation
+        self.generation_prompt = self.config.generation_system_prompt
+        # Initialize tool registry when agent_mode is enabled or tools are configured
+        self.tool_registry = None
+        if (
+            self.config.agent_mode is not None
+            or self.config.tool_components
+            or self.config.custom_tools
+        ):
+            self._initialize_tool_registry()
+        # Progress reporter for streaming feedback (set by external callers)
+        self.progress_reporter: ProgressReporter | None = None
+    def _initialize_tool_registry(self):
+        """Initialize tool registry from component configuration.
+        Tools are loaded based on the tool_components mapping:
+        - 'builtin': Uses BUILTIN_TOOL_REGISTRY (read_file, write_file, etc.)
+        - Other components: Loads from tools_endpoint and sets component field
+        Each tool's component field determines routing (/{component}/execute).
+        """
+        try:
+            all_tools = []
+            endpoint_registry = None
+            # Load tools from endpoint if needed for non-builtin components
+            non_builtin_components = {
+                k: v for k, v in self.config.tool_components.items() if k != "builtin"
+            }
+            if non_builtin_components:
+                if not self.config.tools_endpoint:
+                    raise DataSetGeneratorError(
+                        f"Non-builtin components {list(non_builtin_components.keys())} require "
+                        "'tools_endpoint' to load tool definitions."
+                    )
+                endpoint_registry = load_tools_from_endpoint(self.config.tools_endpoint)
+                logger.info(
+                    "Loaded %d tools from endpoint: %s",
+                    len(endpoint_registry.tools),
+                    self.config.tools_endpoint,
+                )
+            # Process each component
+            for component_name, tool_names in self.config.tool_components.items():
+                if component_name == "builtin":
+                    # Filter from builtin registry
+                    for tool in BUILTIN_TOOL_REGISTRY.tools:
+                        if tool.name in tool_names:
+                            all_tools.append(tool)
+                elif endpoint_registry:
+                    # Filter from endpoint registry and set component
+                    for tool in endpoint_registry.tools:
+                        if tool.name in tool_names:
+                            # Create copy with component set
+                            tool_copy = tool.model_copy(update={"component": component_name})
+                            all_tools.append(tool_copy)
+            # Add custom tools if provided
+            if self.config.custom_tools:
+                custom_registry = load_tools_from_dict(self.config.custom_tools)
+                all_tools.extend(custom_registry.tools)
+            self.tool_registry = ToolRegistry(tools=all_tools)
+            logger.info("Initialized tool registry with %d tools", len(all_tools))
+        except Exception as e:  # noqa: BLE001
+            raise DataSetGeneratorError(f"Failed to initialize tool registry: {str(e)}") from e
+    def _validate_create_data_params(
+        self,
+        num_steps: int,
+        batch_size: int,
+        topic_model: "TopicModel | None" = None,
+    ) -> None:
+        """Validate parameters for data creation."""
+        if num_steps is None or num_steps <= 0:
+            raise DataSetGeneratorError("num_steps must be a positive integer")
+        if batch_size <= 0:
+            raise DataSetGeneratorError("batch_size must be a positive integer")
+        if topic_model and len(topic_model.get_all_paths()) == 0:
+            raise DataSetGeneratorError(
+                "Topic model has no paths. Ensure the topic tree was built successfully."
+            )
+    def _prepare_topic_paths(
+        self,
+        num_steps: int,
+        batch_size: int,
+        topic_model: "TopicModel | None" = None,
+    ) -> tuple[list | None, int]:
+        """Prepare and validate topic paths for data generation."""
+        topic_paths = None
+        if topic_model is not None:
+            topic_paths = topic_model.get_all_paths()
+            total_paths = len(topic_paths)
+            required_samples = num_steps * batch_size
+            if required_samples > total_paths:
+                # Provide detailed error with recommendations
+                max_steps_for_batch = total_paths // batch_size
+                max_batch_for_steps = total_paths // num_steps if num_steps > 0 else total_paths
+                error_msg = (
+                    f"Insufficient topic paths for dataset generation:\n"
+                    f"  • Available paths: {total_paths}\n"
+                    f"  • Requested samples: {required_samples} ({num_steps} steps × {batch_size} batch size)\n"
+                    f"  • Shortfall: {required_samples - total_paths} samples\n\n"
+                    f"Recommendations:\n"
+                    f"  • Reduce --num-steps to {max_steps_for_batch} (with current batch size {batch_size})\n"
+                    f"  • Reduce --batch-size to {max_batch_for_steps} (with current {num_steps} steps)\n"
+                    f"  • Increase topic tree/graph depth or degree to generate more paths"
+                )
+                raise DataSetGeneratorError(error_msg)
+            # Bandit: not a security function
+            topic_paths = random.sample(topic_paths, required_samples)  # nosec
+            num_steps = math.ceil(len(topic_paths) / batch_size)
+        return topic_paths, num_steps
+    def _generate_batch_prompts(
+        self,
+        batch_size: int,
+        start_idx: int,
+        topic_paths: list,
+        data_creation_prompt: str,
+        num_example_demonstrations: int,
+    ) -> tuple[list[str], list[list[str] | None]]:
+        """Generate prompts for a batch and return the associated paths used.
+        Returns:
+            (prompts, used_paths) where used_paths aligns with prompts order.
+        """
+        prompts: list[str] = []
+        used_paths: list[list[str] | None] = []
+        for i in range(batch_size):
+            path = None
+            if topic_paths:
+                current_idx = start_idx + i
+                if current_idx < len(topic_paths):
+                    path = topic_paths[current_idx]
+                else:
+                    break
+            sample_prompt = self.build_prompt(
+                data_creation_prompt=data_creation_prompt,
+                num_example_demonstrations=num_example_demonstrations,
+                subtopics_list=path,
+            )
+            prompts.append(sample_prompt)
+            used_paths.append(path)
+        return prompts, used_paths
+    def _get_minimal_schema(self) -> type:
+        """Get the conversation schema for the current config."""
+        return get_conversation_schema(self.config.conversation_type)
+    def _emit_retry(
+        self,
+        sample_idx: int,
+        attempt: int,
+        max_attempts: int,
+        error: Exception | str,
+    ) -> None:
+        """Emit a retry event if a progress reporter is attached.
+        Args:
+            sample_idx: 0-based sample index (will be converted to 1-based)
+            attempt: 0-based attempt number (will be converted to 1-based)
+            max_attempts: Total number of attempts allowed
+            error: The error that triggered the retry
+        """
+        if self.progress_reporter:
+            self.progress_reporter.emit_retry(
+                sample_idx=sample_idx + 1,
+                attempt=attempt + 1,
+                max_attempts=max_attempts,
+                error_summary=str(error)[:100],
+            )
+    async def _generate_structured_samples_async(
+        self,
+        prompts: list[str],
+        include_sys_msg: bool,
+        start_sample_idx: int = 0,
+        paths_for_batch: list[list[str] | None] | None = None,
+    ) -> tuple[list, list]:
+        """Generate structured samples using builder pattern.
+        Args:
+            prompts: List of topic prompts to generate samples for
+            include_sys_msg: Whether to include system message in output
+            start_sample_idx: Starting sample index for progress reporting
+        Returns:
+            Tuple of (successful samples, failed responses)
+        """
+        samples = []
+        failed_responses = []
+        # Create config with overridden sys_msg if needed
+        config = self.config
+        if include_sys_msg != self.config.sys_msg:
+            # Create a copy of config with sys_msg overridden
+            config = self.config.model_copy(update={"sys_msg": include_sys_msg})
+        async def _generate_with_retry(
+            prompt: str, sample_idx: int, path_info: list[str] | None
+        ) -> tuple[bool, Exception | Conversation]:
+            """Generate a single sample with per-sample retry for validation errors.
+            Each parallel task gets its own builder instance to avoid Spin session
+            conflicts when running samples concurrently (batch_size > 1).
+            """
+            # Create a fresh builder for this sample to avoid session conflicts
+            # when running in parallel batches
+            builder = ConversationBuilderFactory.create(
+                config=config,
+                llm=self.llm_client,
+                tool_registry=self.tool_registry,
+                progress_reporter=self.progress_reporter,
+            )
+            last_error: Exception | None = None
+            error_feedback: str | None = None
+            max_attempts = self.config.sample_retries + 1
+            logger.debug(
+                "Sample %d: max_attempts=%d (sample_retries=%d)",
+                sample_idx + 1,
+                max_attempts,
+                self.config.sample_retries,
+            )
+            for attempt in range(max_attempts):
+                # Notify progress reporter about which sample we're working on
+                if self.progress_reporter:
+                    retry_suffix = f" (retry {attempt})" if attempt > 0 else ""
+                    self.progress_reporter.emit_step_start(
+                        f"Generating sample {sample_idx + 1}{retry_suffix}",
+                        sample_idx=sample_idx + 1,
+                        topic_path=path_info,
+                    )
+                try:
+                    # Builder handles all generation complexity
+                    # Pass error feedback from previous attempt if this is a retry
+                    conversation = await builder.generate(prompt, error_feedback)
+                except Exception as e:  # noqa: BLE001
+                    last_error = e
+                    is_validation = is_validation_error(e)
+                    can_retry = attempt < self.config.sample_retries
+                    logger.debug(
+                        "Sample %d error: is_validation=%s, can_retry=%s, attempt=%d/%d, error=%s",
+                        sample_idx + 1,
+                        is_validation,
+                        can_retry,
+                        attempt + 1,
+                        self.config.sample_retries + 1,
+                        str(e)[:200],
+                    )
+                    # Only retry validation errors, not API/network errors
+                    if is_validation and can_retry:
+                        # Extract error message for feedback to the model
+                        error_feedback = str(e)
+                        self._emit_retry(sample_idx, attempt, max_attempts, e)
+                        continue
+                    # Non-retryable error or exhausted retries
+                    return False, last_error or Exception("Sample generation failed")
+                else:
+                    # Validate tool execution count for agent modes
+                    if self.config.agent_mode is not None:
+                        if (
+                            not conversation.tool_context
+                            or not conversation.tool_context.executions
+                        ):
+                            last_error = ValueError(
+                                "Agent mode requires at least one tool execution"
+                            )
+                            if attempt < self.config.sample_retries:
+                                self._emit_retry(sample_idx, attempt, max_attempts, last_error)
+                                continue
+                            return False, last_error or Exception("Sample generation failed")
+                        num_executions = len(conversation.tool_context.executions)
+                        if num_executions > self.config.max_tools_per_query:
+                            if self.config.max_tools_strict:
+                                # Strict mode: discard entire sample
+                                last_error = ValueError(
+                                    f"Sample has {num_executions} tool executions, "
+                                    f"exceeds limit of {self.config.max_tools_per_query}"
+                                )
+                                if attempt < self.config.sample_retries:
+                                    self._emit_retry(sample_idx, attempt, max_attempts, last_error)
+                                    continue
+                                return False, last_error or Exception("Sample generation failed")
+                            # Non-strict mode: truncate to limit and keep sample
+                            conversation.tool_context.executions = (
+                                conversation.tool_context.executions[
+                                    : self.config.max_tools_per_query
+                                ]
+                            )
+                    return True, conversation
+            return False, last_error or Exception("Sample generation failed")
+        # Generate all samples concurrently with sample indices
+        tasks = []
+        for idx, prompt in enumerate(prompts):
+            path_info = None
+            if paths_for_batch and idx < len(paths_for_batch):
+                path_info = paths_for_batch[idx]
+            tasks.append(
+                asyncio.create_task(_generate_with_retry(prompt, start_sample_idx + idx, path_info))
+            )
+        results = await asyncio.gather(*tasks)
+        for idx, (success, payload) in enumerate(results):
+            if success:
+                samples.append(payload)
+            else:
+                error = payload
+                error_msg = f"Generation failed: {error}"
+                # Build failure record with raw content if available
+                failure_record = {"error": error_msg}
+                if isinstance(error, Exception):
+                    context = getattr(error, "context", None)
+                    if isinstance(context, dict) and "raw_content" in context:
+                        failure_record["raw_content"] = context["raw_content"]
+                failed_responses.append(failure_record)
+                failure_type = self.analyze_failure(
+                    str(error), error=error if isinstance(error, Exception) else None
+                )
+                self.failure_analysis[failure_type].append(error_msg)
+                # Classify and emit error to progress reporter
+                classified = classify_error(
+                    error if isinstance(error, Exception) else str(error),
+                    provider=self.provider,
+                    context={"error_type": failure_type},
+                )
+                if self.progress_reporter:
+                    self.progress_reporter.emit_error(
+                        classified,
+                        sample_idx=start_sample_idx + idx,
+                    )
+        return samples, failed_responses
+    def analyze_failure(self, response_content: str, error: Exception | None = None) -> str:
+        """Analyze the failure reason for a sample."""
+        if error:
+            error_str = str(error)
+            if "schema" in error_str.lower():
+                return "invalid_schema"
+            if any(api_err in error_str.lower() for api_err in API_ERROR_INDICATORS):
+                return "api_errors"
+            return "other_errors"
+        if not response_content or response_content.isspace():
+            return "empty_responses"
+        # Check if response seems to be attempting JSON but failing
+        if any(char in response_content for char in "{}[]"):
+            return "json_parsing_errors"
+        return "malformed_responses"
+    def summarize_failures(self) -> dict:
+        """Generate a summary of all failures."""
+        summary = {
+            "total_failures": len(self.failed_samples),
+            "failure_types": {k: len(v) for k, v in self.failure_analysis.items()},
+            "failure_examples": {},
+        }
+        # Add example failures for each category
+        for _category, failures in self.failure_analysis.items():
+            if failures:
+                # Get up to 3 examples for each category
+                examples = failures[:3]
+                summary["failure_examples"].append(
+                    (
+                        str(ex)[:200] + "..."
+                        if len(str(ex)) > 200  # noqa: PLR2004
+                        else str(ex)
+                    )
+                    for ex in examples
+                )
+        return summary
+    def create_data(
+        self,
+        num_steps: int | None = None,
+        num_example_demonstrations: int = 3,
+        batch_size: int = 10,
+        topic_model: TopicModel | None = None,
+        model_name: str | None = None,
+        sys_msg: bool | None = None,
+    ):
+        ensure_not_running_loop("DataSetGenerator.create_data")
+        return asyncio.run(
+            self.create_data_async(
+                num_steps=num_steps,
+                num_example_demonstrations=num_example_demonstrations,
+                batch_size=batch_size,
+                topic_model=topic_model,
+                model_name=model_name,
+                sys_msg=sys_msg,
+            )
+        )
+    def create_data_with_events(
+        self,
+        num_steps: int | None = None,
+        num_example_demonstrations: int = 3,
+        batch_size: int = 10,
+        topic_model: TopicModel | None = None,
+        model_name: str | None = None,
+        sys_msg: bool | None = None,
+    ):
+        ensure_not_running_loop("DataSetGenerator.create_data_with_events")
+        async def _async_generator() -> AsyncGenerator[dict | HFDataset, None]:
+            async for event in self.create_data_with_events_async(
+                num_steps=num_steps,
+                num_example_demonstrations=num_example_demonstrations,
+                batch_size=batch_size,
+                topic_model=topic_model,
+                model_name=model_name,
+                sys_msg=sys_msg,
+            ):
+                yield event
+        agen = _async_generator()
+        def _sync_generator():
+            loop = asyncio.new_event_loop()
+            try:
+                while True:
+                    try:
+                        event = loop.run_until_complete(agen.__anext__())
+                    except StopAsyncIteration:
+                        break
+                    else:
+                        yield event
+            finally:
+                loop.run_until_complete(agen.aclose())
+                loop.close()
+        return _sync_generator()
+    async def create_data_async(
+        self,
+        num_steps: int | None = None,
+        num_example_demonstrations: int = 3,
+        batch_size: int = 10,
+        topic_model: TopicModel | None = None,
+        model_name: str | None = None,
+        sys_msg: bool | None = None,
+    ) -> HFDataset:
+        if num_steps is None:
+            num_steps = 1
+        self._validate_create_data_params(num_steps, batch_size, topic_model)
+        if model_name:
+            self.model_name = model_name.strip()
+        if not self.model_name:
+            raise DataSetGeneratorError("")
+        include_sys_msg = sys_msg if sys_msg is not None else self.config.sys_msg
+        topic_paths, num_steps = self._prepare_topic_paths(num_steps, batch_size, topic_model)
+        total_samples = num_steps * batch_size
+        data_creation_prompt = self._get_cot_prompt_template()
+        final_result: HFDataset | dict | None = None
+        async for event in self._run_generation_loop_async(
+            num_steps=num_steps,
+            batch_size=batch_size,
+            total_samples=total_samples,
+            topic_paths=topic_paths or [],
+            data_creation_prompt=data_creation_prompt,
+            num_example_demonstrations=num_example_demonstrations,
+            include_sys_msg=include_sys_msg,
+        ):
+            final_result = event
+        if isinstance(final_result, HFDataset):
+            trace(
+                "dataset_created",
+                {
+                    "provider": self.provider,
+                    "model_name": self.model_name,
+                    "conversation_type": self.config.conversation_type,
+                    "samples_count": len(final_result),
+                    "failed_samples": len(self.failed_samples),
+                    "success": len(final_result) > 0,
+                },
+            )
+            return final_result
+        msg = "Dataset generation failed"
+        raise DataSetGeneratorError(msg)
+    async def create_data_with_events_async(
+        self,
+        num_steps: int | None = None,
+        num_example_demonstrations: int = 3,
+        batch_size: int = 10,
+        topic_model: TopicModel | None = None,
+        model_name: str | None = None,
+        sys_msg: bool | None = None,
+    ) -> AsyncGenerator[dict | HFDataset, None]:
+        if num_steps is None:
+            num_steps = 1
+        self._validate_create_data_params(num_steps, batch_size, topic_model)
+        if model_name:
+            self.model_name = model_name.strip()
+        if not self.model_name:
+            raise DataSetGeneratorError("")
+        include_sys_msg = sys_msg if sys_msg is not None else self.config.sys_msg
+        topic_paths, num_steps = self._prepare_topic_paths(num_steps, batch_size, topic_model)
+        total_samples = num_steps * batch_size
+        data_creation_prompt = self._get_cot_prompt_template()
+        root_topic_prompt = None
+        topic_model_type = None
+        if topic_model is not None:
+            root_topic_prompt = getattr(topic_model, "topic_prompt", None)
+            topic_model_type = type(topic_model).__name__.lower()
+        async for event in self._run_generation_loop_async(
+            num_steps=num_steps,
+            batch_size=batch_size,
+            total_samples=total_samples,
+            topic_paths=topic_paths or [],
+            data_creation_prompt=data_creation_prompt,
+            num_example_demonstrations=num_example_demonstrations,
+            include_sys_msg=include_sys_msg,
+            root_topic_prompt=root_topic_prompt,
+            topic_model_type=topic_model_type,
+        ):
+            yield event
+    async def _run_generation_loop_async(  # noqa: PLR0912
+        self,
+        num_steps: int,
+        batch_size: int,
+        total_samples: int,
+        topic_paths: list,
+        data_creation_prompt: str,
+        num_example_demonstrations: int,
+        include_sys_msg: bool,
+        root_topic_prompt: str | None = None,
+        topic_model_type: str | None = None,
+    ) -> AsyncGenerator[dict | HFDataset, None]:
+        """Run the main generation loop yielding progress events."""
+        try:
+            yield {
+                "event": "generation_start",
+                "model_name": self.model_name,
+                "num_steps": num_steps,
+                "batch_size": batch_size,
+                "total_samples": total_samples,
+                "root_topic_prompt": root_topic_prompt,
+                "topic_model_type": topic_model_type,
+            }
+            for step in range(num_steps):
+                yield {
+                    "event": "step_start",
+                    "step": step + 1,
+                    "total_steps": num_steps,
+                }
+                start_idx = step * batch_size
+                prompts, used_paths = self._generate_batch_prompts(
+                    batch_size,
+                    start_idx,
+                    topic_paths,
+                    data_creation_prompt,
+                    num_example_demonstrations,
+                )
+                failed_before = len(self.failed_samples)
+                success, samples_generated = await self._process_batch_with_retries_async(
+                    prompts, include_sys_msg, start_idx, used_paths
+                )
+                failed_in_batch = len(self.failed_samples) - failed_before
+                failure_reasons = []
+                if failed_in_batch > 0 and self.failed_samples:
+                    recent_failures = self.failed_samples[-failed_in_batch:]
+                    failure_reasons = recent_failures[:3]
+                yield {
+                    "event": "step_complete",
+                    "step": step + 1,
+                    "samples_generated": samples_generated,
+                    "success": success,
+                    "failed_in_step": failed_in_batch,
+                    "failure_reasons": failure_reasons,
+                }
+                if not success:
+                    yield {
+                        "event": "step_failed",
+                        "step": step + 1,
+                        "message": f"Failed to process batch {step + 1} after all retries",
+                    }
+            yield {
+                "event": "generation_complete",
+                "total_samples": len(self._samples),
+                "failed_samples": len(self.failed_samples),
+            }
+        except KeyboardInterrupt:
+            yield {
+                "event": "generation_interrupted",
+                "message": "Generation interrupted by user.",
+            }
+            self.print_failure_summary()
+            self._save_samples_to_file(INTERRUPTED_DATASET_FILENAME)
+        except Exception as e:  # noqa: BLE001
+            yield {"event": "generation_error", "error": str(e)}
+            self.print_failure_summary()
+            self._save_samples_to_file(ERROR_DATASET_FILENAME)
+            raise DataSetGeneratorError("failed") from e
+        yield (HFDataset.from_list(self._samples) if self._samples else HFDataset.from_list([]))
+    async def _process_batch_with_retries_async(
+        self,
+        prompts: list[str],
+        include_sys_msg: bool,
+        start_sample_idx: int = 0,
+        paths_for_batch: list[list[str] | None] | None = None,
+    ) -> tuple[bool, int]:
+        """Process a batch with retry logic."""
+        for attempt in range(self.config.max_retries):
+            try:
+                samples, failed_responses = await self._generate_structured_samples_async(
+                    prompts, include_sys_msg, start_sample_idx, paths_for_batch
+                )
+                # Update failed samples
+                self.failed_samples.extend(failed_responses)
+                if samples:
+                    # Convert Pydantic models to dicts and add to samples list
+                    sample_dicts = [s.model_dump(exclude_none=True) for s in samples]
+                    self._samples.extend(sample_dicts)
+                    return True, len(samples)  # Success - exit retry loop
+            except DataSetGeneratorError as e:
+                # Authentication and API errors are now wrapped in DataSetGeneratorError
+                error_str = str(e).lower()
+                if any(
+                    keyword in error_str
+                    for keyword in [
+                        "api_key",
+                        "api key",
+                        "authentication",
+                        "unauthorized",
+                    ]
+                ):
+                    error_msg = f"Authentication failed for provider '{self.provider}'. Please set the required API key environment variable."
+                    self.failure_analysis["authentication_error"].append(error_msg)
+                else:
+                    error_msg = f"API error for provider '{self.provider}': {str(e)[:100]}..."
+                    self.failure_analysis["api_errors"].append(error_msg)
+                self.failed_samples.append(error_msg)
+                logger.exception("API error: %s", error_msg)
+                return False, 0  # Don't retry authentication/API errors
+            except Exception as e:
+                if attempt == self.config.max_retries - 1:
+                    self.failed_samples.append(str(e))
+                    failure_type = self.analyze_failure(str(e), error=e)
+                    self.failure_analysis[failure_type].append(str(e))
+                    return False, 0
+            else:
+                # If no exception and no samples, return False, 0
+                return False, 0
+        return False, 0
+    def print_failure_summary(self):
+        """Print a detailed summary of all failures."""
+        summary = self.summarize_failures()
+        print("\n=== Failure Analysis Summary ===")
+        print(f"Total Failed Samples: {summary['total_failures']}")
+        print("\nFailure Types Breakdown:")
+        for failure_type, count in summary["failure_types"].items():
+            if count > 0:
+                print(f"\n{failure_type.replace('_', ' ').title()}: {count}")
+                if failure_type in summary["failure_examples"]:
+                    print("Example failures:")
+                    for i, example in enumerate(
+                        summary["failure_examples"].get(failure_type, []), 1
+                    ):
+                        print(f"  {i}. {example}")
+        print("\n=============================")
+    def build_prompt(
+        self,
+        data_creation_prompt: str,
+        num_example_demonstrations: int,
+        subtopics_list: list[str] | None = None,
+    ) -> str:
+        prompt = data_creation_prompt.replace("{{{{system_prompt}}}}", self.generation_prompt)
+        prompt = prompt.replace("{{{{instructions}}}}", self.build_custom_instructions_text())
+        prompt = prompt.replace(
+            "{{{{examples}}}}", self.build_examples_text(num_example_demonstrations)
+        )
+        return prompt.replace("{{{{subtopics}}}}", self.build_subtopics_text(subtopics_list))
+    def build_system_prompt(self):
+        """Return the original system prompt for dataset inclusion."""
+        return self.dataset_system_prompt
+    def build_custom_instructions_text(self) -> str:
+        if self.config.instructions is None or self.config.instructions == "":
+            return ""
+        return f"\nHere are additional instructions:\n<instructions>\n{self.config.instructions}\n</instructions>\n"
+    def build_examples_text(self, num_example_demonstrations: int):
+        if self.config.example_data is None or num_example_demonstrations == 0:
+            return ""
+        # Bandit: not a security function
+        # HF Dataset supports len() and indexing, convert to list for sampling
+        example_list = list(self.config.example_data)
+        examples = random.sample(example_list, min(num_example_demonstrations, len(example_list)))  # nosec
+        examples_text = "Here are output examples:\n\n"
+        examples_text += "\n".join(f"Example {i + 1}: \n\n{ex}\n" for i, ex in enumerate(examples))
+        return f"\nHere are output examples:\n<examples>\n{examples_text}\n</examples>\n"
+    def build_tools_text(self) -> str:
+        """Build formatted tools text for XLAM multi-turn prompts."""
+        if not self.tool_registry:
+            return "No tools available"
+        tools_text = []
+        for tool in self.tool_registry.tools:
+            params_text = []
+            for param in tool.parameters:
+                req = " (required)" if param.required else " (optional)"
+                params_text.append(f"  - {param.name} ({param.type}){req}: {param.description}")
+            tool_text = f"• {tool.name}: {tool.description}\n  Parameters:\n" + "\n".join(
+                params_text
+            )
+            tools_text.append(tool_text)
+        return "\n\n".join(tools_text)
+    def build_subtopics_text(self, subtopic_list: list[str] | None):
+        if subtopic_list is None:
+            return ""
+        return f"\nLastly, the topic of the training data should be related to the following subtopics: {' -> '.join(subtopic_list)}"
+    def _get_cot_prompt_template(self) -> str:  # noqa: PLR0911
+        """Get the appropriate prompt template based on modular configuration."""
+        # Handle basic conversations
+        if self.config.conversation_type == "basic":
+            return CONVERSATION_GENERATION_PROMPT
+        # Handle chain of thought conversations
+        if self.config.conversation_type == "chain_of_thought":
+            # Agent mode with tools - use agent prompts
+            if self.config.agent_mode == "single_turn" and self.tool_registry:
+                # Use agent prompt for single-turn tool calling
+                return (
+                    AgentPromptBuilder.build_tool_context_prompt(
+                        self.tool_registry,
+                        max_tools_per_query=self.config.max_tools_per_query,
+                    )
+                    or AGENT_COT_TOOLS_PROMPT
+                )
+            if self.config.agent_mode == "multi_turn" and self.tool_registry:
+                # Standard multi-turn agent
+                return (
+                    AgentPromptBuilder.build_multi_turn_context_prompt(
+                        self.tool_registry,
+                        max_tools_per_query=self.config.max_tools_per_query,
+                    )
+                    or AGENT_COT_MULTI_TURN_PROMPT
+                )
+            # Non-agent CoT - select based on reasoning style
+            if self.config.reasoning_style == "freetext":
+                return FREETEXT_COT_PROMPT
+            if self.config.reasoning_style == "agent":
+                return STRUCTURED_COT_PROMPT
+        # Fallback to basic conversation prompt
+        return CONVERSATION_GENERATION_PROMPT
+    def _save_samples_to_file(self, save_path: str):
+        """Save the current samples to a JSONL file."""
+        with open(save_path, "w") as f:
+            for sample in self._samples:
+                f.write(json.dumps(sample, separators=(",", ":")) + "\n")
+    def save_dataset(self, save_path: str):
+        """Save the dataset to a JSONL file."""
+        self._save_samples_to_file(save_path)