PyPI - DeepFabric - Versions diffs - 4.9.0__py3-none-any.whl → 4.10.0__py3-none-any.whl - Mend

DeepFabric 4.9.0py3-none-any.whl → 4.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

deepfabric/builders.py +7 -21
deepfabric/builders_agent.py +0 -542
deepfabric/cli.py +505 -74
deepfabric/config.py +57 -73
deepfabric/config_manager.py +8 -6
deepfabric/constants.py +6 -0
deepfabric/dataset_manager.py +107 -11
deepfabric/evaluation/parser.py +7 -7
deepfabric/generator.py +656 -103
deepfabric/graph.py +46 -1
deepfabric/prompts.py +0 -39
deepfabric/schemas.py +4 -3
deepfabric/topic_model.py +32 -0
deepfabric/tree.py +23 -1
deepfabric/tui.py +66 -21
deepfabric/utils.py +184 -0
deepfabric/validation.py +47 -77
{deepfabric-4.9.0.dist-info → deepfabric-4.10.0.dist-info}/METADATA +5 -6
{deepfabric-4.9.0.dist-info → deepfabric-4.10.0.dist-info}/RECORD +22 -22
{deepfabric-4.9.0.dist-info → deepfabric-4.10.0.dist-info}/WHEEL +0 -0
{deepfabric-4.9.0.dist-info → deepfabric-4.10.0.dist-info}/entry_points.txt +0 -0
{deepfabric-4.9.0.dist-info → deepfabric-4.10.0.dist-info}/licenses/LICENSE +0 -0

deepfabric/generator.py CHANGED Viewed

@@ -2,9 +2,12 @@ import asyncio
 import json
 import logging
 import math
+import os
 import random
 from collections.abc import AsyncGenerator
+from datetime import datetime, timezone
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 from datasets import Dataset as HFDataset
@@ -14,6 +17,10 @@ from .builders import ConversationBuilderFactory
 from .config import _normalize_reasoning_style
 from .constants import (
     API_ERROR_INDICATORS,
+    CHECKPOINT_FAILURES_SUFFIX,
+    CHECKPOINT_METADATA_SUFFIX,
+    CHECKPOINT_SAMPLES_SUFFIX,
+    CHECKPOINT_VERSION,
     DEFAULT_MAX_RETRIES,
     DEFAULT_REQUEST_TIMEOUT,
     DEFAULT_SAMPLE_RETRIES,
@@ -30,7 +37,6 @@ from .llm import LLMClient
 from .metrics import trace
 from .progress import ProgressReporter
 from .prompts import (
-    AGENT_COT_MULTI_TURN_PROMPT,
     AGENT_COT_TOOLS_PROMPT,
     CONVERSATION_GENERATION_PROMPT,
     FREETEXT_COT_PROMPT,
@@ -40,8 +46,8 @@ from .prompts import (
 from .schemas import Conversation, ToolRegistry, get_conversation_schema
 from .tools import BUILTIN_TOOL_REGISTRY
 from .tools.loader import load_tools_from_dict, load_tools_from_endpoint
-from .topic_model import TopicModel
-from .utils import ensure_not_running_loop, is_validation_error
+from .topic_model import TopicModel, TopicPath
+from .utils import ensure_not_running_loop, get_checkpoint_dir, is_validation_error
 # Handle circular import for type hints
 if TYPE_CHECKING:
@@ -143,12 +149,7 @@ class DataSetGeneratorConfig(BaseModel):
         """Normalize deprecated reasoning_style values."""
         return _normalize_reasoning_style(v)
-    agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
-        default=None,
-        description="Agent mode: single_turn (one-shot tool use), multi_turn (extended agent conversations). Requires tools to be configured.",
-    )
-    # Tool configuration (used when agent_mode is enabled or for tool_calling)
+    # Tool configuration (used when tools are configured for agent mode)
     tool_components: dict[str, list[str]] = Field(
         default_factory=dict,
         description=(
@@ -194,28 +195,32 @@ class DataSetGeneratorConfig(BaseModel):
         description="Path for tool execution when using tools_endpoint (e.g., '/mock/execute'). Combined with spin_endpoint.",
     )
-    # Multi-turn configuration (used when agent_mode="multi_turn")
-    min_turns: int = Field(
-        default=2,
-        ge=1,
-        le=10,
-        description="Minimum number of conversation turns for multi-turn agent mode",
+    tool_inclusion_strategy: Literal["all", "used_only"] = Field(
+        default="used_only",
+        description="Which tools to include in each sample: 'all' includes full catalog, 'used_only' includes only tools actually called (recommended for training)",
     )
-    max_turns: int = Field(
-        default=4,
+    # Checkpoint configuration
+    checkpoint_interval: int | None = Field(
+        default=None,
         ge=1,
-        le=10,
-        description="Maximum number of conversation turns for multi-turn agent mode",
+        description="Save checkpoint every N samples. None disables checkpointing.",
     )
-    min_tool_calls: int = Field(
-        default=2,
-        ge=0,
-        le=20,
-        description="Minimum number of tool calls required before allowing early conversation conclusion",
+    checkpoint_path: str | None = Field(
+        default=None,
+        description="Directory to store checkpoint files. None uses fallback '.checkpoints'",
     )
-    tool_inclusion_strategy: Literal["all", "used_only"] = Field(
-        default="used_only",
-        description="Which tools to include in each sample: 'all' includes full catalog, 'used_only' includes only tools actually called (recommended for training)",
+    checkpoint_retry_failed: bool = Field(
+        default=False,
+        description="When resuming, retry previously failed samples",
+    )
+    output_save_as: str | None = Field(
+        default=None,
+        description="Output file path (used to derive checkpoint file names)",
+    )
+    topics_file: str | None = Field(
+        default=None,
+        description="Topics file path (stored in checkpoint metadata for auto-resume)",
     )
@@ -260,18 +265,27 @@ class DataSetGenerator:
         # Store generation prompt for content generation
         self.generation_prompt = self.config.generation_system_prompt
-        # Initialize tool registry when agent_mode is enabled or tools are configured
+        # Initialize tool registry when tools are configured (enables agent mode)
         self.tool_registry = None
-        if (
-            self.config.agent_mode is not None
-            or self.config.tool_components
-            or self.config.custom_tools
-        ):
+        if self.config.tool_components or self.config.custom_tools:
             self._initialize_tool_registry()
         # Progress reporter for streaming feedback (set by external callers)
         self.progress_reporter: ProgressReporter | None = None
+        # Checkpoint state
+        self._checkpoint_samples_since_save = 0
+        self._processed_ids: set[str] = set()  # Track processed topic IDs (UUIDs)
+        self._checkpoint_metadata_path: Path | None = None
+        self._checkpoint_samples_path: Path | None = None
+        self._checkpoint_failures_path: Path | None = None
+        # Memory optimization: track flushed counts for checkpoint mode
+        self._flushed_samples_count = 0
+        self._flushed_failures_count = 0
+        # Graceful stop flag - set by signal handler to stop at next checkpoint
+        self.stop_requested = False
     def _initialize_tool_registry(self):
         """Initialize tool registry from component configuration.
@@ -328,6 +342,400 @@ class DataSetGenerator:
         except Exception as e:  # noqa: BLE001
             raise DataSetGeneratorError(f"Failed to initialize tool registry: {str(e)}") from e
+    def _get_checkpoint_paths(self) -> tuple[Path, Path, Path]:
+        """Get checkpoint file paths based on output_save_as.
+        Returns:
+            Tuple of (metadata_path, samples_path, failures_path)
+        """
+        if not self.config.output_save_as:
+            raise DataSetGeneratorError(
+                "Cannot create checkpoint paths: output_save_as not configured"
+            )
+        # Create checkpoint directory if needed
+        # Use XDG-compliant fallback if checkpoint_path not resolved by CLI
+        checkpoint_dir = Path(self.config.checkpoint_path or get_checkpoint_dir(config_path=None))
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        # Derive checkpoint filenames from output filename
+        output_stem = Path(self.config.output_save_as).stem
+        metadata_path = checkpoint_dir / f"{output_stem}{CHECKPOINT_METADATA_SUFFIX}"
+        samples_path = checkpoint_dir / f"{output_stem}{CHECKPOINT_SAMPLES_SUFFIX}"
+        failures_path = checkpoint_dir / f"{output_stem}{CHECKPOINT_FAILURES_SUFFIX}"
+        return metadata_path, samples_path, failures_path
+    def _initialize_checkpoint_paths(self) -> None:
+        """Initialize checkpoint file paths if checkpointing is enabled."""
+        if self.config.checkpoint_interval is not None:
+            paths = self._get_checkpoint_paths()
+            self._checkpoint_metadata_path = paths[0]
+            self._checkpoint_samples_path = paths[1]
+            self._checkpoint_failures_path = paths[2]
+            logger.info(
+                "Checkpointing enabled: saving every %d samples to %s",
+                self.config.checkpoint_interval,
+                self._checkpoint_samples_path,
+            )
+    def _save_checkpoint(
+        self,
+        new_samples: list[dict],
+        new_failures: list[dict],
+        processed_topic_paths: list[TopicPath | None],
+        flush_memory: bool = True,
+    ) -> None:
+        """Save checkpoint data incrementally.
+        Args:
+            new_samples: New successful samples to append
+            new_failures: New failed samples to append
+            processed_topic_paths: TopicPath objects that were processed in this batch
+            flush_memory: If True, clear flushed samples from memory (memory optimization)
+        """
+        if self._checkpoint_samples_path is None:
+            return
+        # Append new samples to checkpoint file
+        if new_samples:
+            with open(self._checkpoint_samples_path, "a", encoding="utf-8") as f:
+                for sample in new_samples:
+                    f.write(json.dumps(sample, separators=(",", ":")) + "\n")
+        # Append new failures to failures file
+        if new_failures and self._checkpoint_failures_path:
+            with open(self._checkpoint_failures_path, "a", encoding="utf-8") as f:
+                for failure in new_failures:
+                    f.write(json.dumps(failure, separators=(",", ":")) + "\n")
+        # Track processed topic IDs
+        for topic_path in processed_topic_paths:
+            if topic_path is not None:
+                self._processed_ids.add(topic_path.topic_id)
+        # Memory optimization: track flushed counts and clear in-memory lists
+        # Must happen BEFORE saving metadata so counts are accurate
+        if flush_memory:
+            self._flushed_samples_count += len(new_samples)
+            self._flushed_failures_count += len(new_failures)
+            # Clear the in-memory lists since data is now on disk
+            self._samples.clear()
+            self.failed_samples.clear()
+        # Update metadata (after flush counts are updated)
+        self._save_checkpoint_metadata()
+        logger.debug(
+            "Checkpoint saved: %d samples, %d failures, %d total IDs processed (flushed=%s)",
+            len(new_samples),
+            len(new_failures),
+            len(self._processed_ids),
+            flush_memory,
+        )
+    def _save_checkpoint_metadata(self) -> None:
+        """Save checkpoint metadata file."""
+        if self._checkpoint_metadata_path is None:
+            return
+        # Total counts include both flushed (on disk) and in-memory samples
+        total_samples = self._flushed_samples_count + len(self._samples)
+        total_failures = self._flushed_failures_count + len(self.failed_samples)
+        metadata = {
+            "version": CHECKPOINT_VERSION,
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "provider": self.provider,
+            "model_name": self.model_name,
+            "conversation_type": self.config.conversation_type,
+            "reasoning_style": self.config.reasoning_style,
+            "total_samples": total_samples,
+            "total_failures": total_failures,
+            "processed_ids": list(self._processed_ids),
+            "checkpoint_interval": self.config.checkpoint_interval,
+            "topics_file": self.config.topics_file,
+        }
+        with open(self._checkpoint_metadata_path, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, indent=2)
+    def _validate_checkpoint_compatibility(self, metadata: dict) -> None:
+        """Validate that current config is compatible with checkpoint.
+        Logs warnings for config mismatches but allows resumption.
+        Args:
+            metadata: Checkpoint metadata dictionary
+        """
+        mismatches: list[str] = []
+        # Check provider
+        checkpoint_provider = metadata.get("provider")
+        if checkpoint_provider and checkpoint_provider != self.provider:
+            mismatches.append(
+                f"provider: checkpoint={checkpoint_provider}, current={self.provider}"
+            )
+        # Check model
+        checkpoint_model = metadata.get("model_name")
+        if checkpoint_model and checkpoint_model != self.model_name:
+            mismatches.append(
+                f"model_name: checkpoint={checkpoint_model}, current={self.model_name}"
+            )
+        # Check conversation type
+        checkpoint_conv_type = metadata.get("conversation_type")
+        if checkpoint_conv_type and checkpoint_conv_type != self.config.conversation_type:
+            mismatches.append(
+                f"conversation_type: checkpoint={checkpoint_conv_type}, "
+                f"current={self.config.conversation_type}"
+            )
+        # Check reasoning style
+        checkpoint_reasoning = metadata.get("reasoning_style")
+        if checkpoint_reasoning and checkpoint_reasoning != self.config.reasoning_style:
+            mismatches.append(
+                f"reasoning_style: checkpoint={checkpoint_reasoning}, "
+                f"current={self.config.reasoning_style}"
+            )
+        if mismatches:
+            logger.warning(
+                "Config mismatch with checkpoint. Resuming may produce inconsistent results. "
+                "Differences: %s",
+                "; ".join(mismatches),
+            )
+    def _validate_checkpoint_integrity(self, metadata: dict) -> tuple[bool, str | None]:
+        """Validate checkpoint file integrity.
+        Checks that:
+        1. Metadata version is supported
+        2. Required metadata fields are present
+        3. Sample count in metadata matches actual file line count
+        4. Sample file contains valid JSON on each line
+        Args:
+            metadata: Checkpoint metadata dictionary
+        Returns:
+            Tuple of (is_valid, error_message). error_message is None if valid.
+        """
+        error_msg: str | None = None
+        # Check version
+        version = metadata.get("version")
+        if version is None:
+            error_msg = "Missing 'version' field in checkpoint metadata"
+        elif version != CHECKPOINT_VERSION:
+            error_msg = f"Unsupported checkpoint version: {version} (expected {CHECKPOINT_VERSION})"
+        # Check required fields
+        if error_msg is None:
+            required_fields = ["created_at", "total_samples", "processed_ids"]
+            for field in required_fields:
+                if field not in metadata:
+                    error_msg = f"Missing required field in checkpoint metadata: {field}"
+                    break
+        # Validate sample count matches file
+        if error_msg is None:
+            expected_samples = metadata.get("total_samples", 0)
+            if self._checkpoint_samples_path and self._checkpoint_samples_path.exists():
+                actual_count = 0
+                try:
+                    with open(self._checkpoint_samples_path, encoding="utf-8") as f:
+                        for line_num, raw_line in enumerate(f, 1):
+                            stripped = raw_line.strip()
+                            if stripped:
+                                try:
+                                    json.loads(stripped)
+                                    actual_count += 1
+                                except json.JSONDecodeError as e:
+                                    error_msg = f"Invalid JSON on line {line_num} of checkpoint samples: {e}"
+                                    break
+                except OSError as e:
+                    error_msg = f"Failed to read checkpoint samples file: {e}"
+                if error_msg is None and actual_count != expected_samples:
+                    error_msg = (
+                        f"Sample count mismatch: metadata says {expected_samples}, "
+                        f"file has {actual_count} samples"
+                    )
+            elif expected_samples > 0:
+                error_msg = f"Checkpoint metadata expects {expected_samples} samples but samples file missing"
+        return (error_msg is None, error_msg)
+    def has_checkpoint(self) -> bool:
+        """Check if a checkpoint exists without loading it.
+        Returns:
+            True if checkpoint metadata file exists, False otherwise
+        """
+        if self.config.checkpoint_interval is None:
+            return False
+        self._initialize_checkpoint_paths()
+        return (
+            self._checkpoint_metadata_path is not None
+            and self._checkpoint_metadata_path.exists()
+        )
+    def load_checkpoint(self, retry_failed: bool = False) -> bool:
+        """Load checkpoint data if it exists.
+        Args:
+            retry_failed: If True, remove failed IDs from processed set to retry them
+        Returns:
+            True if checkpoint was loaded, False if no checkpoint exists
+        """
+        if self.config.checkpoint_interval is None:
+            return False
+        self._initialize_checkpoint_paths()
+        if self._checkpoint_metadata_path is None or not self._checkpoint_metadata_path.exists():
+            return False
+        try:
+            # Load metadata
+            with open(self._checkpoint_metadata_path, encoding="utf-8") as f:
+                metadata = json.load(f)
+            # Validate checkpoint integrity
+            is_valid, error_msg = self._validate_checkpoint_integrity(metadata)
+            if not is_valid:
+                logger.error("Checkpoint integrity check failed: %s", error_msg)
+                return False
+            # Validate config compatibility
+            self._validate_checkpoint_compatibility(metadata)
+            # Restore processed IDs
+            self._processed_ids = set(metadata.get("processed_ids", []))
+            # Count existing samples (don't load into memory - they're already on disk)
+            # Memory optimization: track as flushed counts instead of loading into RAM
+            if self._checkpoint_samples_path and self._checkpoint_samples_path.exists():
+                sample_count = 0
+                with open(self._checkpoint_samples_path, encoding="utf-8") as f:
+                    for raw_line in f:
+                        if raw_line.strip():
+                            sample_count += 1
+                self._flushed_samples_count = sample_count
+            # Load failure IDs for retry logic (these are small)
+            failed_ids: set[str] = set()
+            if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
+                failure_count = 0
+                with open(self._checkpoint_failures_path, encoding="utf-8") as f:
+                    for raw_line in f:
+                        stripped = raw_line.strip()
+                        if stripped:
+                            failure = json.loads(stripped)
+                            failure_count += 1
+                            # Track the topic_id that failed for potential retry
+                            if "topic_id" in failure:
+                                failed_ids.add(failure["topic_id"])
+                self._flushed_failures_count = failure_count
+            # If retry_failed is True, remove failed IDs from processed set
+            # so they will be retried during generation
+            if retry_failed and failed_ids:
+                ids_to_retry = self._processed_ids & failed_ids
+                self._processed_ids -= ids_to_retry
+                # Clear failures file since we're retrying
+                if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
+                    os.remove(self._checkpoint_failures_path)
+                self._flushed_failures_count = 0
+                logger.info(
+                    "Retry mode: %d failed IDs will be retried",
+                    len(ids_to_retry),
+                )
+            logger.info(
+                "Loaded checkpoint: %d samples, %d failures, %d IDs processed",
+                self._flushed_samples_count,
+                self._flushed_failures_count,
+                len(self._processed_ids),
+            )
+        except Exception as e:  # noqa: BLE001
+            logger.warning("Failed to load checkpoint: %s", e)
+            return False
+        else:
+            return True
+    def clear_checkpoint(self) -> None:
+        """Remove checkpoint files."""
+        if self._checkpoint_metadata_path and self._checkpoint_metadata_path.exists():
+            os.remove(self._checkpoint_metadata_path)
+        if self._checkpoint_samples_path and self._checkpoint_samples_path.exists():
+            os.remove(self._checkpoint_samples_path)
+        if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
+            os.remove(self._checkpoint_failures_path)
+        self._processed_ids.clear()
+        self._flushed_samples_count = 0
+        self._flushed_failures_count = 0
+        logger.info("Checkpoint files cleared")
+    def _load_all_samples_from_checkpoint(self) -> list[dict]:
+        """Load all samples from checkpoint file.
+        Used at end of generation to build final dataset when memory
+        optimization has flushed samples to disk.
+        Returns:
+            List of all sample dictionaries from checkpoint file
+        """
+        all_samples: list[dict] = []
+        if self._checkpoint_samples_path and self._checkpoint_samples_path.exists():
+            with open(self._checkpoint_samples_path, encoding="utf-8") as f:
+                for raw_line in f:
+                    stripped = raw_line.strip()
+                    if stripped:
+                        all_samples.append(json.loads(stripped))
+        return all_samples
+    def get_all_failures(self) -> list[dict]:
+        """Get all failures including those flushed to checkpoint.
+        This combines in-memory failures with any that were flushed to the
+        checkpoint failures file during memory optimization.
+        Returns:
+            List of all failure dictionaries
+        """
+        all_failures: list[dict] = []
+        # First load from checkpoint file if it exists
+        if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
+            with open(self._checkpoint_failures_path, encoding="utf-8") as f:
+                for raw_line in f:
+                    stripped = raw_line.strip()
+                    if stripped:
+                        all_failures.append(json.loads(stripped))
+        # Then add any in-memory failures (if not yet flushed)
+        all_failures.extend(self.failed_samples)
+        return all_failures
+    def _is_topic_processed(self, topic_path: TopicPath | None) -> bool:
+        """Check if a topic has already been processed.
+        Args:
+            topic_path: TopicPath to check
+        Returns:
+            True if topic was already processed in a previous run
+        """
+        if topic_path is None:
+            return False
+        return topic_path.topic_id in self._processed_ids
     def _validate_create_data_params(
         self,
         num_steps: int,
@@ -351,34 +759,24 @@ class DataSetGenerator:
         num_steps: int,
         batch_size: int,
         topic_model: "TopicModel | None" = None,
-    ) -> tuple[list | None, int]:
+    ) -> tuple[list[TopicPath] | None, int]:
         """Prepare and validate topic paths for data generation."""
-        topic_paths = None
+        topic_paths: list[TopicPath] | None = None
         if topic_model is not None:
-            topic_paths = topic_model.get_all_paths()
+            topic_paths = topic_model.get_all_paths_with_ids()
             total_paths = len(topic_paths)
             required_samples = num_steps * batch_size
             if required_samples > total_paths:
-                # Provide detailed error with recommendations
-                max_steps_for_batch = total_paths // batch_size
-                max_batch_for_steps = total_paths // num_steps if num_steps > 0 else total_paths
-                error_msg = (
-                    f"Insufficient topic paths for dataset generation:\n"
-                    f"  • Available paths: {total_paths}\n"
-                    f"  • Requested samples: {required_samples} ({num_steps} steps × {batch_size} batch size)\n"
-                    f"  • Shortfall: {required_samples - total_paths} samples\n\n"
-                    f"Recommendations:\n"
-                    f"  • Reduce --num-steps to {max_steps_for_batch} (with current batch size {batch_size})\n"
-                    f"  • Reduce --batch-size to {max_batch_for_steps} (with current {num_steps} steps)\n"
-                    f"  • Increase topic tree/graph depth or degree to generate more paths"
-                )
-                raise DataSetGeneratorError(error_msg)
-            # Bandit: not a security function
-            topic_paths = random.sample(topic_paths, required_samples)  # nosec
-            num_steps = math.ceil(len(topic_paths) / batch_size)
+                # Cycle through topics to generate more samples than paths
+                # Each topic will be used multiple times for even coverage
+                multiplier = math.ceil(required_samples / total_paths)
+                topic_paths = (topic_paths * multiplier)[:required_samples]
+            elif required_samples < total_paths:
+                # Sample subset (percentage case or explicit count < total)
+                # Bandit: not a security function
+                topic_paths = random.sample(topic_paths, required_samples)  # nosec
+            # else: required_samples == total_paths - use all paths as-is
         return topic_paths, num_steps
@@ -386,23 +784,25 @@ class DataSetGenerator:
         self,
         batch_size: int,
         start_idx: int,
-        topic_paths: list,
+        topic_paths: list[TopicPath],
         data_creation_prompt: str,
         num_example_demonstrations: int,
-    ) -> tuple[list[str], list[list[str] | None]]:
-        """Generate prompts for a batch and return the associated paths used.
+    ) -> tuple[list[str], list[TopicPath | None]]:
+        """Generate prompts for a batch and return the associated TopicPaths used.
         Returns:
-            (prompts, used_paths) where used_paths aligns with prompts order.
+            (prompts, used_topic_paths) where used_topic_paths aligns with prompts order.
         """
         prompts: list[str] = []
-        used_paths: list[list[str] | None] = []
+        used_topic_paths: list[TopicPath | None] = []
         for i in range(batch_size):
-            path = None
+            topic_path: TopicPath | None = None
+            path: list[str] | None = None
             if topic_paths:
                 current_idx = start_idx + i
                 if current_idx < len(topic_paths):
-                    path = topic_paths[current_idx]
+                    topic_path = topic_paths[current_idx]
+                    path = topic_path.path
                 else:
                     break
@@ -412,8 +812,8 @@ class DataSetGenerator:
                 subtopics_list=path,
             )
             prompts.append(sample_prompt)
-            used_paths.append(path)
-        return prompts, used_paths
+            used_topic_paths.append(topic_path)
+        return prompts, used_topic_paths
     def _get_minimal_schema(self) -> type:
         """Get the conversation schema for the current config."""
@@ -447,7 +847,7 @@ class DataSetGenerator:
         prompts: list[str],
         include_sys_msg: bool,
         start_sample_idx: int = 0,
-        paths_for_batch: list[list[str] | None] | None = None,
+        topic_paths_for_batch: list[TopicPath | None] | None = None,
     ) -> tuple[list, list]:
         """Generate structured samples using builder pattern.
@@ -455,6 +855,7 @@ class DataSetGenerator:
             prompts: List of topic prompts to generate samples for
             include_sys_msg: Whether to include system message in output
             start_sample_idx: Starting sample index for progress reporting
+            topic_paths_for_batch: TopicPath objects for each sample (includes topic_id)
         Returns:
             Tuple of (successful samples, failed responses)
@@ -470,7 +871,7 @@ class DataSetGenerator:
             config = self.config.model_copy(update={"sys_msg": include_sys_msg})
         async def _generate_with_retry(
-            prompt: str, sample_idx: int, path_info: list[str] | None
+            prompt: str, sample_idx: int, topic_path_info: TopicPath | None
         ) -> tuple[bool, Exception | Conversation]:
             """Generate a single sample with per-sample retry for validation errors.
@@ -496,6 +897,9 @@ class DataSetGenerator:
                 self.config.sample_retries,
             )
+            # Extract path for progress reporting
+            path_info = topic_path_info.path if topic_path_info else None
             for attempt in range(max_attempts):
                 # Notify progress reporter about which sample we're working on
                 if self.progress_reporter:
@@ -533,8 +937,8 @@ class DataSetGenerator:
                     return False, last_error or Exception("Sample generation failed")
                 else:
-                    # Validate tool execution count for agent modes
-                    if self.config.agent_mode is not None:
+                    # Validate tool execution count for agent mode (when tools configured)
+                    if self.tool_registry is not None:
                         if (
                             not conversation.tool_context
                             or not conversation.tool_context.executions
@@ -566,6 +970,12 @@ class DataSetGenerator:
                                 ]
                             )
+                    # Add topic_id to conversation metadata for traceability
+                    if topic_path_info and hasattr(conversation, "metadata"):
+                        if conversation.metadata is None:
+                            conversation.metadata = {}
+                        conversation.metadata["topic_id"] = topic_path_info.topic_id
                     return True, conversation
             return False, last_error or Exception("Sample generation failed")
@@ -573,11 +983,13 @@ class DataSetGenerator:
         # Generate all samples concurrently with sample indices
         tasks = []
         for idx, prompt in enumerate(prompts):
-            path_info = None
-            if paths_for_batch and idx < len(paths_for_batch):
-                path_info = paths_for_batch[idx]
+            topic_path_info = None
+            if topic_paths_for_batch and idx < len(topic_paths_for_batch):
+                topic_path_info = topic_paths_for_batch[idx]
             tasks.append(
-                asyncio.create_task(_generate_with_retry(prompt, start_sample_idx + idx, path_info))
+                asyncio.create_task(
+                    _generate_with_retry(prompt, start_sample_idx + idx, topic_path_info)
+                )
             )
         results = await asyncio.gather(*tasks)
@@ -587,12 +999,18 @@ class DataSetGenerator:
             else:
                 error = payload
                 error_msg = f"Generation failed: {error}"
-                # Build failure record with raw content if available
-                failure_record = {"error": error_msg}
+                # Build failure record with raw content and topic_id if available
+                failure_record: dict[str, str | None] = {"error": error_msg}
                 if isinstance(error, Exception):
                     context = getattr(error, "context", None)
                     if isinstance(context, dict) and "raw_content" in context:
                         failure_record["raw_content"] = context["raw_content"]
+                # Include topic_id and path for checkpoint retry functionality
+                if topic_paths_for_batch and idx < len(topic_paths_for_batch):
+                    tp = topic_paths_for_batch[idx]
+                    if tp:
+                        failure_record["topic_id"] = tp.topic_id
+                        failure_record["path"] = " -> ".join(tp.path)
                 failed_responses.append(failure_record)
                 failure_type = self.analyze_failure(
                     str(error), error=error if isinstance(error, Exception) else None
@@ -817,12 +1235,12 @@ class DataSetGenerator:
         ):
             yield event
-    async def _run_generation_loop_async(  # noqa: PLR0912
+    async def _run_generation_loop_async(  # noqa: PLR0912, PLR0915
         self,
         num_steps: int,
         batch_size: int,
         total_samples: int,
-        topic_paths: list,
+        topic_paths: list[TopicPath],
         data_creation_prompt: str,
         num_example_demonstrations: int,
         include_sys_msg: bool,
@@ -830,6 +1248,16 @@ class DataSetGenerator:
         topic_model_type: str | None = None,
     ) -> AsyncGenerator[dict | HFDataset, None]:
         """Run the main generation loop yielding progress events."""
+        # Initialize checkpoint paths if checkpointing is enabled
+        if self.config.checkpoint_interval is not None:
+            self._initialize_checkpoint_paths()
+        # Track samples added in this run for checkpointing
+        samples_since_checkpoint = 0
+        samples_in_current_batch: list[dict] = []
+        failures_in_current_batch: list[dict] = []
+        topic_paths_in_current_batch: list[TopicPath | None] = []
         try:
             yield {
                 "event": "generation_start",
@@ -839,6 +1267,11 @@ class DataSetGenerator:
                 "total_samples": total_samples,
                 "root_topic_prompt": root_topic_prompt,
                 "topic_model_type": topic_model_type,
+                "resumed_from_checkpoint": len(self._processed_ids) > 0,
+                "previously_processed": len(self._processed_ids),
+                "resumed_samples": self._flushed_samples_count,
+                "resumed_failures": self._flushed_failures_count,
+                "checkpoint_enabled": self.config.checkpoint_interval is not None,
             }
             for step in range(num_steps):
@@ -849,7 +1282,7 @@ class DataSetGenerator:
                 }
                 start_idx = step * batch_size
-                prompts, used_paths = self._generate_batch_prompts(
+                prompts, used_topic_paths = self._generate_batch_prompts(
                     batch_size,
                     start_idx,
                     topic_paths,
@@ -857,17 +1290,85 @@ class DataSetGenerator:
                     num_example_demonstrations,
                 )
+                # Filter out already-processed topics when resuming
+                if self._processed_ids:
+                    filtered_prompts = []
+                    filtered_topic_paths: list[TopicPath | None] = []
+                    for prompt, tp in zip(prompts, used_topic_paths, strict=False):
+                        if not self._is_topic_processed(tp):
+                            filtered_prompts.append(prompt)
+                            filtered_topic_paths.append(tp)
+                    if not filtered_prompts:
+                        # All topics in this batch were already processed
+                        yield {
+                            "event": "step_complete",
+                            "step": step + 1,
+                            "samples_generated": 0,
+                            "success": True,
+                            "failed_in_step": 0,
+                            "failure_reasons": [],
+                            "skipped": len(prompts),
+                        }
+                        continue
+                    prompts = filtered_prompts
+                    used_topic_paths = filtered_topic_paths
                 failed_before = len(self.failed_samples)
+                samples_before = len(self._samples)
                 success, samples_generated = await self._process_batch_with_retries_async(
-                    prompts, include_sys_msg, start_idx, used_paths
+                    prompts, include_sys_msg, start_idx, used_topic_paths
                 )
+                # Track new samples and failures for checkpointing
+                new_samples = self._samples[samples_before:]
+                new_failures = self.failed_samples[failed_before:]
+                samples_in_current_batch.extend(new_samples)
+                failures_in_current_batch.extend(new_failures)
+                topic_paths_in_current_batch.extend(used_topic_paths)
+                samples_since_checkpoint += samples_generated
+                # Save checkpoint if we've reached the interval
+                if (
+                    self.config.checkpoint_interval is not None
+                    and samples_since_checkpoint >= self.config.checkpoint_interval
+                ):
+                    self._save_checkpoint(
+                        samples_in_current_batch,
+                        failures_in_current_batch,
+                        topic_paths_in_current_batch,
+                    )
+                    samples_in_current_batch = []
+                    failures_in_current_batch = []
+                    topic_paths_in_current_batch = []
+                    samples_since_checkpoint = 0
+                    yield {
+                        "event": "checkpoint_saved",
+                        "total_samples": self._flushed_samples_count,
+                        "total_failures": self._flushed_failures_count,
+                    }
+                    # Check for graceful stop request after checkpoint save
+                    if self.stop_requested:
+                        yield {
+                            "event": "generation_stopped",
+                            "message": "Stopped at checkpoint as requested",
+                            "total_samples": self._flushed_samples_count,
+                            "total_failures": self._flushed_failures_count,
+                        }
+                        return  # Exit generator cleanly
                 failed_in_batch = len(self.failed_samples) - failed_before
-                failure_reasons = []
+                failure_reasons: list[str] = []
                 if failed_in_batch > 0 and self.failed_samples:
                     recent_failures = self.failed_samples[-failed_in_batch:]
-                    failure_reasons = recent_failures[:3]
+                    for f in recent_failures[:3]:
+                        if isinstance(f, dict):
+                            failure_reasons.append(f.get("error", str(f)))
+                        else:
+                            failure_reasons.append(str(f))
                 yield {
                     "event": "step_complete",
@@ -885,13 +1386,42 @@ class DataSetGenerator:
                         "message": f"Failed to process batch {step + 1} after all retries",
                     }
+            # Save final checkpoint with any remaining samples
+            if self.config.checkpoint_interval is not None and (
+                samples_in_current_batch or failures_in_current_batch
+            ):
+                self._save_checkpoint(
+                    samples_in_current_batch,
+                    failures_in_current_batch,
+                    topic_paths_in_current_batch,
+                )
+                yield {
+                    "event": "checkpoint_saved",
+                    "total_samples": self._flushed_samples_count,
+                    "total_failures": self._flushed_failures_count,
+                    "final": True,
+                }
+            # Calculate total counts including flushed data
+            total_samples = self._flushed_samples_count + len(self._samples)
+            total_failures = self._flushed_failures_count + len(self.failed_samples)
             yield {
                 "event": "generation_complete",
-                "total_samples": len(self._samples),
-                "failed_samples": len(self.failed_samples),
+                "total_samples": total_samples,
+                "failed_samples": total_failures,
             }
         except KeyboardInterrupt:
+            # Save checkpoint on interrupt
+            if self.config.checkpoint_interval is not None and (
+                samples_in_current_batch or failures_in_current_batch
+            ):
+                self._save_checkpoint(
+                    samples_in_current_batch,
+                    failures_in_current_batch,
+                    topic_paths_in_current_batch,
+                )
             yield {
                 "event": "generation_interrupted",
                 "message": "Generation interrupted by user.",
@@ -900,25 +1430,39 @@ class DataSetGenerator:
             self._save_samples_to_file(INTERRUPTED_DATASET_FILENAME)
         except Exception as e:  # noqa: BLE001
+            # Save checkpoint on error
+            if self.config.checkpoint_interval is not None and (
+                samples_in_current_batch or failures_in_current_batch
+            ):
+                self._save_checkpoint(
+                    samples_in_current_batch,
+                    failures_in_current_batch,
+                    topic_paths_in_current_batch,
+                )
             yield {"event": "generation_error", "error": str(e)}
             self.print_failure_summary()
             self._save_samples_to_file(ERROR_DATASET_FILENAME)
             raise DataSetGeneratorError("failed") from e
-        yield (HFDataset.from_list(self._samples) if self._samples else HFDataset.from_list([]))
+        # Build final dataset: if samples were flushed to disk, load them from checkpoint
+        if self._flushed_samples_count > 0:
+            all_samples = self._load_all_samples_from_checkpoint()
+            yield HFDataset.from_list(all_samples) if all_samples else HFDataset.from_list([])
+        else:
+            yield (HFDataset.from_list(self._samples) if self._samples else HFDataset.from_list([]))
     async def _process_batch_with_retries_async(
         self,
         prompts: list[str],
         include_sys_msg: bool,
         start_sample_idx: int = 0,
-        paths_for_batch: list[list[str] | None] | None = None,
+        topic_paths_for_batch: list[TopicPath | None] | None = None,
     ) -> tuple[bool, int]:
         """Process a batch with retry logic."""
         for attempt in range(self.config.max_retries):
             try:
                 samples, failed_responses = await self._generate_structured_samples_async(
-                    prompts, include_sys_msg, start_sample_idx, paths_for_batch
+                    prompts, include_sys_msg, start_sample_idx, topic_paths_for_batch
                 )
                 # Update failed samples
@@ -948,14 +1492,33 @@ class DataSetGenerator:
                     error_msg = f"API error for provider '{self.provider}': {str(e)[:100]}..."
                     self.failure_analysis["api_errors"].append(error_msg)
-                self.failed_samples.append(error_msg)
+                # Build failure records for each topic path in the batch
+                if topic_paths_for_batch:
+                    for tp in topic_paths_for_batch:
+                        failure_record: dict[str, str | None] = {"error": error_msg}
+                        if tp:
+                            failure_record["topic_id"] = tp.topic_id
+                            failure_record["path"] = " -> ".join(tp.path)
+                        self.failed_samples.append(failure_record)
+                else:
+                    self.failed_samples.append({"error": error_msg})
                 logger.exception("API error: %s", error_msg)
                 return False, 0  # Don't retry authentication/API errors
             except Exception as e:
                 if attempt == self.config.max_retries - 1:
-                    self.failed_samples.append(str(e))
-                    failure_type = self.analyze_failure(str(e), error=e)
-                    self.failure_analysis[failure_type].append(str(e))
+                    error_msg = str(e)
+                    # Build failure records for each topic path in the batch
+                    if topic_paths_for_batch:
+                        for tp in topic_paths_for_batch:
+                            failure_record_exc: dict[str, str | None] = {"error": error_msg}
+                            if tp:
+                                failure_record_exc["topic_id"] = tp.topic_id
+                                failure_record_exc["path"] = " -> ".join(tp.path)
+                            self.failed_samples.append(failure_record_exc)
+                    else:
+                        self.failed_samples.append({"error": error_msg})
+                    failure_type = self.analyze_failure(error_msg, error=e)
+                    self.failure_analysis[failure_type].append(error_msg)
                     return False, 0
             else:
                 # If no exception and no samples, return False, 0
@@ -1015,7 +1578,7 @@ class DataSetGenerator:
         return f"\nHere are output examples:\n<examples>\n{examples_text}\n</examples>\n"
     def build_tools_text(self) -> str:
-        """Build formatted tools text for XLAM multi-turn prompts."""
+        """Build formatted tools text for XLAM prompts."""
         if not self.tool_registry:
             return "No tools available"
@@ -1046,8 +1609,8 @@ class DataSetGenerator:
         # Handle chain of thought conversations
         if self.config.conversation_type == "cot":
-            # Agent mode with tools - use agent prompts
-            if self.config.agent_mode == "single_turn" and self.tool_registry:
+            # Agent mode with tools - use agent prompts (implicit when tools configured)
+            if self.tool_registry:
                 # Use agent prompt for single-turn tool calling
                 return (
                     AgentPromptBuilder.build_tool_context_prompt(
@@ -1057,16 +1620,6 @@ class DataSetGenerator:
                     or AGENT_COT_TOOLS_PROMPT
                 )
-            if self.config.agent_mode == "multi_turn" and self.tool_registry:
-                # Standard multi-turn agent
-                return (
-                    AgentPromptBuilder.build_multi_turn_context_prompt(
-                        self.tool_registry,
-                        max_tools_per_query=self.config.max_tools_per_query,
-                    )
-                    or AGENT_COT_MULTI_TURN_PROMPT
-                )
             # Non-agent CoT - select based on reasoning style
             if self.config.reasoning_style == "freetext":
                 return FREETEXT_COT_PROMPT
@@ -1079,7 +1632,7 @@ class DataSetGenerator:
     def _save_samples_to_file(self, save_path: str):
         """Save the current samples to a JSONL file."""
-        with open(save_path, "w") as f:
+        with open(save_path, "w", encoding="utf-8") as f:
             for sample in self._samples:
                 f.write(json.dumps(sample, separators=(",", ":")) + "\n")

DeepFabric 4.9.0__py3-none-any.whl → 4.10.0__py3-none-any.whl

DeepFabric 4.9.0py3-none-any.whl → 4.10.0py3-none-any.whl