PyPI - DeepFabric - Versions diffs - 4.9.0__py3-none-any.whl → 4.10.1__py3-none-any.whl - Mend

DeepFabric 4.9.0py3-none-any.whl → 4.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

deepfabric/builders.py +7 -21
deepfabric/builders_agent.py +0 -542
deepfabric/cli.py +505 -74
deepfabric/config.py +57 -73
deepfabric/config_manager.py +8 -6
deepfabric/constants.py +6 -0
deepfabric/dataset_manager.py +107 -11
deepfabric/evaluation/parser.py +7 -7
deepfabric/generator.py +656 -103
deepfabric/graph.py +46 -1
deepfabric/prompts.py +0 -39
deepfabric/schemas.py +4 -3
deepfabric/topic_model.py +32 -0
deepfabric/tree.py +23 -1
deepfabric/tui.py +66 -21
deepfabric/utils.py +184 -0
deepfabric/validation.py +47 -77
{deepfabric-4.9.0.dist-info → deepfabric-4.10.1.dist-info}/METADATA +5 -6
{deepfabric-4.9.0.dist-info → deepfabric-4.10.1.dist-info}/RECORD +22 -22
{deepfabric-4.9.0.dist-info → deepfabric-4.10.1.dist-info}/WHEEL +0 -0
{deepfabric-4.9.0.dist-info → deepfabric-4.10.1.dist-info}/entry_points.txt +0 -0
{deepfabric-4.9.0.dist-info → deepfabric-4.10.1.dist-info}/licenses/LICENSE +0 -0

deepfabric/graph.py CHANGED Viewed

@@ -26,7 +26,7 @@ from .prompts import (
 )
 from .schemas import GraphSubtopics
 from .stream_simulator import simulate_stream
-from .topic_model import TopicModel
+from .topic_model import TopicModel, TopicPath
 if TYPE_CHECKING:  # only for type hints to avoid runtime cycles
     from .progress import ProgressReporter
@@ -231,6 +231,9 @@ class Graph(TopicModel):
     def save(self, save_path: str) -> None:
         """Save the topic graph to a file."""
+        from pathlib import Path  # noqa: PLC0415
+        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
         with open(save_path, "w") as f:
             f.write(self.to_json())
@@ -570,6 +573,48 @@ class Graph(TopicModel):
         self._dfs_paths(self.root, [self.root.topic], paths, visited)
         return paths
+    def get_all_paths_with_ids(self) -> list[TopicPath]:
+        """Returns all paths from root to leaf nodes with their leaf node UUIDs.
+        Returns:
+            List of TopicPath namedtuples containing (path, topic_id).
+            The topic_id is the UUID of the leaf node for each path.
+        """
+        result: list[TopicPath] = []
+        visited: set[int] = set()
+        self._dfs_paths_with_ids(self.root, [self.root.topic], result, visited)
+        return result
+    def _dfs_paths_with_ids(
+        self,
+        node: Node,
+        current_path: list[str],
+        result: list[TopicPath],
+        visited: set[int],
+    ) -> None:
+        """Helper function for DFS traversal to find all paths with leaf node UUIDs.
+        Args:
+            node: Current node being visited
+            current_path: Path from root to current node
+            result: Accumulated list of TopicPath namedtuples
+            visited: Set of node IDs already visited in current path to prevent cycles
+        """
+        if node.id in visited:
+            return
+        visited.add(node.id)
+        if not node.children:
+            # Leaf node - add path with this node's UUID
+            topic_id = node.metadata.get("uuid", str(node.id))
+            result.append(TopicPath(path=current_path, topic_id=topic_id))
+        for child in node.children:
+            self._dfs_paths_with_ids(child, current_path + [child.topic], result, visited)
+        visited.remove(node.id)
     def _dfs_paths(
         self, node: Node, current_path: list[str], paths: list[list[str]], visited: set[int]
     ) -> None:

deepfabric/prompts.py CHANGED Viewed

@@ -165,35 +165,6 @@ ARGUMENT REQUIREMENTS:
 Generate a complete agent reasoning example using structured output with tool_executions list."""
-    @staticmethod
-    def build_multi_turn_context_prompt(tool_registry, max_tools_per_query: int = 3) -> str:
-        """Build context for multi-turn conversations.
-        Returns a template with {{{{instructions}}}} and {{{{subtopics}}}} placeholders
-        that will be filled in by build_prompt() with actual topic paths from the tree.
-        """
-        tool_signatures = []
-        for tool in tool_registry.tools:
-            tool_signatures.append(f"- {tool.to_signature()}")
-        return f"""Generate a multi-turn agent conversation with evolving tool usage.
-Available tools:
-{chr(10).join(tool_signatures)}
-You may use 1 to {max_tools_per_query} tools per query. Show tool dependencies and reasoning across conversation turns.
-ARGUMENT REQUIREMENTS:
-- All argument values must be concrete and realistic (e.g., owner="acme-corp", repo="web-app", issue_number=42)
-- Never use template placeholders like {{{{owner}}}} or {{{{repo}}}}
-- Never use null values - omit optional parameters entirely if not needed
-- String fields must contain actual content, not empty strings
-{{{{{{{{instructions}}}}}}}}
-{{{{{{{{subtopics}}}}}}}}
-Generate a complete multi-turn conversation using structured output with tool_executions list."""
 # Simplified prompts that delegate to structured generation
 AGENT_COT_TOOLS_PROMPT = """Generate an agent tool-calling training example using the available tool definitions.
@@ -224,16 +195,6 @@ Focus on teaching both the reasoning process AND multi-tool usage patterns.
 {{{{examples}}}}
 {{{{subtopics}}}}"""
-AGENT_COT_MULTI_TURN_PROMPT = """Generate a multi-turn agent conversation with tool usage across turns.
-Show how reasoning evolves: tool dependencies, progressive refinement, and result synthesis.
-Create realistic tool chaining patterns and decision-making processes.
-{{{{instructions}}}}
-{{{{examples}}}}
-{{{{subtopics}}}}"""
 CONVERSATION_GENERATION_PROMPT = """Generate a training conversation for a language model with this system prompt:
 <system_prompt>

deepfabric/schemas.py CHANGED Viewed

@@ -842,10 +842,11 @@ class ToolContext(BaseModel):
 class AgentContext(BaseModel):
-    """Agent capability - present when agent_mode is enabled."""
+    """Agent capability - present when tools are configured for agent mode."""
-    mode: Literal["single_turn", "multi_turn"] = Field(
-        description="Agent interaction mode: single_turn for one-shot tool use, multi_turn for extended conversations"
+    mode: Literal["single_turn"] = Field(
+        default="single_turn",
+        description="Agent interaction mode (single_turn is the only supported mode)",
     )
     planning_trace: str | None = Field(
         default=None, description="Agent's planning and reasoning about tool usage strategy"

deepfabric/topic_model.py CHANGED Viewed

@@ -1,4 +1,12 @@
 from abc import ABC, abstractmethod
+from typing import NamedTuple
+class TopicPath(NamedTuple):
+    """A topic path with its associated unique identifier."""
+    path: list[str]
+    topic_id: str
 class TopicModel(ABC):
@@ -18,3 +26,27 @@ class TopicModel(ABC):
     def get_all_paths(self) -> list[list[str]]:
         """Returns all the paths in the topic model."""
         raise NotImplementedError
+    @abstractmethod
+    def get_all_paths_with_ids(self) -> list[TopicPath]:
+        """Returns all paths with their unique identifiers.
+        Returns:
+            List of TopicPath namedtuples containing (path, topic_id).
+            The topic_id is a stable identifier for the leaf node of each path.
+        """
+        raise NotImplementedError
+    def get_path_by_id(self, topic_id: str) -> list[str] | None:
+        """Look up a path by its topic_id.
+        Args:
+            topic_id: The unique identifier for a topic path.
+        Returns:
+            The path list if found, None otherwise.
+        """
+        for topic_path in self.get_all_paths_with_ids():
+            if topic_path.topic_id == topic_id:
+                return topic_path.path
+        return None

deepfabric/tree.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .metrics import trace
 from .prompts import TreePromptBuilder
 from .schemas import TopicList
 from .stream_simulator import simulate_stream
-from .topic_model import TopicModel
+from .topic_model import TopicModel, TopicPath
 warnings.filterwarnings("ignore", message=".*Pydantic serializer warnings:.*")
@@ -242,6 +242,25 @@ class Tree(TopicModel):
         """Returns all the paths in the topic model."""
         return self.tree_paths
+    def get_all_paths_with_ids(self) -> list[TopicPath]:
+        """Returns all paths with their unique identifiers.
+        For Tree, we generate stable IDs by hashing the path content.
+        This ensures the same path always gets the same ID across runs.
+        Returns:
+            List of TopicPath namedtuples containing (path, topic_id).
+        """
+        import hashlib  # noqa: PLC0415
+        result: list[TopicPath] = []
+        for path in self.tree_paths:
+            # Generate stable ID from path content
+            path_str = "::".join(path)
+            topic_id = hashlib.sha256(path_str.encode()).hexdigest()[:16]
+            result.append(TopicPath(path=path, topic_id=topic_id))
+        return result
     async def get_subtopics(
         self, system_prompt: str, node_path: list[str], num_subtopics: int
     ) -> list[str]:
@@ -385,6 +404,9 @@ class Tree(TopicModel):
     def save(self, save_path: str) -> None:
         """Save the topic tree to a file."""
+        from pathlib import Path  # noqa: PLC0415
+        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
         with open(save_path, "w") as f:
             for path in self.tree_paths:
                 f.write(json.dumps({"path": path}) + "\n")

deepfabric/tui.py CHANGED Viewed

@@ -275,19 +275,19 @@ class DeepFabricTUI:
     def success(self, message: str) -> None:
         """Display a success message."""
-        self.console.print(f" {message}", style="green")
+        self.console.print(f"✓ {message}", style="green")
     def warning(self, message: str) -> None:
         """Display a warning message."""
-        self.console.print(f"⚠️  {message}", style="yellow")
+        self.console.print(f"⚠ {message}", style="yellow")
     def error(self, message: str) -> None:
         """Display an error message."""
-        self.console.print(f"❌ {message}", style="red")
+        self.console.print(f"✗ {message}", style="red")
     def info(self, message: str) -> None:
         """Display an info message."""
-        self.console.print(f" {message}", style="blue")
+        self.console.print(f"• {message}", style="blue")
 class TreeBuildingTUI(TopicBuildingMixin, StreamObserver):
@@ -846,6 +846,13 @@ class DatasetGenerationTUI(StreamObserver):
         self.status_samples_done = 0
         self.status_failed_total = 0
         self.status_step_started_at = 0.0
+        self.status_last_step_duration = 0.0
+        # Checkpoint tracking for status panel
+        self.checkpoint_enabled = False  # Set to True when checkpointing is configured
+        self.checkpoint_count = 0
+        self.last_checkpoint_samples = 0
+        self._resumed_from_checkpoint = False  # Set by set_checkpoint_resume_status()
+        self._stop_requested = False  # Set when graceful stop requested via Ctrl+C
         # Retry tracking for simple mode
         self.step_retries: list[dict] = []  # Retries in current step
@@ -919,18 +926,8 @@ class DatasetGenerationTUI(StreamObserver):
             type_map = {
                 "basic": "Basic Q&A",
                 "cot": "Chain of Thought",
-                "single_turn_agent": "Single-Turn Agent (Tool Calling)",
-                "multi_turn_agent": "Multi-Turn Agent (Tool Calling)",
             }
             self.current_sample_type = type_map.get(conv_type, conv_type)
-        elif "agent_mode" in metadata:
-            agent_mode = metadata["agent_mode"]
-            if agent_mode == "single_turn":
-                self.current_sample_type = "Single-Turn Agent (Tool Calling)"
-            elif agent_mode == "multi_turn":
-                self.current_sample_type = "Multi-Turn Agent (Tool Calling)"
-            else:
-                self.current_sample_type = f"Agent ({agent_mode})"
         # Update current topic path if provided
         topic_path = metadata.get("topic_path") if isinstance(metadata, dict) else None
@@ -1041,13 +1038,21 @@ class DatasetGenerationTUI(StreamObserver):
             return
     # --- Status Panel helpers ---
-    def init_status(self, total_steps: int, total_samples: int) -> None:
+    def init_status(
+        self, total_steps: int, total_samples: int, checkpoint_enabled: bool = False
+    ) -> None:
         self.status_total_steps = total_steps
         self.status_total_samples = total_samples
         self.status_current_step = 0
-        self.status_samples_done = 0
-        self.status_failed_total = 0
+        # Preserve samples_done and failed_total if resuming from checkpoint
+        if not getattr(self, "_resumed_from_checkpoint", False):
+            self.status_samples_done = 0
+            self.status_failed_total = 0
+            self.checkpoint_count = 0
+            self.last_checkpoint_samples = 0
         self.status_step_started_at = 0.0
+        self.status_last_step_duration = 0.0
+        self.checkpoint_enabled = checkpoint_enabled
     def status_step_start(self, step: int, total_steps: int | None = None) -> None:
         self.status_current_step = step
@@ -1057,22 +1062,62 @@ class DatasetGenerationTUI(StreamObserver):
         self.update_status_panel()
     def status_step_complete(self, samples_generated: int, failed_in_step: int = 0) -> None:
+        # Calculate step duration before updating counters
+        if self.status_step_started_at:
+            self.status_last_step_duration = max(0.0, monotonic() - self.status_step_started_at)
+            self.status_step_started_at = 0.0  # Reset for next step
         self.status_samples_done += max(0, int(samples_generated))
         self.status_failed_total += max(0, int(failed_in_step))
         self.update_status_panel()
+    def set_checkpoint_resume_status(
+        self, samples_done: int, failed_total: int, checkpoint_count: int = 0
+    ) -> None:
+        """Initialize status counters from checkpoint data when resuming.
+        Args:
+            samples_done: Number of samples already generated in checkpoint
+            failed_total: Number of failures already recorded in checkpoint
+            checkpoint_count: Number of checkpoints already saved (optional)
+        """
+        self._resumed_from_checkpoint = True
+        self.status_samples_done = max(0, int(samples_done))
+        self.status_failed_total = max(0, int(failed_total))
+        if checkpoint_count > 0:
+            self.checkpoint_count = checkpoint_count
+            self.last_checkpoint_samples = samples_done
+        self.update_status_panel()
+    def status_checkpoint_saved(self, total_samples: int) -> None:
+        """Update checkpoint tracking when a checkpoint is saved."""
+        self.checkpoint_count += 1
+        self.last_checkpoint_samples = total_samples
+        self.update_status_panel()
+    def status_stop_requested(self) -> None:
+        """Mark that a graceful stop has been requested."""
+        self._stop_requested = True
+        self.update_status_panel()
     def _status_panel(self) -> Panel:
-        elapsed = 0.0
-        if self.status_step_started_at:
-            elapsed = max(0.0, monotonic() - self.status_step_started_at)
         table = Table(show_header=False, box=None, padding=(0, 1))
         table.add_column(style="cyan", no_wrap=True)
         table.add_column(style="white")
         table.add_row("Step:", f"{self.status_current_step}/{self.status_total_steps}")
-        table.add_row("Step Elapsed:", f"{elapsed:0.1f}s")
+        if self.status_last_step_duration > 0:
+            table.add_row("Last Step:", f"{self.status_last_step_duration:0.1f}s")
         table.add_row("Generated:", f"{self.status_samples_done}/{self.status_total_samples}")
         if self.status_failed_total:
             table.add_row("Failed:", str(self.status_failed_total))
+        if self.checkpoint_enabled:
+            if self.checkpoint_count > 0:
+                table.add_row(
+                    "Checkpoints:", f"{self.checkpoint_count} ({self.last_checkpoint_samples} samples)"
+                )
+            else:
+                table.add_row("Checkpoints:", "0 (enabled)")
+        if self._stop_requested:
+            table.add_row("[yellow]Stopping:[/yellow]", "[yellow]at next checkpoint[/yellow]")
         return Panel(table, title="Status", border_style="dim", padding=(0, 1))
     def update_status_panel(self) -> None:

deepfabric/utils.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import ast
 import asyncio
+import hashlib
 import importlib
 import json
 import os
 import re
+import sys
+from pathlib import Path
 from typing import Any
 VALIDATION_ERROR_INDICATORS = [
@@ -155,6 +158,51 @@ def read_topic_tree_from_jsonl(file_path: str) -> list[dict]:
     return topic_tree
+def parse_num_samples(value: int | str | None) -> int | str | None:
+    """Parse and validate num_samples: integer, 'auto', or percentage like '50%'.
+    This is a shared utility used by both CLI argument parsing and config validation.
+    Args:
+        value: Raw value - can be int, string, or None
+    Returns:
+        Parsed value: int, "auto", percentage string like "50%", or None
+    Raises:
+        ValueError: If the value is invalid
+    """
+    if value is None:
+        return None
+    if isinstance(value, int):
+        if value < 1:
+            raise ValueError("num_samples must be at least 1")
+        return value
+    if isinstance(value, str):
+        normalized = value.strip().lower()
+        if normalized == "auto":
+            return "auto"
+        if normalized.endswith("%"):
+            try:
+                pct = float(normalized[:-1])
+            except ValueError as e:
+                raise ValueError(f"Invalid percentage format: {value}") from e
+            if pct <= 0:
+                raise ValueError("Percentage must be greater than 0")
+            return normalized
+        # Try to parse as integer string
+        try:
+            parsed = int(normalized)
+        except ValueError as e:
+            raise ValueError(
+                f"Invalid num_samples value: {value}. Use integer, 'auto', or percentage like '50%'"
+            ) from e
+        if parsed < 1:
+            raise ValueError("num_samples must be at least 1")
+        return parsed
+    raise ValueError(f"num_samples must be int or string, got {type(value).__name__}")
 def get_bool_env(key: str, default: bool = False) -> bool:
     """Get a boolean environment variable.
@@ -195,3 +243,139 @@ def import_optional_dependency(
         else:
             msg = f"The '{module_name}' library is required but is not installed."
         raise ModuleNotFoundError(msg) from None
+def check_path_writable(path: str, path_description: str) -> tuple[bool, str | None]:
+    """Check if a path is writable.
+    Checks whether the specified file path can be written to by verifying:
+    1. If the file exists, whether it's writable
+    2. If the file doesn't exist, whether the parent directory exists and is writable
+    Args:
+        path: The file path to check
+        path_description: Human-readable description for error messages
+    Returns:
+        Tuple of (is_writable, error_message). error_message is None if writable.
+    """
+    file_path = Path(path)
+    parent_dir = file_path.parent
+    error_msg: str | None = None
+    # If the file exists, check if it's writable
+    if file_path.exists():
+        if not os.access(file_path, os.W_OK):
+            error_msg = f"{path_description} exists but is not writable: {path}"
+    elif not parent_dir.exists():
+        # File doesn't exist and parent doesn't exist
+        # Walk up to find the first existing ancestor
+        ancestor = parent_dir
+        while not ancestor.exists() and ancestor != ancestor.parent:
+            ancestor = ancestor.parent
+        if not ancestor.exists():
+            error_msg = (
+                f"{path_description} parent directory does not exist "
+                f"and cannot be created: {parent_dir}"
+            )
+        elif not os.access(ancestor, os.W_OK):
+            error_msg = (
+                f"{path_description} cannot create parent directory "
+                f"(no write access to {ancestor}): {parent_dir}"
+            )
+    elif not os.access(parent_dir, os.W_OK):
+        # Parent exists but is not writable
+        error_msg = f"{path_description} parent directory is not writable: {parent_dir}"
+    return (error_msg is None, error_msg)
+def check_dir_writable(path: str, path_description: str) -> tuple[bool, str | None]:
+    """Check if a directory path is writable.
+    Checks whether files can be created in the specified directory by verifying:
+    1. If the directory exists, whether it's writable
+    2. If the directory doesn't exist, whether we can create it
+    Args:
+        path: The directory path to check
+        path_description: Human-readable description for error messages
+    Returns:
+        Tuple of (is_writable, error_message). error_message is None if writable.
+    """
+    dir_path = Path(path)
+    # If the directory exists, check if it's writable
+    if dir_path.exists():
+        if not dir_path.is_dir():
+            return False, f"{path_description} exists but is not a directory: {path}"
+        if not os.access(dir_path, os.W_OK):
+            return False, f"{path_description} directory is not writable: {path}"
+        return True, None
+    # Directory doesn't exist - check if we can create it
+    ancestor = dir_path
+    while not ancestor.exists() and ancestor != ancestor.parent:
+        ancestor = ancestor.parent
+    if not ancestor.exists():
+        return False, f"{path_description} cannot be created (root does not exist): {path}"
+    if not os.access(ancestor, os.W_OK):
+        return False, f"{path_description} cannot be created (no write access to {ancestor}): {path}"
+    return True, None
+# Checkpoint directory resolution
+APP_NAME = "deepfabric"
+def _get_deepfabric_data_dir() -> Path:
+    """Get the DeepFabric data directory using platformdirs or fallback."""
+    try:
+        from platformdirs import user_data_dir  # noqa: PLC0415
+        return Path(user_data_dir(APP_NAME))
+    except ImportError:
+        # Fallback if platformdirs not available
+        if os.name == "nt":
+            # Windows: APPDATA
+            base = os.environ.get("APPDATA") or os.path.expanduser(r"~\AppData\Roaming")
+        elif sys.platform == "darwin":
+            # macOS: ~/Library/Application Support
+            base = os.path.expanduser("~/Library/Application Support")
+        else:
+            # Linux and other Unix: XDG_DATA_HOME
+            base = os.environ.get("XDG_DATA_HOME") or os.path.expanduser("~/.local/share")
+        return Path(base) / APP_NAME
+def get_checkpoint_dir(config_path: str | None = None) -> str:
+    """
+    Get the checkpoint directory for a given config file.
+    Uses ~/.deepfabric/checkpoints/{hash}/ where hash is derived from
+    the absolute path of the config file. This ensures:
+    - Consistent location regardless of current working directory
+    - No conflicts between different projects with same output filename
+    Args:
+        config_path: Path to the config file. If None, uses a default subdirectory.
+    Returns:
+        Path to the checkpoint directory (not created, just resolved)
+    """
+    base_dir = _get_deepfabric_data_dir() / "checkpoints"
+    if config_path is None:
+        # No config file - use a "default" subdirectory
+        return str(base_dir / "default")
+    # Create a short hash from the absolute path of the config file
+    abs_path = str(Path(config_path).resolve())
+    path_hash = hashlib.sha256(abs_path.encode()).hexdigest()[:12]
+    return str(base_dir / path_hash)

DeepFabric 4.9.0__py3-none-any.whl → 4.10.1__py3-none-any.whl

DeepFabric 4.9.0py3-none-any.whl → 4.10.1py3-none-any.whl