PyPI - DeepFabric - Versions diffs - 4.4.0__py3-none-any.whl - Mend

DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

deepfabric/__init__.py +70 -0
deepfabric/__main__.py +6 -0
deepfabric/auth.py +382 -0
deepfabric/builders.py +303 -0
deepfabric/builders_agent.py +1304 -0
deepfabric/cli.py +1288 -0
deepfabric/config.py +899 -0
deepfabric/config_manager.py +251 -0
deepfabric/constants.py +94 -0
deepfabric/dataset_manager.py +534 -0
deepfabric/error_codes.py +581 -0
deepfabric/evaluation/__init__.py +47 -0
deepfabric/evaluation/backends/__init__.py +32 -0
deepfabric/evaluation/backends/ollama_backend.py +137 -0
deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
deepfabric/evaluation/backends/transformers_backend.py +326 -0
deepfabric/evaluation/evaluator.py +845 -0
deepfabric/evaluation/evaluators/__init__.py +13 -0
deepfabric/evaluation/evaluators/base.py +104 -0
deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
deepfabric/evaluation/evaluators/registry.py +66 -0
deepfabric/evaluation/inference.py +155 -0
deepfabric/evaluation/metrics.py +397 -0
deepfabric/evaluation/parser.py +304 -0
deepfabric/evaluation/reporters/__init__.py +13 -0
deepfabric/evaluation/reporters/base.py +56 -0
deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
deepfabric/evaluation/reporters/file_reporter.py +61 -0
deepfabric/evaluation/reporters/multi_reporter.py +56 -0
deepfabric/exceptions.py +67 -0
deepfabric/factory.py +26 -0
deepfabric/generator.py +1084 -0
deepfabric/graph.py +545 -0
deepfabric/hf_hub.py +214 -0
deepfabric/kaggle_hub.py +219 -0
deepfabric/llm/__init__.py +41 -0
deepfabric/llm/api_key_verifier.py +534 -0
deepfabric/llm/client.py +1206 -0
deepfabric/llm/errors.py +105 -0
deepfabric/llm/rate_limit_config.py +262 -0
deepfabric/llm/rate_limit_detector.py +278 -0
deepfabric/llm/retry_handler.py +270 -0
deepfabric/metrics.py +212 -0
deepfabric/progress.py +262 -0
deepfabric/prompts.py +290 -0
deepfabric/schemas.py +1000 -0
deepfabric/spin/__init__.py +6 -0
deepfabric/spin/client.py +263 -0
deepfabric/spin/models.py +26 -0
deepfabric/stream_simulator.py +90 -0
deepfabric/tools/__init__.py +5 -0
deepfabric/tools/defaults.py +85 -0
deepfabric/tools/loader.py +87 -0
deepfabric/tools/mcp_client.py +677 -0
deepfabric/topic_manager.py +303 -0
deepfabric/topic_model.py +20 -0
deepfabric/training/__init__.py +35 -0
deepfabric/training/api_key_prompt.py +302 -0
deepfabric/training/callback.py +363 -0
deepfabric/training/metrics_sender.py +301 -0
deepfabric/tree.py +438 -0
deepfabric/tui.py +1267 -0
deepfabric/update_checker.py +166 -0
deepfabric/utils.py +150 -0
deepfabric/validation.py +143 -0
deepfabric-4.4.0.dist-info/METADATA +702 -0
deepfabric-4.4.0.dist-info/RECORD +71 -0
deepfabric-4.4.0.dist-info/WHEEL +4 -0
deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0

deepfabric/progress.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Progress reporting system for dataset generation.
+This module provides a modular event-based progress reporting system that
+allows components to emit progress events (streaming text, step markers, etc.)
+without coupling to specific display implementations.
+The system uses the Observer pattern to enable multiple observers (TUI, logging,
+metrics, etc.) to react to progress events.
+"""
+from typing import TYPE_CHECKING, Any, Protocol
+if TYPE_CHECKING:
+    from .error_codes import ClassifiedError
+class StreamObserver(Protocol):
+    """Protocol for observers that react to progress events.
+    Implementations can choose which events to handle based on their needs.
+    This protocol supports both dataset generation and tree/graph building.
+    """
+    def on_stream_chunk(self, source: str, chunk: str, metadata: dict[str, Any]) -> None:
+        """Called when a chunk of streaming text is received from an LLM.
+        Args:
+            source: Identifier for the generation source
+                - Dataset: "user_question", "agent_reasoning", "tool_sim_weather"
+                - Tree/Graph: "topic_generation", "subtopic_expansion"
+            chunk: The text chunk received from the LLM
+            metadata: Additional context (sample_idx, node_path, depth, etc.)
+        """
+        ...
+    def on_step_start(self, step_name: str, metadata: dict[str, Any]) -> None:
+        """Called when a generation step begins.
+        Args:
+            step_name: Human-readable name of the step
+                - Dataset: "Generating user question", "Simulating tool: get_weather"
+                - Tree/Graph: "Expanding node: AI/ML", "Generating subtopics (depth 2)"
+            metadata: Additional context (sample_idx, turn_idx, depth, node_path, etc.)
+        """
+        ...
+    def on_step_complete(self, step_name: str, metadata: dict[str, Any]) -> None:
+        """Called when a generation step completes.
+        Args:
+            step_name: Human-readable name of the step
+            metadata: Additional context including results (tokens_used, duration, success, etc.)
+        """
+        ...
+    def on_error(self, error: "ClassifiedError", metadata: dict[str, Any]) -> None:
+        """Called when an error occurs during generation.
+        Args:
+            error: ClassifiedError with error code and details
+            metadata: Additional context (sample_idx, step, etc.)
+        """
+        ...
+    def on_retry(
+        self,
+        sample_idx: int,
+        attempt: int,
+        max_attempts: int,
+        error_summary: str,
+        metadata: dict[str, Any],
+    ) -> None:
+        """Called when a sample generation will be retried due to validation failure.
+        Args:
+            sample_idx: 1-based sample index
+            attempt: Current attempt number (1-based)
+            max_attempts: Total number of attempts allowed
+            error_summary: Brief description of the validation error
+            metadata: Additional context
+        """
+        ...
+class ProgressReporter:
+    """Central progress reporter that notifies observers of generation events.
+    This class acts as the subject in the Observer pattern, managing a list of
+    observers and broadcasting events to them.
+    Example:
+        >>> reporter = ProgressReporter()
+        >>> reporter.attach(my_tui_observer)
+        >>> reporter.emit_step_start("Generating question", sample_idx=1)
+        >>> reporter.emit_chunk("user_question", "What is the weather", sample_idx=1)
+        >>> reporter.emit_step_complete("Generating question", sample_idx=1)
+    """
+    def __init__(self):
+        """Initialize an empty progress reporter."""
+        self._observers: list[StreamObserver] = []
+    def attach(self, observer: StreamObserver) -> None:
+        """Attach an observer to receive progress events.
+        Args:
+            observer: Observer implementing StreamObserver protocol
+        """
+        if observer not in self._observers:
+            self._observers.append(observer)
+    def detach(self, observer: StreamObserver) -> None:
+        """Detach an observer from receiving progress events.
+        Args:
+            observer: Observer to remove
+        """
+        if observer in self._observers:
+            self._observers.remove(observer)
+    def emit_chunk(self, source: str, chunk: str, **metadata) -> None:
+        """Emit a streaming text chunk to all observers.
+        Args:
+            source: Identifier for the generation source
+            chunk: Text chunk from LLM
+            **metadata: Additional context as keyword arguments
+        """
+        for observer in self._observers:
+            observer.on_stream_chunk(source, chunk, metadata)
+    def emit_step_start(self, step_name: str, **metadata) -> None:
+        """Emit a step start event to all observers.
+        Args:
+            step_name: Human-readable step name
+            **metadata: Additional context as keyword arguments
+        """
+        for observer in self._observers:
+            observer.on_step_start(step_name, metadata)
+    def emit_step_complete(self, step_name: str, **metadata) -> None:
+        """Emit a step complete event to all observers.
+        Args:
+            step_name: Human-readable step name
+            **metadata: Additional context as keyword arguments
+        """
+        for observer in self._observers:
+            observer.on_step_complete(step_name, metadata)
+    def emit_error(self, error: "ClassifiedError", **metadata) -> None:
+        """Emit an error event to all observers.
+        Args:
+            error: ClassifiedError with error code and details
+            **metadata: Additional context as keyword arguments
+        """
+        for observer in self._observers:
+            if hasattr(observer, "on_error"):
+                observer.on_error(error, metadata)
+    def emit_retry(
+        self,
+        sample_idx: int,
+        attempt: int,
+        max_attempts: int,
+        error_summary: str,
+        **metadata,
+    ) -> None:
+        """Emit a retry event to all observers.
+        This is used to track validation failures that will be retried,
+        allowing the TUI to display them gracefully without cluttering output.
+        Args:
+            sample_idx: 1-based sample index
+            attempt: Current attempt number (1-based)
+            max_attempts: Total number of attempts allowed
+            error_summary: Brief description of the error
+            **metadata: Additional context as keyword arguments
+        """
+        for observer in self._observers:
+            if hasattr(observer, "on_retry"):
+                observer.on_retry(sample_idx, attempt, max_attempts, error_summary, metadata)
+    def emit_tool_execution(
+        self,
+        tool_name: str,
+        success: bool,
+        **metadata,
+    ) -> None:
+        """Emit a tool execution event to all observers.
+        Used to track Spin tool executions in the TUI events panel.
+        Args:
+            tool_name: Name of the tool being executed
+            success: Whether the execution succeeded
+            **metadata: Additional context (e.g., error_type, result preview)
+        """
+        for observer in self._observers:
+            if hasattr(observer, "on_tool_execution"):
+                observer.on_tool_execution(tool_name, success, metadata)
+    def emit_node_retry(
+        self,
+        node_topic: str,
+        attempt: int,
+        max_attempts: int,
+        error_summary: str,
+        **metadata,
+    ) -> None:
+        """Emit a node expansion retry event to all observers.
+        Used to track graph/tree node expansion retries in the TUI events panel.
+        Args:
+            node_topic: Topic of the node being expanded
+            attempt: Current attempt number (1-based)
+            max_attempts: Total number of attempts allowed
+            error_summary: Brief description of the error
+            **metadata: Additional context as keyword arguments
+        """
+        for observer in self._observers:
+            if hasattr(observer, "on_node_retry"):
+                observer.on_node_retry(node_topic, attempt, max_attempts, error_summary, metadata)
+# Convenience context manager for tracking steps
+class ProgressStep:
+    """Context manager for automatic step start/complete reporting.
+    Example:
+        >>> with ProgressStep(reporter, "Generating question", sample_idx=1):
+        ...     # Do work
+        ...     reporter.emit_chunk("question", "What is...")
+    """
+    def __init__(self, reporter: ProgressReporter | None, step_name: str, **metadata: Any):
+        """Initialize progress step tracker.
+        Args:
+            reporter: Progress reporter (None = no-op)
+            step_name: Human-readable step name
+            **metadata: Additional context
+        """
+        self.reporter = reporter
+        self.step_name = step_name
+        self.metadata = metadata
+    def __enter__(self):
+        """Enter context: emit step start."""
+        if self.reporter:
+            self.reporter.emit_step_start(self.step_name, **self.metadata)
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Exit context: emit step complete."""
+        if self.reporter:
+            self.reporter.emit_step_complete(self.step_name, **self.metadata)
+        return False  # Don't suppress exceptions

deepfabric/prompts.py ADDED Viewed

@@ -0,0 +1,290 @@
+class TreePromptBuilder:
+    """Build dynamic prompts for topic tree expansion with domain-specific examples."""
+    # Domain-specific expansion examples
+    EXAMPLES = {
+        "general": [
+            {
+                "path": ["Technology", "Artificial Intelligence"],
+                "subtopics": [
+                    "machine learning",
+                    "neural networks",
+                    "computer vision",
+                    "natural language processing",
+                    "robotics",
+                ],
+            },
+            {
+                "path": ["Entertainment", "Movies", "Actors"],
+                "subtopics": [
+                    "Tom Hanks",
+                    "Meryl Streep",
+                    "Leonardo DiCaprio",
+                    "Jennifer Lawrence",
+                    "Denzel Washington",
+                ],
+            },
+        ],
+        "conversational": [
+            {
+                "path": ["Small Talk Topics"],
+                "subtopics": [
+                    "weather",
+                    "weekend plans",
+                    "hobbies",
+                    "family",
+                    "books",
+                    "food",
+                    "music",
+                ],
+            },
+            {
+                "path": ["Small Talk Topics", "Family"],
+                "subtopics": [
+                    "parents",
+                    "grandparents",
+                    "siblings",
+                    "family traditions",
+                    "family vacations",
+                ],
+            },
+            {
+                "path": ["Small Talk Topics", "Hobbies", "Cooking"],
+                "subtopics": [
+                    "recipes",
+                    "asian food",
+                    "favourite dishes",
+                    "cookbooks",
+                    "kitchen gadgets",
+                    "vegan cooking",
+                ],
+            },
+        ],
+        "technical": [
+            {
+                "path": ["Programming"],
+                "subtopics": [
+                    "algorithms",
+                    "data structures",
+                    "debugging",
+                    "testing",
+                    "version control",
+                ],
+            },
+            {
+                "path": ["Programming", "Python"],
+                "subtopics": ["pandas", "flask", "pytest", "asyncio", "django"],
+            },
+        ],
+        "educational": [
+            {
+                "path": ["Mathematics"],
+                "subtopics": ["algebra", "geometry", "calculus", "statistics", "probability"],
+            },
+            {
+                "path": ["Mathematics", "Algebra"],
+                "subtopics": [
+                    "linear equations",
+                    "quadratic functions",
+                    "polynomials",
+                    "matrices",
+                    "systems",
+                ],
+            },
+        ],
+    }
+    @classmethod
+    def build_expansion_prompt(
+        cls,
+        topic_path: list[str],
+        num_subtopics: int,
+        system_prompt: str = "",
+        domain: str = "general",
+    ) -> str:
+        """Build a topic expansion prompt with relevant examples."""
+        path_str = " -> ".join(f'"{topic}"' for topic in topic_path)
+        examples = cls._format_examples(cls.EXAMPLES.get(domain, cls.EXAMPLES["general"]))
+        return f"""Generate {num_subtopics} subtopics for training data organization.
+Task: Create diverse but related subtopics that expand on the given topic path.
+Examples:
+{examples}
+Context: {system_prompt}
+Topic path: {path_str}
+Generate {num_subtopics} subtopics as a Python list. Return only the list, nothing else."""
+    @classmethod
+    def _format_examples(cls, examples: list) -> str:
+        """Format examples for inclusion in prompt."""
+        formatted = []
+        for ex in examples[:3]:  # Limit to 3 examples
+            path_str = " -> ".join(f'"{topic}"' for topic in ex["path"])
+            subtopics_str = str(ex["subtopics"])
+            formatted.append(f"Path: {path_str}\nSubtopics: {subtopics_str}")
+        return "\n\n".join(formatted)
+# Structured Agent Tool-Calling Prompt Builder
+class AgentPromptBuilder:
+    """Build structured prompts for agent tool-calling training."""
+    @staticmethod
+    def build_tool_context_prompt(tool_registry, max_tools_per_query: int = 3) -> str:
+        """Build a minimal context prompt that relies on structured generation.
+        Returns a template with {{{{instructions}}}} and {{{{subtopics}}}} placeholders
+        that will be filled in by build_prompt() with actual topic paths from the tree.
+        """
+        tool_signatures = []
+        for tool in tool_registry.tools:
+            tool_signatures.append(f"- {tool.to_signature()}")
+        return f"""Generate a realistic agent training example with tool usage reasoning.
+Available tools:
+{chr(10).join(tool_signatures)}
+You may use 1 to {max_tools_per_query} tools to complete the task.
+Focus on WHY each tool is selected and HOW parameters are constructed.
+ARGUMENT REQUIREMENTS:
+- All argument values must be concrete and realistic (e.g., owner="acme-corp", repo="web-app", issue_number=42)
+- Never use template placeholders like {{{{owner}}}} or {{{{repo}}}}
+- Never use null values - omit optional parameters entirely if not needed
+- String fields must contain actual content, not empty strings
+{{{{{{{{instructions}}}}}}}}
+{{{{{{{{subtopics}}}}}}}}
+Generate a complete agent reasoning example using structured output with tool_executions list."""
+    @staticmethod
+    def build_multi_turn_context_prompt(tool_registry, max_tools_per_query: int = 3) -> str:
+        """Build context for multi-turn conversations.
+        Returns a template with {{{{instructions}}}} and {{{{subtopics}}}} placeholders
+        that will be filled in by build_prompt() with actual topic paths from the tree.
+        """
+        tool_signatures = []
+        for tool in tool_registry.tools:
+            tool_signatures.append(f"- {tool.to_signature()}")
+        return f"""Generate a multi-turn agent conversation with evolving tool usage.
+Available tools:
+{chr(10).join(tool_signatures)}
+You may use 1 to {max_tools_per_query} tools per query. Show tool dependencies and reasoning across conversation turns.
+ARGUMENT REQUIREMENTS:
+- All argument values must be concrete and realistic (e.g., owner="acme-corp", repo="web-app", issue_number=42)
+- Never use template placeholders like {{{{owner}}}} or {{{{repo}}}}
+- Never use null values - omit optional parameters entirely if not needed
+- String fields must contain actual content, not empty strings
+{{{{{{{{instructions}}}}}}}}
+{{{{{{{{subtopics}}}}}}}}
+Generate a complete multi-turn conversation using structured output with tool_executions list."""
+# Simplified prompts that delegate to structured generation
+AGENT_COT_TOOLS_PROMPT = """Generate an agent tool-calling training example using the available tool definitions.
+You may use multiple tools (up to the specified limit) to complete the task.
+Focus on the reasoning process: WHY each tool is selected, HOW parameters are constructed, and WHAT results are expected.
+Create realistic scenarios that teach proper tool reasoning patterns and multi-tool orchestration.
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}"""
+AGENT_COT_HYBRID_PROMPT = """Generate agent tool-calling examples with rich CoT reasoning traces and tool execution.
+You may use multiple tools (up to the specified limit) to complete the task.
+Combine natural language reasoning with structured step-by-step traces that include:
+- Chain of thought analysis
+- Structured reasoning steps with thoughts and actions
+- Clear tool selection and parameter reasoning
+- Multiple tool executions with results
+Focus on teaching both the reasoning process AND multi-tool usage patterns.
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}"""
+AGENT_COT_MULTI_TURN_PROMPT = """Generate a multi-turn agent conversation with tool usage across turns.
+Show how reasoning evolves: tool dependencies, progressive refinement, and result synthesis.
+Create realistic tool chaining patterns and decision-making processes.
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}"""
+CONVERSATION_GENERATION_PROMPT = """Generate a training conversation for a language model with this system prompt:
+<system_prompt>
+{{{{system_prompt}}}}
+</system_prompt>
+Create a realistic single q&a that demonstrates the system's capabilities. The conversation should:
+- Start with a user question/request
+- Have the assistant respond helpfully according to the system prompt
+- Be natural and educational
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}
+Generate one training sample as question and answer."""
+GRAPH_EXPANSION_PROMPT = """
+You are an expert in knowledge graph generation. Your task is to expand a topic into a set of subtopics. For each subtopic, you should also identify if it connects to any other existing topics in the graph.
+Here is the current state of the graph:
+{{current_graph_summary}}
+You are expanding the topic: "{{current_topic}}"
+Generate a list of {{num_subtopics}} subtopics. For each subtopic, provide:
+1. A "topic" string - the name of the new subtopic
+2. A "connections" list of IDs of existing topics it should connect to for creating cross-links (use empty list if no connections)
+"""
+# Chain of Thought prompts for reasoning-based dataset generation
+FREETEXT_COT_PROMPT = """Generate a reasoning problem that requires analytical thinking to solve.
+Create problems involving mathematics, logic, science, or analytical reasoning that can be solved through clear thinking steps.
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}"""
+STRUCTURED_COT_PROMPT = """Generate a training conversation that demonstrates systematic problem-solving.
+Create realistic dialogues where complex problems are solved through methodical reasoning.
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}"""
+HYBRID_COT_PROMPT = """Generate problems that require analytical and systematic thinking.
+Create challenging reasoning problems suitable for training systematic problem-solving skills.
+{{{{instructions}}}}
+{{{{examples}}}}
+{{{{subtopics}}}}"""