PyPI - DeepFabric - Versions diffs - 4.4.0__py3-none-any.whl - Mend

DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

deepfabric/__init__.py +70 -0
deepfabric/__main__.py +6 -0
deepfabric/auth.py +382 -0
deepfabric/builders.py +303 -0
deepfabric/builders_agent.py +1304 -0
deepfabric/cli.py +1288 -0
deepfabric/config.py +899 -0
deepfabric/config_manager.py +251 -0
deepfabric/constants.py +94 -0
deepfabric/dataset_manager.py +534 -0
deepfabric/error_codes.py +581 -0
deepfabric/evaluation/__init__.py +47 -0
deepfabric/evaluation/backends/__init__.py +32 -0
deepfabric/evaluation/backends/ollama_backend.py +137 -0
deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
deepfabric/evaluation/backends/transformers_backend.py +326 -0
deepfabric/evaluation/evaluator.py +845 -0
deepfabric/evaluation/evaluators/__init__.py +13 -0
deepfabric/evaluation/evaluators/base.py +104 -0
deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
deepfabric/evaluation/evaluators/registry.py +66 -0
deepfabric/evaluation/inference.py +155 -0
deepfabric/evaluation/metrics.py +397 -0
deepfabric/evaluation/parser.py +304 -0
deepfabric/evaluation/reporters/__init__.py +13 -0
deepfabric/evaluation/reporters/base.py +56 -0
deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
deepfabric/evaluation/reporters/file_reporter.py +61 -0
deepfabric/evaluation/reporters/multi_reporter.py +56 -0
deepfabric/exceptions.py +67 -0
deepfabric/factory.py +26 -0
deepfabric/generator.py +1084 -0
deepfabric/graph.py +545 -0
deepfabric/hf_hub.py +214 -0
deepfabric/kaggle_hub.py +219 -0
deepfabric/llm/__init__.py +41 -0
deepfabric/llm/api_key_verifier.py +534 -0
deepfabric/llm/client.py +1206 -0
deepfabric/llm/errors.py +105 -0
deepfabric/llm/rate_limit_config.py +262 -0
deepfabric/llm/rate_limit_detector.py +278 -0
deepfabric/llm/retry_handler.py +270 -0
deepfabric/metrics.py +212 -0
deepfabric/progress.py +262 -0
deepfabric/prompts.py +290 -0
deepfabric/schemas.py +1000 -0
deepfabric/spin/__init__.py +6 -0
deepfabric/spin/client.py +263 -0
deepfabric/spin/models.py +26 -0
deepfabric/stream_simulator.py +90 -0
deepfabric/tools/__init__.py +5 -0
deepfabric/tools/defaults.py +85 -0
deepfabric/tools/loader.py +87 -0
deepfabric/tools/mcp_client.py +677 -0
deepfabric/topic_manager.py +303 -0
deepfabric/topic_model.py +20 -0
deepfabric/training/__init__.py +35 -0
deepfabric/training/api_key_prompt.py +302 -0
deepfabric/training/callback.py +363 -0
deepfabric/training/metrics_sender.py +301 -0
deepfabric/tree.py +438 -0
deepfabric/tui.py +1267 -0
deepfabric/update_checker.py +166 -0
deepfabric/utils.py +150 -0
deepfabric/validation.py +143 -0
deepfabric-4.4.0.dist-info/METADATA +702 -0
deepfabric-4.4.0.dist-info/RECORD +71 -0
deepfabric-4.4.0.dist-info/WHEEL +4 -0
deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0

deepfabric/evaluation/evaluators/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Evaluator system for assessing model outputs."""
+from .base import BaseEvaluator, EvaluationContext, EvaluatorResult
+from .builtin.tool_calling import ToolCallingEvaluator
+from .registry import EvaluatorRegistry
+__all__ = [
+    "BaseEvaluator",
+    "EvaluationContext",
+    "EvaluatorResult",
+    "EvaluatorRegistry",
+    "ToolCallingEvaluator",
+]

deepfabric/evaluation/evaluators/base.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""Base classes for evaluation system."""
+from abc import ABC, abstractmethod
+from typing import Any
+from pydantic import BaseModel, Field
+from ...schemas import ToolDefinition
+from ..inference import ModelResponse
+from ..parser import GroundTruth
+class EvaluationContext(BaseModel):
+    """Context passed to evaluators."""
+    messages: list[dict[str, str]] = Field(description="Messages sent to model")
+    tools: list[ToolDefinition] | None = Field(
+        default=None,
+        description="Available tools for the evaluation",
+    )
+    sample_id: int = Field(description="Sample index in dataset")
+    metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Additional context metadata",
+    )
+class EvaluatorResult(BaseModel):
+    """Result from a single evaluator."""
+    evaluator_name: str = Field(description="Name of the evaluator")
+    metrics: dict[str, float] = Field(
+        description="Metrics produced by this evaluator",
+    )
+    details: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Additional details about the evaluation",
+    )
+    error: str | None = Field(
+        default=None,
+        description="Error message if evaluation failed",
+    )
+class BaseEvaluator(ABC):
+    """Base class for all evaluators.
+    Evaluators assess specific aspects of model outputs (e.g., tool calling,
+    safety, answer quality). They are modular and can be enabled/disabled
+    via configuration.
+    """
+    def __init__(self, config: dict[str, Any] | None = None):
+        """Initialize evaluator with optional configuration.
+        Args:
+            config: Optional evaluator-specific configuration
+        """
+        self.config = config or {}
+    @abstractmethod
+    def get_name(self) -> str:
+        """Return unique identifier for this evaluator.
+        Returns:
+            Evaluator name (e.g., "tool_calling", "safety")
+        """
+    def get_metrics(self) -> list[str]:
+        """Return list of metric names this evaluator produces.
+        Returns:
+            List of metric names
+        """
+        return []
+    def applicable_to(self, ground_truth: GroundTruth) -> bool:  # noqa: ARG002
+        """Check if this evaluator should run for the given sample.
+        Args:
+            ground_truth: Ground truth for the sample
+        Returns:
+            True if evaluator should run, False to skip
+        """
+        return True
+    @abstractmethod
+    def evaluate(
+        self,
+        ground_truth: GroundTruth,
+        prediction: ModelResponse,
+        context: EvaluationContext,
+    ) -> EvaluatorResult | None:
+        """Evaluate a single sample.
+        Args:
+            ground_truth: Expected values from dataset
+            prediction: Model's generated response
+            context: Additional evaluation context
+        Returns:
+            EvaluatorResult with metrics and details, or None to skip
+        """

deepfabric/evaluation/evaluators/builtin/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Built-in evaluators."""
+from .tool_calling import ToolCallingEvaluator
+__all__ = ["ToolCallingEvaluator"]

deepfabric/evaluation/evaluators/builtin/tool_calling.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""Tool calling evaluator for assessing function calling accuracy."""
+from ...inference import ModelResponse
+from ...metrics import compare_parameters
+from ...parser import GroundTruth
+from ..base import BaseEvaluator, EvaluationContext, EvaluatorResult
+class ToolCallingEvaluator(BaseEvaluator):
+    """Evaluates tool selection and parameter extraction accuracy.
+    This evaluator checks if the model:
+    1. Selects the correct tool
+    2. Extracts parameters correctly (with fuzzy matching)
+    3. Can execute the tool successfully (tool + params both correct)
+    Only applicable to samples with tool calls (skips samples without tools).
+    """
+    def get_name(self) -> str:
+        """Return evaluator identifier."""
+        return "tool_calling"
+    def get_metrics(self) -> list[str]:
+        """Return list of metrics this evaluator produces."""
+        return [
+            "tool_selection_accuracy",
+            "parameter_accuracy",
+            "execution_valid",
+        ]
+    def applicable_to(self, ground_truth: GroundTruth) -> bool:
+        """Only apply to samples with expected tool calls."""
+        return ground_truth.expected_tool is not None
+    def evaluate(
+        self,
+        ground_truth: GroundTruth,
+        prediction: ModelResponse,
+        context: EvaluationContext,
+    ) -> EvaluatorResult | None:
+        """Evaluate tool calling accuracy.
+        Args:
+            ground_truth: Expected tool and parameters
+            prediction: Model's generated response
+            context: Evaluation context with tool definitions
+        Returns:
+            EvaluatorResult with tool calling metrics
+        """
+        # Skip if not applicable
+        if not self.applicable_to(ground_truth):
+            return None
+        # Extract predicted tool and parameters
+        predicted_tool = None
+        predicted_params = {}
+        if prediction.tool_call:
+            predicted_tool = prediction.tool_call.get("name")
+            predicted_params = prediction.tool_call.get("arguments", {})
+        # Compute metrics
+        tool_correct = predicted_tool == ground_truth.expected_tool
+        # Validate parameters against the PREDICTED tool (not expected)
+        # This measures parameter extraction capability independently of tool selection
+        params_correct = compare_parameters(
+            ground_truth.expected_parameters,
+            predicted_params,
+            tool_name=predicted_tool,  # Use predicted tool for schema validation
+            tool_definitions=context.tools,
+        )
+        # Execution valid requires BOTH correct tool AND correct params
+        execution_valid = tool_correct and params_correct
+        return EvaluatorResult(
+            evaluator_name=self.get_name(),
+            metrics={
+                "tool_selection_accuracy": 1.0 if tool_correct else 0.0,
+                "parameter_accuracy": 1.0 if params_correct else 0.0,
+                "execution_valid": 1.0 if execution_valid else 0.0,
+            },
+            details={
+                "expected_tool": ground_truth.expected_tool,
+                "predicted_tool": predicted_tool,
+                "expected_parameters": ground_truth.expected_parameters,
+                "predicted_parameters": predicted_params,
+                "tool_match": tool_correct,
+                "params_match": params_correct,
+            },
+        )

deepfabric/evaluation/evaluators/registry.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Registry for managing evaluators."""
+from .base import BaseEvaluator
+class EvaluatorRegistry:
+    """Registry for managing evaluators (similar to FormatterRegistry).
+    Provides a central place to register and retrieve evaluators.
+    Supports both built-in and custom evaluators.
+    """
+    def __init__(self):
+        """Initialize registry with built-in evaluators."""
+        self._evaluators: dict[str, type[BaseEvaluator]] = {}
+        self._register_builtin_evaluators()
+    def register(self, evaluator_class: type[BaseEvaluator]) -> None:
+        """Register an evaluator class.
+        Args:
+            evaluator_class: Evaluator class to register
+        """
+        # Create temporary instance to get name
+        temp_instance = evaluator_class()
+        name = temp_instance.get_name()
+        self._evaluators[name] = evaluator_class
+    def get(self, name: str, config: dict | None = None) -> BaseEvaluator:
+        """Get evaluator instance by name.
+        Args:
+            name: Evaluator name
+            config: Optional configuration for the evaluator
+        Returns:
+            Evaluator instance
+        Raises:
+            KeyError: If evaluator not found
+        """
+        if name not in self._evaluators:
+            available = ", ".join(self._evaluators.keys())
+            msg = f"Evaluator '{name}' not found. Available: {available}"
+            raise KeyError(msg)
+        evaluator_class = self._evaluators[name]
+        return evaluator_class(config=config)
+    def list_evaluators(self) -> list[str]:
+        """List all registered evaluator names.
+        Returns:
+            List of evaluator names
+        """
+        return list(self._evaluators.keys())
+    def _register_builtin_evaluators(self) -> None:
+        """Register built-in evaluators."""
+        from .builtin.tool_calling import ToolCallingEvaluator  # noqa: PLC0415
+        self.register(ToolCallingEvaluator)
+        # More built-in evaluators can be registered here in the future
+        # Future: self.register(AnswerQualityEvaluator)
+        # Future: self.register(SafetyEvaluator)
+        # Future: self.register(GuardrailsEvaluator)

deepfabric/evaluation/inference.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""Model inference interfaces and implementations for evaluation."""
+from abc import ABC, abstractmethod
+from typing import Literal
+from pydantic import BaseModel, Field
+from ..schemas import ToolDefinition
+class InferenceConfig(BaseModel):
+    """Configuration for model inference."""
+    model_path: str = Field(
+        description="Path to model (local path or HuggingFace Hub ID)",
+    )
+    adapter_path: str | None = Field(
+        default=None,
+        description="Path to PEFT/LoRA adapter (if using adapter-based fine-tuning)",
+    )
+    backend: Literal["transformers", "ollama"] = Field(
+        default="transformers",
+        description="Inference backend to use",
+    )
+    use_unsloth: bool = Field(
+        default=False,
+        description="Use Unsloth for loading adapter (for adapters trained with Unsloth)",
+    )
+    max_seq_length: int = Field(
+        default=2048,
+        ge=1,
+        description="Maximum sequence length for Unsloth models",
+    )
+    load_in_4bit: bool = Field(
+        default=False,
+        description="Load model in 4-bit quantization (for Unsloth)",
+    )
+    temperature: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=2.0,
+        description="Sampling temperature",
+    )
+    max_tokens: int = Field(
+        default=2048,
+        ge=1,
+        description="Maximum tokens to generate",
+    )
+    top_p: float = Field(
+        default=0.9,
+        ge=0.0,
+        le=1.0,
+        description="Nucleus sampling top-p",
+    )
+    device: str | None = Field(
+        default=None,
+        description="Device to use (cuda, cpu, etc.). None for auto-detection",
+    )
+    batch_size: int = Field(
+        default=1,
+        ge=1,
+        description="Batch size for inference",
+    )
+class ModelResponse(BaseModel):
+    """Model inference response."""
+    content: str = Field(description="Generated text content")
+    tool_call: dict | None = Field(
+        default=None,
+        description="Parsed tool call if present (first tool call for backwards compatibility)",
+    )
+    tool_calls: list[dict] | None = Field(
+        default=None,
+        description="All parsed tool calls if present (for multi-tool responses)",
+    )
+    raw_output: str = Field(description="Raw model output before parsing")
+    finish_reason: str | None = Field(
+        default=None,
+        description="Reason for completion (stop, length, etc.)",
+    )
+class InferenceBackend(ABC):
+    """Abstract base class for inference backends."""
+    def __init__(self, config: InferenceConfig):
+        """Initialize inference backend.
+        Args:
+            config: Inference configuration
+        """
+        self.config = config
+    @abstractmethod
+    def generate(
+        self,
+        messages: list[dict[str, str]],
+        tools: list[ToolDefinition] | None = None,
+    ) -> ModelResponse:
+        """Generate response from model.
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            tools: Optional list of available tools for function calling
+        Returns:
+            ModelResponse with generated content and parsed tool calls
+        """
+    @abstractmethod
+    def generate_batch(
+        self,
+        batch_messages: list[list[dict[str, str]]],
+        tools: list[ToolDefinition] | None = None,
+    ) -> list[ModelResponse]:
+        """Generate responses for a batch of message sequences.
+        Args:
+            batch_messages: List of message sequences
+            tools: Optional list of available tools for function calling
+        Returns:
+            List of ModelResponse objects
+        """
+    @abstractmethod
+    def cleanup(self) -> None:
+        """Clean up resources (GPU memory, etc.)."""
+def create_inference_backend(config: InferenceConfig) -> InferenceBackend:
+    """Factory function to create inference backend.
+    Args:
+        config: Inference configuration
+    Returns:
+        Initialized InferenceBackend instance
+    Raises:
+        ValueError: If backend type is not supported
+    """
+    if config.backend == "transformers":
+        from .backends.transformers_backend import TransformersBackend  # noqa: PLC0415
+        return TransformersBackend(config)
+    if config.backend == "ollama":
+        from .backends.ollama_backend import OllamaBackend  # noqa: PLC0415
+        return OllamaBackend(config)
+    msg = f"Unsupported backend: {config.backend}"
+    raise ValueError(msg)