DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. deepfabric/__init__.py +70 -0
  2. deepfabric/__main__.py +6 -0
  3. deepfabric/auth.py +382 -0
  4. deepfabric/builders.py +303 -0
  5. deepfabric/builders_agent.py +1304 -0
  6. deepfabric/cli.py +1288 -0
  7. deepfabric/config.py +899 -0
  8. deepfabric/config_manager.py +251 -0
  9. deepfabric/constants.py +94 -0
  10. deepfabric/dataset_manager.py +534 -0
  11. deepfabric/error_codes.py +581 -0
  12. deepfabric/evaluation/__init__.py +47 -0
  13. deepfabric/evaluation/backends/__init__.py +32 -0
  14. deepfabric/evaluation/backends/ollama_backend.py +137 -0
  15. deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
  16. deepfabric/evaluation/backends/transformers_backend.py +326 -0
  17. deepfabric/evaluation/evaluator.py +845 -0
  18. deepfabric/evaluation/evaluators/__init__.py +13 -0
  19. deepfabric/evaluation/evaluators/base.py +104 -0
  20. deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
  21. deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
  22. deepfabric/evaluation/evaluators/registry.py +66 -0
  23. deepfabric/evaluation/inference.py +155 -0
  24. deepfabric/evaluation/metrics.py +397 -0
  25. deepfabric/evaluation/parser.py +304 -0
  26. deepfabric/evaluation/reporters/__init__.py +13 -0
  27. deepfabric/evaluation/reporters/base.py +56 -0
  28. deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
  29. deepfabric/evaluation/reporters/file_reporter.py +61 -0
  30. deepfabric/evaluation/reporters/multi_reporter.py +56 -0
  31. deepfabric/exceptions.py +67 -0
  32. deepfabric/factory.py +26 -0
  33. deepfabric/generator.py +1084 -0
  34. deepfabric/graph.py +545 -0
  35. deepfabric/hf_hub.py +214 -0
  36. deepfabric/kaggle_hub.py +219 -0
  37. deepfabric/llm/__init__.py +41 -0
  38. deepfabric/llm/api_key_verifier.py +534 -0
  39. deepfabric/llm/client.py +1206 -0
  40. deepfabric/llm/errors.py +105 -0
  41. deepfabric/llm/rate_limit_config.py +262 -0
  42. deepfabric/llm/rate_limit_detector.py +278 -0
  43. deepfabric/llm/retry_handler.py +270 -0
  44. deepfabric/metrics.py +212 -0
  45. deepfabric/progress.py +262 -0
  46. deepfabric/prompts.py +290 -0
  47. deepfabric/schemas.py +1000 -0
  48. deepfabric/spin/__init__.py +6 -0
  49. deepfabric/spin/client.py +263 -0
  50. deepfabric/spin/models.py +26 -0
  51. deepfabric/stream_simulator.py +90 -0
  52. deepfabric/tools/__init__.py +5 -0
  53. deepfabric/tools/defaults.py +85 -0
  54. deepfabric/tools/loader.py +87 -0
  55. deepfabric/tools/mcp_client.py +677 -0
  56. deepfabric/topic_manager.py +303 -0
  57. deepfabric/topic_model.py +20 -0
  58. deepfabric/training/__init__.py +35 -0
  59. deepfabric/training/api_key_prompt.py +302 -0
  60. deepfabric/training/callback.py +363 -0
  61. deepfabric/training/metrics_sender.py +301 -0
  62. deepfabric/tree.py +438 -0
  63. deepfabric/tui.py +1267 -0
  64. deepfabric/update_checker.py +166 -0
  65. deepfabric/utils.py +150 -0
  66. deepfabric/validation.py +143 -0
  67. deepfabric-4.4.0.dist-info/METADATA +702 -0
  68. deepfabric-4.4.0.dist-info/RECORD +71 -0
  69. deepfabric-4.4.0.dist-info/WHEEL +4 -0
  70. deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
  71. deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,13 @@
1
+ """Evaluator system for assessing model outputs."""
2
+
3
+ from .base import BaseEvaluator, EvaluationContext, EvaluatorResult
4
+ from .builtin.tool_calling import ToolCallingEvaluator
5
+ from .registry import EvaluatorRegistry
6
+
7
+ __all__ = [
8
+ "BaseEvaluator",
9
+ "EvaluationContext",
10
+ "EvaluatorResult",
11
+ "EvaluatorRegistry",
12
+ "ToolCallingEvaluator",
13
+ ]
@@ -0,0 +1,104 @@
1
+ """Base classes for evaluation system."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from ...schemas import ToolDefinition
9
+ from ..inference import ModelResponse
10
+ from ..parser import GroundTruth
11
+
12
+
13
+ class EvaluationContext(BaseModel):
14
+ """Context passed to evaluators."""
15
+
16
+ messages: list[dict[str, str]] = Field(description="Messages sent to model")
17
+ tools: list[ToolDefinition] | None = Field(
18
+ default=None,
19
+ description="Available tools for the evaluation",
20
+ )
21
+ sample_id: int = Field(description="Sample index in dataset")
22
+ metadata: dict[str, Any] = Field(
23
+ default_factory=dict,
24
+ description="Additional context metadata",
25
+ )
26
+
27
+
28
+ class EvaluatorResult(BaseModel):
29
+ """Result from a single evaluator."""
30
+
31
+ evaluator_name: str = Field(description="Name of the evaluator")
32
+ metrics: dict[str, float] = Field(
33
+ description="Metrics produced by this evaluator",
34
+ )
35
+ details: dict[str, Any] = Field(
36
+ default_factory=dict,
37
+ description="Additional details about the evaluation",
38
+ )
39
+ error: str | None = Field(
40
+ default=None,
41
+ description="Error message if evaluation failed",
42
+ )
43
+
44
+
45
+ class BaseEvaluator(ABC):
46
+ """Base class for all evaluators.
47
+
48
+ Evaluators assess specific aspects of model outputs (e.g., tool calling,
49
+ safety, answer quality). They are modular and can be enabled/disabled
50
+ via configuration.
51
+ """
52
+
53
+ def __init__(self, config: dict[str, Any] | None = None):
54
+ """Initialize evaluator with optional configuration.
55
+
56
+ Args:
57
+ config: Optional evaluator-specific configuration
58
+ """
59
+ self.config = config or {}
60
+
61
+ @abstractmethod
62
+ def get_name(self) -> str:
63
+ """Return unique identifier for this evaluator.
64
+
65
+ Returns:
66
+ Evaluator name (e.g., "tool_calling", "safety")
67
+ """
68
+
69
+ def get_metrics(self) -> list[str]:
70
+ """Return list of metric names this evaluator produces.
71
+
72
+ Returns:
73
+ List of metric names
74
+ """
75
+ return []
76
+
77
+ def applicable_to(self, ground_truth: GroundTruth) -> bool: # noqa: ARG002
78
+ """Check if this evaluator should run for the given sample.
79
+
80
+ Args:
81
+ ground_truth: Ground truth for the sample
82
+
83
+ Returns:
84
+ True if evaluator should run, False to skip
85
+ """
86
+ return True
87
+
88
+ @abstractmethod
89
+ def evaluate(
90
+ self,
91
+ ground_truth: GroundTruth,
92
+ prediction: ModelResponse,
93
+ context: EvaluationContext,
94
+ ) -> EvaluatorResult | None:
95
+ """Evaluate a single sample.
96
+
97
+ Args:
98
+ ground_truth: Expected values from dataset
99
+ prediction: Model's generated response
100
+ context: Additional evaluation context
101
+
102
+ Returns:
103
+ EvaluatorResult with metrics and details, or None to skip
104
+ """
@@ -0,0 +1,5 @@
1
+ """Built-in evaluators."""
2
+
3
+ from .tool_calling import ToolCallingEvaluator
4
+
5
+ __all__ = ["ToolCallingEvaluator"]
@@ -0,0 +1,93 @@
1
+ """Tool calling evaluator for assessing function calling accuracy."""
2
+
3
+ from ...inference import ModelResponse
4
+ from ...metrics import compare_parameters
5
+ from ...parser import GroundTruth
6
+ from ..base import BaseEvaluator, EvaluationContext, EvaluatorResult
7
+
8
+
9
+ class ToolCallingEvaluator(BaseEvaluator):
10
+ """Evaluates tool selection and parameter extraction accuracy.
11
+
12
+ This evaluator checks if the model:
13
+ 1. Selects the correct tool
14
+ 2. Extracts parameters correctly (with fuzzy matching)
15
+ 3. Can execute the tool successfully (tool + params both correct)
16
+
17
+ Only applicable to samples with tool calls (skips samples without tools).
18
+ """
19
+
20
+ def get_name(self) -> str:
21
+ """Return evaluator identifier."""
22
+ return "tool_calling"
23
+
24
+ def get_metrics(self) -> list[str]:
25
+ """Return list of metrics this evaluator produces."""
26
+ return [
27
+ "tool_selection_accuracy",
28
+ "parameter_accuracy",
29
+ "execution_valid",
30
+ ]
31
+
32
+ def applicable_to(self, ground_truth: GroundTruth) -> bool:
33
+ """Only apply to samples with expected tool calls."""
34
+ return ground_truth.expected_tool is not None
35
+
36
+ def evaluate(
37
+ self,
38
+ ground_truth: GroundTruth,
39
+ prediction: ModelResponse,
40
+ context: EvaluationContext,
41
+ ) -> EvaluatorResult | None:
42
+ """Evaluate tool calling accuracy.
43
+
44
+ Args:
45
+ ground_truth: Expected tool and parameters
46
+ prediction: Model's generated response
47
+ context: Evaluation context with tool definitions
48
+
49
+ Returns:
50
+ EvaluatorResult with tool calling metrics
51
+ """
52
+ # Skip if not applicable
53
+ if not self.applicable_to(ground_truth):
54
+ return None
55
+
56
+ # Extract predicted tool and parameters
57
+ predicted_tool = None
58
+ predicted_params = {}
59
+ if prediction.tool_call:
60
+ predicted_tool = prediction.tool_call.get("name")
61
+ predicted_params = prediction.tool_call.get("arguments", {})
62
+
63
+ # Compute metrics
64
+ tool_correct = predicted_tool == ground_truth.expected_tool
65
+
66
+ # Validate parameters against the PREDICTED tool (not expected)
67
+ # This measures parameter extraction capability independently of tool selection
68
+ params_correct = compare_parameters(
69
+ ground_truth.expected_parameters,
70
+ predicted_params,
71
+ tool_name=predicted_tool, # Use predicted tool for schema validation
72
+ tool_definitions=context.tools,
73
+ )
74
+
75
+ # Execution valid requires BOTH correct tool AND correct params
76
+ execution_valid = tool_correct and params_correct
77
+
78
+ return EvaluatorResult(
79
+ evaluator_name=self.get_name(),
80
+ metrics={
81
+ "tool_selection_accuracy": 1.0 if tool_correct else 0.0,
82
+ "parameter_accuracy": 1.0 if params_correct else 0.0,
83
+ "execution_valid": 1.0 if execution_valid else 0.0,
84
+ },
85
+ details={
86
+ "expected_tool": ground_truth.expected_tool,
87
+ "predicted_tool": predicted_tool,
88
+ "expected_parameters": ground_truth.expected_parameters,
89
+ "predicted_parameters": predicted_params,
90
+ "tool_match": tool_correct,
91
+ "params_match": params_correct,
92
+ },
93
+ )
@@ -0,0 +1,66 @@
1
+ """Registry for managing evaluators."""
2
+
3
+ from .base import BaseEvaluator
4
+
5
+
6
+ class EvaluatorRegistry:
7
+ """Registry for managing evaluators (similar to FormatterRegistry).
8
+
9
+ Provides a central place to register and retrieve evaluators.
10
+ Supports both built-in and custom evaluators.
11
+ """
12
+
13
+ def __init__(self):
14
+ """Initialize registry with built-in evaluators."""
15
+ self._evaluators: dict[str, type[BaseEvaluator]] = {}
16
+ self._register_builtin_evaluators()
17
+
18
+ def register(self, evaluator_class: type[BaseEvaluator]) -> None:
19
+ """Register an evaluator class.
20
+
21
+ Args:
22
+ evaluator_class: Evaluator class to register
23
+ """
24
+ # Create temporary instance to get name
25
+ temp_instance = evaluator_class()
26
+ name = temp_instance.get_name()
27
+ self._evaluators[name] = evaluator_class
28
+
29
+ def get(self, name: str, config: dict | None = None) -> BaseEvaluator:
30
+ """Get evaluator instance by name.
31
+
32
+ Args:
33
+ name: Evaluator name
34
+ config: Optional configuration for the evaluator
35
+
36
+ Returns:
37
+ Evaluator instance
38
+
39
+ Raises:
40
+ KeyError: If evaluator not found
41
+ """
42
+ if name not in self._evaluators:
43
+ available = ", ".join(self._evaluators.keys())
44
+ msg = f"Evaluator '{name}' not found. Available: {available}"
45
+ raise KeyError(msg)
46
+
47
+ evaluator_class = self._evaluators[name]
48
+ return evaluator_class(config=config)
49
+
50
+ def list_evaluators(self) -> list[str]:
51
+ """List all registered evaluator names.
52
+
53
+ Returns:
54
+ List of evaluator names
55
+ """
56
+ return list(self._evaluators.keys())
57
+
58
+ def _register_builtin_evaluators(self) -> None:
59
+ """Register built-in evaluators."""
60
+ from .builtin.tool_calling import ToolCallingEvaluator # noqa: PLC0415
61
+
62
+ self.register(ToolCallingEvaluator)
63
+ # More built-in evaluators can be registered here in the future
64
+ # Future: self.register(AnswerQualityEvaluator)
65
+ # Future: self.register(SafetyEvaluator)
66
+ # Future: self.register(GuardrailsEvaluator)
@@ -0,0 +1,155 @@
1
+ """Model inference interfaces and implementations for evaluation."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from ..schemas import ToolDefinition
9
+
10
+
11
+ class InferenceConfig(BaseModel):
12
+ """Configuration for model inference."""
13
+
14
+ model_path: str = Field(
15
+ description="Path to model (local path or HuggingFace Hub ID)",
16
+ )
17
+ adapter_path: str | None = Field(
18
+ default=None,
19
+ description="Path to PEFT/LoRA adapter (if using adapter-based fine-tuning)",
20
+ )
21
+ backend: Literal["transformers", "ollama"] = Field(
22
+ default="transformers",
23
+ description="Inference backend to use",
24
+ )
25
+ use_unsloth: bool = Field(
26
+ default=False,
27
+ description="Use Unsloth for loading adapter (for adapters trained with Unsloth)",
28
+ )
29
+ max_seq_length: int = Field(
30
+ default=2048,
31
+ ge=1,
32
+ description="Maximum sequence length for Unsloth models",
33
+ )
34
+ load_in_4bit: bool = Field(
35
+ default=False,
36
+ description="Load model in 4-bit quantization (for Unsloth)",
37
+ )
38
+ temperature: float = Field(
39
+ default=0.7,
40
+ ge=0.0,
41
+ le=2.0,
42
+ description="Sampling temperature",
43
+ )
44
+ max_tokens: int = Field(
45
+ default=2048,
46
+ ge=1,
47
+ description="Maximum tokens to generate",
48
+ )
49
+ top_p: float = Field(
50
+ default=0.9,
51
+ ge=0.0,
52
+ le=1.0,
53
+ description="Nucleus sampling top-p",
54
+ )
55
+ device: str | None = Field(
56
+ default=None,
57
+ description="Device to use (cuda, cpu, etc.). None for auto-detection",
58
+ )
59
+ batch_size: int = Field(
60
+ default=1,
61
+ ge=1,
62
+ description="Batch size for inference",
63
+ )
64
+
65
+
66
+ class ModelResponse(BaseModel):
67
+ """Model inference response."""
68
+
69
+ content: str = Field(description="Generated text content")
70
+ tool_call: dict | None = Field(
71
+ default=None,
72
+ description="Parsed tool call if present (first tool call for backwards compatibility)",
73
+ )
74
+ tool_calls: list[dict] | None = Field(
75
+ default=None,
76
+ description="All parsed tool calls if present (for multi-tool responses)",
77
+ )
78
+ raw_output: str = Field(description="Raw model output before parsing")
79
+ finish_reason: str | None = Field(
80
+ default=None,
81
+ description="Reason for completion (stop, length, etc.)",
82
+ )
83
+
84
+
85
+ class InferenceBackend(ABC):
86
+ """Abstract base class for inference backends."""
87
+
88
+ def __init__(self, config: InferenceConfig):
89
+ """Initialize inference backend.
90
+
91
+ Args:
92
+ config: Inference configuration
93
+ """
94
+ self.config = config
95
+
96
+ @abstractmethod
97
+ def generate(
98
+ self,
99
+ messages: list[dict[str, str]],
100
+ tools: list[ToolDefinition] | None = None,
101
+ ) -> ModelResponse:
102
+ """Generate response from model.
103
+
104
+ Args:
105
+ messages: List of message dicts with 'role' and 'content'
106
+ tools: Optional list of available tools for function calling
107
+
108
+ Returns:
109
+ ModelResponse with generated content and parsed tool calls
110
+ """
111
+
112
+ @abstractmethod
113
+ def generate_batch(
114
+ self,
115
+ batch_messages: list[list[dict[str, str]]],
116
+ tools: list[ToolDefinition] | None = None,
117
+ ) -> list[ModelResponse]:
118
+ """Generate responses for a batch of message sequences.
119
+
120
+ Args:
121
+ batch_messages: List of message sequences
122
+ tools: Optional list of available tools for function calling
123
+
124
+ Returns:
125
+ List of ModelResponse objects
126
+ """
127
+
128
+ @abstractmethod
129
+ def cleanup(self) -> None:
130
+ """Clean up resources (GPU memory, etc.)."""
131
+
132
+
133
+ def create_inference_backend(config: InferenceConfig) -> InferenceBackend:
134
+ """Factory function to create inference backend.
135
+
136
+ Args:
137
+ config: Inference configuration
138
+
139
+ Returns:
140
+ Initialized InferenceBackend instance
141
+
142
+ Raises:
143
+ ValueError: If backend type is not supported
144
+ """
145
+ if config.backend == "transformers":
146
+ from .backends.transformers_backend import TransformersBackend # noqa: PLC0415
147
+
148
+ return TransformersBackend(config)
149
+ if config.backend == "ollama":
150
+ from .backends.ollama_backend import OllamaBackend # noqa: PLC0415
151
+
152
+ return OllamaBackend(config)
153
+
154
+ msg = f"Unsupported backend: {config.backend}"
155
+ raise ValueError(msg)