DeepFabric 4.4.1__py3-none-any.whl → 4.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. deepfabric/__init__.py +8 -0
  2. deepfabric/auth.py +8 -2
  3. deepfabric/builders.py +2 -2
  4. deepfabric/builders_agent.py +18 -6
  5. deepfabric/cli.py +292 -13
  6. deepfabric/cloud_upload.py +884 -0
  7. deepfabric/config.py +47 -20
  8. deepfabric/config_manager.py +2 -2
  9. deepfabric/dataset.py +302 -0
  10. deepfabric/evaluation/backends/__init__.py +2 -0
  11. deepfabric/evaluation/backends/llm_eval_backend.py +527 -0
  12. deepfabric/evaluation/backends/ollama_backend.py +3 -3
  13. deepfabric/evaluation/backends/tool_call_parsers.py +7 -7
  14. deepfabric/evaluation/backends/transformers_backend.py +73 -16
  15. deepfabric/evaluation/evaluator.py +41 -7
  16. deepfabric/evaluation/evaluators/builtin/tool_calling.py +13 -8
  17. deepfabric/evaluation/inference.py +77 -5
  18. deepfabric/evaluation/metrics.py +4 -0
  19. deepfabric/evaluation/parser.py +8 -8
  20. deepfabric/evaluation/reporters/cloud_reporter.py +19 -6
  21. deepfabric/exceptions.py +14 -0
  22. deepfabric/generator.py +8 -4
  23. deepfabric/graph.py +38 -0
  24. deepfabric/hf_hub.py +1 -1
  25. deepfabric/loader.py +554 -0
  26. deepfabric/schemas.py +7 -7
  27. deepfabric/topic_manager.py +4 -0
  28. deepfabric/training/__init__.py +24 -5
  29. deepfabric/training/callback.py +43 -1
  30. deepfabric/training/dataset_utils.py +223 -0
  31. deepfabric/training/metrics_sender.py +50 -16
  32. deepfabric/tui.py +9 -1
  33. deepfabric/utils.py +14 -0
  34. deepfabric/validation.py +1 -1
  35. {deepfabric-4.4.1.dist-info → deepfabric-4.6.0.dist-info}/METADATA +84 -177
  36. {deepfabric-4.4.1.dist-info → deepfabric-4.6.0.dist-info}/RECORD +39 -34
  37. {deepfabric-4.4.1.dist-info → deepfabric-4.6.0.dist-info}/WHEEL +0 -0
  38. {deepfabric-4.4.1.dist-info → deepfabric-4.6.0.dist-info}/entry_points.txt +0 -0
  39. {deepfabric-4.4.1.dist-info → deepfabric-4.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import sys
3
4
 
4
5
  from typing import Any
5
6
 
@@ -36,9 +37,15 @@ class TransformersBackend(InferenceBackend):
36
37
  """
37
38
  super().__init__(config)
38
39
 
40
+ # Check if model is pre-loaded (not a string path)
41
+ is_preloaded = not isinstance(config.model, str)
42
+
39
43
  # Determine device
40
44
  if config.device:
41
45
  self.device = config.device
46
+ elif is_preloaded:
47
+ # Get device from pre-loaded model
48
+ self.device = str(next(config.model.parameters()).device)
42
49
  # Auto-detect best available device
43
50
  elif torch.cuda.is_available():
44
51
  self.device = "cuda"
@@ -48,7 +55,7 @@ class TransformersBackend(InferenceBackend):
48
55
  self.device = "cpu"
49
56
 
50
57
  # Determine dtype based on device
51
- if self.device == "cuda":
58
+ if self.device == "cuda" or self.device.startswith("cuda:"):
52
59
  dtype = torch.float16
53
60
  device_map = "auto"
54
61
  elif self.device == "mps":
@@ -58,11 +65,36 @@ class TransformersBackend(InferenceBackend):
58
65
  dtype = torch.float32
59
66
  device_map = None
60
67
 
68
+ # Handle pre-loaded model case - skip all loading logic
69
+ if is_preloaded:
70
+ self.model = config.model
71
+ self.tokenizer = config.tokenizer
72
+ self.loaded_with_unsloth = False
73
+
74
+ # Detect architecture from pre-loaded model's config
75
+ self._architectures = []
76
+ if hasattr(self.model, "config"):
77
+ self._architectures = getattr(self.model.config, "architectures", []) or []
78
+
79
+ # Initialize tool call parser
80
+ self._tool_call_parser: ToolCallParser = get_parser(self._architectures)
81
+ logger.info(
82
+ "Using pre-loaded model with %s parser for architectures: %s",
83
+ type(self._tool_call_parser).__name__,
84
+ self._architectures or ["unknown"],
85
+ )
86
+
87
+ # Set padding token if not set
88
+ if self.tokenizer.pad_token is None:
89
+ self.tokenizer.pad_token = self.tokenizer.eos_token
90
+
91
+ return # Skip remaining initialization
92
+
61
93
  # Detect model architecture for parser selection and tokenizer config
62
- self._architectures: list[str] = []
94
+ self._architectures = []
63
95
  tokenizer_kwargs: dict[str, Any] = {}
64
96
  try:
65
- model_config = AutoConfig.from_pretrained(config.model_path) # nosec
97
+ model_config = AutoConfig.from_pretrained(config.model) # nosec
66
98
  self._architectures = getattr(model_config, "architectures", []) or []
67
99
  if any(arch in MISTRAL_ARCHITECTURES for arch in self._architectures):
68
100
  tokenizer_kwargs["fix_mistral_regex"] = True
@@ -71,7 +103,7 @@ class TransformersBackend(InferenceBackend):
71
103
  logger.warning("Could not detect model architecture: %s", e)
72
104
 
73
105
  # Initialize tool call parser based on detected architecture
74
- self._tool_call_parser: ToolCallParser = get_parser(self._architectures)
106
+ self._tool_call_parser = get_parser(self._architectures)
75
107
  logger.info(
76
108
  "Using %s for model architectures: %s",
77
109
  type(self._tool_call_parser).__name__,
@@ -79,19 +111,44 @@ class TransformersBackend(InferenceBackend):
79
111
  )
80
112
 
81
113
  self.loaded_with_unsloth = False
82
- # Load with Unsloth if requested
83
- if config.use_unsloth:
114
+
115
+ # Detect if Unsloth has already patched the environment
116
+ # This happens when user imports unsloth in the same runtime
117
+ unsloth_patched = "unsloth" in sys.modules
118
+
119
+ # Use Unsloth if explicitly requested OR if Unsloth has patched the environment
120
+ # (to avoid "apply_qkv" errors from patched attention classes)
121
+ use_unsloth_loading = config.use_unsloth or unsloth_patched
122
+
123
+ if use_unsloth_loading:
84
124
  try:
85
125
  from unsloth import FastLanguageModel # type: ignore # noqa: PLC0415
86
126
 
87
- # Load from adapter path if provided, otherwise from model_path
88
- load_path = config.adapter_path if config.adapter_path else config.model_path
89
- self.model, self.tokenizer = FastLanguageModel.from_pretrained(
90
- model_name=load_path,
91
- max_seq_length=config.max_seq_length,
92
- dtype=dtype,
93
- load_in_4bit=config.load_in_4bit,
94
- )
127
+ if unsloth_patched and not config.use_unsloth:
128
+ logger.info(
129
+ "Unsloth detected in environment, using Unsloth loader for compatibility"
130
+ )
131
+
132
+ if config.adapter_path:
133
+ # Load base model first, then apply adapter
134
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(
135
+ model_name=config.model,
136
+ max_seq_length=config.max_seq_length,
137
+ dtype=dtype,
138
+ load_in_4bit=config.load_in_4bit,
139
+ )
140
+ # Load LoRA adapter using PEFT
141
+ from peft import PeftModel # noqa: PLC0415
142
+
143
+ self.model = PeftModel.from_pretrained(self.model, config.adapter_path)
144
+ else:
145
+ # Load merged model or base model directly
146
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(
147
+ model_name=config.model,
148
+ max_seq_length=config.max_seq_length,
149
+ dtype=dtype,
150
+ load_in_4bit=config.load_in_4bit,
151
+ )
95
152
  FastLanguageModel.for_inference(self.model)
96
153
  self.loaded_with_unsloth = True
97
154
  except ImportError:
@@ -104,11 +161,11 @@ class TransformersBackend(InferenceBackend):
104
161
  # Standard transformers/PEFT loading
105
162
  if not self.loaded_with_unsloth:
106
163
  self.tokenizer = AutoTokenizer.from_pretrained( # nosec
107
- config.model_path, **tokenizer_kwargs
164
+ config.model, **tokenizer_kwargs
108
165
  )
109
166
 
110
167
  self.model = AutoModelForCausalLM.from_pretrained( # nosec
111
- config.model_path,
168
+ config.model,
112
169
  device_map=device_map,
113
170
  dtype=dtype,
114
171
  )
@@ -36,12 +36,12 @@ class EvaluatorConfig(BaseModel):
36
36
  default=None,
37
37
  description="Path to save evaluation results",
38
38
  )
39
- model_path: str | None = Field(
39
+ model: str | None = Field(
40
40
  default=None,
41
- description="Path to model to evaluate (overrides inference_config.model_path)",
41
+ description="Model to evaluate (overrides inference_config.model)",
42
42
  )
43
43
  inference_config: InferenceConfig = Field(
44
- description="Inference backend configuration (includes model_path)",
44
+ description="Inference backend configuration (includes model)",
45
45
  )
46
46
  batch_size: int = Field(
47
47
  default=1,
@@ -119,7 +119,7 @@ class Evaluator:
119
119
  "evaluator_created",
120
120
  {
121
121
  "backend": self.config.inference_config.backend,
122
- "model_path": self.config.inference_config.model_path,
122
+ "model": self.config.inference_config.model,
123
123
  "has_adapter": self.config.inference_config.adapter_path is not None,
124
124
  "evaluators": (
125
125
  list(self.config.evaluators)
@@ -434,6 +434,7 @@ class Evaluator:
434
434
  ground_truth=ground_truth,
435
435
  response=response,
436
436
  evaluator_results=evaluator_results,
437
+ tools=tools,
437
438
  )
438
439
 
439
440
  except Exception as e: # noqa: BLE001
@@ -442,8 +443,9 @@ class Evaluator:
442
443
  expected_tool = None
443
444
  expected_params: dict[str, Any] = {}
444
445
  expected_answer = None
446
+ available_tool_names: list[str] = []
445
447
 
446
- # Try to extract ground truth if available
448
+ # Try to extract ground truth and tools if available
447
449
  try:
448
450
  gt = self.extract_ground_truth(sample)
449
451
  query = gt.query
@@ -453,9 +455,16 @@ class Evaluator:
453
455
  except (KeyError, AttributeError, ValidationError):
454
456
  pass
455
457
 
458
+ try:
459
+ tools = self.prepare_tools(sample)
460
+ available_tool_names = [t.name for t in tools]
461
+ except (KeyError, AttributeError, ValidationError):
462
+ pass
463
+
456
464
  return SampleEvaluation(
457
465
  sample_id=sample_id,
458
466
  query=query,
467
+ available_tools=available_tool_names,
459
468
  expected_tool=expected_tool,
460
469
  predicted_tool=None,
461
470
  expected_parameters=expected_params,
@@ -560,6 +569,7 @@ class Evaluator:
560
569
  ground_truth=ground_truth,
561
570
  predicted_tool_calls=all_predicted_tool_calls,
562
571
  final_content=final_content,
572
+ tools=tools,
563
573
  )
564
574
 
565
575
  except Exception as e: # noqa: BLE001
@@ -568,6 +578,7 @@ class Evaluator:
568
578
  expected_tool = None
569
579
  expected_params: dict[str, Any] = {}
570
580
  expected_answer = None
581
+ available_tool_names: list[str] = []
571
582
 
572
583
  try:
573
584
  gt = self.extract_ground_truth(sample)
@@ -578,9 +589,16 @@ class Evaluator:
578
589
  except (KeyError, AttributeError, ValidationError):
579
590
  pass
580
591
 
592
+ try:
593
+ tools = self.prepare_tools(sample)
594
+ available_tool_names = [t.name for t in tools]
595
+ except (KeyError, AttributeError, ValidationError):
596
+ pass
597
+
581
598
  return SampleEvaluation(
582
599
  sample_id=sample_id,
583
600
  query=query,
601
+ available_tools=available_tool_names,
584
602
  expected_tool=expected_tool,
585
603
  predicted_tool=None,
586
604
  expected_parameters=expected_params,
@@ -600,6 +618,7 @@ class Evaluator:
600
618
  ground_truth: GroundTruth,
601
619
  predicted_tool_calls: list[dict],
602
620
  final_content: str,
621
+ tools: list[ToolDefinition] | None = None,
603
622
  ) -> SampleEvaluation:
604
623
  """Compute metrics for multi-turn evaluation.
605
624
 
@@ -610,6 +629,7 @@ class Evaluator:
610
629
  ground_truth: Expected values including all expected tools
611
630
  predicted_tool_calls: All tool calls made by model across turns
612
631
  final_content: Final model response content
632
+ tools: List of available tools for this sample
613
633
 
614
634
  Returns:
615
635
  SampleEvaluation with computed metrics
@@ -652,9 +672,13 @@ class Evaluator:
652
672
  # Execution valid if we got through the conversation
653
673
  execution_valid = len(predicted_tool_calls) > 0 or final_content != ""
654
674
 
675
+ # Extract tool names for available_tools field
676
+ available_tool_names = [t.name for t in tools] if tools else []
677
+
655
678
  return SampleEvaluation(
656
679
  sample_id=sample_id,
657
680
  query=ground_truth.query,
681
+ available_tools=available_tool_names,
658
682
  expected_tool=ground_truth.expected_tool,
659
683
  predicted_tool=first_predicted_tool,
660
684
  expected_parameters=ground_truth.expected_parameters,
@@ -714,6 +738,7 @@ class Evaluator:
714
738
  ground_truth: GroundTruth,
715
739
  response: ModelResponse,
716
740
  evaluator_results: list[EvaluatorResult],
741
+ tools: list[ToolDefinition] | None = None,
717
742
  ) -> SampleEvaluation:
718
743
  """Aggregate evaluator results into SampleEvaluation.
719
744
 
@@ -722,6 +747,7 @@ class Evaluator:
722
747
  ground_truth: Expected values
723
748
  response: Model response
724
749
  evaluator_results: Results from all evaluators
750
+ tools: List of available tools for this sample
725
751
 
726
752
  Returns:
727
753
  SampleEvaluation with aggregated metrics
@@ -746,10 +772,14 @@ class Evaluator:
746
772
  params_correct = metrics.get("parameter_accuracy", 0.0) == 1.0
747
773
  execution_valid = metrics.get("execution_valid", 0.0) == 1.0
748
774
 
775
+ # Extract tool names for available_tools field
776
+ available_tool_names = [t.name for t in tools] if tools else []
777
+
749
778
  # Return backwards-compatible SampleEvaluation
750
779
  return SampleEvaluation(
751
780
  sample_id=sample_id,
752
781
  query=ground_truth.query,
782
+ available_tools=available_tool_names,
753
783
  expected_tool=ground_truth.expected_tool,
754
784
  predicted_tool=predicted_tool,
755
785
  expected_parameters=ground_truth.expected_parameters,
@@ -780,13 +810,17 @@ class Evaluator:
780
810
  console.print("[bold blue]Running evaluation...[/bold blue]")
781
811
  evaluations = []
782
812
 
783
- for idx, sample in tqdm(enumerate(samples), total=len(samples), desc="Evaluating"):
813
+ pbar = tqdm(enumerate(samples), total=len(samples), desc="Evaluating")
814
+ for idx, sample in pbar:
784
815
  eval_result = self.evaluate_sample(sample, idx)
785
816
  evaluations.append(eval_result)
786
817
 
787
818
  # Stream sample to reporters (for cloud real-time tracking)
788
819
  self.reporter.report_sample(eval_result)
789
820
 
821
+ # Force refresh for notebook compatibility
822
+ pbar.refresh()
823
+
790
824
  console.print("[bold green]Evaluation complete![/bold green]")
791
825
 
792
826
  # Compute metrics
@@ -804,7 +838,7 @@ class Evaluator:
804
838
  "evaluation_completed",
805
839
  {
806
840
  "backend": self.config.inference_config.backend,
807
- "model_path": self.config.inference_config.model_path,
841
+ "model": self.config.inference_config.model,
808
842
  "has_adapter": self.config.inference_config.adapter_path is not None,
809
843
  "samples_evaluated": metrics.samples_evaluated,
810
844
  "samples_processed": metrics.samples_processed,
@@ -63,14 +63,19 @@ class ToolCallingEvaluator(BaseEvaluator):
63
63
  # Compute metrics
64
64
  tool_correct = predicted_tool == ground_truth.expected_tool
65
65
 
66
- # Validate parameters against the PREDICTED tool (not expected)
67
- # This measures parameter extraction capability independently of tool selection
68
- params_correct = compare_parameters(
69
- ground_truth.expected_parameters,
70
- predicted_params,
71
- tool_name=predicted_tool, # Use predicted tool for schema validation
72
- tool_definitions=context.tools,
73
- )
66
+ # Parameter accuracy requires a tool to have been called
67
+ # If no tool was predicted but one was expected, params cannot be correct
68
+ if predicted_tool is None and ground_truth.expected_tool is not None:
69
+ params_correct = False
70
+ else:
71
+ # Validate parameters against the PREDICTED tool (not expected)
72
+ # This measures parameter extraction capability independently of tool selection
73
+ params_correct = compare_parameters(
74
+ ground_truth.expected_parameters,
75
+ predicted_params,
76
+ tool_name=predicted_tool, # Use predicted tool for schema validation
77
+ tool_definitions=context.tools,
78
+ )
74
79
 
75
80
  # Execution valid requires BOTH correct tool AND correct params
76
81
  execution_valid = tool_correct and params_correct
@@ -1,9 +1,9 @@
1
1
  """Model inference interfaces and implementations for evaluation."""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
- from typing import Literal
4
+ from typing import Any, Literal
5
5
 
6
- from pydantic import BaseModel, Field
6
+ from pydantic import BaseModel, ConfigDict, Field, field_serializer, model_validator
7
7
 
8
8
  from ..schemas import ToolDefinition
9
9
 
@@ -11,17 +11,40 @@ from ..schemas import ToolDefinition
11
11
  class InferenceConfig(BaseModel):
12
12
  """Configuration for model inference."""
13
13
 
14
- model_path: str = Field(
15
- description="Path to model (local path or HuggingFace Hub ID)",
14
+ model_config = ConfigDict(arbitrary_types_allowed=True)
15
+
16
+ model: str | Any = Field(
17
+ description="Model identifier (local path, HuggingFace Hub ID, or model name for cloud providers). "
18
+ "Can also be a pre-loaded model object to avoid reloading.",
19
+ )
20
+ tokenizer: Any | None = Field(
21
+ default=None,
22
+ description="Pre-loaded tokenizer object. Required when model is a pre-loaded model object.",
16
23
  )
17
24
  adapter_path: str | None = Field(
18
25
  default=None,
19
26
  description="Path to PEFT/LoRA adapter (if using adapter-based fine-tuning)",
20
27
  )
21
- backend: Literal["transformers", "ollama"] = Field(
28
+ backend: Literal["transformers", "ollama", "llm"] = Field(
22
29
  default="transformers",
23
30
  description="Inference backend to use",
24
31
  )
32
+ provider: Literal["openai", "anthropic", "gemini", "openrouter"] | None = Field(
33
+ default=None,
34
+ description="Cloud LLM provider (required when backend='llm')",
35
+ )
36
+ api_key: str | None = Field(
37
+ default=None,
38
+ description="API key for the provider (falls back to environment variable if not set)",
39
+ )
40
+ base_url: str | None = Field(
41
+ default=None,
42
+ description="Custom base URL for the API (e.g., for OpenRouter or proxies)",
43
+ )
44
+ rate_limit_config: dict | None = Field(
45
+ default=None,
46
+ description="Rate limiting configuration overrides",
47
+ )
25
48
  use_unsloth: bool = Field(
26
49
  default=False,
27
50
  description="Use Unsloth for loading adapter (for adapters trained with Unsloth)",
@@ -62,6 +85,51 @@ class InferenceConfig(BaseModel):
62
85
  description="Batch size for inference",
63
86
  )
64
87
 
88
+ @field_serializer("model")
89
+ def serialize_model(self, value: str | Any) -> str:
90
+ """Serialize model field - convert objects to descriptive string."""
91
+ if isinstance(value, str):
92
+ return value
93
+ # For in-memory model objects, return a descriptive string
94
+ model_class = type(value).__name__
95
+ model_name = getattr(getattr(value, "config", None), "name_or_path", "unknown")
96
+ return f"<in-memory:{model_class}:{model_name}>"
97
+
98
+ @field_serializer("tokenizer")
99
+ def serialize_tokenizer(self, value: Any | None) -> str | None:
100
+ """Serialize tokenizer field - convert objects to descriptive string."""
101
+ if value is None:
102
+ return None
103
+ if isinstance(value, str):
104
+ return value
105
+ # For in-memory tokenizer objects, return a descriptive string
106
+ tokenizer_class = type(value).__name__
107
+ tokenizer_name = getattr(value, "name_or_path", "unknown")
108
+ return f"<in-memory:{tokenizer_class}:{tokenizer_name}>"
109
+
110
+ @model_validator(mode="after")
111
+ def validate_config(self) -> "InferenceConfig":
112
+ """Validate configuration consistency."""
113
+ # Ensure provider is set when using LLM backend
114
+ if self.backend == "llm" and self.provider is None:
115
+ msg = "provider must be specified when backend='llm'"
116
+ raise ValueError(msg)
117
+
118
+ # Check if model is a pre-loaded object (not a string path)
119
+ is_preloaded_model = not isinstance(self.model, str)
120
+
121
+ # If model is pre-loaded, tokenizer must also be provided
122
+ if is_preloaded_model and self.tokenizer is None:
123
+ msg = "tokenizer must be provided when using a pre-loaded model object"
124
+ raise ValueError(msg)
125
+
126
+ # Pre-loaded models only work with transformers backend
127
+ if is_preloaded_model and self.backend != "transformers":
128
+ msg = "pre-loaded model objects are only supported with backend='transformers'"
129
+ raise ValueError(msg)
130
+
131
+ return self
132
+
65
133
 
66
134
  class ModelResponse(BaseModel):
67
135
  """Model inference response."""
@@ -150,6 +218,10 @@ def create_inference_backend(config: InferenceConfig) -> InferenceBackend:
150
218
  from .backends.ollama_backend import OllamaBackend # noqa: PLC0415
151
219
 
152
220
  return OllamaBackend(config)
221
+ if config.backend == "llm":
222
+ from .backends.llm_eval_backend import LLMEvalBackend # noqa: PLC0415
223
+
224
+ return LLMEvalBackend(config)
153
225
 
154
226
  msg = f"Unsupported backend: {config.backend}"
155
227
  raise ValueError(msg)
@@ -107,6 +107,10 @@ class SampleEvaluation(BaseModel):
107
107
 
108
108
  sample_id: int = Field(description="Sample index")
109
109
  query: str = Field(description="Input query")
110
+ available_tools: list[str] = Field(
111
+ default_factory=list,
112
+ description="List of tool names available for this sample",
113
+ )
110
114
  expected_tool: str | None = Field(
111
115
  default=None,
112
116
  description="Expected tool name",
@@ -49,12 +49,12 @@ class GroundTruth(BaseModel):
49
49
  default=None,
50
50
  description="Expected final answer if available",
51
51
  )
52
- conversation_type: Literal["basic", "chain_of_thought"] = Field(
52
+ conversation_type: Literal["basic", "cot"] = Field(
53
53
  description="Type of conversation",
54
54
  )
55
55
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
56
56
  default=None,
57
- description="Reasoning style if chain_of_thought",
57
+ description="Reasoning style if cot",
58
58
  )
59
59
  agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
60
60
  default=None,
@@ -75,18 +75,18 @@ class GroundTruthParser:
75
75
 
76
76
  def __init__(
77
77
  self,
78
- conversation_type: Literal["basic", "chain_of_thought"],
78
+ conversation_type: Literal["basic", "cot"],
79
79
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
80
80
  agent_mode: Literal["single_turn", "multi_turn"] | None = None,
81
81
  ):
82
82
  """Initialize parser with conversation configuration.
83
83
 
84
84
  Args:
85
- conversation_type: Type of conversation (basic, chain_of_thought)
86
- reasoning_style: Reasoning style for chain_of_thought
85
+ conversation_type: Type of conversation (basic, cot)
86
+ reasoning_style: Reasoning style for cot
87
87
  agent_mode: Agent mode if tools are used
88
88
  """
89
- self.conversation_type: Literal["basic", "chain_of_thought"] = conversation_type
89
+ self.conversation_type: Literal["basic", "cot"] = conversation_type
90
90
  self.reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = (
91
91
  reasoning_style
92
92
  )
@@ -270,7 +270,7 @@ class GroundTruthParser:
270
270
 
271
271
  def parse_batch(
272
272
  conversations: list[Conversation],
273
- conversation_type: Literal["basic", "chain_of_thought"],
273
+ conversation_type: Literal["basic", "cot"],
274
274
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
275
275
  agent_mode: Literal["single_turn", "multi_turn"] | None = None,
276
276
  ) -> list[GroundTruth]:
@@ -279,7 +279,7 @@ def parse_batch(
279
279
  Args:
280
280
  conversations: List of Conversation objects
281
281
  conversation_type: Type of conversation
282
- reasoning_style: Reasoning style if chain_of_thought
282
+ reasoning_style: Reasoning style if cot
283
283
  agent_mode: Agent mode if tools are used
284
284
 
285
285
  Returns:
@@ -13,6 +13,7 @@ import httpx
13
13
 
14
14
  from rich.console import Console
15
15
 
16
+ from ...utils import get_bool_env
16
17
  from .base import BaseReporter
17
18
 
18
19
  if TYPE_CHECKING:
@@ -45,7 +46,7 @@ class CloudReporter(BaseReporter):
45
46
 
46
47
  Args:
47
48
  config: Optional configuration with:
48
- - api_url: DeepFabric API URL (default: https://api.deepfabric.dev")
49
+ - api_url: DeepFabric API URL (default: https://api.deepfabric.cloud")
49
50
  - project_id: Project ID to associate results with
50
51
  - auth_token: Authentication token (if not provided, will read from config file)
51
52
  - enabled: Whether to enable cloud reporting (default: True if authenticated)
@@ -53,7 +54,7 @@ class CloudReporter(BaseReporter):
53
54
  super().__init__(config)
54
55
 
55
56
  # Get API URL from config or environment
56
- self.api_url = os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.dev")
57
+ self.api_url = os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.cloud")
57
58
  if config and "api_url" in config:
58
59
  self.api_url = config["api_url"]
59
60
 
@@ -67,8 +68,9 @@ class CloudReporter(BaseReporter):
67
68
  # Get project ID from config
68
69
  self.project_id = config.get("project_id") if config else None
69
70
 
70
- # Enable cloud reporting if authenticated
71
- self.enabled = (
71
+ # Enable cloud reporting if authenticated AND experimental flag is set
72
+ is_experimental = get_bool_env("EXPERIMENTAL_DF")
73
+ self.enabled = is_experimental and (
72
74
  config.get("enabled", bool(self.auth_token)) if config else bool(self.auth_token)
73
75
  )
74
76
 
@@ -99,11 +101,22 @@ class CloudReporter(BaseReporter):
99
101
  try:
100
102
  console.print("[cyan]Uploading evaluation results to cloud...[/cyan]")
101
103
 
104
+ # Get model name as string (handle in-memory model objects)
105
+ model_value = result.config.inference_config.model
106
+ if isinstance(model_value, str):
107
+ model_name = model_value
108
+ else:
109
+ # For in-memory model objects, extract name from config
110
+ model_config = getattr(model_value, "config", None)
111
+ model_name = (
112
+ getattr(model_config, "name_or_path", None) or type(model_value).__name__
113
+ )
114
+
102
115
  # Create evaluation run
103
116
  run_data = {
104
- "project_id": self.project_id,
117
+ "pipeline_id": self.project_id,
105
118
  "name": f"Evaluation - {datetime.now(UTC).strftime('%Y-%m-%d %H:%M')}",
106
- "model_name": result.config.inference_config.model_path,
119
+ "model_name": model_name,
107
120
  "model_provider": result.config.inference_config.backend,
108
121
  "config": {
109
122
  "evaluators": getattr(result.config, "evaluators", ["tool_calling"]),
deepfabric/exceptions.py CHANGED
@@ -65,3 +65,17 @@ class RetryExhaustedError(ModelError):
65
65
  """Raised when maximum retries are exceeded."""
66
66
 
67
67
  pass
68
+
69
+
70
+ class LoaderError(DeepFabricError):
71
+ """Raised when dataset loading fails.
72
+
73
+ Common causes:
74
+ - File not found
75
+ - Invalid file format (malformed JSON/JSONL)
76
+ - Cloud authentication failure
77
+ - Network errors
78
+ - Empty dataset
79
+ """
80
+
81
+ pass
deepfabric/generator.py CHANGED
@@ -127,14 +127,14 @@ class DataSetGeneratorConfig(BaseModel):
127
127
  )
128
128
 
129
129
  # Modular conversation configuration
130
- conversation_type: Literal["basic", "chain_of_thought"] = Field(
130
+ conversation_type: Literal["basic", "cot"] = Field(
131
131
  default="basic",
132
- description="Base conversation type: basic (simple chat), chain_of_thought (with reasoning traces)",
132
+ description="Base conversation type: basic (simple chat), cot (with reasoning traces)",
133
133
  )
134
134
 
135
135
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
136
136
  default=None,
137
- description="Reasoning style for chain_of_thought type: freetext (natural language) or agent (structured step-by-step for tool-calling). Note: 'structured' and 'hybrid' are deprecated.",
137
+ description="Reasoning style for cot type: freetext (natural language) or agent (structured step-by-step for tool-calling). Note: 'structured' and 'hybrid' are deprecated.",
138
138
  )
139
139
 
140
140
  @field_validator("reasoning_style", mode="before")
@@ -213,6 +213,10 @@ class DataSetGeneratorConfig(BaseModel):
213
213
  le=20,
214
214
  description="Minimum number of tool calls required before allowing early conversation conclusion",
215
215
  )
216
+ tool_inclusion_strategy: Literal["all", "used_only"] = Field(
217
+ default="used_only",
218
+ description="Which tools to include in each sample: 'all' includes full catalog, 'used_only' includes only tools actually called (recommended for training)",
219
+ )
216
220
 
217
221
 
218
222
  class DataSetGenerator:
@@ -1041,7 +1045,7 @@ class DataSetGenerator:
1041
1045
  return CONVERSATION_GENERATION_PROMPT
1042
1046
 
1043
1047
  # Handle chain of thought conversations
1044
- if self.config.conversation_type == "chain_of_thought":
1048
+ if self.config.conversation_type == "cot":
1045
1049
  # Agent mode with tools - use agent prompts
1046
1050
  if self.config.agent_mode == "single_turn" and self.tool_registry:
1047
1051
  # Use agent prompt for single-turn tool calling