DeepFabric 4.5.1__py3-none-any.whl → 4.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepfabric/config.py CHANGED
@@ -119,13 +119,13 @@ class TopicsConfig(BaseModel):
119
119
  class ConversationConfig(BaseModel):
120
120
  """Configuration for conversation structure in generation."""
121
121
 
122
- type: Literal["basic", "chain_of_thought"] = Field(
122
+ type: Literal["basic", "cot"] = Field(
123
123
  default="basic",
124
- description="Base conversation type: basic (simple chat), chain_of_thought (with reasoning)",
124
+ description="Base conversation type: basic (simple chat), cot (with reasoning)",
125
125
  )
126
126
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
127
127
  default=None,
128
- description="Reasoning style for chain_of_thought: freetext or agent. Note: 'structured' and 'hybrid' are deprecated.",
128
+ description="Reasoning style for cot: freetext or agent. Note: 'structured' and 'hybrid' are deprecated.",
129
129
  )
130
130
  agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
131
131
  default=None,
@@ -159,15 +159,14 @@ class ConversationConfig(BaseModel):
159
159
  @model_validator(mode="after")
160
160
  def validate_configuration(self):
161
161
  """Validate that configuration combinations are consistent."""
162
- if self.reasoning_style is not None and self.type != "chain_of_thought":
162
+ if self.reasoning_style is not None and self.type != "cot":
163
163
  raise ValueError(
164
- f"reasoning_style can only be set when type='chain_of_thought', "
165
- f"got type='{self.type}'"
164
+ f"reasoning_style can only be set when type='cot', got type='{self.type}'"
166
165
  )
167
166
 
168
- if self.type == "chain_of_thought" and self.reasoning_style is None:
167
+ if self.type == "cot" and self.reasoning_style is None:
169
168
  raise ValueError(
170
- "reasoning_style must be specified when type='chain_of_thought'. "
169
+ "reasoning_style must be specified when type='cot'. "
171
170
  "Choose from: 'freetext' or 'agent'"
172
171
  )
173
172
 
@@ -346,16 +345,37 @@ class KaggleConfig(BaseModel):
346
345
  version_notes: str | None = Field(None, description="Version notes for dataset update")
347
346
 
348
347
 
348
+ class DeepFabricCloudConfig(BaseModel):
349
+ """Configuration for DeepFabric Cloud integration."""
350
+
351
+ graph: str | None = Field(
352
+ default=None,
353
+ description="DeepFabric Cloud graph handle (e.g., username/graph-name)",
354
+ )
355
+ dataset: str | None = Field(
356
+ default=None,
357
+ description="DeepFabric Cloud dataset handle (e.g., username/dataset-name)",
358
+ )
359
+ description: str | None = Field(
360
+ default=None,
361
+ description="Description for uploaded resources",
362
+ )
363
+ tags: list[str] = Field(
364
+ default_factory=list,
365
+ description="Tags for uploaded resources",
366
+ )
367
+
368
+
349
369
  class EvaluationConfig(BaseModel):
350
370
  """Configuration for model evaluation."""
351
371
 
352
- conversation_type: Literal["basic", "chain_of_thought"] = Field(
372
+ conversation_type: Literal["basic", "cot"] = Field(
353
373
  ...,
354
374
  description="Conversation type (must match dataset generation)",
355
375
  )
356
376
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
357
377
  default=None,
358
- description="Reasoning style for chain_of_thought type",
378
+ description="Reasoning style for cot type",
359
379
  )
360
380
 
361
381
  @field_validator("reasoning_style", mode="before")
@@ -419,15 +439,15 @@ class EvaluationConfig(BaseModel):
419
439
  @model_validator(mode="after")
420
440
  def validate_evaluation_config(self) -> "EvaluationConfig":
421
441
  """Validate evaluation configuration consistency."""
422
- if self.reasoning_style is not None and self.conversation_type != "chain_of_thought":
442
+ if self.reasoning_style is not None and self.conversation_type != "cot":
423
443
  raise ValueError(
424
- f"reasoning_style can only be set when conversation_type='chain_of_thought', "
444
+ f"reasoning_style can only be set when conversation_type='cot', "
425
445
  f"got conversation_type='{self.conversation_type}'"
426
446
  )
427
447
 
428
- if self.conversation_type == "chain_of_thought" and self.reasoning_style is None:
448
+ if self.conversation_type == "cot" and self.reasoning_style is None:
429
449
  raise ValueError(
430
- "reasoning_style must be specified when conversation_type='chain_of_thought'. "
450
+ "reasoning_style must be specified when conversation_type='cot'. "
431
451
  "Choose from: 'freetext' or 'agent'"
432
452
  )
433
453
 
@@ -457,6 +477,9 @@ class DeepFabricConfig(BaseModel):
457
477
  evaluation: EvaluationConfig | None = Field(None, description="Evaluation configuration")
458
478
  huggingface: HuggingFaceConfig | None = Field(None, description="Hugging Face configuration")
459
479
  kaggle: KaggleConfig | None = Field(None, description="Kaggle configuration")
480
+ deepfabric_cloud: DeepFabricCloudConfig | None = Field(
481
+ None, description="DeepFabric Cloud configuration"
482
+ )
460
483
 
461
484
  @classmethod
462
485
  def _detect_old_format(cls, config_dict: dict) -> bool:
@@ -663,6 +686,10 @@ See documentation for full examples.
663
686
  """Get Kaggle configuration."""
664
687
  return self.kaggle.model_dump() if self.kaggle else {}
665
688
 
689
+ def get_deepfabric_cloud_config(self) -> dict:
690
+ """Get DeepFabric Cloud configuration."""
691
+ return self.deepfabric_cloud.model_dump() if self.deepfabric_cloud else {}
692
+
666
693
  def get_configured_providers(self) -> set[str]:
667
694
  """Get the set of LLM providers configured in this config."""
668
695
  providers = set()
@@ -808,13 +835,13 @@ class DataEngineConfig(BaseModel):
808
835
  default=None,
809
836
  description="Rate limiting and retry configuration",
810
837
  )
811
- conversation_type: Literal["basic", "chain_of_thought"] = Field(
838
+ conversation_type: Literal["basic", "cot"] = Field(
812
839
  default="basic",
813
840
  description="Base conversation type",
814
841
  )
815
842
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
816
843
  default=None,
817
- description="Reasoning style for chain_of_thought type",
844
+ description="Reasoning style for cot type",
818
845
  )
819
846
 
820
847
  @field_validator("reasoning_style", mode="before")
@@ -839,15 +866,15 @@ class DataEngineConfig(BaseModel):
839
866
 
840
867
  @model_validator(mode="after")
841
868
  def validate_configuration(self):
842
- if self.reasoning_style is not None and self.conversation_type != "chain_of_thought":
869
+ if self.reasoning_style is not None and self.conversation_type != "cot":
843
870
  raise ValueError(
844
- f"reasoning_style can only be set when conversation_type='chain_of_thought', "
871
+ f"reasoning_style can only be set when conversation_type='cot', "
845
872
  f"got conversation_type='{self.conversation_type}'"
846
873
  )
847
874
 
848
- if self.conversation_type == "chain_of_thought" and self.reasoning_style is None:
875
+ if self.conversation_type == "cot" and self.reasoning_style is None:
849
876
  raise ValueError(
850
- "reasoning_style must be specified when conversation_type='chain_of_thought'. "
877
+ "reasoning_style must be specified when conversation_type='cot'. "
851
878
  "Choose from: 'freetext' or 'agent'"
852
879
  )
853
880
 
@@ -63,8 +63,8 @@ def load_config( # noqa: PLR0913
63
63
  output_save_as: Path to save dataset
64
64
  include_system_message: Include system message in dataset
65
65
  mode: Topic generation mode (tree or graph)
66
- conversation_type: Base conversation type (basic, chain_of_thought)
67
- reasoning_style: Reasoning style for chain_of_thought (freetext, agent)
66
+ conversation_type: Base conversation type (basic, cot)
67
+ reasoning_style: Reasoning style for cot (freetext, agent)
68
68
  agent_mode: Agent mode (single_turn, multi_turn)
69
69
 
70
70
  Returns:
deepfabric/dataset.py ADDED
@@ -0,0 +1,302 @@
1
+ """Native DeepFabric Dataset implementation.
2
+
3
+ This module provides a simple, maintainable Dataset class with no external
4
+ dependencies (beyond stdlib). It supports column-oriented access patterns
5
+ similar to HuggingFace datasets.
6
+ """
7
+
8
+ import json
9
+ import random
10
+
11
+ from collections.abc import Callable, Iterator
12
+ from typing import Any, overload
13
+
14
+
15
+ class Dataset:
16
+ """A simple, native dataset class that stores data as a list of dicts
17
+ with column-oriented access patterns.
18
+
19
+ Examples:
20
+ >>> ds = Dataset([{"text": "hello"}, {"text": "world"}])
21
+ >>> len(ds)
22
+ 2
23
+ >>> ds["text"]
24
+ ['hello', 'world']
25
+ >>> ds[0]
26
+ {'text': 'hello'}
27
+ >>> ds[0:1]
28
+ Dataset with 1 samples
29
+ """
30
+
31
+ def __init__(self, data: list[dict[str, Any]], metadata: dict | None = None):
32
+ """Initialize dataset from list of sample dicts.
33
+
34
+ Args:
35
+ data: List of sample dictionaries
36
+ metadata: Optional metadata (source, path, etc.)
37
+ """
38
+ self._data = data
39
+ self._metadata = metadata or {}
40
+ self._columns: list[str] | None = None
41
+
42
+ @property
43
+ def column_names(self) -> list[str]:
44
+ """Return list of column names."""
45
+ if self._columns is None:
46
+ if self._data:
47
+ # Collect all unique keys across samples
48
+ all_keys: set[str] = set()
49
+ for sample in self._data:
50
+ all_keys.update(sample.keys())
51
+ self._columns = sorted(all_keys)
52
+ else:
53
+ self._columns = []
54
+ return self._columns
55
+
56
+ @property
57
+ def num_rows(self) -> int:
58
+ """Return number of samples (alias for len)."""
59
+ return len(self._data)
60
+
61
+ def __len__(self) -> int:
62
+ """Return number of samples."""
63
+ return len(self._data)
64
+
65
+ @overload
66
+ def __getitem__(self, key: str) -> list[Any]: ...
67
+
68
+ @overload
69
+ def __getitem__(self, key: int) -> dict[str, Any]: ...
70
+
71
+ @overload
72
+ def __getitem__(self, key: slice) -> "Dataset": ...
73
+
74
+ def __getitem__(self, key: str | int | slice) -> Any:
75
+ """Access by column name, index, or slice.
76
+
77
+ Args:
78
+ key: Column name (str), row index (int), or slice
79
+
80
+ Returns:
81
+ - For str: list of values for that column
82
+ - For int: dict for that sample
83
+ - For slice: new Dataset with selected samples
84
+
85
+ Examples:
86
+ >>> ds["messages"] # Get column as list
87
+ >>> ds[0] # Get first sample as dict
88
+ >>> ds[0:10] # Get first 10 samples as new Dataset
89
+ """
90
+ if isinstance(key, str):
91
+ # Column access - return list of values
92
+ return [sample.get(key) for sample in self._data]
93
+ if isinstance(key, int):
94
+ # Single sample access
95
+ if key < 0:
96
+ key = len(self._data) + key
97
+ if key < 0 or key >= len(self._data):
98
+ raise IndexError(
99
+ f"Index {key} out of range for dataset with {len(self._data)} samples"
100
+ )
101
+ return self._data[key]
102
+ if isinstance(key, slice):
103
+ # Slice access - return new Dataset
104
+ return Dataset(self._data[key], self._metadata.copy())
105
+ raise TypeError(f"Invalid key type: {type(key)}. Expected str, int, or slice.")
106
+
107
+ def __iter__(self) -> Iterator[dict[str, Any]]:
108
+ """Iterate over samples."""
109
+ return iter(self._data)
110
+
111
+ def __repr__(self) -> str:
112
+ """Return string representation."""
113
+ cols = ", ".join(self.column_names[:5])
114
+ if len(self.column_names) > 5: # noqa: PLR2004
115
+ cols += ", ..."
116
+ return f"Dataset(num_rows={len(self)}, columns=[{cols}])"
117
+
118
+ def split(
119
+ self,
120
+ test_size: float = 0.1,
121
+ seed: int | None = None,
122
+ ) -> dict[str, "Dataset"]:
123
+ """Split dataset into train and test sets.
124
+
125
+ Args:
126
+ test_size: Fraction of data for test set (0.0 to 1.0)
127
+ seed: Random seed for reproducibility
128
+
129
+ Returns:
130
+ Dict with "train" and "test" Dataset instances
131
+
132
+ Examples:
133
+ >>> splits = ds.split(test_size=0.1, seed=42)
134
+ >>> train_ds = splits["train"]
135
+ >>> test_ds = splits["test"]
136
+ """
137
+ if not 0.0 < test_size < 1.0:
138
+ raise ValueError("test_size must be between 0.0 and 1.0 (exclusive)")
139
+
140
+ # Use a local Random instance to avoid affecting global state
141
+ rng = random.Random(seed) # noqa: S311 # nosec
142
+
143
+ # Create shuffled indices
144
+ indices = list(range(len(self._data)))
145
+ rng.shuffle(indices)
146
+
147
+ # Calculate split point
148
+ split_idx = int(len(indices) * (1 - test_size))
149
+
150
+ train_indices = indices[:split_idx]
151
+ test_indices = indices[split_idx:]
152
+
153
+ return {
154
+ "train": self.select(train_indices),
155
+ "test": self.select(test_indices),
156
+ }
157
+
158
+ def select(self, indices: list[int]) -> "Dataset":
159
+ """Select samples by indices.
160
+
161
+ Args:
162
+ indices: List of integer indices to select
163
+
164
+ Returns:
165
+ New Dataset with selected samples
166
+ """
167
+ return Dataset([self._data[i] for i in indices], self._metadata.copy())
168
+
169
+ def shuffle(self, seed: int | None = None) -> "Dataset":
170
+ """Return a shuffled copy of the dataset.
171
+
172
+ Args:
173
+ seed: Random seed for reproducibility
174
+
175
+ Returns:
176
+ New Dataset with shuffled samples
177
+ """
178
+ rng = random.Random(seed) # nosec # noqa: S311
179
+ indices = list(range(len(self._data)))
180
+ rng.shuffle(indices)
181
+ return self.select(indices)
182
+
183
+ def map(self, fn: Callable[[dict[str, Any]], dict[str, Any]]) -> "Dataset":
184
+ """Apply function to each sample.
185
+
186
+ Args:
187
+ fn: Function that takes a sample dict and returns a new sample dict
188
+
189
+ Returns:
190
+ New Dataset with transformed samples
191
+
192
+ Examples:
193
+ >>> ds.map(lambda x: {"text": x["text"].upper()})
194
+ """
195
+ return Dataset([fn(sample) for sample in self._data], self._metadata.copy())
196
+
197
+ def filter(self, fn: Callable[[dict[str, Any]], bool]) -> "Dataset":
198
+ """Filter samples by predicate function.
199
+
200
+ Args:
201
+ fn: Function that takes a sample dict and returns True to keep
202
+
203
+ Returns:
204
+ New Dataset with filtered samples
205
+
206
+ Examples:
207
+ >>> ds.filter(lambda x: len(x["text"]) > 10)
208
+ """
209
+ return Dataset([s for s in self._data if fn(s)], self._metadata.copy())
210
+
211
+ def to_list(self) -> list[dict[str, Any]]:
212
+ """Return data as list of dicts.
213
+
214
+ Returns:
215
+ Copy of internal data as list of dictionaries
216
+ """
217
+ return self._data.copy()
218
+
219
+ def to_hf(self) -> Any:
220
+ """Convert to HuggingFace Dataset for use with TRL/transformers.
221
+
222
+ Returns:
223
+ A HuggingFace datasets.Dataset instance
224
+
225
+ Raises:
226
+ ImportError: If the 'datasets' package is not installed
227
+
228
+ Examples:
229
+ >>> from deepfabric import load_dataset
230
+ >>> ds = load_dataset("data.jsonl")
231
+ >>> hf_ds = ds.to_hf()
232
+ >>> trainer = SFTTrainer(train_dataset=hf_ds, ...)
233
+ """
234
+ try:
235
+ from datasets import Dataset as HFDataset # noqa: PLC0415
236
+ except ImportError:
237
+ raise ImportError(
238
+ "The 'datasets' package is required for to_hf(). "
239
+ "Install it with: pip install datasets"
240
+ ) from None
241
+
242
+ return HFDataset.from_list(self._data)
243
+
244
+ def to_jsonl(self, path: str) -> None:
245
+ """Save dataset to JSONL file.
246
+
247
+ Args:
248
+ path: File path to save to
249
+ """
250
+ with open(path, "w", encoding="utf-8") as f:
251
+ for sample in self._data:
252
+ f.write(json.dumps(sample, ensure_ascii=False) + "\n")
253
+
254
+ @classmethod
255
+ def from_jsonl(cls, path: str) -> "Dataset":
256
+ """Load dataset from JSONL file.
257
+
258
+ Args:
259
+ path: File path to load from
260
+
261
+ Returns:
262
+ New Dataset loaded from file
263
+ """
264
+ data = []
265
+ with open(path, encoding="utf-8") as f:
266
+ for line in f:
267
+ if line.strip():
268
+ data.append(json.loads(line))
269
+ return cls(data, metadata={"source": "jsonl", "path": path})
270
+
271
+ @classmethod
272
+ def from_list(cls, data: list[dict[str, Any]]) -> "Dataset":
273
+ """Create dataset from list of dicts.
274
+
275
+ Args:
276
+ data: List of sample dictionaries
277
+
278
+ Returns:
279
+ New Dataset from the provided data
280
+ """
281
+ return cls(data)
282
+
283
+
284
+ class DatasetDict(dict):
285
+ """Dictionary of Dataset objects for train/test/validation splits.
286
+
287
+ A simple dict subclass that provides typed access to Dataset values.
288
+
289
+ Examples:
290
+ >>> dd = DatasetDict({"train": train_ds, "test": test_ds})
291
+ >>> dd["train"]
292
+ Dataset(num_rows=100, columns=[...])
293
+ """
294
+
295
+ def __getitem__(self, key: str) -> Dataset:
296
+ """Get Dataset by split name."""
297
+ return super().__getitem__(key)
298
+
299
+ def __repr__(self) -> str:
300
+ """Return string representation."""
301
+ splits = ", ".join(f"{k}: {len(v)} rows" for k, v in self.items())
302
+ return f"DatasetDict({{{splits}}})"
@@ -49,12 +49,12 @@ class GroundTruth(BaseModel):
49
49
  default=None,
50
50
  description="Expected final answer if available",
51
51
  )
52
- conversation_type: Literal["basic", "chain_of_thought"] = Field(
52
+ conversation_type: Literal["basic", "cot"] = Field(
53
53
  description="Type of conversation",
54
54
  )
55
55
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
56
56
  default=None,
57
- description="Reasoning style if chain_of_thought",
57
+ description="Reasoning style if cot",
58
58
  )
59
59
  agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
60
60
  default=None,
@@ -75,18 +75,18 @@ class GroundTruthParser:
75
75
 
76
76
  def __init__(
77
77
  self,
78
- conversation_type: Literal["basic", "chain_of_thought"],
78
+ conversation_type: Literal["basic", "cot"],
79
79
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
80
80
  agent_mode: Literal["single_turn", "multi_turn"] | None = None,
81
81
  ):
82
82
  """Initialize parser with conversation configuration.
83
83
 
84
84
  Args:
85
- conversation_type: Type of conversation (basic, chain_of_thought)
86
- reasoning_style: Reasoning style for chain_of_thought
85
+ conversation_type: Type of conversation (basic, cot)
86
+ reasoning_style: Reasoning style for cot
87
87
  agent_mode: Agent mode if tools are used
88
88
  """
89
- self.conversation_type: Literal["basic", "chain_of_thought"] = conversation_type
89
+ self.conversation_type: Literal["basic", "cot"] = conversation_type
90
90
  self.reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = (
91
91
  reasoning_style
92
92
  )
@@ -270,7 +270,7 @@ class GroundTruthParser:
270
270
 
271
271
  def parse_batch(
272
272
  conversations: list[Conversation],
273
- conversation_type: Literal["basic", "chain_of_thought"],
273
+ conversation_type: Literal["basic", "cot"],
274
274
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
275
275
  agent_mode: Literal["single_turn", "multi_turn"] | None = None,
276
276
  ) -> list[GroundTruth]:
@@ -279,7 +279,7 @@ def parse_batch(
279
279
  Args:
280
280
  conversations: List of Conversation objects
281
281
  conversation_type: Type of conversation
282
- reasoning_style: Reasoning style if chain_of_thought
282
+ reasoning_style: Reasoning style if cot
283
283
  agent_mode: Agent mode if tools are used
284
284
 
285
285
  Returns:
@@ -13,6 +13,7 @@ import httpx
13
13
 
14
14
  from rich.console import Console
15
15
 
16
+ from ...utils import get_bool_env
16
17
  from .base import BaseReporter
17
18
 
18
19
  if TYPE_CHECKING:
@@ -45,7 +46,7 @@ class CloudReporter(BaseReporter):
45
46
 
46
47
  Args:
47
48
  config: Optional configuration with:
48
- - api_url: DeepFabric API URL (default: https://api.deepfabric.dev")
49
+ - api_url: DeepFabric API URL (default: https://api.deepfabric.cloud")
49
50
  - project_id: Project ID to associate results with
50
51
  - auth_token: Authentication token (if not provided, will read from config file)
51
52
  - enabled: Whether to enable cloud reporting (default: True if authenticated)
@@ -53,7 +54,7 @@ class CloudReporter(BaseReporter):
53
54
  super().__init__(config)
54
55
 
55
56
  # Get API URL from config or environment
56
- self.api_url = os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.dev")
57
+ self.api_url = os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.cloud")
57
58
  if config and "api_url" in config:
58
59
  self.api_url = config["api_url"]
59
60
 
@@ -67,8 +68,9 @@ class CloudReporter(BaseReporter):
67
68
  # Get project ID from config
68
69
  self.project_id = config.get("project_id") if config else None
69
70
 
70
- # Enable cloud reporting if authenticated
71
- self.enabled = (
71
+ # Enable cloud reporting if authenticated AND experimental flag is set
72
+ is_experimental = get_bool_env("EXPERIMENTAL_DF")
73
+ self.enabled = is_experimental and (
72
74
  config.get("enabled", bool(self.auth_token)) if config else bool(self.auth_token)
73
75
  )
74
76
 
@@ -99,11 +101,22 @@ class CloudReporter(BaseReporter):
99
101
  try:
100
102
  console.print("[cyan]Uploading evaluation results to cloud...[/cyan]")
101
103
 
104
+ # Get model name as string (handle in-memory model objects)
105
+ model_value = result.config.inference_config.model
106
+ if isinstance(model_value, str):
107
+ model_name = model_value
108
+ else:
109
+ # For in-memory model objects, extract name from config
110
+ model_config = getattr(model_value, "config", None)
111
+ model_name = (
112
+ getattr(model_config, "name_or_path", None) or type(model_value).__name__
113
+ )
114
+
102
115
  # Create evaluation run
103
116
  run_data = {
104
- "project_id": self.project_id,
117
+ "pipeline_id": self.project_id,
105
118
  "name": f"Evaluation - {datetime.now(UTC).strftime('%Y-%m-%d %H:%M')}",
106
- "model_name": result.config.inference_config.model,
119
+ "model_name": model_name,
107
120
  "model_provider": result.config.inference_config.backend,
108
121
  "config": {
109
122
  "evaluators": getattr(result.config, "evaluators", ["tool_calling"]),
deepfabric/exceptions.py CHANGED
@@ -65,3 +65,17 @@ class RetryExhaustedError(ModelError):
65
65
  """Raised when maximum retries are exceeded."""
66
66
 
67
67
  pass
68
+
69
+
70
+ class LoaderError(DeepFabricError):
71
+ """Raised when dataset loading fails.
72
+
73
+ Common causes:
74
+ - File not found
75
+ - Invalid file format (malformed JSON/JSONL)
76
+ - Cloud authentication failure
77
+ - Network errors
78
+ - Empty dataset
79
+ """
80
+
81
+ pass
deepfabric/generator.py CHANGED
@@ -127,14 +127,14 @@ class DataSetGeneratorConfig(BaseModel):
127
127
  )
128
128
 
129
129
  # Modular conversation configuration
130
- conversation_type: Literal["basic", "chain_of_thought"] = Field(
130
+ conversation_type: Literal["basic", "cot"] = Field(
131
131
  default="basic",
132
- description="Base conversation type: basic (simple chat), chain_of_thought (with reasoning traces)",
132
+ description="Base conversation type: basic (simple chat), cot (with reasoning traces)",
133
133
  )
134
134
 
135
135
  reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
136
136
  default=None,
137
- description="Reasoning style for chain_of_thought type: freetext (natural language) or agent (structured step-by-step for tool-calling). Note: 'structured' and 'hybrid' are deprecated.",
137
+ description="Reasoning style for cot type: freetext (natural language) or agent (structured step-by-step for tool-calling). Note: 'structured' and 'hybrid' are deprecated.",
138
138
  )
139
139
 
140
140
  @field_validator("reasoning_style", mode="before")
@@ -1045,7 +1045,7 @@ class DataSetGenerator:
1045
1045
  return CONVERSATION_GENERATION_PROMPT
1046
1046
 
1047
1047
  # Handle chain of thought conversations
1048
- if self.config.conversation_type == "chain_of_thought":
1048
+ if self.config.conversation_type == "cot":
1049
1049
  # Agent mode with tools - use agent prompts
1050
1050
  if self.config.agent_mode == "single_turn" and self.tool_registry:
1051
1051
  # Use agent prompt for single-turn tool calling