DeepFabric 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/__init__.py +70 -0
- deepfabric/__main__.py +6 -0
- deepfabric/auth.py +382 -0
- deepfabric/builders.py +303 -0
- deepfabric/builders_agent.py +1304 -0
- deepfabric/cli.py +1288 -0
- deepfabric/config.py +899 -0
- deepfabric/config_manager.py +251 -0
- deepfabric/constants.py +94 -0
- deepfabric/dataset_manager.py +534 -0
- deepfabric/error_codes.py +581 -0
- deepfabric/evaluation/__init__.py +47 -0
- deepfabric/evaluation/backends/__init__.py +32 -0
- deepfabric/evaluation/backends/ollama_backend.py +137 -0
- deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
- deepfabric/evaluation/backends/transformers_backend.py +326 -0
- deepfabric/evaluation/evaluator.py +845 -0
- deepfabric/evaluation/evaluators/__init__.py +13 -0
- deepfabric/evaluation/evaluators/base.py +104 -0
- deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
- deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
- deepfabric/evaluation/evaluators/registry.py +66 -0
- deepfabric/evaluation/inference.py +155 -0
- deepfabric/evaluation/metrics.py +397 -0
- deepfabric/evaluation/parser.py +304 -0
- deepfabric/evaluation/reporters/__init__.py +13 -0
- deepfabric/evaluation/reporters/base.py +56 -0
- deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
- deepfabric/evaluation/reporters/file_reporter.py +61 -0
- deepfabric/evaluation/reporters/multi_reporter.py +56 -0
- deepfabric/exceptions.py +67 -0
- deepfabric/factory.py +26 -0
- deepfabric/generator.py +1084 -0
- deepfabric/graph.py +545 -0
- deepfabric/hf_hub.py +214 -0
- deepfabric/kaggle_hub.py +219 -0
- deepfabric/llm/__init__.py +41 -0
- deepfabric/llm/api_key_verifier.py +534 -0
- deepfabric/llm/client.py +1206 -0
- deepfabric/llm/errors.py +105 -0
- deepfabric/llm/rate_limit_config.py +262 -0
- deepfabric/llm/rate_limit_detector.py +278 -0
- deepfabric/llm/retry_handler.py +270 -0
- deepfabric/metrics.py +212 -0
- deepfabric/progress.py +262 -0
- deepfabric/prompts.py +290 -0
- deepfabric/schemas.py +1000 -0
- deepfabric/spin/__init__.py +6 -0
- deepfabric/spin/client.py +263 -0
- deepfabric/spin/models.py +26 -0
- deepfabric/stream_simulator.py +90 -0
- deepfabric/tools/__init__.py +5 -0
- deepfabric/tools/defaults.py +85 -0
- deepfabric/tools/loader.py +87 -0
- deepfabric/tools/mcp_client.py +677 -0
- deepfabric/topic_manager.py +303 -0
- deepfabric/topic_model.py +20 -0
- deepfabric/training/__init__.py +35 -0
- deepfabric/training/api_key_prompt.py +302 -0
- deepfabric/training/callback.py +363 -0
- deepfabric/training/metrics_sender.py +301 -0
- deepfabric/tree.py +438 -0
- deepfabric/tui.py +1267 -0
- deepfabric/update_checker.py +166 -0
- deepfabric/utils.py +150 -0
- deepfabric/validation.py +143 -0
- deepfabric-4.4.0.dist-info/METADATA +702 -0
- deepfabric-4.4.0.dist-info/RECORD +71 -0
- deepfabric-4.4.0.dist-info/WHEEL +4 -0
- deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
- deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
|
|
3
|
+
from pydantic import ValidationError
|
|
4
|
+
|
|
5
|
+
from .config import DeepFabricConfig
|
|
6
|
+
from .constants import (
|
|
7
|
+
DEFAULT_MAX_RETRIES,
|
|
8
|
+
DEFAULT_MODEL,
|
|
9
|
+
DEFAULT_PROVIDER,
|
|
10
|
+
ENGINE_DEFAULT_BATCH_SIZE,
|
|
11
|
+
ENGINE_DEFAULT_NUM_EXAMPLES,
|
|
12
|
+
ENGINE_DEFAULT_TEMPERATURE,
|
|
13
|
+
TOPIC_GRAPH_DEFAULT_DEGREE,
|
|
14
|
+
TOPIC_GRAPH_DEFAULT_DEPTH,
|
|
15
|
+
TOPIC_GRAPH_DEFAULT_TEMPERATURE,
|
|
16
|
+
TOPIC_TREE_DEFAULT_DEGREE,
|
|
17
|
+
TOPIC_TREE_DEFAULT_DEPTH,
|
|
18
|
+
TOPIC_TREE_DEFAULT_TEMPERATURE,
|
|
19
|
+
)
|
|
20
|
+
from .exceptions import ConfigurationError
|
|
21
|
+
from .tui import get_tui
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_config( # noqa: PLR0913
|
|
25
|
+
config_file: str | None,
|
|
26
|
+
topic_prompt: str | None = None,
|
|
27
|
+
topics_system_prompt: str | None = None,
|
|
28
|
+
generation_system_prompt: str | None = None,
|
|
29
|
+
output_system_prompt: str | None = None,
|
|
30
|
+
provider: str | None = None,
|
|
31
|
+
model: str | None = None,
|
|
32
|
+
temperature: float | None = None,
|
|
33
|
+
degree: int | None = None,
|
|
34
|
+
depth: int | None = None,
|
|
35
|
+
num_samples: int | None = None,
|
|
36
|
+
batch_size: int | None = None,
|
|
37
|
+
topics_save_as: str | None = None,
|
|
38
|
+
output_save_as: str | None = None,
|
|
39
|
+
include_system_message: bool | None = None,
|
|
40
|
+
mode: str = "tree",
|
|
41
|
+
# Modular conversation configuration
|
|
42
|
+
conversation_type: str | None = None,
|
|
43
|
+
reasoning_style: str | None = None,
|
|
44
|
+
agent_mode: str | None = None,
|
|
45
|
+
) -> DeepFabricConfig:
|
|
46
|
+
"""
|
|
47
|
+
Load configuration from YAML file or create minimal config from CLI arguments.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
config_file: Path to YAML configuration file
|
|
51
|
+
topic_prompt: Starting topic/seed for topic generation
|
|
52
|
+
topics_system_prompt: System prompt for topic generation
|
|
53
|
+
generation_system_prompt: System prompt for dataset content generation
|
|
54
|
+
output_system_prompt: System prompt for final dataset output
|
|
55
|
+
provider: LLM provider
|
|
56
|
+
model: Model name
|
|
57
|
+
temperature: Temperature setting
|
|
58
|
+
degree: Branching factor
|
|
59
|
+
depth: Depth of tree/graph
|
|
60
|
+
num_samples: Number of samples to generate
|
|
61
|
+
batch_size: Batch size for generation
|
|
62
|
+
topics_save_as: Path to save topics
|
|
63
|
+
output_save_as: Path to save dataset
|
|
64
|
+
include_system_message: Include system message in dataset
|
|
65
|
+
mode: Topic generation mode (tree or graph)
|
|
66
|
+
conversation_type: Base conversation type (basic, chain_of_thought)
|
|
67
|
+
reasoning_style: Reasoning style for chain_of_thought (freetext, agent)
|
|
68
|
+
agent_mode: Agent mode (single_turn, multi_turn)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
DeepFabricConfig object
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ConfigurationError: If config file is invalid or required parameters are missing
|
|
75
|
+
"""
|
|
76
|
+
if config_file:
|
|
77
|
+
try:
|
|
78
|
+
config = DeepFabricConfig.from_yaml(config_file)
|
|
79
|
+
except FileNotFoundError as e:
|
|
80
|
+
raise ConfigurationError(f"Config file not found: {config_file}") from e
|
|
81
|
+
except yaml.YAMLError as e:
|
|
82
|
+
raise ConfigurationError(f"Invalid YAML in config file: {str(e)}") from e
|
|
83
|
+
except Exception as e:
|
|
84
|
+
raise ConfigurationError(f"Error loading config file: {str(e)}") from e
|
|
85
|
+
else:
|
|
86
|
+
return config
|
|
87
|
+
|
|
88
|
+
# No config file provided - create minimal configuration from CLI args
|
|
89
|
+
if not topic_prompt:
|
|
90
|
+
raise ConfigurationError("--topic-prompt is required when no config file is provided")
|
|
91
|
+
|
|
92
|
+
tui = get_tui()
|
|
93
|
+
tui.info("No config file provided - using CLI parameters")
|
|
94
|
+
|
|
95
|
+
# Create minimal config dict with new structure
|
|
96
|
+
default_prompt = generation_system_prompt or "You are a helpful AI assistant."
|
|
97
|
+
|
|
98
|
+
# Build conversation config
|
|
99
|
+
conversation_config = {"type": conversation_type or "basic"}
|
|
100
|
+
if reasoning_style:
|
|
101
|
+
conversation_config["reasoning_style"] = reasoning_style
|
|
102
|
+
if agent_mode:
|
|
103
|
+
conversation_config["agent_mode"] = agent_mode
|
|
104
|
+
|
|
105
|
+
minimal_config = {
|
|
106
|
+
"topics": {
|
|
107
|
+
"prompt": topic_prompt,
|
|
108
|
+
"mode": mode,
|
|
109
|
+
"system_prompt": topics_system_prompt or "",
|
|
110
|
+
"depth": depth
|
|
111
|
+
or (TOPIC_GRAPH_DEFAULT_DEPTH if mode == "graph" else TOPIC_TREE_DEFAULT_DEPTH),
|
|
112
|
+
"degree": degree
|
|
113
|
+
or (TOPIC_GRAPH_DEFAULT_DEGREE if mode == "graph" else TOPIC_TREE_DEFAULT_DEGREE),
|
|
114
|
+
"save_as": topics_save_as
|
|
115
|
+
or ("topic_graph.json" if mode == "graph" else "topic_tree.jsonl"),
|
|
116
|
+
"llm": {
|
|
117
|
+
"provider": provider or DEFAULT_PROVIDER,
|
|
118
|
+
"model": model or DEFAULT_MODEL,
|
|
119
|
+
"temperature": temperature
|
|
120
|
+
or (
|
|
121
|
+
TOPIC_GRAPH_DEFAULT_TEMPERATURE
|
|
122
|
+
if mode == "graph"
|
|
123
|
+
else TOPIC_TREE_DEFAULT_TEMPERATURE
|
|
124
|
+
),
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
"generation": {
|
|
128
|
+
"system_prompt": default_prompt,
|
|
129
|
+
"instructions": "Generate diverse and educational examples",
|
|
130
|
+
"conversation": conversation_config,
|
|
131
|
+
"max_retries": DEFAULT_MAX_RETRIES,
|
|
132
|
+
"llm": {
|
|
133
|
+
"provider": provider or DEFAULT_PROVIDER,
|
|
134
|
+
"model": model or DEFAULT_MODEL,
|
|
135
|
+
"temperature": temperature or ENGINE_DEFAULT_TEMPERATURE,
|
|
136
|
+
},
|
|
137
|
+
},
|
|
138
|
+
"output": {
|
|
139
|
+
"system_prompt": output_system_prompt,
|
|
140
|
+
"include_system_message": include_system_message
|
|
141
|
+
if include_system_message is not None
|
|
142
|
+
else True,
|
|
143
|
+
"num_samples": num_samples or ENGINE_DEFAULT_NUM_EXAMPLES,
|
|
144
|
+
"batch_size": batch_size or ENGINE_DEFAULT_BATCH_SIZE,
|
|
145
|
+
"save_as": output_save_as or "dataset.jsonl",
|
|
146
|
+
},
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
return DeepFabricConfig.model_validate(minimal_config)
|
|
151
|
+
except ValidationError as e:
|
|
152
|
+
raise ConfigurationError(f"Invalid configuration: {str(e)}") from e
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def apply_cli_overrides(
|
|
156
|
+
output_system_prompt: str | None = None,
|
|
157
|
+
topic_prompt: str | None = None,
|
|
158
|
+
topics_system_prompt: str | None = None,
|
|
159
|
+
generation_system_prompt: str | None = None,
|
|
160
|
+
provider: str | None = None,
|
|
161
|
+
model: str | None = None,
|
|
162
|
+
temperature: float | None = None,
|
|
163
|
+
degree: int | None = None,
|
|
164
|
+
depth: int | None = None,
|
|
165
|
+
base_url: str | None = None,
|
|
166
|
+
) -> tuple[dict, dict]:
|
|
167
|
+
"""
|
|
168
|
+
Build override dictionaries from CLI parameters.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
output_system_prompt: Override for output system prompt
|
|
172
|
+
topic_prompt: Override for topic prompt
|
|
173
|
+
topics_system_prompt: Override for topics system prompt
|
|
174
|
+
generation_system_prompt: Override for generation system prompt
|
|
175
|
+
provider: Override for LLM provider
|
|
176
|
+
model: Override for model name
|
|
177
|
+
temperature: Override for temperature
|
|
178
|
+
degree: Override for branching factor
|
|
179
|
+
depth: Override for depth
|
|
180
|
+
base_url: Override for base URL
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Tuple of (topics_overrides, generation_overrides) dictionaries
|
|
184
|
+
"""
|
|
185
|
+
# Prepare topics overrides
|
|
186
|
+
topics_overrides = {}
|
|
187
|
+
if topic_prompt:
|
|
188
|
+
topics_overrides["topic_prompt"] = topic_prompt
|
|
189
|
+
if topics_system_prompt:
|
|
190
|
+
topics_overrides["topic_system_prompt"] = topics_system_prompt
|
|
191
|
+
if provider:
|
|
192
|
+
topics_overrides["provider"] = provider
|
|
193
|
+
if model:
|
|
194
|
+
topics_overrides["model"] = model
|
|
195
|
+
if temperature:
|
|
196
|
+
topics_overrides["temperature"] = temperature
|
|
197
|
+
if degree:
|
|
198
|
+
topics_overrides["degree"] = degree
|
|
199
|
+
if depth:
|
|
200
|
+
topics_overrides["depth"] = depth
|
|
201
|
+
if base_url:
|
|
202
|
+
topics_overrides["base_url"] = base_url
|
|
203
|
+
|
|
204
|
+
# Prepare generation overrides
|
|
205
|
+
generation_overrides = {}
|
|
206
|
+
if generation_system_prompt:
|
|
207
|
+
generation_overrides["generation_system_prompt"] = generation_system_prompt
|
|
208
|
+
if output_system_prompt:
|
|
209
|
+
generation_overrides["dataset_system_prompt"] = output_system_prompt
|
|
210
|
+
if provider:
|
|
211
|
+
generation_overrides["provider"] = provider
|
|
212
|
+
if model:
|
|
213
|
+
generation_overrides["model"] = model
|
|
214
|
+
if temperature:
|
|
215
|
+
generation_overrides["temperature"] = temperature
|
|
216
|
+
if base_url:
|
|
217
|
+
generation_overrides["base_url"] = base_url
|
|
218
|
+
|
|
219
|
+
return topics_overrides, generation_overrides
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def get_final_parameters(
|
|
223
|
+
config: DeepFabricConfig,
|
|
224
|
+
num_samples: int | None = None,
|
|
225
|
+
batch_size: int | None = None,
|
|
226
|
+
depth: int | None = None,
|
|
227
|
+
degree: int | None = None,
|
|
228
|
+
) -> tuple[int, int, int, int]:
|
|
229
|
+
"""
|
|
230
|
+
Get final parameters from config and CLI overrides.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
config: DeepFabricConfig object
|
|
234
|
+
num_samples: CLI override for num_samples
|
|
235
|
+
batch_size: CLI override for batch_size
|
|
236
|
+
depth: CLI override for depth
|
|
237
|
+
degree: CLI override for degree
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Tuple of (num_samples, batch_size, depth, degree)
|
|
241
|
+
"""
|
|
242
|
+
output_config = config.get_output_config()
|
|
243
|
+
|
|
244
|
+
final_num_samples = num_samples or output_config["num_samples"]
|
|
245
|
+
final_batch_size = batch_size or output_config["batch_size"]
|
|
246
|
+
|
|
247
|
+
# Get depth and degree from topics config
|
|
248
|
+
final_depth = depth or config.topics.depth
|
|
249
|
+
final_degree = degree or config.topics.degree
|
|
250
|
+
|
|
251
|
+
return final_num_samples, final_batch_size, final_depth, final_degree
|
deepfabric/constants.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Default values
|
|
2
|
+
DEFAULT_PROVIDER = "openai"
|
|
3
|
+
DEFAULT_MODEL = "gpt-4o"
|
|
4
|
+
DEFAULT_TEMPERATURE = 0.7
|
|
5
|
+
DEFAULT_DEGREE = 3
|
|
6
|
+
DEFAULT_DEPTH = 2
|
|
7
|
+
DEFAULT_MAX_RETRIES = 3
|
|
8
|
+
DEFAULT_BATCH_SIZE = 5
|
|
9
|
+
DEFAULT_NUM_EXAMPLES = 3
|
|
10
|
+
DEFAULT_REQUEST_TIMEOUT = 30
|
|
11
|
+
DEFAULT_MAX_TOKENS = 1000
|
|
12
|
+
|
|
13
|
+
# Engine defaults
|
|
14
|
+
ENGINE_DEFAULT_TEMPERATURE = 0.2
|
|
15
|
+
ENGINE_DEFAULT_BATCH_SIZE = 5
|
|
16
|
+
ENGINE_DEFAULT_NUM_EXAMPLES = 3
|
|
17
|
+
|
|
18
|
+
# Topic tree defaults
|
|
19
|
+
TOPIC_TREE_DEFAULT_DEGREE = 10
|
|
20
|
+
TOPIC_TREE_DEFAULT_DEPTH = 3
|
|
21
|
+
TOPIC_TREE_DEFAULT_TEMPERATURE = 0.2
|
|
22
|
+
TOPIC_TREE_DEFAULT_MODEL = "gpt-4o"
|
|
23
|
+
|
|
24
|
+
# Topic graph defaults
|
|
25
|
+
TOPIC_GRAPH_DEFAULT_DEGREE = 10
|
|
26
|
+
TOPIC_GRAPH_DEFAULT_DEPTH = 3
|
|
27
|
+
TOPIC_GRAPH_SUMMARY = 20
|
|
28
|
+
TOPIC_GRAPH_DEFAULT_MODEL = "gpt-4o"
|
|
29
|
+
TOPIC_GRAPH_DEFAULT_TEMPERATURE = 0.7
|
|
30
|
+
|
|
31
|
+
# File extensions and patterns
|
|
32
|
+
JSONL_EXTENSION = ".jsonl"
|
|
33
|
+
YAML_EXTENSIONS = (".yaml", ".yml")
|
|
34
|
+
|
|
35
|
+
# Message roles
|
|
36
|
+
ROLE_SYSTEM = "system"
|
|
37
|
+
ROLE_USER = "user"
|
|
38
|
+
ROLE_ASSISTANT = "assistant"
|
|
39
|
+
VALID_ROLES = [ROLE_SYSTEM, ROLE_USER, ROLE_ASSISTANT]
|
|
40
|
+
|
|
41
|
+
# Placeholders
|
|
42
|
+
SYSTEM_PROMPT_VAR = "{{{{system_prompt}}}}"
|
|
43
|
+
INSTRUCTIONS_VAR = "{{{{instructions}}}}"
|
|
44
|
+
EXAMPLES_VAR = "{{{{examples}}}}"
|
|
45
|
+
SUBTOPICS_VAR = "{{{{subtopics}}}}"
|
|
46
|
+
|
|
47
|
+
# Retry and backoff settings
|
|
48
|
+
MAX_RETRY_ATTEMPTS = 3
|
|
49
|
+
RETRY_BASE_DELAY = 2 # seconds
|
|
50
|
+
EXPONENTIAL_BACKOFF_MULTIPLIER = 2
|
|
51
|
+
DEFAULT_SAMPLE_RETRIES = 2 # per-sample retries for validation errors
|
|
52
|
+
|
|
53
|
+
# String length limits
|
|
54
|
+
MAX_ERROR_PREVIEW_LENGTH = 200
|
|
55
|
+
TRUNCATION_SUFFIX = "..."
|
|
56
|
+
|
|
57
|
+
# Validation patterns
|
|
58
|
+
JSON_BLOCK_PATTERN = r"(?s)\{.*\}"
|
|
59
|
+
JSON_ARRAY_PATTERN = r"\[.*\]"
|
|
60
|
+
JSON_CODE_BLOCK_PATTERN = r"```json\s*|\s*```"
|
|
61
|
+
|
|
62
|
+
# Default tags
|
|
63
|
+
DEFAULT_HF_TAGS = ["deepfabric", "synthetic"]
|
|
64
|
+
DEFAULT_KAGGLE_TAGS = ["deepfabric", "synthetic"]
|
|
65
|
+
|
|
66
|
+
# Error categories
|
|
67
|
+
ERROR_CATEGORIES = {
|
|
68
|
+
"json_parsing_errors": "JSON Parsing Errors",
|
|
69
|
+
"invalid_schema": "Invalid Schema",
|
|
70
|
+
"api_errors": "API Errors",
|
|
71
|
+
"authentication_error": "Authentication Errors",
|
|
72
|
+
"empty_responses": "Empty Responses",
|
|
73
|
+
"malformed_responses": "Malformed Responses",
|
|
74
|
+
"other_errors": "Other Errors",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# API error indicators
|
|
78
|
+
API_ERROR_INDICATORS = ["timeout", "rate limit", "connection"]
|
|
79
|
+
|
|
80
|
+
# Special characters that need cleaning in JSON responses
|
|
81
|
+
JSON_SPECIAL_CHARS = "{}"
|
|
82
|
+
|
|
83
|
+
# Progress display settings
|
|
84
|
+
PROGRESS_BAR_DESC = "Progress"
|
|
85
|
+
|
|
86
|
+
# File save patterns
|
|
87
|
+
INTERRUPTED_DATASET_FILENAME = "interrupted_dataset.jsonl"
|
|
88
|
+
ERROR_DATASET_FILENAME = "error_dataset.jsonl"
|
|
89
|
+
PARTIAL_TREE_FILENAME = "partial_tree.jsonl"
|
|
90
|
+
FAILED_TREE_SUFFIX = "_failed.jsonl"
|
|
91
|
+
|
|
92
|
+
# Stream simulation defaults
|
|
93
|
+
STREAM_SIM_CHUNK_SIZE = 8 # characters per chunk
|
|
94
|
+
STREAM_SIM_CHUNK_DELAY_MS = 10.0 # milliseconds between chunks
|