DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. deepfabric/__init__.py +70 -0
  2. deepfabric/__main__.py +6 -0
  3. deepfabric/auth.py +382 -0
  4. deepfabric/builders.py +303 -0
  5. deepfabric/builders_agent.py +1304 -0
  6. deepfabric/cli.py +1288 -0
  7. deepfabric/config.py +899 -0
  8. deepfabric/config_manager.py +251 -0
  9. deepfabric/constants.py +94 -0
  10. deepfabric/dataset_manager.py +534 -0
  11. deepfabric/error_codes.py +581 -0
  12. deepfabric/evaluation/__init__.py +47 -0
  13. deepfabric/evaluation/backends/__init__.py +32 -0
  14. deepfabric/evaluation/backends/ollama_backend.py +137 -0
  15. deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
  16. deepfabric/evaluation/backends/transformers_backend.py +326 -0
  17. deepfabric/evaluation/evaluator.py +845 -0
  18. deepfabric/evaluation/evaluators/__init__.py +13 -0
  19. deepfabric/evaluation/evaluators/base.py +104 -0
  20. deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
  21. deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
  22. deepfabric/evaluation/evaluators/registry.py +66 -0
  23. deepfabric/evaluation/inference.py +155 -0
  24. deepfabric/evaluation/metrics.py +397 -0
  25. deepfabric/evaluation/parser.py +304 -0
  26. deepfabric/evaluation/reporters/__init__.py +13 -0
  27. deepfabric/evaluation/reporters/base.py +56 -0
  28. deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
  29. deepfabric/evaluation/reporters/file_reporter.py +61 -0
  30. deepfabric/evaluation/reporters/multi_reporter.py +56 -0
  31. deepfabric/exceptions.py +67 -0
  32. deepfabric/factory.py +26 -0
  33. deepfabric/generator.py +1084 -0
  34. deepfabric/graph.py +545 -0
  35. deepfabric/hf_hub.py +214 -0
  36. deepfabric/kaggle_hub.py +219 -0
  37. deepfabric/llm/__init__.py +41 -0
  38. deepfabric/llm/api_key_verifier.py +534 -0
  39. deepfabric/llm/client.py +1206 -0
  40. deepfabric/llm/errors.py +105 -0
  41. deepfabric/llm/rate_limit_config.py +262 -0
  42. deepfabric/llm/rate_limit_detector.py +278 -0
  43. deepfabric/llm/retry_handler.py +270 -0
  44. deepfabric/metrics.py +212 -0
  45. deepfabric/progress.py +262 -0
  46. deepfabric/prompts.py +290 -0
  47. deepfabric/schemas.py +1000 -0
  48. deepfabric/spin/__init__.py +6 -0
  49. deepfabric/spin/client.py +263 -0
  50. deepfabric/spin/models.py +26 -0
  51. deepfabric/stream_simulator.py +90 -0
  52. deepfabric/tools/__init__.py +5 -0
  53. deepfabric/tools/defaults.py +85 -0
  54. deepfabric/tools/loader.py +87 -0
  55. deepfabric/tools/mcp_client.py +677 -0
  56. deepfabric/topic_manager.py +303 -0
  57. deepfabric/topic_model.py +20 -0
  58. deepfabric/training/__init__.py +35 -0
  59. deepfabric/training/api_key_prompt.py +302 -0
  60. deepfabric/training/callback.py +363 -0
  61. deepfabric/training/metrics_sender.py +301 -0
  62. deepfabric/tree.py +438 -0
  63. deepfabric/tui.py +1267 -0
  64. deepfabric/update_checker.py +166 -0
  65. deepfabric/utils.py +150 -0
  66. deepfabric/validation.py +143 -0
  67. deepfabric-4.4.0.dist-info/METADATA +702 -0
  68. deepfabric-4.4.0.dist-info/RECORD +71 -0
  69. deepfabric-4.4.0.dist-info/WHEEL +4 -0
  70. deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
  71. deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,251 @@
1
+ import yaml
2
+
3
+ from pydantic import ValidationError
4
+
5
+ from .config import DeepFabricConfig
6
+ from .constants import (
7
+ DEFAULT_MAX_RETRIES,
8
+ DEFAULT_MODEL,
9
+ DEFAULT_PROVIDER,
10
+ ENGINE_DEFAULT_BATCH_SIZE,
11
+ ENGINE_DEFAULT_NUM_EXAMPLES,
12
+ ENGINE_DEFAULT_TEMPERATURE,
13
+ TOPIC_GRAPH_DEFAULT_DEGREE,
14
+ TOPIC_GRAPH_DEFAULT_DEPTH,
15
+ TOPIC_GRAPH_DEFAULT_TEMPERATURE,
16
+ TOPIC_TREE_DEFAULT_DEGREE,
17
+ TOPIC_TREE_DEFAULT_DEPTH,
18
+ TOPIC_TREE_DEFAULT_TEMPERATURE,
19
+ )
20
+ from .exceptions import ConfigurationError
21
+ from .tui import get_tui
22
+
23
+
24
+ def load_config( # noqa: PLR0913
25
+ config_file: str | None,
26
+ topic_prompt: str | None = None,
27
+ topics_system_prompt: str | None = None,
28
+ generation_system_prompt: str | None = None,
29
+ output_system_prompt: str | None = None,
30
+ provider: str | None = None,
31
+ model: str | None = None,
32
+ temperature: float | None = None,
33
+ degree: int | None = None,
34
+ depth: int | None = None,
35
+ num_samples: int | None = None,
36
+ batch_size: int | None = None,
37
+ topics_save_as: str | None = None,
38
+ output_save_as: str | None = None,
39
+ include_system_message: bool | None = None,
40
+ mode: str = "tree",
41
+ # Modular conversation configuration
42
+ conversation_type: str | None = None,
43
+ reasoning_style: str | None = None,
44
+ agent_mode: str | None = None,
45
+ ) -> DeepFabricConfig:
46
+ """
47
+ Load configuration from YAML file or create minimal config from CLI arguments.
48
+
49
+ Args:
50
+ config_file: Path to YAML configuration file
51
+ topic_prompt: Starting topic/seed for topic generation
52
+ topics_system_prompt: System prompt for topic generation
53
+ generation_system_prompt: System prompt for dataset content generation
54
+ output_system_prompt: System prompt for final dataset output
55
+ provider: LLM provider
56
+ model: Model name
57
+ temperature: Temperature setting
58
+ degree: Branching factor
59
+ depth: Depth of tree/graph
60
+ num_samples: Number of samples to generate
61
+ batch_size: Batch size for generation
62
+ topics_save_as: Path to save topics
63
+ output_save_as: Path to save dataset
64
+ include_system_message: Include system message in dataset
65
+ mode: Topic generation mode (tree or graph)
66
+ conversation_type: Base conversation type (basic, chain_of_thought)
67
+ reasoning_style: Reasoning style for chain_of_thought (freetext, agent)
68
+ agent_mode: Agent mode (single_turn, multi_turn)
69
+
70
+ Returns:
71
+ DeepFabricConfig object
72
+
73
+ Raises:
74
+ ConfigurationError: If config file is invalid or required parameters are missing
75
+ """
76
+ if config_file:
77
+ try:
78
+ config = DeepFabricConfig.from_yaml(config_file)
79
+ except FileNotFoundError as e:
80
+ raise ConfigurationError(f"Config file not found: {config_file}") from e
81
+ except yaml.YAMLError as e:
82
+ raise ConfigurationError(f"Invalid YAML in config file: {str(e)}") from e
83
+ except Exception as e:
84
+ raise ConfigurationError(f"Error loading config file: {str(e)}") from e
85
+ else:
86
+ return config
87
+
88
+ # No config file provided - create minimal configuration from CLI args
89
+ if not topic_prompt:
90
+ raise ConfigurationError("--topic-prompt is required when no config file is provided")
91
+
92
+ tui = get_tui()
93
+ tui.info("No config file provided - using CLI parameters")
94
+
95
+ # Create minimal config dict with new structure
96
+ default_prompt = generation_system_prompt or "You are a helpful AI assistant."
97
+
98
+ # Build conversation config
99
+ conversation_config = {"type": conversation_type or "basic"}
100
+ if reasoning_style:
101
+ conversation_config["reasoning_style"] = reasoning_style
102
+ if agent_mode:
103
+ conversation_config["agent_mode"] = agent_mode
104
+
105
+ minimal_config = {
106
+ "topics": {
107
+ "prompt": topic_prompt,
108
+ "mode": mode,
109
+ "system_prompt": topics_system_prompt or "",
110
+ "depth": depth
111
+ or (TOPIC_GRAPH_DEFAULT_DEPTH if mode == "graph" else TOPIC_TREE_DEFAULT_DEPTH),
112
+ "degree": degree
113
+ or (TOPIC_GRAPH_DEFAULT_DEGREE if mode == "graph" else TOPIC_TREE_DEFAULT_DEGREE),
114
+ "save_as": topics_save_as
115
+ or ("topic_graph.json" if mode == "graph" else "topic_tree.jsonl"),
116
+ "llm": {
117
+ "provider": provider or DEFAULT_PROVIDER,
118
+ "model": model or DEFAULT_MODEL,
119
+ "temperature": temperature
120
+ or (
121
+ TOPIC_GRAPH_DEFAULT_TEMPERATURE
122
+ if mode == "graph"
123
+ else TOPIC_TREE_DEFAULT_TEMPERATURE
124
+ ),
125
+ },
126
+ },
127
+ "generation": {
128
+ "system_prompt": default_prompt,
129
+ "instructions": "Generate diverse and educational examples",
130
+ "conversation": conversation_config,
131
+ "max_retries": DEFAULT_MAX_RETRIES,
132
+ "llm": {
133
+ "provider": provider or DEFAULT_PROVIDER,
134
+ "model": model or DEFAULT_MODEL,
135
+ "temperature": temperature or ENGINE_DEFAULT_TEMPERATURE,
136
+ },
137
+ },
138
+ "output": {
139
+ "system_prompt": output_system_prompt,
140
+ "include_system_message": include_system_message
141
+ if include_system_message is not None
142
+ else True,
143
+ "num_samples": num_samples or ENGINE_DEFAULT_NUM_EXAMPLES,
144
+ "batch_size": batch_size or ENGINE_DEFAULT_BATCH_SIZE,
145
+ "save_as": output_save_as or "dataset.jsonl",
146
+ },
147
+ }
148
+
149
+ try:
150
+ return DeepFabricConfig.model_validate(minimal_config)
151
+ except ValidationError as e:
152
+ raise ConfigurationError(f"Invalid configuration: {str(e)}") from e
153
+
154
+
155
+ def apply_cli_overrides(
156
+ output_system_prompt: str | None = None,
157
+ topic_prompt: str | None = None,
158
+ topics_system_prompt: str | None = None,
159
+ generation_system_prompt: str | None = None,
160
+ provider: str | None = None,
161
+ model: str | None = None,
162
+ temperature: float | None = None,
163
+ degree: int | None = None,
164
+ depth: int | None = None,
165
+ base_url: str | None = None,
166
+ ) -> tuple[dict, dict]:
167
+ """
168
+ Build override dictionaries from CLI parameters.
169
+
170
+ Args:
171
+ output_system_prompt: Override for output system prompt
172
+ topic_prompt: Override for topic prompt
173
+ topics_system_prompt: Override for topics system prompt
174
+ generation_system_prompt: Override for generation system prompt
175
+ provider: Override for LLM provider
176
+ model: Override for model name
177
+ temperature: Override for temperature
178
+ degree: Override for branching factor
179
+ depth: Override for depth
180
+ base_url: Override for base URL
181
+
182
+ Returns:
183
+ Tuple of (topics_overrides, generation_overrides) dictionaries
184
+ """
185
+ # Prepare topics overrides
186
+ topics_overrides = {}
187
+ if topic_prompt:
188
+ topics_overrides["topic_prompt"] = topic_prompt
189
+ if topics_system_prompt:
190
+ topics_overrides["topic_system_prompt"] = topics_system_prompt
191
+ if provider:
192
+ topics_overrides["provider"] = provider
193
+ if model:
194
+ topics_overrides["model"] = model
195
+ if temperature:
196
+ topics_overrides["temperature"] = temperature
197
+ if degree:
198
+ topics_overrides["degree"] = degree
199
+ if depth:
200
+ topics_overrides["depth"] = depth
201
+ if base_url:
202
+ topics_overrides["base_url"] = base_url
203
+
204
+ # Prepare generation overrides
205
+ generation_overrides = {}
206
+ if generation_system_prompt:
207
+ generation_overrides["generation_system_prompt"] = generation_system_prompt
208
+ if output_system_prompt:
209
+ generation_overrides["dataset_system_prompt"] = output_system_prompt
210
+ if provider:
211
+ generation_overrides["provider"] = provider
212
+ if model:
213
+ generation_overrides["model"] = model
214
+ if temperature:
215
+ generation_overrides["temperature"] = temperature
216
+ if base_url:
217
+ generation_overrides["base_url"] = base_url
218
+
219
+ return topics_overrides, generation_overrides
220
+
221
+
222
+ def get_final_parameters(
223
+ config: DeepFabricConfig,
224
+ num_samples: int | None = None,
225
+ batch_size: int | None = None,
226
+ depth: int | None = None,
227
+ degree: int | None = None,
228
+ ) -> tuple[int, int, int, int]:
229
+ """
230
+ Get final parameters from config and CLI overrides.
231
+
232
+ Args:
233
+ config: DeepFabricConfig object
234
+ num_samples: CLI override for num_samples
235
+ batch_size: CLI override for batch_size
236
+ depth: CLI override for depth
237
+ degree: CLI override for degree
238
+
239
+ Returns:
240
+ Tuple of (num_samples, batch_size, depth, degree)
241
+ """
242
+ output_config = config.get_output_config()
243
+
244
+ final_num_samples = num_samples or output_config["num_samples"]
245
+ final_batch_size = batch_size or output_config["batch_size"]
246
+
247
+ # Get depth and degree from topics config
248
+ final_depth = depth or config.topics.depth
249
+ final_degree = degree or config.topics.degree
250
+
251
+ return final_num_samples, final_batch_size, final_depth, final_degree
@@ -0,0 +1,94 @@
1
+ # Default values
2
+ DEFAULT_PROVIDER = "openai"
3
+ DEFAULT_MODEL = "gpt-4o"
4
+ DEFAULT_TEMPERATURE = 0.7
5
+ DEFAULT_DEGREE = 3
6
+ DEFAULT_DEPTH = 2
7
+ DEFAULT_MAX_RETRIES = 3
8
+ DEFAULT_BATCH_SIZE = 5
9
+ DEFAULT_NUM_EXAMPLES = 3
10
+ DEFAULT_REQUEST_TIMEOUT = 30
11
+ DEFAULT_MAX_TOKENS = 1000
12
+
13
+ # Engine defaults
14
+ ENGINE_DEFAULT_TEMPERATURE = 0.2
15
+ ENGINE_DEFAULT_BATCH_SIZE = 5
16
+ ENGINE_DEFAULT_NUM_EXAMPLES = 3
17
+
18
+ # Topic tree defaults
19
+ TOPIC_TREE_DEFAULT_DEGREE = 10
20
+ TOPIC_TREE_DEFAULT_DEPTH = 3
21
+ TOPIC_TREE_DEFAULT_TEMPERATURE = 0.2
22
+ TOPIC_TREE_DEFAULT_MODEL = "gpt-4o"
23
+
24
+ # Topic graph defaults
25
+ TOPIC_GRAPH_DEFAULT_DEGREE = 10
26
+ TOPIC_GRAPH_DEFAULT_DEPTH = 3
27
+ TOPIC_GRAPH_SUMMARY = 20
28
+ TOPIC_GRAPH_DEFAULT_MODEL = "gpt-4o"
29
+ TOPIC_GRAPH_DEFAULT_TEMPERATURE = 0.7
30
+
31
+ # File extensions and patterns
32
+ JSONL_EXTENSION = ".jsonl"
33
+ YAML_EXTENSIONS = (".yaml", ".yml")
34
+
35
+ # Message roles
36
+ ROLE_SYSTEM = "system"
37
+ ROLE_USER = "user"
38
+ ROLE_ASSISTANT = "assistant"
39
+ VALID_ROLES = [ROLE_SYSTEM, ROLE_USER, ROLE_ASSISTANT]
40
+
41
+ # Placeholders
42
+ SYSTEM_PROMPT_VAR = "{{{{system_prompt}}}}"
43
+ INSTRUCTIONS_VAR = "{{{{instructions}}}}"
44
+ EXAMPLES_VAR = "{{{{examples}}}}"
45
+ SUBTOPICS_VAR = "{{{{subtopics}}}}"
46
+
47
+ # Retry and backoff settings
48
+ MAX_RETRY_ATTEMPTS = 3
49
+ RETRY_BASE_DELAY = 2 # seconds
50
+ EXPONENTIAL_BACKOFF_MULTIPLIER = 2
51
+ DEFAULT_SAMPLE_RETRIES = 2 # per-sample retries for validation errors
52
+
53
+ # String length limits
54
+ MAX_ERROR_PREVIEW_LENGTH = 200
55
+ TRUNCATION_SUFFIX = "..."
56
+
57
+ # Validation patterns
58
+ JSON_BLOCK_PATTERN = r"(?s)\{.*\}"
59
+ JSON_ARRAY_PATTERN = r"\[.*\]"
60
+ JSON_CODE_BLOCK_PATTERN = r"```json\s*|\s*```"
61
+
62
+ # Default tags
63
+ DEFAULT_HF_TAGS = ["deepfabric", "synthetic"]
64
+ DEFAULT_KAGGLE_TAGS = ["deepfabric", "synthetic"]
65
+
66
+ # Error categories
67
+ ERROR_CATEGORIES = {
68
+ "json_parsing_errors": "JSON Parsing Errors",
69
+ "invalid_schema": "Invalid Schema",
70
+ "api_errors": "API Errors",
71
+ "authentication_error": "Authentication Errors",
72
+ "empty_responses": "Empty Responses",
73
+ "malformed_responses": "Malformed Responses",
74
+ "other_errors": "Other Errors",
75
+ }
76
+
77
+ # API error indicators
78
+ API_ERROR_INDICATORS = ["timeout", "rate limit", "connection"]
79
+
80
+ # Special characters that need cleaning in JSON responses
81
+ JSON_SPECIAL_CHARS = "{}"
82
+
83
+ # Progress display settings
84
+ PROGRESS_BAR_DESC = "Progress"
85
+
86
+ # File save patterns
87
+ INTERRUPTED_DATASET_FILENAME = "interrupted_dataset.jsonl"
88
+ ERROR_DATASET_FILENAME = "error_dataset.jsonl"
89
+ PARTIAL_TREE_FILENAME = "partial_tree.jsonl"
90
+ FAILED_TREE_SUFFIX = "_failed.jsonl"
91
+
92
+ # Stream simulation defaults
93
+ STREAM_SIM_CHUNK_SIZE = 8 # characters per chunk
94
+ STREAM_SIM_CHUNK_DELAY_MS = 10.0 # milliseconds between chunks