DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. deepfabric/__init__.py +70 -0
  2. deepfabric/__main__.py +6 -0
  3. deepfabric/auth.py +382 -0
  4. deepfabric/builders.py +303 -0
  5. deepfabric/builders_agent.py +1304 -0
  6. deepfabric/cli.py +1288 -0
  7. deepfabric/config.py +899 -0
  8. deepfabric/config_manager.py +251 -0
  9. deepfabric/constants.py +94 -0
  10. deepfabric/dataset_manager.py +534 -0
  11. deepfabric/error_codes.py +581 -0
  12. deepfabric/evaluation/__init__.py +47 -0
  13. deepfabric/evaluation/backends/__init__.py +32 -0
  14. deepfabric/evaluation/backends/ollama_backend.py +137 -0
  15. deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
  16. deepfabric/evaluation/backends/transformers_backend.py +326 -0
  17. deepfabric/evaluation/evaluator.py +845 -0
  18. deepfabric/evaluation/evaluators/__init__.py +13 -0
  19. deepfabric/evaluation/evaluators/base.py +104 -0
  20. deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
  21. deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
  22. deepfabric/evaluation/evaluators/registry.py +66 -0
  23. deepfabric/evaluation/inference.py +155 -0
  24. deepfabric/evaluation/metrics.py +397 -0
  25. deepfabric/evaluation/parser.py +304 -0
  26. deepfabric/evaluation/reporters/__init__.py +13 -0
  27. deepfabric/evaluation/reporters/base.py +56 -0
  28. deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
  29. deepfabric/evaluation/reporters/file_reporter.py +61 -0
  30. deepfabric/evaluation/reporters/multi_reporter.py +56 -0
  31. deepfabric/exceptions.py +67 -0
  32. deepfabric/factory.py +26 -0
  33. deepfabric/generator.py +1084 -0
  34. deepfabric/graph.py +545 -0
  35. deepfabric/hf_hub.py +214 -0
  36. deepfabric/kaggle_hub.py +219 -0
  37. deepfabric/llm/__init__.py +41 -0
  38. deepfabric/llm/api_key_verifier.py +534 -0
  39. deepfabric/llm/client.py +1206 -0
  40. deepfabric/llm/errors.py +105 -0
  41. deepfabric/llm/rate_limit_config.py +262 -0
  42. deepfabric/llm/rate_limit_detector.py +278 -0
  43. deepfabric/llm/retry_handler.py +270 -0
  44. deepfabric/metrics.py +212 -0
  45. deepfabric/progress.py +262 -0
  46. deepfabric/prompts.py +290 -0
  47. deepfabric/schemas.py +1000 -0
  48. deepfabric/spin/__init__.py +6 -0
  49. deepfabric/spin/client.py +263 -0
  50. deepfabric/spin/models.py +26 -0
  51. deepfabric/stream_simulator.py +90 -0
  52. deepfabric/tools/__init__.py +5 -0
  53. deepfabric/tools/defaults.py +85 -0
  54. deepfabric/tools/loader.py +87 -0
  55. deepfabric/tools/mcp_client.py +677 -0
  56. deepfabric/topic_manager.py +303 -0
  57. deepfabric/topic_model.py +20 -0
  58. deepfabric/training/__init__.py +35 -0
  59. deepfabric/training/api_key_prompt.py +302 -0
  60. deepfabric/training/callback.py +363 -0
  61. deepfabric/training/metrics_sender.py +301 -0
  62. deepfabric/tree.py +438 -0
  63. deepfabric/tui.py +1267 -0
  64. deepfabric/update_checker.py +166 -0
  65. deepfabric/utils.py +150 -0
  66. deepfabric/validation.py +143 -0
  67. deepfabric-4.4.0.dist-info/METADATA +702 -0
  68. deepfabric-4.4.0.dist-info/RECORD +71 -0
  69. deepfabric-4.4.0.dist-info/WHEEL +4 -0
  70. deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
  71. deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
deepfabric/hf_hub.py ADDED
@@ -0,0 +1,214 @@
1
+ import json
2
+ import tempfile
3
+
4
+ from pathlib import Path
5
+
6
+ from huggingface_hub import DatasetCard, HfApi, login
7
+ from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
8
+
9
+ from .constants import DEFAULT_HF_TAGS
10
+
11
+
12
+ class HFUploader:
13
+ """
14
+ HFUploader is a class for uploading datasets to the Hugging Face Hub.
15
+
16
+ Methods
17
+ -------
18
+ __init__(hf_token)
19
+
20
+ push_to_hub(hf_dataset_repo, jsonl_file_path, tags=None)
21
+
22
+ Parameters
23
+ ----------
24
+ hf_dataset_repo : str
25
+ The repository name in the format 'username/dataset_name'.
26
+ jsonl_file_path : str
27
+ Path to the JSONL file.
28
+ tags : list[str], optional
29
+ List of tags to add to the dataset card.
30
+
31
+ Returns
32
+ -------
33
+ dict
34
+ A dictionary containing the status and a message.
35
+ """
36
+
37
+ def __init__(self, hf_token):
38
+ """
39
+ Initialize the uploader with the Hugging Face authentication token.
40
+
41
+ Parameters:
42
+ hf_token (str): Hugging Face Hub authentication token.
43
+ """
44
+ self.hf_token = hf_token
45
+
46
+ def _clean_dataset_for_upload(self, jsonl_file_path: str) -> str:
47
+ """
48
+ Clean dataset by removing empty question/final_answer fields.
49
+
50
+ This prevents empty columns from appearing in HuggingFace/Kaggle dataset viewers.
51
+
52
+ Parameters:
53
+ jsonl_file_path (str): Path to the original JSONL file.
54
+
55
+ Returns:
56
+ str: Path to cleaned file (temp file if cleaning was needed, original if not).
57
+ """
58
+ # Read the dataset and check if cleaning is needed
59
+ needs_cleaning = False
60
+ samples = []
61
+
62
+ with open(jsonl_file_path) as f:
63
+ for line in f:
64
+ if not line.strip():
65
+ continue
66
+ sample = json.loads(line)
67
+ samples.append(sample)
68
+
69
+ # Check if any sample has empty question/final_answer
70
+ if sample.get("question") == "" or sample.get("final_answer") == "":
71
+ needs_cleaning = True
72
+
73
+ # If no cleaning needed, return original file
74
+ if not needs_cleaning:
75
+ return jsonl_file_path
76
+
77
+ # Create a temporary file with cleaned data
78
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp_file:
79
+ for sample in samples:
80
+ # Remove empty question/final_answer fields
81
+ if sample.get("question") == "":
82
+ sample.pop("question", None)
83
+ if sample.get("final_answer") == "":
84
+ sample.pop("final_answer", None)
85
+
86
+ tmp_file.write(json.dumps(sample) + "\n")
87
+
88
+ return tmp_file.name
89
+
90
+ def update_dataset_card(self, repo_id: str, tags: list[str] | None = None):
91
+ """
92
+ Update the dataset card with tags.
93
+
94
+ Parameters:
95
+ repo_id (str): The repository ID in the format 'username/dataset_name'.
96
+ tags (list[str], optional): List of tags to add to the dataset card.
97
+ """
98
+ try:
99
+ # Try to load existing card, or create a new one if it doesn't exist
100
+ try:
101
+ card = DatasetCard.load(repo_id)
102
+ except Exception:
103
+ # No existing card - create a new one with basic content
104
+ card_content = f"---\ntags: []\n---\n# {repo_id.split('/')[-1]}\n\nDataset generated with DeepFabric.\n"
105
+ card = DatasetCard(card_content)
106
+
107
+ # Initialize tags if not present - use getattr for safe access
108
+ current_tags = getattr(card.data, "tags", None)
109
+ if not current_tags or not isinstance(current_tags, list):
110
+ current_tags = []
111
+ setattr(card.data, "tags", current_tags) # noqa: B010
112
+
113
+ # Add default deepfabric tags
114
+ for tag in DEFAULT_HF_TAGS:
115
+ if tag not in current_tags:
116
+ current_tags.append(tag)
117
+
118
+ # Add custom tags if provided
119
+ if tags:
120
+ for tag in tags:
121
+ if tag not in current_tags:
122
+ current_tags.append(tag)
123
+
124
+ # Use getattr to safely access push_to_hub method
125
+ push_method = getattr(card, "push_to_hub", None)
126
+ if push_method:
127
+ push_method(repo_id, token=self.hf_token)
128
+ return True # noqa: TRY300
129
+ except Exception as e:
130
+ print(f"Warning: Failed to update dataset card: {str(e)}") # nosec
131
+ return False
132
+
133
+ def push_to_hub(
134
+ self, hf_dataset_repo: str, jsonl_file_path: str, tags: list[str] | None = None
135
+ ):
136
+ """
137
+ Push a JSONL dataset to Hugging Face Hub.
138
+
139
+ Parameters:
140
+ hf_dataset_repo (str): The repository name in the format 'username/dataset_name'.
141
+ jsonl_file_path (str): Path to the JSONL file.
142
+ tags (list[str], optional): List of tags to add to the dataset card.
143
+
144
+ Returns:
145
+ dict: A dictionary containing the status and a message.
146
+ """
147
+ try:
148
+ login(token=self.hf_token)
149
+
150
+ # Clean empty question/final_answer fields to avoid empty columns in dataset viewers
151
+ cleaned_file = self._clean_dataset_for_upload(jsonl_file_path)
152
+
153
+ # Upload JSONL file directly using HfApi to avoid schema inference issues
154
+ # The datasets library tries to unify schemas across rows which fails when
155
+ # tool arguments have different fields (e.g., different tools have different params)
156
+ api = HfApi()
157
+
158
+ # Create the repo if it doesn't exist (type="dataset" for dataset repos)
159
+ api.create_repo(
160
+ repo_id=hf_dataset_repo,
161
+ repo_type="dataset",
162
+ exist_ok=True,
163
+ token=self.hf_token,
164
+ )
165
+
166
+ # Upload the JSONL file to the data/ directory (standard HF dataset structure)
167
+ api.upload_file(
168
+ path_or_fileobj=cleaned_file,
169
+ path_in_repo="data/train.jsonl",
170
+ repo_id=hf_dataset_repo,
171
+ repo_type="dataset",
172
+ token=self.hf_token,
173
+ )
174
+
175
+ # Update dataset card with tags
176
+ self.update_dataset_card(hf_dataset_repo, tags)
177
+
178
+ # Clean up temp file if we created one
179
+ if cleaned_file != jsonl_file_path:
180
+ Path(cleaned_file).unlink(missing_ok=True)
181
+
182
+ except RepositoryNotFoundError:
183
+ return {
184
+ "status": "error",
185
+ "message": f"Repository '{hf_dataset_repo}' not found. Please check your repository name.",
186
+ }
187
+
188
+ except HfHubHTTPError as e:
189
+ return {
190
+ "status": "error",
191
+ "message": f"Hugging Face Hub HTTP Error: {str(e)}",
192
+ }
193
+
194
+ except FileNotFoundError:
195
+ return {
196
+ "status": "error",
197
+ "message": f"File '{jsonl_file_path}' not found. Please check your file path.",
198
+ }
199
+
200
+ except Exception as e:
201
+ # Include the full exception chain for better debugging
202
+ error_msg = str(e)
203
+ if hasattr(e, "__cause__") and e.__cause__:
204
+ error_msg = f"{error_msg} (caused by: {e.__cause__})"
205
+ return {
206
+ "status": "error",
207
+ "message": f"An unexpected error occurred: {error_msg}",
208
+ }
209
+
210
+ else:
211
+ return {
212
+ "status": "success",
213
+ "message": f"Dataset pushed successfully to {hf_dataset_repo}.",
214
+ }
@@ -0,0 +1,219 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+
6
+ from contextlib import contextmanager
7
+ from pathlib import Path
8
+
9
+ import kagglehub
10
+
11
+ from .constants import DEFAULT_KAGGLE_TAGS
12
+
13
+ # Constants
14
+ EXPECTED_HANDLE_PARTS = 2
15
+
16
+
17
+ class KaggleUploader:
18
+ """
19
+ KaggleUploader is a class for uploading datasets to Kaggle.
20
+
21
+ Methods
22
+ -------
23
+ __init__(kaggle_username, kaggle_key)
24
+
25
+ push_to_hub(dataset_handle, jsonl_file_path, tags=None, version_notes=None)
26
+
27
+ Parameters
28
+ ----------
29
+ dataset_handle : str
30
+ The dataset handle in the format 'username/dataset-name'.
31
+ jsonl_file_path : str
32
+ Path to the JSONL file.
33
+ tags : list[str], optional
34
+ List of tags to add to the dataset.
35
+ version_notes : str, optional
36
+ Notes for the dataset version.
37
+
38
+ Returns
39
+ -------
40
+ dict
41
+ A dictionary containing the status and a message.
42
+ """
43
+
44
+ def __init__(self, kaggle_username: str | None = None, kaggle_key: str | None = None):
45
+ """
46
+ Initialize the uploader with Kaggle authentication credentials.
47
+
48
+ Parameters:
49
+ kaggle_username (str, optional): Kaggle username (can also be set via KAGGLE_USERNAME env var).
50
+ kaggle_key (str, optional): Kaggle API key (can also be set via KAGGLE_KEY env var).
51
+ """
52
+ self.kaggle_username = kaggle_username or os.getenv("KAGGLE_USERNAME")
53
+ self.kaggle_key = kaggle_key or os.getenv("KAGGLE_KEY")
54
+
55
+ if not self.kaggle_username or not self.kaggle_key:
56
+ raise ValueError(
57
+ "Kaggle credentials not provided. "
58
+ "Set via constructor params or KAGGLE_USERNAME/KAGGLE_KEY env vars."
59
+ )
60
+
61
+ @contextmanager
62
+ def _kaggle_credentials(self):
63
+ """Context manager to temporarily set Kaggle credentials in environment."""
64
+ # Store original values to restore later
65
+ original_username = os.environ.get("KAGGLE_USERNAME")
66
+ original_key = os.environ.get("KAGGLE_KEY")
67
+
68
+ try:
69
+ # Set credentials for kagglehub
70
+ os.environ["KAGGLE_USERNAME"] = self.kaggle_username # type: ignore
71
+ os.environ["KAGGLE_KEY"] = self.kaggle_key # type: ignore
72
+ yield
73
+ finally:
74
+ # Restore original environment state
75
+ if original_username is None:
76
+ os.environ.pop("KAGGLE_USERNAME", None)
77
+ else:
78
+ os.environ["KAGGLE_USERNAME"] = original_username
79
+
80
+ if original_key is None:
81
+ os.environ.pop("KAGGLE_KEY", None)
82
+ else:
83
+ os.environ["KAGGLE_KEY"] = original_key
84
+
85
+ def create_dataset_metadata(
86
+ self, dataset_handle: str, tags: list[str] | None = None, description: str | None = None
87
+ ) -> dict:
88
+ """
89
+ Create metadata for the Kaggle dataset.
90
+
91
+ Parameters:
92
+ dataset_handle (str): The dataset handle in the format 'username/dataset-name'.
93
+ tags (list[str], optional): List of tags for the dataset.
94
+ description (str, optional): Description for the dataset.
95
+
96
+ Returns:
97
+ dict: Metadata dictionary for the dataset.
98
+ """
99
+ # Parse the dataset handle
100
+ parts = dataset_handle.split("/")
101
+ if len(parts) != EXPECTED_HANDLE_PARTS:
102
+ raise ValueError(
103
+ f"Invalid dataset handle format: {dataset_handle}. Expected 'username/dataset-name'"
104
+ )
105
+
106
+ username, dataset_name = parts
107
+
108
+ # Add default deepfabric tags
109
+ all_tags = set(DEFAULT_KAGGLE_TAGS)
110
+ if tags:
111
+ all_tags.update(tags)
112
+
113
+ metadata = {
114
+ "title": dataset_name.replace("-", " ").title(),
115
+ "id": f"{username}/{dataset_name}",
116
+ "licenses": [{"name": "CC0-1.0"}],
117
+ "tags": list(all_tags),
118
+ }
119
+
120
+ if description:
121
+ metadata["description"] = description
122
+ else:
123
+ metadata["description"] = "Synthetic dataset generated using DeepFabric"
124
+
125
+ return metadata
126
+
127
+ def _handle_upload_error(self, error: Exception, dataset_handle: str) -> dict | None:
128
+ """Handle specific upload errors and return appropriate error response."""
129
+ error_msg = str(error)
130
+ if "404" in error_msg or "not found" in error_msg.lower():
131
+ return {
132
+ "status": "error",
133
+ "message": (
134
+ f"Dataset '{dataset_handle}' not found. "
135
+ "You may need to create it first on Kaggle.com"
136
+ ),
137
+ }
138
+ if "401" in error_msg or "unauthorized" in error_msg.lower():
139
+ return {
140
+ "status": "error",
141
+ "message": "Authentication failed. Please check your Kaggle credentials.",
142
+ }
143
+ if "403" in error_msg or "forbidden" in error_msg.lower():
144
+ return {
145
+ "status": "error",
146
+ "message": f"Permission denied. You may not have access to update {dataset_handle}.",
147
+ }
148
+ return None
149
+
150
+ def push_to_hub(
151
+ self,
152
+ dataset_handle: str,
153
+ jsonl_file_path: str,
154
+ tags: list[str] | None = None,
155
+ version_notes: str | None = None,
156
+ description: str | None = None,
157
+ ) -> dict[str, str]:
158
+ """
159
+ Push a JSONL dataset to Kaggle.
160
+
161
+ Parameters:
162
+ dataset_handle (str): The dataset handle in the format 'username/dataset-name'.
163
+ jsonl_file_path (str): Path to the JSONL file.
164
+ tags (list[str], optional): List of tags to add to the dataset.
165
+ version_notes (str, optional): Notes for the dataset version.
166
+ description (str, optional): Description for the dataset.
167
+
168
+ Returns:
169
+ dict: A dictionary containing the status and a message.
170
+ """
171
+ result = {"status": "error", "message": ""}
172
+
173
+ try:
174
+ # Create a temporary directory for the dataset
175
+ with tempfile.TemporaryDirectory() as tmpdir:
176
+ tmpdir_path = Path(tmpdir)
177
+
178
+ # Copy the JSONL file to the temp directory
179
+ dest_file = tmpdir_path / Path(jsonl_file_path).name
180
+ shutil.copy2(jsonl_file_path, dest_file)
181
+
182
+ # Create dataset metadata
183
+ metadata = self.create_dataset_metadata(dataset_handle, tags, description)
184
+ metadata_path = tmpdir_path / "dataset-metadata.json"
185
+ with open(metadata_path, "w") as f:
186
+ json.dump(metadata, f, indent=2)
187
+
188
+ # Upload the dataset using kagglehub
189
+ version_notes = version_notes or "Dataset uploaded via DeepFabric"
190
+
191
+ try:
192
+ # Upload the dataset with temporary credentials
193
+ with self._kaggle_credentials():
194
+ kagglehub.dataset_upload(
195
+ handle=dataset_handle,
196
+ local_dataset_dir=str(tmpdir_path),
197
+ version_notes=version_notes,
198
+ )
199
+
200
+ except Exception as upload_error:
201
+ # Handle specific Kaggle errors
202
+ error_result = self._handle_upload_error(upload_error, dataset_handle)
203
+ if error_result:
204
+ return error_result
205
+ raise
206
+ else:
207
+ result["status"] = "success"
208
+ result["message"] = f"Dataset pushed successfully to Kaggle: {dataset_handle}"
209
+
210
+ except FileNotFoundError:
211
+ result["message"] = f"File '{jsonl_file_path}' not found. Please check your file path."
212
+
213
+ except ValueError as e:
214
+ result["message"] = f"Invalid configuration: {str(e)}"
215
+
216
+ except Exception as e:
217
+ result["message"] = f"An unexpected error occurred: {str(e)}"
218
+
219
+ return result
@@ -0,0 +1,41 @@
1
+ """LLM abstraction layer for DeepFabric."""
2
+
3
+ from .api_key_verifier import (
4
+ VerificationResult,
5
+ VerificationStatus,
6
+ verify_all_api_keys,
7
+ verify_all_api_keys_async,
8
+ verify_anthropic_api_key,
9
+ verify_gemini_api_key,
10
+ verify_ollama_connection,
11
+ verify_openai_api_key,
12
+ verify_openrouter_api_key,
13
+ verify_provider_api_key,
14
+ verify_provider_api_key_async,
15
+ )
16
+ from .client import (
17
+ PROVIDER_API_KEY_MAP,
18
+ LLMClient,
19
+ get_required_api_key_env_var,
20
+ make_outlines_model,
21
+ validate_provider_api_key,
22
+ )
23
+
24
+ __all__ = [
25
+ "LLMClient",
26
+ "PROVIDER_API_KEY_MAP",
27
+ "VerificationResult",
28
+ "VerificationStatus",
29
+ "get_required_api_key_env_var",
30
+ "make_outlines_model",
31
+ "validate_provider_api_key",
32
+ "verify_all_api_keys",
33
+ "verify_all_api_keys_async",
34
+ "verify_anthropic_api_key",
35
+ "verify_gemini_api_key",
36
+ "verify_ollama_connection",
37
+ "verify_openai_api_key",
38
+ "verify_openrouter_api_key",
39
+ "verify_provider_api_key",
40
+ "verify_provider_api_key_async",
41
+ ]