PyPI - DeepFabric - Versions diffs - 4.4.0__py3-none-any.whl - Mend

DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

deepfabric/__init__.py +70 -0
deepfabric/__main__.py +6 -0
deepfabric/auth.py +382 -0
deepfabric/builders.py +303 -0
deepfabric/builders_agent.py +1304 -0
deepfabric/cli.py +1288 -0
deepfabric/config.py +899 -0
deepfabric/config_manager.py +251 -0
deepfabric/constants.py +94 -0
deepfabric/dataset_manager.py +534 -0
deepfabric/error_codes.py +581 -0
deepfabric/evaluation/__init__.py +47 -0
deepfabric/evaluation/backends/__init__.py +32 -0
deepfabric/evaluation/backends/ollama_backend.py +137 -0
deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
deepfabric/evaluation/backends/transformers_backend.py +326 -0
deepfabric/evaluation/evaluator.py +845 -0
deepfabric/evaluation/evaluators/__init__.py +13 -0
deepfabric/evaluation/evaluators/base.py +104 -0
deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
deepfabric/evaluation/evaluators/registry.py +66 -0
deepfabric/evaluation/inference.py +155 -0
deepfabric/evaluation/metrics.py +397 -0
deepfabric/evaluation/parser.py +304 -0
deepfabric/evaluation/reporters/__init__.py +13 -0
deepfabric/evaluation/reporters/base.py +56 -0
deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
deepfabric/evaluation/reporters/file_reporter.py +61 -0
deepfabric/evaluation/reporters/multi_reporter.py +56 -0
deepfabric/exceptions.py +67 -0
deepfabric/factory.py +26 -0
deepfabric/generator.py +1084 -0
deepfabric/graph.py +545 -0
deepfabric/hf_hub.py +214 -0
deepfabric/kaggle_hub.py +219 -0
deepfabric/llm/__init__.py +41 -0
deepfabric/llm/api_key_verifier.py +534 -0
deepfabric/llm/client.py +1206 -0
deepfabric/llm/errors.py +105 -0
deepfabric/llm/rate_limit_config.py +262 -0
deepfabric/llm/rate_limit_detector.py +278 -0
deepfabric/llm/retry_handler.py +270 -0
deepfabric/metrics.py +212 -0
deepfabric/progress.py +262 -0
deepfabric/prompts.py +290 -0
deepfabric/schemas.py +1000 -0
deepfabric/spin/__init__.py +6 -0
deepfabric/spin/client.py +263 -0
deepfabric/spin/models.py +26 -0
deepfabric/stream_simulator.py +90 -0
deepfabric/tools/__init__.py +5 -0
deepfabric/tools/defaults.py +85 -0
deepfabric/tools/loader.py +87 -0
deepfabric/tools/mcp_client.py +677 -0
deepfabric/topic_manager.py +303 -0
deepfabric/topic_model.py +20 -0
deepfabric/training/__init__.py +35 -0
deepfabric/training/api_key_prompt.py +302 -0
deepfabric/training/callback.py +363 -0
deepfabric/training/metrics_sender.py +301 -0
deepfabric/tree.py +438 -0
deepfabric/tui.py +1267 -0
deepfabric/update_checker.py +166 -0
deepfabric/utils.py +150 -0
deepfabric/validation.py +143 -0
deepfabric-4.4.0.dist-info/METADATA +702 -0
deepfabric-4.4.0.dist-info/RECORD +71 -0
deepfabric-4.4.0.dist-info/WHEEL +4 -0
deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0

deepfabric/hf_hub.py ADDED Viewed

@@ -0,0 +1,214 @@
+import json
+import tempfile
+from pathlib import Path
+from huggingface_hub import DatasetCard, HfApi, login
+from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
+from .constants import DEFAULT_HF_TAGS
+class HFUploader:
+    """
+    HFUploader is a class for uploading datasets to the Hugging Face Hub.
+    Methods
+    -------
+    __init__(hf_token)
+    push_to_hub(hf_dataset_repo, jsonl_file_path, tags=None)
+        Parameters
+        ----------
+        hf_dataset_repo : str
+            The repository name in the format 'username/dataset_name'.
+        jsonl_file_path : str
+            Path to the JSONL file.
+        tags : list[str], optional
+            List of tags to add to the dataset card.
+        Returns
+        -------
+        dict
+            A dictionary containing the status and a message.
+    """
+    def __init__(self, hf_token):
+        """
+        Initialize the uploader with the Hugging Face authentication token.
+        Parameters:
+        hf_token (str): Hugging Face Hub authentication token.
+        """
+        self.hf_token = hf_token
+    def _clean_dataset_for_upload(self, jsonl_file_path: str) -> str:
+        """
+        Clean dataset by removing empty question/final_answer fields.
+        This prevents empty columns from appearing in HuggingFace/Kaggle dataset viewers.
+        Parameters:
+        jsonl_file_path (str): Path to the original JSONL file.
+        Returns:
+        str: Path to cleaned file (temp file if cleaning was needed, original if not).
+        """
+        # Read the dataset and check if cleaning is needed
+        needs_cleaning = False
+        samples = []
+        with open(jsonl_file_path) as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                sample = json.loads(line)
+                samples.append(sample)
+                # Check if any sample has empty question/final_answer
+                if sample.get("question") == "" or sample.get("final_answer") == "":
+                    needs_cleaning = True
+        # If no cleaning needed, return original file
+        if not needs_cleaning:
+            return jsonl_file_path
+        # Create a temporary file with cleaned data
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp_file:
+            for sample in samples:
+                # Remove empty question/final_answer fields
+                if sample.get("question") == "":
+                    sample.pop("question", None)
+                if sample.get("final_answer") == "":
+                    sample.pop("final_answer", None)
+                tmp_file.write(json.dumps(sample) + "\n")
+            return tmp_file.name
+    def update_dataset_card(self, repo_id: str, tags: list[str] | None = None):
+        """
+        Update the dataset card with tags.
+        Parameters:
+        repo_id (str): The repository ID in the format 'username/dataset_name'.
+        tags (list[str], optional): List of tags to add to the dataset card.
+        """
+        try:
+            # Try to load existing card, or create a new one if it doesn't exist
+            try:
+                card = DatasetCard.load(repo_id)
+            except Exception:
+                # No existing card - create a new one with basic content
+                card_content = f"---\ntags: []\n---\n# {repo_id.split('/')[-1]}\n\nDataset generated with DeepFabric.\n"
+                card = DatasetCard(card_content)
+            # Initialize tags if not present - use getattr for safe access
+            current_tags = getattr(card.data, "tags", None)
+            if not current_tags or not isinstance(current_tags, list):
+                current_tags = []
+                setattr(card.data, "tags", current_tags)  # noqa: B010
+            # Add default deepfabric tags
+            for tag in DEFAULT_HF_TAGS:
+                if tag not in current_tags:
+                    current_tags.append(tag)
+            # Add custom tags if provided
+            if tags:
+                for tag in tags:
+                    if tag not in current_tags:
+                        current_tags.append(tag)
+            # Use getattr to safely access push_to_hub method
+            push_method = getattr(card, "push_to_hub", None)
+            if push_method:
+                push_method(repo_id, token=self.hf_token)
+            return True  # noqa: TRY300
+        except Exception as e:
+            print(f"Warning: Failed to update dataset card: {str(e)}")  # nosec
+            return False
+    def push_to_hub(
+        self, hf_dataset_repo: str, jsonl_file_path: str, tags: list[str] | None = None
+    ):
+        """
+        Push a JSONL dataset to Hugging Face Hub.
+        Parameters:
+        hf_dataset_repo (str): The repository name in the format 'username/dataset_name'.
+        jsonl_file_path (str): Path to the JSONL file.
+        tags (list[str], optional): List of tags to add to the dataset card.
+        Returns:
+        dict: A dictionary containing the status and a message.
+        """
+        try:
+            login(token=self.hf_token)
+            # Clean empty question/final_answer fields to avoid empty columns in dataset viewers
+            cleaned_file = self._clean_dataset_for_upload(jsonl_file_path)
+            # Upload JSONL file directly using HfApi to avoid schema inference issues
+            # The datasets library tries to unify schemas across rows which fails when
+            # tool arguments have different fields (e.g., different tools have different params)
+            api = HfApi()
+            # Create the repo if it doesn't exist (type="dataset" for dataset repos)
+            api.create_repo(
+                repo_id=hf_dataset_repo,
+                repo_type="dataset",
+                exist_ok=True,
+                token=self.hf_token,
+            )
+            # Upload the JSONL file to the data/ directory (standard HF dataset structure)
+            api.upload_file(
+                path_or_fileobj=cleaned_file,
+                path_in_repo="data/train.jsonl",
+                repo_id=hf_dataset_repo,
+                repo_type="dataset",
+                token=self.hf_token,
+            )
+            # Update dataset card with tags
+            self.update_dataset_card(hf_dataset_repo, tags)
+            # Clean up temp file if we created one
+            if cleaned_file != jsonl_file_path:
+                Path(cleaned_file).unlink(missing_ok=True)
+        except RepositoryNotFoundError:
+            return {
+                "status": "error",
+                "message": f"Repository '{hf_dataset_repo}' not found. Please check your repository name.",
+            }
+        except HfHubHTTPError as e:
+            return {
+                "status": "error",
+                "message": f"Hugging Face Hub HTTP Error: {str(e)}",
+            }
+        except FileNotFoundError:
+            return {
+                "status": "error",
+                "message": f"File '{jsonl_file_path}' not found. Please check your file path.",
+            }
+        except Exception as e:
+            # Include the full exception chain for better debugging
+            error_msg = str(e)
+            if hasattr(e, "__cause__") and e.__cause__:
+                error_msg = f"{error_msg} (caused by: {e.__cause__})"
+            return {
+                "status": "error",
+                "message": f"An unexpected error occurred: {error_msg}",
+            }
+        else:
+            return {
+                "status": "success",
+                "message": f"Dataset pushed successfully to {hf_dataset_repo}.",
+            }

deepfabric/kaggle_hub.py ADDED Viewed

@@ -0,0 +1,219 @@
+import json
+import os
+import shutil
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+import kagglehub
+from .constants import DEFAULT_KAGGLE_TAGS
+# Constants
+EXPECTED_HANDLE_PARTS = 2
+class KaggleUploader:
+    """
+    KaggleUploader is a class for uploading datasets to Kaggle.
+    Methods
+    -------
+    __init__(kaggle_username, kaggle_key)
+    push_to_hub(dataset_handle, jsonl_file_path, tags=None, version_notes=None)
+        Parameters
+        ----------
+        dataset_handle : str
+            The dataset handle in the format 'username/dataset-name'.
+        jsonl_file_path : str
+            Path to the JSONL file.
+        tags : list[str], optional
+            List of tags to add to the dataset.
+        version_notes : str, optional
+            Notes for the dataset version.
+        Returns
+        -------
+        dict
+            A dictionary containing the status and a message.
+    """
+    def __init__(self, kaggle_username: str | None = None, kaggle_key: str | None = None):
+        """
+        Initialize the uploader with Kaggle authentication credentials.
+        Parameters:
+        kaggle_username (str, optional): Kaggle username (can also be set via KAGGLE_USERNAME env var).
+        kaggle_key (str, optional): Kaggle API key (can also be set via KAGGLE_KEY env var).
+        """
+        self.kaggle_username = kaggle_username or os.getenv("KAGGLE_USERNAME")
+        self.kaggle_key = kaggle_key or os.getenv("KAGGLE_KEY")
+        if not self.kaggle_username or not self.kaggle_key:
+            raise ValueError(
+                "Kaggle credentials not provided. "
+                "Set via constructor params or KAGGLE_USERNAME/KAGGLE_KEY env vars."
+            )
+    @contextmanager
+    def _kaggle_credentials(self):
+        """Context manager to temporarily set Kaggle credentials in environment."""
+        # Store original values to restore later
+        original_username = os.environ.get("KAGGLE_USERNAME")
+        original_key = os.environ.get("KAGGLE_KEY")
+        try:
+            # Set credentials for kagglehub
+            os.environ["KAGGLE_USERNAME"] = self.kaggle_username  # type: ignore
+            os.environ["KAGGLE_KEY"] = self.kaggle_key  # type: ignore
+            yield
+        finally:
+            # Restore original environment state
+            if original_username is None:
+                os.environ.pop("KAGGLE_USERNAME", None)
+            else:
+                os.environ["KAGGLE_USERNAME"] = original_username
+            if original_key is None:
+                os.environ.pop("KAGGLE_KEY", None)
+            else:
+                os.environ["KAGGLE_KEY"] = original_key
+    def create_dataset_metadata(
+        self, dataset_handle: str, tags: list[str] | None = None, description: str | None = None
+    ) -> dict:
+        """
+        Create metadata for the Kaggle dataset.
+        Parameters:
+        dataset_handle (str): The dataset handle in the format 'username/dataset-name'.
+        tags (list[str], optional): List of tags for the dataset.
+        description (str, optional): Description for the dataset.
+        Returns:
+        dict: Metadata dictionary for the dataset.
+        """
+        # Parse the dataset handle
+        parts = dataset_handle.split("/")
+        if len(parts) != EXPECTED_HANDLE_PARTS:
+            raise ValueError(
+                f"Invalid dataset handle format: {dataset_handle}. Expected 'username/dataset-name'"
+            )
+        username, dataset_name = parts
+        # Add default deepfabric tags
+        all_tags = set(DEFAULT_KAGGLE_TAGS)
+        if tags:
+            all_tags.update(tags)
+        metadata = {
+            "title": dataset_name.replace("-", " ").title(),
+            "id": f"{username}/{dataset_name}",
+            "licenses": [{"name": "CC0-1.0"}],
+            "tags": list(all_tags),
+        }
+        if description:
+            metadata["description"] = description
+        else:
+            metadata["description"] = "Synthetic dataset generated using DeepFabric"
+        return metadata
+    def _handle_upload_error(self, error: Exception, dataset_handle: str) -> dict | None:
+        """Handle specific upload errors and return appropriate error response."""
+        error_msg = str(error)
+        if "404" in error_msg or "not found" in error_msg.lower():
+            return {
+                "status": "error",
+                "message": (
+                    f"Dataset '{dataset_handle}' not found. "
+                    "You may need to create it first on Kaggle.com"
+                ),
+            }
+        if "401" in error_msg or "unauthorized" in error_msg.lower():
+            return {
+                "status": "error",
+                "message": "Authentication failed. Please check your Kaggle credentials.",
+            }
+        if "403" in error_msg or "forbidden" in error_msg.lower():
+            return {
+                "status": "error",
+                "message": f"Permission denied. You may not have access to update {dataset_handle}.",
+            }
+        return None
+    def push_to_hub(
+        self,
+        dataset_handle: str,
+        jsonl_file_path: str,
+        tags: list[str] | None = None,
+        version_notes: str | None = None,
+        description: str | None = None,
+    ) -> dict[str, str]:
+        """
+        Push a JSONL dataset to Kaggle.
+        Parameters:
+        dataset_handle (str): The dataset handle in the format 'username/dataset-name'.
+        jsonl_file_path (str): Path to the JSONL file.
+        tags (list[str], optional): List of tags to add to the dataset.
+        version_notes (str, optional): Notes for the dataset version.
+        description (str, optional): Description for the dataset.
+        Returns:
+        dict: A dictionary containing the status and a message.
+        """
+        result = {"status": "error", "message": ""}
+        try:
+            # Create a temporary directory for the dataset
+            with tempfile.TemporaryDirectory() as tmpdir:
+                tmpdir_path = Path(tmpdir)
+                # Copy the JSONL file to the temp directory
+                dest_file = tmpdir_path / Path(jsonl_file_path).name
+                shutil.copy2(jsonl_file_path, dest_file)
+                # Create dataset metadata
+                metadata = self.create_dataset_metadata(dataset_handle, tags, description)
+                metadata_path = tmpdir_path / "dataset-metadata.json"
+                with open(metadata_path, "w") as f:
+                    json.dump(metadata, f, indent=2)
+                # Upload the dataset using kagglehub
+                version_notes = version_notes or "Dataset uploaded via DeepFabric"
+                try:
+                    # Upload the dataset with temporary credentials
+                    with self._kaggle_credentials():
+                        kagglehub.dataset_upload(
+                            handle=dataset_handle,
+                            local_dataset_dir=str(tmpdir_path),
+                            version_notes=version_notes,
+                        )
+                except Exception as upload_error:
+                    # Handle specific Kaggle errors
+                    error_result = self._handle_upload_error(upload_error, dataset_handle)
+                    if error_result:
+                        return error_result
+                    raise
+                else:
+                    result["status"] = "success"
+                    result["message"] = f"Dataset pushed successfully to Kaggle: {dataset_handle}"
+        except FileNotFoundError:
+            result["message"] = f"File '{jsonl_file_path}' not found. Please check your file path."
+        except ValueError as e:
+            result["message"] = f"Invalid configuration: {str(e)}"
+        except Exception as e:
+            result["message"] = f"An unexpected error occurred: {str(e)}"
+        return result

deepfabric/llm/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""LLM abstraction layer for DeepFabric."""
+from .api_key_verifier import (
+    VerificationResult,
+    VerificationStatus,
+    verify_all_api_keys,
+    verify_all_api_keys_async,
+    verify_anthropic_api_key,
+    verify_gemini_api_key,
+    verify_ollama_connection,
+    verify_openai_api_key,
+    verify_openrouter_api_key,
+    verify_provider_api_key,
+    verify_provider_api_key_async,
+)
+from .client import (
+    PROVIDER_API_KEY_MAP,
+    LLMClient,
+    get_required_api_key_env_var,
+    make_outlines_model,
+    validate_provider_api_key,
+)
+__all__ = [
+    "LLMClient",
+    "PROVIDER_API_KEY_MAP",
+    "VerificationResult",
+    "VerificationStatus",
+    "get_required_api_key_env_var",
+    "make_outlines_model",
+    "validate_provider_api_key",
+    "verify_all_api_keys",
+    "verify_all_api_keys_async",
+    "verify_anthropic_api_key",
+    "verify_gemini_api_key",
+    "verify_ollama_connection",
+    "verify_openai_api_key",
+    "verify_openrouter_api_key",
+    "verify_provider_api_key",
+    "verify_provider_api_key_async",
+]