PyPI - syntharc - Versions diffs - 0.1.0__py3-none-any.whl - Mend

syntharc 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

syntharc/__init__.py +38 -0
syntharc/core/__init__.py +14 -0
syntharc/core/base.py +282 -0
syntharc/core/config.py +99 -0
syntharc/core/utils.py +95 -0
syntharc/image/__init__.py +7 -0
syntharc/image/augmentor.py +315 -0
syntharc/image/evaluation.py +144 -0
syntharc/image/utils.py +15 -0
syntharc/tabular/__init__.py +19 -0
syntharc/tabular/ctgan.py +195 -0
syntharc/tabular/evaluation.py +187 -0
syntharc/tabular/gaussian_copula.py +182 -0
syntharc/tabular/utils.py +129 -0
syntharc/text/__init__.py +22 -0
syntharc/text/evaluation.py +100 -0
syntharc/text/markov.py +203 -0
syntharc/text/template.py +243 -0
syntharc/text/transformer.py +206 -0
syntharc/text/utils.py +14 -0
syntharc/timeseries/__init__.py +15 -0
syntharc/timeseries/evaluation.py +140 -0
syntharc/timeseries/par.py +224 -0
syntharc/timeseries/utils.py +97 -0
syntharc-0.1.0.dist-info/METADATA +243 -0
syntharc-0.1.0.dist-info/RECORD +29 -0
syntharc-0.1.0.dist-info/WHEEL +5 -0
syntharc-0.1.0.dist-info/licenses/LICENSE +21 -0
syntharc-0.1.0.dist-info/top_level.txt +1 -0

syntharc/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""syntharc — Unified synthetic data generation.
+A lightweight Python package for synthetic data generation across
+tabular, time-series, image, and text domains using sample-based
+learning, augmentation, and lightweight generative techniques.
+Quick Start
+-----------
+>>> from syntharc.core import BaseSynthesizer, set_seed, setup_logging
+Tabular (requires ``pip install syntharc[tabular]``):
+>>> from syntharc.tabular import CTGANSynthesizer  # doctest: +SKIP
+>>> from syntharc.tabular import GaussianCopulaSynthesizer  # doctest: +SKIP
+Time-series (requires ``pip install syntharc[timeseries]``):
+>>> from syntharc.timeseries import TimeSeriesSynthesizer  # doctest: +SKIP
+Image (requires ``pip install syntharc[image]``):
+>>> from syntharc.image import ImageAugmentor  # doctest: +SKIP
+Text (markov/template work out of the box, transformer needs
+``pip install syntharc[text]``):
+>>> from syntharc.text import MarkovTextGenerator  # doctest: +SKIP
+>>> from syntharc.text import TemplateTextGenerator  # doctest: +SKIP
+>>> from syntharc.text import TransformerTextGenerator  # doctest: +SKIP
+"""
+from __future__ import annotations
+__version__ = "0.1.0"
+__author__ = "Fahad Abdullah"
+__email__ = "fahadai.co@gmail.com"
+__all__ = ["__version__"]

syntharc/core/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""syntharc.core — Core infrastructure for syntharc."""
+from syntharc.core.base import BaseSynthesizer
+from syntharc.core.config import load_config, validate_config
+from syntharc.core.utils import get_device, set_seed, setup_logging
+__all__ = [
+    "BaseSynthesizer",
+    "load_config",
+    "validate_config",
+    "get_device",
+    "set_seed",
+    "setup_logging",
+]

syntharc/core/base.py ADDED Viewed

@@ -0,0 +1,282 @@
+"""Base synthesizer abstraction for all syntharc modules.
+Provides the ``BaseSynthesizer`` ABC that every generator inherits from.
+The API is split into three lifecycle methods:
+* ``prepare()`` — load / preprocess / cache resources (no learning).
+* ``fit()``     — learn / train / estimate parameters from sample data.
+* ``generate()`` — produce *N* synthetic samples (abstract, always required).
+"""
+from __future__ import annotations
+import logging
+import pickle
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+class BaseSynthesizer(ABC):
+    """Abstract base class for all syntharc synthesizers.
+    Parameters
+    ----------
+    config : dict | None
+        Optional configuration dictionary for the synthesizer.
+        Keys and values are module-specific.
+    """
+    _lifecycle: str = ""
+    """Subclasses set this to ``"fit"`` or ``"prepare"`` to declare
+    which lifecycle method they use. ``process()`` reads this directly."""
+    def __init__(self, config: dict[str, Any] | None = None) -> None:
+        self.config: dict[str, Any] = config or {}
+        self.is_fitted: bool = False
+        self.is_prepared: bool = False
+        self._logger: logging.Logger = logging.getLogger(f"syntharc.{self.__class__.__name__}")
+    # Lifecycle methods
+    def process(self, data: Any, **kwargs: Any) -> BaseSynthesizer:
+        """Primary entry point that routes data to the correct lifecycle method.
+        Reads the ``_lifecycle`` class attribute to determine whether to
+        call ``fit()`` or ``prepare()``. Each subclass declares its
+        lifecycle type explicitly.
+        Parameters
+        ----------
+        data : Any
+            Sample data, file paths, corpus, or any input the module needs.
+        **kwargs : Any
+            Passed through to the resolved lifecycle method.
+        Returns
+        -------
+        BaseSynthesizer
+            ``self``, for method chaining.
+        Raises
+        ------
+        NotImplementedError
+            If ``_lifecycle`` is not set on the subclass.
+        Examples
+        --------
+        >>> synth = CTGANSynthesizer()
+        >>> synth.process(sample_df)          # routes to fit()
+        >>> aug = ImageAugmentor()
+        >>> aug.process("./images/")          # routes to prepare()
+        """
+        if self._lifecycle == "fit":
+            return self.fit(data, **kwargs)
+        if self._lifecycle == "prepare":
+            return self.prepare(data, **kwargs)
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must set _lifecycle to 'fit' or "
+            f"'prepare' to use process()."
+        )
+    def prepare(self, data: Any, **kwargs: Any) -> BaseSynthesizer:
+        """Load, preprocess, cache, or set up resources.
+        Subclasses override this when no learning occurs — only loading,
+        caching, or preprocessing (e.g. ImageAugmentor, TransformerTextGenerator).
+        Parameters
+        ----------
+        data : Any
+            Resource to prepare (paths, text, images, etc.).
+        **kwargs : Any
+            Module-specific preparation options.
+        Returns
+        -------
+        BaseSynthesizer
+            ``self``, for method chaining.
+        Raises
+        ------
+        NotImplementedError
+            If the subclass does not support ``prepare()``.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support prepare(). "
+            f"This module uses fit() to learn from sample data."
+        )
+    def fit(self, data: Any, **kwargs: Any) -> BaseSynthesizer:
+        """Learn, train, or estimate parameters from sample data.
+        Subclasses override this when genuine learning occurs — training
+        neural networks, building transition tables, estimating distributions
+        (e.g. CTGANSynthesizer, MarkovTextGenerator, ImageAugmentor).
+        Parameters
+        ----------
+        data : Any
+            Sample data to learn from (DataFrame, text, image paths, etc.).
+        **kwargs : Any
+            Module-specific training options.
+        Returns
+        -------
+        BaseSynthesizer
+            ``self``, for method chaining.
+        Raises
+        ------
+        NotImplementedError
+            If the subclass does not support ``fit()``.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support fit(). "
+            f"This module uses prepare() to load and cache resources."
+        )
+    @abstractmethod
+    def generate(
+        self,
+        num_samples: int,
+        instructions: str | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Generate *num_samples* synthetic samples.
+        This is the only **required** method that every subclass must
+        implement.
+        Parameters
+        ----------
+        num_samples : int
+            Number of synthetic samples to produce.
+        instructions : str | None
+            Optional natural-language instructions that guide generation
+            (e.g. ``"ensure age > 18"``, ``"formal tone"``).
+        **kwargs : Any
+            Module-specific generation options.
+        Returns
+        -------
+        Any
+            Generated data in module-appropriate format
+            (DataFrame, list[str], list[Image], etc.).
+        """
+    def evaluate(self, real_data: Any, synthetic_data: Any) -> dict[str, Any]:
+        """Compare real vs. synthetic data quality.
+        Override in subclasses to return domain-specific metrics.
+        The default implementation returns an empty dict.
+        Parameters
+        ----------
+        real_data : Any
+            Original / reference data.
+        synthetic_data : Any
+            Data produced by ``generate()``.
+        Returns
+        -------
+        dict[str, Any]
+            Evaluation metrics.
+        """
+        return {}
+    # Serialization
+    def save(self, path: str | Path) -> None:
+        """Serialize the synthesizer state to disk.
+        The default implementation uses ``pickle``.  Subclasses may
+        override this for model-specific serialization (e.g.
+        ``torch.save``).
+        Parameters
+        ----------
+        path : str | Path
+            Destination file path.
+        Raises
+        ------
+        RuntimeError
+            If the synthesizer has not been fitted or prepared.
+        """
+        self._check_ready()
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "wb") as fh:
+            pickle.dump(self, fh)
+        self._logger.info("Saved %s to %s", self.__class__.__name__, path)
+    @classmethod
+    def load(cls, path: str | Path) -> BaseSynthesizer:
+        """Load a previously saved synthesizer from disk.
+        Parameters
+        ----------
+        path : str | Path
+            Path to the saved file.
+        Returns
+        -------
+        BaseSynthesizer
+            The restored synthesizer instance.
+        Raises
+        ------
+        FileNotFoundError
+            If the file does not exist.
+        """
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"No saved synthesizer found at {path}")
+        with open(path, "rb") as fh:
+            instance = pickle.load(fh)
+        if not isinstance(instance, cls):
+            raise TypeError(
+                f"Loaded object is {type(instance).__name__}, " f"expected {cls.__name__}"
+            )
+        return instance
+    # State guards
+    def _check_is_fitted(self) -> None:
+        """Raise if the synthesizer has not been fitted."""
+        if not self.is_fitted:
+            raise RuntimeError(
+                f"{self.__class__.__name__} must be fitted first. "
+                f"Call fit(data) before generating."
+            )
+    def _check_is_prepared(self) -> None:
+        """Raise if the synthesizer has not been prepared."""
+        if not self.is_prepared:
+            raise RuntimeError(
+                f"{self.__class__.__name__} must be prepared first. "
+                f"Call prepare(data) before generating."
+            )
+    def _check_ready(self) -> None:
+        """Raise if the synthesizer is neither fitted nor prepared."""
+        if not self.is_fitted and not self.is_prepared:
+            raise RuntimeError(
+                f"{self.__class__.__name__} is not ready. "
+                f"Call fit(data) or prepare(data) first."
+            )
+    def __repr__(self) -> str:
+        status = (
+            "fitted" if self.is_fitted else "prepared" if self.is_prepared else "not initialized"
+        )
+        config_str = ", ".join(f"{k}={v!r}" for k, v in self.config.items()) if self.config else ""
+        return (
+            f"{self.__class__.__name__}("
+            f"status={status}"
+            f"{', ' + config_str if config_str else ''}"
+            f")"
+        )

syntharc/core/config.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""Configuration loading and validation utilities.
+Provides helpers for reading YAML config files and validating that
+required keys are present before a synthesizer runs.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import yaml
+def load_config(source: str | Path | dict[str, Any]) -> dict[str, Any]:
+    """Load configuration from a YAML file path or pass through a dict.
+    Parameters
+    ----------
+    source : str | Path | dict
+        Either a path to a ``.yaml`` / ``.yml`` file, or an existing
+        configuration dictionary.
+    Returns
+    -------
+    dict[str, Any]
+        Parsed configuration.
+    Raises
+    ------
+    FileNotFoundError
+        If *source* is a path that does not exist.
+    TypeError
+        If *source* is neither a path-like nor a dict.
+    ValueError
+        If the YAML file does not parse to a dictionary.
+    Examples
+    --------
+    >>> cfg = load_config({"epochs": 50, "batch_size": 64})
+    >>> cfg["epochs"]
+    50
+    >>> cfg = load_config("config.yaml")  # doctest: +SKIP
+    """
+    # Already a dict — return as-is.
+    if isinstance(source, dict):
+        return source
+    # Treat as file path.
+    path = Path(source)
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    if path.suffix.lower() not in {".yaml", ".yml"}:
+        raise ValueError(f"Config file must be .yaml or .yml, got: {path.suffix!r}")
+    with open(path, encoding="utf-8") as fh:
+        data = yaml.safe_load(fh)
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"Expected YAML file to contain a mapping (dict), " f"got {type(data).__name__}"
+        )
+    return data
+def validate_config(
+    config: dict[str, Any],
+    required_keys: list[str],
+    context: str = "",
+) -> None:
+    """Validate that all *required_keys* are present in *config*.
+    Parameters
+    ----------
+    config : dict[str, Any]
+        Configuration dictionary to validate.
+    required_keys : list[str]
+        Keys that **must** be present.
+    context : str
+        Optional label (e.g. class name) for clearer error messages.
+    Raises
+    ------
+    ValueError
+        If any required key is missing.
+    Examples
+    --------
+    >>> validate_config({"a": 1, "b": 2}, ["a", "b"])
+    >>> validate_config({"a": 1}, ["a", "b"], context="MyModule")
+    Traceback (most recent call last):
+        ...
+    ValueError: MyModule config missing required keys: {'b'}
+    """
+    missing = set(required_keys) - set(config)
+    if missing:
+        prefix = f"{context} config" if context else "Config"
+        raise ValueError(f"{prefix} missing required keys: {missing}")

syntharc/core/utils.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Shared utility functions for the syntharc package.
+Provides device detection, reproducibility helpers, and logging setup.
+"""
+from __future__ import annotations
+import logging
+import random
+import numpy as np
+import torch
+def get_device() -> torch.device:
+    """Auto-detect the best available compute device.
+    Priority: CUDA → MPS (Apple Silicon) → CPU.
+    Returns
+    -------
+    torch.device
+        The selected device.
+    """
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+def set_seed(seed: int) -> None:
+    """Set random seeds for reproducibility across all relevant libraries.
+    Sets seeds for Python's ``random``, NumPy, and PyTorch.
+    Parameters
+    ----------
+    seed : int
+        The seed value. Must be a non-negative integer.
+    Raises
+    ------
+    ValueError
+        If *seed* is negative.
+    """
+    if seed < 0:
+        raise ValueError(f"Seed must be non-negative, got {seed}")
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def setup_logging(level: str = "INFO") -> None:
+    """Configure rich-formatted logging for syntharc.
+    Uses the ``rich`` library for coloured, structured log output.
+    Falls back to basic ``logging`` config if ``rich`` is unavailable
+    (shouldn't happen since it's a core dependency).
+    Parameters
+    ----------
+    level : str
+        Logging level name (``"DEBUG"``, ``"INFO"``, ``"WARNING"``, etc.).
+    Raises
+    ------
+    ValueError
+        If *level* is not a valid logging level name.
+    """
+    numeric_level = getattr(logging, level.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError(f"Invalid log level: {level!r}")
+    try:
+        from rich.logging import RichHandler
+        logging.basicConfig(
+            level=numeric_level,
+            format="%(message)s",
+            datefmt="[%X]",
+            handlers=[RichHandler(rich_tracebacks=True, markup=True)],
+            force=True,
+        )
+    except ImportError:
+        logging.basicConfig(
+            level=numeric_level,
+            format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
+            force=True,
+        )
+    logging.getLogger("syntharc").setLevel(numeric_level)

syntharc/image/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""syntharc.image — Image synthetic data generation."""
+from __future__ import annotations
+from syntharc.image.augmentor import ImageAugmentor
+__all__ = ["ImageAugmentor"]