syntharc 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syntharc/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """syntharc — Unified synthetic data generation.
2
+
3
+ A lightweight Python package for synthetic data generation across
4
+ tabular, time-series, image, and text domains using sample-based
5
+ learning, augmentation, and lightweight generative techniques.
6
+
7
+ Quick Start
8
+ -----------
9
+ >>> from syntharc.core import BaseSynthesizer, set_seed, setup_logging
10
+
11
+ Tabular (requires ``pip install syntharc[tabular]``):
12
+
13
+ >>> from syntharc.tabular import CTGANSynthesizer # doctest: +SKIP
14
+ >>> from syntharc.tabular import GaussianCopulaSynthesizer # doctest: +SKIP
15
+
16
+ Time-series (requires ``pip install syntharc[timeseries]``):
17
+
18
+ >>> from syntharc.timeseries import TimeSeriesSynthesizer # doctest: +SKIP
19
+
20
+ Image (requires ``pip install syntharc[image]``):
21
+
22
+ >>> from syntharc.image import ImageAugmentor # doctest: +SKIP
23
+
24
+ Text (markov/template work out of the box, transformer needs
25
+ ``pip install syntharc[text]``):
26
+
27
+ >>> from syntharc.text import MarkovTextGenerator # doctest: +SKIP
28
+ >>> from syntharc.text import TemplateTextGenerator # doctest: +SKIP
29
+ >>> from syntharc.text import TransformerTextGenerator # doctest: +SKIP
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ __version__ = "0.1.0"
35
+ __author__ = "Fahad Abdullah"
36
+ __email__ = "fahadai.co@gmail.com"
37
+
38
+ __all__ = ["__version__"]
@@ -0,0 +1,14 @@
1
+ """syntharc.core — Core infrastructure for syntharc."""
2
+
3
+ from syntharc.core.base import BaseSynthesizer
4
+ from syntharc.core.config import load_config, validate_config
5
+ from syntharc.core.utils import get_device, set_seed, setup_logging
6
+
7
+ __all__ = [
8
+ "BaseSynthesizer",
9
+ "load_config",
10
+ "validate_config",
11
+ "get_device",
12
+ "set_seed",
13
+ "setup_logging",
14
+ ]
syntharc/core/base.py ADDED
@@ -0,0 +1,282 @@
1
+ """Base synthesizer abstraction for all syntharc modules.
2
+
3
+ Provides the ``BaseSynthesizer`` ABC that every generator inherits from.
4
+ The API is split into three lifecycle methods:
5
+
6
+ * ``prepare()`` — load / preprocess / cache resources (no learning).
7
+ * ``fit()`` — learn / train / estimate parameters from sample data.
8
+ * ``generate()`` — produce *N* synthetic samples (abstract, always required).
9
+
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import pickle
16
+ from abc import ABC, abstractmethod
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+
21
+ class BaseSynthesizer(ABC):
22
+ """Abstract base class for all syntharc synthesizers.
23
+
24
+ Parameters
25
+ ----------
26
+ config : dict | None
27
+ Optional configuration dictionary for the synthesizer.
28
+ Keys and values are module-specific.
29
+ """
30
+
31
+ _lifecycle: str = ""
32
+ """Subclasses set this to ``"fit"`` or ``"prepare"`` to declare
33
+ which lifecycle method they use. ``process()`` reads this directly."""
34
+
35
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
36
+ self.config: dict[str, Any] = config or {}
37
+ self.is_fitted: bool = False
38
+ self.is_prepared: bool = False
39
+ self._logger: logging.Logger = logging.getLogger(f"syntharc.{self.__class__.__name__}")
40
+
41
+ # Lifecycle methods
42
+
43
+ def process(self, data: Any, **kwargs: Any) -> BaseSynthesizer:
44
+ """Primary entry point that routes data to the correct lifecycle method.
45
+
46
+ Reads the ``_lifecycle`` class attribute to determine whether to
47
+ call ``fit()`` or ``prepare()``. Each subclass declares its
48
+ lifecycle type explicitly.
49
+
50
+ Parameters
51
+ ----------
52
+ data : Any
53
+ Sample data, file paths, corpus, or any input the module needs.
54
+ **kwargs : Any
55
+ Passed through to the resolved lifecycle method.
56
+
57
+ Returns
58
+ -------
59
+ BaseSynthesizer
60
+ ``self``, for method chaining.
61
+
62
+ Raises
63
+ ------
64
+ NotImplementedError
65
+ If ``_lifecycle`` is not set on the subclass.
66
+
67
+ Examples
68
+ --------
69
+ >>> synth = CTGANSynthesizer()
70
+ >>> synth.process(sample_df) # routes to fit()
71
+ >>> aug = ImageAugmentor()
72
+ >>> aug.process("./images/") # routes to prepare()
73
+ """
74
+ if self._lifecycle == "fit":
75
+ return self.fit(data, **kwargs)
76
+ if self._lifecycle == "prepare":
77
+ return self.prepare(data, **kwargs)
78
+
79
+ raise NotImplementedError(
80
+ f"{self.__class__.__name__} must set _lifecycle to 'fit' or "
81
+ f"'prepare' to use process()."
82
+ )
83
+
84
+ def prepare(self, data: Any, **kwargs: Any) -> BaseSynthesizer:
85
+ """Load, preprocess, cache, or set up resources.
86
+
87
+ Subclasses override this when no learning occurs — only loading,
88
+ caching, or preprocessing (e.g. ImageAugmentor, TransformerTextGenerator).
89
+
90
+ Parameters
91
+ ----------
92
+ data : Any
93
+ Resource to prepare (paths, text, images, etc.).
94
+ **kwargs : Any
95
+ Module-specific preparation options.
96
+
97
+ Returns
98
+ -------
99
+ BaseSynthesizer
100
+ ``self``, for method chaining.
101
+
102
+ Raises
103
+ ------
104
+ NotImplementedError
105
+ If the subclass does not support ``prepare()``.
106
+ """
107
+ raise NotImplementedError(
108
+ f"{self.__class__.__name__} does not support prepare(). "
109
+ f"This module uses fit() to learn from sample data."
110
+ )
111
+
112
+ def fit(self, data: Any, **kwargs: Any) -> BaseSynthesizer:
113
+ """Learn, train, or estimate parameters from sample data.
114
+
115
+ Subclasses override this when genuine learning occurs — training
116
+ neural networks, building transition tables, estimating distributions
117
+ (e.g. CTGANSynthesizer, MarkovTextGenerator, ImageAugmentor).
118
+
119
+ Parameters
120
+ ----------
121
+ data : Any
122
+ Sample data to learn from (DataFrame, text, image paths, etc.).
123
+ **kwargs : Any
124
+ Module-specific training options.
125
+
126
+ Returns
127
+ -------
128
+ BaseSynthesizer
129
+ ``self``, for method chaining.
130
+
131
+ Raises
132
+ ------
133
+ NotImplementedError
134
+ If the subclass does not support ``fit()``.
135
+ """
136
+ raise NotImplementedError(
137
+ f"{self.__class__.__name__} does not support fit(). "
138
+ f"This module uses prepare() to load and cache resources."
139
+ )
140
+
141
+ @abstractmethod
142
+ def generate(
143
+ self,
144
+ num_samples: int,
145
+ instructions: str | None = None,
146
+ **kwargs: Any,
147
+ ) -> Any:
148
+ """Generate *num_samples* synthetic samples.
149
+
150
+ This is the only **required** method that every subclass must
151
+ implement.
152
+
153
+ Parameters
154
+ ----------
155
+ num_samples : int
156
+ Number of synthetic samples to produce.
157
+ instructions : str | None
158
+ Optional natural-language instructions that guide generation
159
+ (e.g. ``"ensure age > 18"``, ``"formal tone"``).
160
+ **kwargs : Any
161
+ Module-specific generation options.
162
+
163
+ Returns
164
+ -------
165
+ Any
166
+ Generated data in module-appropriate format
167
+ (DataFrame, list[str], list[Image], etc.).
168
+ """
169
+
170
+ def evaluate(self, real_data: Any, synthetic_data: Any) -> dict[str, Any]:
171
+ """Compare real vs. synthetic data quality.
172
+
173
+ Override in subclasses to return domain-specific metrics.
174
+ The default implementation returns an empty dict.
175
+
176
+ Parameters
177
+ ----------
178
+ real_data : Any
179
+ Original / reference data.
180
+ synthetic_data : Any
181
+ Data produced by ``generate()``.
182
+
183
+ Returns
184
+ -------
185
+ dict[str, Any]
186
+ Evaluation metrics.
187
+ """
188
+ return {}
189
+
190
+ # Serialization
191
+
192
+ def save(self, path: str | Path) -> None:
193
+ """Serialize the synthesizer state to disk.
194
+
195
+ The default implementation uses ``pickle``. Subclasses may
196
+ override this for model-specific serialization (e.g.
197
+ ``torch.save``).
198
+
199
+ Parameters
200
+ ----------
201
+ path : str | Path
202
+ Destination file path.
203
+
204
+ Raises
205
+ ------
206
+ RuntimeError
207
+ If the synthesizer has not been fitted or prepared.
208
+ """
209
+ self._check_ready()
210
+ path = Path(path)
211
+ path.parent.mkdir(parents=True, exist_ok=True)
212
+ with open(path, "wb") as fh:
213
+ pickle.dump(self, fh)
214
+ self._logger.info("Saved %s to %s", self.__class__.__name__, path)
215
+
216
+ @classmethod
217
+ def load(cls, path: str | Path) -> BaseSynthesizer:
218
+ """Load a previously saved synthesizer from disk.
219
+
220
+ Parameters
221
+ ----------
222
+ path : str | Path
223
+ Path to the saved file.
224
+
225
+ Returns
226
+ -------
227
+ BaseSynthesizer
228
+ The restored synthesizer instance.
229
+
230
+ Raises
231
+ ------
232
+ FileNotFoundError
233
+ If the file does not exist.
234
+ """
235
+ path = Path(path)
236
+ if not path.exists():
237
+ raise FileNotFoundError(f"No saved synthesizer found at {path}")
238
+ with open(path, "rb") as fh:
239
+ instance = pickle.load(fh)
240
+ if not isinstance(instance, cls):
241
+ raise TypeError(
242
+ f"Loaded object is {type(instance).__name__}, " f"expected {cls.__name__}"
243
+ )
244
+ return instance
245
+
246
+ # State guards
247
+
248
+ def _check_is_fitted(self) -> None:
249
+ """Raise if the synthesizer has not been fitted."""
250
+ if not self.is_fitted:
251
+ raise RuntimeError(
252
+ f"{self.__class__.__name__} must be fitted first. "
253
+ f"Call fit(data) before generating."
254
+ )
255
+
256
+ def _check_is_prepared(self) -> None:
257
+ """Raise if the synthesizer has not been prepared."""
258
+ if not self.is_prepared:
259
+ raise RuntimeError(
260
+ f"{self.__class__.__name__} must be prepared first. "
261
+ f"Call prepare(data) before generating."
262
+ )
263
+
264
+ def _check_ready(self) -> None:
265
+ """Raise if the synthesizer is neither fitted nor prepared."""
266
+ if not self.is_fitted and not self.is_prepared:
267
+ raise RuntimeError(
268
+ f"{self.__class__.__name__} is not ready. "
269
+ f"Call fit(data) or prepare(data) first."
270
+ )
271
+
272
+ def __repr__(self) -> str:
273
+ status = (
274
+ "fitted" if self.is_fitted else "prepared" if self.is_prepared else "not initialized"
275
+ )
276
+ config_str = ", ".join(f"{k}={v!r}" for k, v in self.config.items()) if self.config else ""
277
+ return (
278
+ f"{self.__class__.__name__}("
279
+ f"status={status}"
280
+ f"{', ' + config_str if config_str else ''}"
281
+ f")"
282
+ )
@@ -0,0 +1,99 @@
1
+ """Configuration loading and validation utilities.
2
+
3
+ Provides helpers for reading YAML config files and validating that
4
+ required keys are present before a synthesizer runs.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import yaml
13
+
14
+
15
+ def load_config(source: str | Path | dict[str, Any]) -> dict[str, Any]:
16
+ """Load configuration from a YAML file path or pass through a dict.
17
+
18
+ Parameters
19
+ ----------
20
+ source : str | Path | dict
21
+ Either a path to a ``.yaml`` / ``.yml`` file, or an existing
22
+ configuration dictionary.
23
+
24
+ Returns
25
+ -------
26
+ dict[str, Any]
27
+ Parsed configuration.
28
+
29
+ Raises
30
+ ------
31
+ FileNotFoundError
32
+ If *source* is a path that does not exist.
33
+ TypeError
34
+ If *source* is neither a path-like nor a dict.
35
+ ValueError
36
+ If the YAML file does not parse to a dictionary.
37
+
38
+ Examples
39
+ --------
40
+ >>> cfg = load_config({"epochs": 50, "batch_size": 64})
41
+ >>> cfg["epochs"]
42
+ 50
43
+
44
+ >>> cfg = load_config("config.yaml") # doctest: +SKIP
45
+ """
46
+ # Already a dict — return as-is.
47
+ if isinstance(source, dict):
48
+ return source
49
+
50
+ # Treat as file path.
51
+ path = Path(source)
52
+ if not path.exists():
53
+ raise FileNotFoundError(f"Config file not found: {path}")
54
+ if path.suffix.lower() not in {".yaml", ".yml"}:
55
+ raise ValueError(f"Config file must be .yaml or .yml, got: {path.suffix!r}")
56
+
57
+ with open(path, encoding="utf-8") as fh:
58
+ data = yaml.safe_load(fh)
59
+
60
+ if not isinstance(data, dict):
61
+ raise ValueError(
62
+ f"Expected YAML file to contain a mapping (dict), " f"got {type(data).__name__}"
63
+ )
64
+ return data
65
+
66
+
67
+ def validate_config(
68
+ config: dict[str, Any],
69
+ required_keys: list[str],
70
+ context: str = "",
71
+ ) -> None:
72
+ """Validate that all *required_keys* are present in *config*.
73
+
74
+ Parameters
75
+ ----------
76
+ config : dict[str, Any]
77
+ Configuration dictionary to validate.
78
+ required_keys : list[str]
79
+ Keys that **must** be present.
80
+ context : str
81
+ Optional label (e.g. class name) for clearer error messages.
82
+
83
+ Raises
84
+ ------
85
+ ValueError
86
+ If any required key is missing.
87
+
88
+ Examples
89
+ --------
90
+ >>> validate_config({"a": 1, "b": 2}, ["a", "b"])
91
+ >>> validate_config({"a": 1}, ["a", "b"], context="MyModule")
92
+ Traceback (most recent call last):
93
+ ...
94
+ ValueError: MyModule config missing required keys: {'b'}
95
+ """
96
+ missing = set(required_keys) - set(config)
97
+ if missing:
98
+ prefix = f"{context} config" if context else "Config"
99
+ raise ValueError(f"{prefix} missing required keys: {missing}")
syntharc/core/utils.py ADDED
@@ -0,0 +1,95 @@
1
+ """Shared utility functions for the syntharc package.
2
+
3
+ Provides device detection, reproducibility helpers, and logging setup.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import random
10
+
11
+ import numpy as np
12
+ import torch
13
+
14
+
15
+ def get_device() -> torch.device:
16
+ """Auto-detect the best available compute device.
17
+
18
+ Priority: CUDA → MPS (Apple Silicon) → CPU.
19
+
20
+ Returns
21
+ -------
22
+ torch.device
23
+ The selected device.
24
+ """
25
+ if torch.cuda.is_available():
26
+ return torch.device("cuda")
27
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
28
+ return torch.device("mps")
29
+ return torch.device("cpu")
30
+
31
+
32
+ def set_seed(seed: int) -> None:
33
+ """Set random seeds for reproducibility across all relevant libraries.
34
+
35
+ Sets seeds for Python's ``random``, NumPy, and PyTorch.
36
+
37
+ Parameters
38
+ ----------
39
+ seed : int
40
+ The seed value. Must be a non-negative integer.
41
+
42
+ Raises
43
+ ------
44
+ ValueError
45
+ If *seed* is negative.
46
+ """
47
+ if seed < 0:
48
+ raise ValueError(f"Seed must be non-negative, got {seed}")
49
+
50
+ random.seed(seed)
51
+ np.random.seed(seed)
52
+ torch.manual_seed(seed)
53
+ if torch.cuda.is_available():
54
+ torch.cuda.manual_seed_all(seed)
55
+
56
+
57
+ def setup_logging(level: str = "INFO") -> None:
58
+ """Configure rich-formatted logging for syntharc.
59
+
60
+ Uses the ``rich`` library for coloured, structured log output.
61
+ Falls back to basic ``logging`` config if ``rich`` is unavailable
62
+ (shouldn't happen since it's a core dependency).
63
+
64
+ Parameters
65
+ ----------
66
+ level : str
67
+ Logging level name (``"DEBUG"``, ``"INFO"``, ``"WARNING"``, etc.).
68
+
69
+ Raises
70
+ ------
71
+ ValueError
72
+ If *level* is not a valid logging level name.
73
+ """
74
+ numeric_level = getattr(logging, level.upper(), None)
75
+ if not isinstance(numeric_level, int):
76
+ raise ValueError(f"Invalid log level: {level!r}")
77
+
78
+ try:
79
+ from rich.logging import RichHandler
80
+
81
+ logging.basicConfig(
82
+ level=numeric_level,
83
+ format="%(message)s",
84
+ datefmt="[%X]",
85
+ handlers=[RichHandler(rich_tracebacks=True, markup=True)],
86
+ force=True,
87
+ )
88
+ except ImportError:
89
+ logging.basicConfig(
90
+ level=numeric_level,
91
+ format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
92
+ force=True,
93
+ )
94
+
95
+ logging.getLogger("syntharc").setLevel(numeric_level)
@@ -0,0 +1,7 @@
1
+ """syntharc.image — Image synthetic data generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from syntharc.image.augmentor import ImageAugmentor
6
+
7
+ __all__ = ["ImageAugmentor"]