syntharc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. syntharc-0.1.0/LICENSE +21 -0
  2. syntharc-0.1.0/PKG-INFO +243 -0
  3. syntharc-0.1.0/README.md +162 -0
  4. syntharc-0.1.0/pyproject.toml +122 -0
  5. syntharc-0.1.0/setup.cfg +4 -0
  6. syntharc-0.1.0/src/syntharc/__init__.py +38 -0
  7. syntharc-0.1.0/src/syntharc/core/__init__.py +14 -0
  8. syntharc-0.1.0/src/syntharc/core/base.py +282 -0
  9. syntharc-0.1.0/src/syntharc/core/config.py +99 -0
  10. syntharc-0.1.0/src/syntharc/core/utils.py +95 -0
  11. syntharc-0.1.0/src/syntharc/image/__init__.py +7 -0
  12. syntharc-0.1.0/src/syntharc/image/augmentor.py +315 -0
  13. syntharc-0.1.0/src/syntharc/image/evaluation.py +144 -0
  14. syntharc-0.1.0/src/syntharc/image/utils.py +15 -0
  15. syntharc-0.1.0/src/syntharc/tabular/__init__.py +19 -0
  16. syntharc-0.1.0/src/syntharc/tabular/ctgan.py +195 -0
  17. syntharc-0.1.0/src/syntharc/tabular/evaluation.py +187 -0
  18. syntharc-0.1.0/src/syntharc/tabular/gaussian_copula.py +182 -0
  19. syntharc-0.1.0/src/syntharc/tabular/utils.py +129 -0
  20. syntharc-0.1.0/src/syntharc/text/__init__.py +22 -0
  21. syntharc-0.1.0/src/syntharc/text/evaluation.py +100 -0
  22. syntharc-0.1.0/src/syntharc/text/markov.py +203 -0
  23. syntharc-0.1.0/src/syntharc/text/template.py +243 -0
  24. syntharc-0.1.0/src/syntharc/text/transformer.py +206 -0
  25. syntharc-0.1.0/src/syntharc/text/utils.py +14 -0
  26. syntharc-0.1.0/src/syntharc/timeseries/__init__.py +15 -0
  27. syntharc-0.1.0/src/syntharc/timeseries/evaluation.py +140 -0
  28. syntharc-0.1.0/src/syntharc/timeseries/par.py +224 -0
  29. syntharc-0.1.0/src/syntharc/timeseries/utils.py +97 -0
  30. syntharc-0.1.0/src/syntharc.egg-info/PKG-INFO +243 -0
  31. syntharc-0.1.0/src/syntharc.egg-info/SOURCES.txt +32 -0
  32. syntharc-0.1.0/src/syntharc.egg-info/dependency_links.txt +1 -0
  33. syntharc-0.1.0/src/syntharc.egg-info/requires.txt +39 -0
  34. syntharc-0.1.0/src/syntharc.egg-info/top_level.txt +1 -0
syntharc-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Fahad Abdullah
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,243 @@
1
+ Metadata-Version: 2.4
2
+ Name: syntharc
3
+ Version: 0.1.0
4
+ Summary: Unified synthetic data generation for tabular, time-series, image, and text data
5
+ Author-email: Fahad Abdullah <fahadai.co@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Fahad Abdullah
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/FAbdullah17/syntharc
29
+ Project-URL: Repository, https://github.com/FAbdullah17/syntharc
30
+ Project-URL: Issues, https://github.com/FAbdullah17/syntharc/issues
31
+ Project-URL: Documentation, https://github.com/FAbdullah17/syntharc/tree/main/docs
32
+ Keywords: synthetic-data,syntharceration,augmentation,tabular,time-series,image,text
33
+ Classifier: Development Status :: 3 - Alpha
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Operating System :: OS Independent
38
+ Classifier: Programming Language :: Python :: 3
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
43
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
44
+ Requires-Python: >=3.10
45
+ Description-Content-Type: text/markdown
46
+ License-File: LICENSE
47
+ Requires-Dist: pandas>=1.5
48
+ Requires-Dist: numpy>=1.23
49
+ Requires-Dist: scikit-learn>=1.2
50
+ Requires-Dist: matplotlib>=3.6
51
+ Requires-Dist: Pillow>=9.0
52
+ Requires-Dist: pyyaml>=6.0
53
+ Requires-Dist: rich>=13.0
54
+ Requires-Dist: torch>=2.0
55
+ Provides-Extra: tabular
56
+ Requires-Dist: sdv>=1.10; extra == "tabular"
57
+ Provides-Extra: timeseries
58
+ Requires-Dist: sdv>=1.10; extra == "timeseries"
59
+ Provides-Extra: image
60
+ Requires-Dist: albumentations>=1.4; extra == "image"
61
+ Requires-Dist: opencv-python-headless>=4.9; extra == "image"
62
+ Provides-Extra: text
63
+ Requires-Dist: transformers>=4.40; extra == "text"
64
+ Requires-Dist: accelerate>=0.22; extra == "text"
65
+ Provides-Extra: dev
66
+ Requires-Dist: pytest>=8.0; extra == "dev"
67
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
68
+ Requires-Dist: ruff>=0.4; extra == "dev"
69
+ Requires-Dist: mypy>=1.10; extra == "dev"
70
+ Requires-Dist: pre-commit>=3.7; extra == "dev"
71
+ Requires-Dist: pandas-stubs; extra == "dev"
72
+ Requires-Dist: types-PyYAML; extra == "dev"
73
+ Requires-Dist: types-Pillow; extra == "dev"
74
+ Provides-Extra: all
75
+ Requires-Dist: syntharc[tabular]; extra == "all"
76
+ Requires-Dist: syntharc[timeseries]; extra == "all"
77
+ Requires-Dist: syntharc[image]; extra == "all"
78
+ Requires-Dist: syntharc[text]; extra == "all"
79
+ Requires-Dist: syntharc[dev]; extra == "all"
80
+ Dynamic: license-file
81
+
82
+ # syntharc: The Grand Unified Synthetic Data Generator
83
+
84
+ [![CI](https://github.com/your-username/syntharc/actions/workflows/ci.yml/badge.svg)](https://github.com/your-username/syntharc/actions/workflows/ci.yml)
85
+ [![PyPI version](https://badge.fury.io/py/syntharc.svg)](https://badge.fury.io/py/syntharc)
86
+ [![Python Versions](https://img.shields.io/pypi/pyversions/syntharc.svg)](https://pypi.org/project/syntharc/)
87
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
88
+
89
+ **`syntharc`** is a powerful, flexible, and unified Python library for generating high-quality synthetic data across multiple domains. In the modern machine learning ecosystem, acquiring high-quality, privacy-compliant, and diverse datasets is often the biggest bottleneck. `syntharc` solves this by abstracting the complexity of various generative algorithms into a single, intuitive framework.
90
+
91
+ Whether you need to anonymize sensitive tabular records, augment image operations for computer vision, simulate realistic text sequences, or replicate complex financial time-series signals. `syntharc` provides the unified API to do it all using state-of-the-art backend engines like PyTorch, Hugging Face Transformers, Albumentations, and the Synthetic Data Vault (SDV).
92
+
93
+ ---
94
+
95
+ ## 🧠 How It Works: The Unified Lifecycle
96
+
97
+ One of the biggest hurdles in synthetic data generation is the fragmented tooling across different data types. `syntharc` forces all underlying models into a highly predictable **3-step lifecycle** inherited from our `BaseSynthesizer` architecture:
98
+
99
+ 1. **`prepare(**kwargs)`**: Configures the internal environment. This is where you define structural metadata, column types, sequence keys, or image transformation pipelines to guide the generation.
100
+ 2. **`fit(data)`**: Feeds your actual, real-world dataset into the generative model so it can learn patterns, probabilistic distributions, and internal representations.
101
+ 3. **`generate(**kwargs)`**: Samples from the trained model to yield your brand new synthetic dataset—retaining the statistical properties of the original without exposing real user information.
102
+
103
+ ---
104
+
105
+ ## 🚀 Key Features
106
+
107
+ * 📊 **Tabular Data:** Synthesize fully relational datasets using Deep Learning (`CTGANSynthesizer`) or statistical modeling (`GaussianCopulaSynthesizer`).
108
+ * 📝 **Text Generation:** Leverage the power of LLMs (`TransformerSynthesizer`), classic statistical chains (`MarkovSynthesizer`), or strict ruleings (`TemplateSynthesizer`).
109
+ * 🖼️ **Image Augmentation:** Use `ImageAugmentor` to rapidly iterate through spatial and pixel-level augmentations, dynamically expanding computer vision datasets.
110
+ * 📈 **Time-Series:** Employ the `PARSynthesizer` (Probabilistic AutoRegressive model) to safely synthesize robust sequence data over time.
111
+ * ✅ **Evaluations:** Built-in tools for evaluating the quality, fidelity, and privacy metrics of your generated data against your source data.
112
+
113
+ ---
114
+
115
+ ## 📦 Installation
116
+
117
+ Install `syntharc` via pip. The base package provides the core infrastructure. We highly recommend installing the specific domain dependencies you intend to use to keep your environment lean:
118
+
119
+ ```bash
120
+ # Install everything (Recommended for the full experience)
121
+ pip install "syntharc[all]"
122
+
123
+ # Or individually pick your domains:
124
+ pip install "syntharc[tabular]"
125
+ pip install "syntharc[text]"
126
+ pip install "syntharc[image]"
127
+ pip install "syntharc[timeseries]"
128
+
129
+ # For contributors and local development:
130
+ pip install "syntharc[dev]"
131
+ ```
132
+
133
+ ---
134
+
135
+ ## 💡 Comprehensive Quick Start
136
+
137
+ Below are examples of how our unified framework elegantly handles vastly different data constraints.
138
+
139
+ ### 1. Privacy-Preserving Tabular Data
140
+ Train a CTGAN model to learn the distribution of your customer data without saving real identities.
141
+ ```python
142
+ import pandas as pd
143
+ from syntharc.tabular.ctgan import CTGANSynthesizer
144
+
145
+ real_data = pd.read_csv("customer_data.csv")
146
+
147
+ # 1. Initialize & Prepare metadata
148
+ synth = CTGANSynthesizer(epochs=50)
149
+ synth.prepare(metadata_dict={"primary_key": "user_id"})
150
+
151
+ # 2. Fit to real data
152
+ synth.fit(real_data)
153
+
154
+ # 3. Generate 1,000 synthetic rows!
155
+ synthetic_data = synth.generate(num_rows=1000)
156
+ print(synthetic_data.head())
157
+ ```
158
+
159
+ ### 2. Deep Time-Series Generation
160
+ Synthesize sequential metrics, like stock prices or IoT sensor readings, using AutoRegressive modeling.
161
+ ```python
162
+ from syntharc.timeseries.par import PARSynthesizer
163
+
164
+ # 1. Initialize & Prepare
165
+ synth = PARSynthesizer(epochs=25)
166
+ # Define what makes a "sequence" (e.g., separate tracking per 'device_id')
167
+ synth.prepare(metadata_dict={
168
+ "sequence_key": "device_id",
169
+ "context_columns": ["region"]
170
+ })
171
+
172
+ # 2. Fit
173
+ synth.fit(sensor_dataframe)
174
+
175
+ # 3. Generate sequential data
176
+ synthetic_series = synth.generate(num_sequences=50)
177
+ ```
178
+
179
+ ### 3. Causal Text Generation (LLMs)
180
+ Easily utilize Hugging Face causal language models.
181
+ ```python
182
+ from syntharc.text.transformer import TransformerTextGenerator
183
+
184
+ # 1. Initialize (Downloads SmolLM2-360M-Instruct by default)
185
+ synth = TransformerTextGenerator()
186
+
187
+ # 2. Prepare with an optional style context
188
+ synth.prepare(corpus="The future of synthetic data is bright and highly scalable.")
189
+
190
+ # 3. Generate structured text from instructions
191
+ text_output = synth.generate(num_samples=1, instructions="Write a short summary about data.")
192
+ print(text_output[0])
193
+ ```
194
+
195
+ ### 4. High-Speed Image Augmentation
196
+ Prepare a pipeline of augmentations to expand your Machine Learning dataset effortlessly.
197
+ ```python
198
+ from syntharc.image.augmentor import ImageAugmentor
199
+ import cv2
200
+
201
+ # 1. Initialize rules
202
+ augmentor = ImageAugmentor()
203
+ # Uses Albumentations backend dictionary standards mapping
204
+ augmentor.prepare(config={
205
+ 'resize': (256, 256),
206
+ 'horizontal_flip': 0.5,
207
+ 'brightness_contrast': 0.2
208
+ })
209
+
210
+ # 2. Load Real Image
211
+ image = cv2.imread('dataset/cat.jpg')
212
+
213
+ # 3. Generate augmentations
214
+ # The generator seamlessly wraps Albumentations under the hood
215
+ augmented_image = augmentor.generate(num_samples=1, data=[image])
216
+ print(f"Augmented Shape: {augmented_image[0].shape}")
217
+ ```
218
+
219
+ ---
220
+
221
+ ## 📖 Documentation
222
+
223
+ For comprehensive guides, parameter references, evaluations, and interactive Jupyter showcase notebooks, please check out the `docs/` folder:
224
+ - [Getting Started](docs/getting_started.md)
225
+ - [Tabular Generation Guide](docs/tabular.md)
226
+ - [Time-Series Generation Guide](docs/timeseries.md)
227
+ - [Text Generation Guide](docs/text.md)
228
+ - [Image Augmentation Guide](docs/image.md)
229
+ - [API Reference](docs/api_reference.md)
230
+
231
+ ---
232
+
233
+ ## 🤝 Contributing
234
+
235
+ We love our contributors! If you're interested in adding a new generative model, fixing a bug, or improving the documentation, please refer to our [CONTRIBUTING.md](CONTRIBUTING.md) for detailed instructions on setting up your local environment, managing dependencies, and passing our CI checks.
236
+
237
+ ## 📄 License
238
+
239
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
240
+
241
+ ---
242
+
243
+ Made with ❤️ by the syntharc team
@@ -0,0 +1,162 @@
1
+ # syntharc: The Grand Unified Synthetic Data Generator
2
+
3
+ [![CI](https://github.com/your-username/syntharc/actions/workflows/ci.yml/badge.svg)](https://github.com/your-username/syntharc/actions/workflows/ci.yml)
4
+ [![PyPI version](https://badge.fury.io/py/syntharc.svg)](https://badge.fury.io/py/syntharc)
5
+ [![Python Versions](https://img.shields.io/pypi/pyversions/syntharc.svg)](https://pypi.org/project/syntharc/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ **`syntharc`** is a powerful, flexible, and unified Python library for generating high-quality synthetic data across multiple domains. In the modern machine learning ecosystem, acquiring high-quality, privacy-compliant, and diverse datasets is often the biggest bottleneck. `syntharc` solves this by abstracting the complexity of various generative algorithms into a single, intuitive framework.
9
+
10
+ Whether you need to anonymize sensitive tabular records, augment image operations for computer vision, simulate realistic text sequences, or replicate complex financial time-series signals. `syntharc` provides the unified API to do it all using state-of-the-art backend engines like PyTorch, Hugging Face Transformers, Albumentations, and the Synthetic Data Vault (SDV).
11
+
12
+ ---
13
+
14
+ ## 🧠 How It Works: The Unified Lifecycle
15
+
16
+ One of the biggest hurdles in synthetic data generation is the fragmented tooling across different data types. `syntharc` forces all underlying models into a highly predictable **3-step lifecycle** inherited from our `BaseSynthesizer` architecture:
17
+
18
+ 1. **`prepare(**kwargs)`**: Configures the internal environment. This is where you define structural metadata, column types, sequence keys, or image transformation pipelines to guide the generation.
19
+ 2. **`fit(data)`**: Feeds your actual, real-world dataset into the generative model so it can learn patterns, probabilistic distributions, and internal representations.
20
+ 3. **`generate(**kwargs)`**: Samples from the trained model to yield your brand new synthetic dataset—retaining the statistical properties of the original without exposing real user information.
21
+
22
+ ---
23
+
24
+ ## 🚀 Key Features
25
+
26
+ * 📊 **Tabular Data:** Synthesize fully relational datasets using Deep Learning (`CTGANSynthesizer`) or statistical modeling (`GaussianCopulaSynthesizer`).
27
+ * 📝 **Text Generation:** Leverage the power of LLMs (`TransformerSynthesizer`), classic statistical chains (`MarkovSynthesizer`), or strict ruleings (`TemplateSynthesizer`).
28
+ * 🖼️ **Image Augmentation:** Use `ImageAugmentor` to rapidly iterate through spatial and pixel-level augmentations, dynamically expanding computer vision datasets.
29
+ * 📈 **Time-Series:** Employ the `PARSynthesizer` (Probabilistic AutoRegressive model) to safely synthesize robust sequence data over time.
30
+ * ✅ **Evaluations:** Built-in tools for evaluating the quality, fidelity, and privacy metrics of your generated data against your source data.
31
+
32
+ ---
33
+
34
+ ## 📦 Installation
35
+
36
+ Install `syntharc` via pip. The base package provides the core infrastructure. We highly recommend installing the specific domain dependencies you intend to use to keep your environment lean:
37
+
38
+ ```bash
39
+ # Install everything (Recommended for the full experience)
40
+ pip install "syntharc[all]"
41
+
42
+ # Or individually pick your domains:
43
+ pip install "syntharc[tabular]"
44
+ pip install "syntharc[text]"
45
+ pip install "syntharc[image]"
46
+ pip install "syntharc[timeseries]"
47
+
48
+ # For contributors and local development:
49
+ pip install "syntharc[dev]"
50
+ ```
51
+
52
+ ---
53
+
54
+ ## 💡 Comprehensive Quick Start
55
+
56
+ Below are examples of how our unified framework elegantly handles vastly different data constraints.
57
+
58
+ ### 1. Privacy-Preserving Tabular Data
59
+ Train a CTGAN model to learn the distribution of your customer data without saving real identities.
60
+ ```python
61
+ import pandas as pd
62
+ from syntharc.tabular.ctgan import CTGANSynthesizer
63
+
64
+ real_data = pd.read_csv("customer_data.csv")
65
+
66
+ # 1. Initialize & Prepare metadata
67
+ synth = CTGANSynthesizer(epochs=50)
68
+ synth.prepare(metadata_dict={"primary_key": "user_id"})
69
+
70
+ # 2. Fit to real data
71
+ synth.fit(real_data)
72
+
73
+ # 3. Generate 1,000 synthetic rows!
74
+ synthetic_data = synth.generate(num_rows=1000)
75
+ print(synthetic_data.head())
76
+ ```
77
+
78
+ ### 2. Deep Time-Series Generation
79
+ Synthesize sequential metrics, like stock prices or IoT sensor readings, using AutoRegressive modeling.
80
+ ```python
81
+ from syntharc.timeseries.par import PARSynthesizer
82
+
83
+ # 1. Initialize & Prepare
84
+ synth = PARSynthesizer(epochs=25)
85
+ # Define what makes a "sequence" (e.g., separate tracking per 'device_id')
86
+ synth.prepare(metadata_dict={
87
+ "sequence_key": "device_id",
88
+ "context_columns": ["region"]
89
+ })
90
+
91
+ # 2. Fit
92
+ synth.fit(sensor_dataframe)
93
+
94
+ # 3. Generate sequential data
95
+ synthetic_series = synth.generate(num_sequences=50)
96
+ ```
97
+
98
+ ### 3. Causal Text Generation (LLMs)
99
+ Easily utilize Hugging Face causal language models.
100
+ ```python
101
+ from syntharc.text.transformer import TransformerTextGenerator
102
+
103
+ # 1. Initialize (Downloads SmolLM2-360M-Instruct by default)
104
+ synth = TransformerTextGenerator()
105
+
106
+ # 2. Prepare with an optional style context
107
+ synth.prepare(corpus="The future of synthetic data is bright and highly scalable.")
108
+
109
+ # 3. Generate structured text from instructions
110
+ text_output = synth.generate(num_samples=1, instructions="Write a short summary about data.")
111
+ print(text_output[0])
112
+ ```
113
+
114
+ ### 4. High-Speed Image Augmentation
115
+ Prepare a pipeline of augmentations to expand your Machine Learning dataset effortlessly.
116
+ ```python
117
+ from syntharc.image.augmentor import ImageAugmentor
118
+ import cv2
119
+
120
+ # 1. Initialize rules
121
+ augmentor = ImageAugmentor()
122
+ # Uses Albumentations backend dictionary standards mapping
123
+ augmentor.prepare(config={
124
+ 'resize': (256, 256),
125
+ 'horizontal_flip': 0.5,
126
+ 'brightness_contrast': 0.2
127
+ })
128
+
129
+ # 2. Load Real Image
130
+ image = cv2.imread('dataset/cat.jpg')
131
+
132
+ # 3. Generate augmentations
133
+ # The generator seamlessly wraps Albumentations under the hood
134
+ augmented_image = augmentor.generate(num_samples=1, data=[image])
135
+ print(f"Augmented Shape: {augmented_image[0].shape}")
136
+ ```
137
+
138
+ ---
139
+
140
+ ## 📖 Documentation
141
+
142
+ For comprehensive guides, parameter references, evaluations, and interactive Jupyter showcase notebooks, please check out the `docs/` folder:
143
+ - [Getting Started](docs/getting_started.md)
144
+ - [Tabular Generation Guide](docs/tabular.md)
145
+ - [Time-Series Generation Guide](docs/timeseries.md)
146
+ - [Text Generation Guide](docs/text.md)
147
+ - [Image Augmentation Guide](docs/image.md)
148
+ - [API Reference](docs/api_reference.md)
149
+
150
+ ---
151
+
152
+ ## 🤝 Contributing
153
+
154
+ We love our contributors! If you're interested in adding a new generative model, fixing a bug, or improving the documentation, please refer to our [CONTRIBUTING.md](CONTRIBUTING.md) for detailed instructions on setting up your local environment, managing dependencies, and passing our CI checks.
155
+
156
+ ## 📄 License
157
+
158
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
159
+
160
+ ---
161
+
162
+ Made with ❤️ by the syntharc team
@@ -0,0 +1,122 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "syntharc"
7
+ version = "0.1.0"
8
+ description = "Unified synthetic data generation for tabular, time-series, image, and text data"
9
+ readme = "README.md"
10
+ license = {file = "LICENSE"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Fahad Abdullah", email = "fahadai.co@gmail.com"},
14
+ ]
15
+ keywords = [
16
+ "synthetic-data",
17
+ "syntharceration",
18
+ "augmentation",
19
+ "tabular",
20
+ "time-series",
21
+ "image",
22
+ "text",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 3 - Alpha",
26
+ "Intended Audience :: Developers",
27
+ "Intended Audience :: Science/Research",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.10",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
35
+ "Topic :: Software Development :: Libraries :: Python Modules",
36
+ ]
37
+
38
+ dependencies = [
39
+ "pandas>=1.5",
40
+ "numpy>=1.23",
41
+ "scikit-learn>=1.2",
42
+ "matplotlib>=3.6",
43
+ "Pillow>=9.0",
44
+ "pyyaml>=6.0",
45
+ "rich>=13.0",
46
+ "torch>=2.0",
47
+ ]
48
+
49
+ [project.optional-dependencies]
50
+ tabular = [
51
+ "sdv>=1.10",
52
+ ]
53
+ timeseries = [
54
+ "sdv>=1.10",
55
+ ]
56
+ image = [
57
+ "albumentations>=1.4",
58
+ "opencv-python-headless>=4.9",
59
+ ]
60
+ text = [
61
+ "transformers>=4.40",
62
+ "accelerate>=0.22",
63
+ ]
64
+ dev = [
65
+ "pytest>=8.0",
66
+ "pytest-cov>=5.0",
67
+ "ruff>=0.4",
68
+ "mypy>=1.10",
69
+ "pre-commit>=3.7",
70
+ "pandas-stubs",
71
+ "types-PyYAML",
72
+ "types-Pillow",
73
+ ]
74
+ all = [
75
+ "syntharc[tabular]",
76
+ "syntharc[timeseries]",
77
+ "syntharc[image]",
78
+ "syntharc[text]",
79
+ "syntharc[dev]",
80
+ ]
81
+
82
+ [project.urls]
83
+ Homepage = "https://github.com/FAbdullah17/syntharc"
84
+ Repository = "https://github.com/FAbdullah17/syntharc"
85
+ Issues = "https://github.com/FAbdullah17/syntharc/issues"
86
+ Documentation = "https://github.com/FAbdullah17/syntharc/tree/main/docs"
87
+
88
+ [tool.setuptools.packages.find]
89
+ where = ["src"]
90
+
91
+ [tool.ruff]
92
+ target-version = "py310"
93
+ line-length = 100
94
+ src = ["src", "tests"]
95
+
96
+ [tool.ruff.lint]
97
+ select = [
98
+ "E", # pycodestyle errors
99
+ "F", # pyflakes
100
+ "W", # pycodestyle warnings
101
+ "I", # isort
102
+ "N", # pep8-naming
103
+ "UP", # pyupgrade
104
+ "B", # flake8-bugbear
105
+ "SIM", # flake8-simplify
106
+ "RUF", # ruff-specific rules
107
+ ]
108
+
109
+ [tool.ruff.lint.isort]
110
+ known-first-party = ["syntharc"]
111
+
112
+ [tool.mypy]
113
+ python_version = "3.10"
114
+ warn_return_any = true
115
+ warn_unused_configs = true
116
+ disallow_untyped_defs = true
117
+ mypy_path = "src"
118
+ packages = ["syntharc"]
119
+
120
+ [tool.pytest.ini_options]
121
+ testpaths = ["tests"]
122
+ addopts = "-v --tb=short"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,38 @@
1
+ """syntharc — Unified synthetic data generation.
2
+
3
+ A lightweight Python package for synthetic data generation across
4
+ tabular, time-series, image, and text domains using sample-based
5
+ learning, augmentation, and lightweight generative techniques.
6
+
7
+ Quick Start
8
+ -----------
9
+ >>> from syntharc.core import BaseSynthesizer, set_seed, setup_logging
10
+
11
+ Tabular (requires ``pip install syntharc[tabular]``):
12
+
13
+ >>> from syntharc.tabular import CTGANSynthesizer # doctest: +SKIP
14
+ >>> from syntharc.tabular import GaussianCopulaSynthesizer # doctest: +SKIP
15
+
16
+ Time-series (requires ``pip install syntharc[timeseries]``):
17
+
18
+ >>> from syntharc.timeseries import TimeSeriesSynthesizer # doctest: +SKIP
19
+
20
+ Image (requires ``pip install syntharc[image]``):
21
+
22
+ >>> from syntharc.image import ImageAugmentor # doctest: +SKIP
23
+
24
+ Text (markov/template work out of the box, transformer needs
25
+ ``pip install syntharc[text]``):
26
+
27
+ >>> from syntharc.text import MarkovTextGenerator # doctest: +SKIP
28
+ >>> from syntharc.text import TemplateTextGenerator # doctest: +SKIP
29
+ >>> from syntharc.text import TransformerTextGenerator # doctest: +SKIP
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ __version__ = "0.1.0"
35
+ __author__ = "Fahad Abdullah"
36
+ __email__ = "fahadai.co@gmail.com"
37
+
38
+ __all__ = ["__version__"]
@@ -0,0 +1,14 @@
1
+ """syntharc.core — Core infrastructure for syntharc."""
2
+
3
+ from syntharc.core.base import BaseSynthesizer
4
+ from syntharc.core.config import load_config, validate_config
5
+ from syntharc.core.utils import get_device, set_seed, setup_logging
6
+
7
+ __all__ = [
8
+ "BaseSynthesizer",
9
+ "load_config",
10
+ "validate_config",
11
+ "get_device",
12
+ "set_seed",
13
+ "setup_logging",
14
+ ]