syntharc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntharc-0.1.0/LICENSE +21 -0
- syntharc-0.1.0/PKG-INFO +243 -0
- syntharc-0.1.0/README.md +162 -0
- syntharc-0.1.0/pyproject.toml +122 -0
- syntharc-0.1.0/setup.cfg +4 -0
- syntharc-0.1.0/src/syntharc/__init__.py +38 -0
- syntharc-0.1.0/src/syntharc/core/__init__.py +14 -0
- syntharc-0.1.0/src/syntharc/core/base.py +282 -0
- syntharc-0.1.0/src/syntharc/core/config.py +99 -0
- syntharc-0.1.0/src/syntharc/core/utils.py +95 -0
- syntharc-0.1.0/src/syntharc/image/__init__.py +7 -0
- syntharc-0.1.0/src/syntharc/image/augmentor.py +315 -0
- syntharc-0.1.0/src/syntharc/image/evaluation.py +144 -0
- syntharc-0.1.0/src/syntharc/image/utils.py +15 -0
- syntharc-0.1.0/src/syntharc/tabular/__init__.py +19 -0
- syntharc-0.1.0/src/syntharc/tabular/ctgan.py +195 -0
- syntharc-0.1.0/src/syntharc/tabular/evaluation.py +187 -0
- syntharc-0.1.0/src/syntharc/tabular/gaussian_copula.py +182 -0
- syntharc-0.1.0/src/syntharc/tabular/utils.py +129 -0
- syntharc-0.1.0/src/syntharc/text/__init__.py +22 -0
- syntharc-0.1.0/src/syntharc/text/evaluation.py +100 -0
- syntharc-0.1.0/src/syntharc/text/markov.py +203 -0
- syntharc-0.1.0/src/syntharc/text/template.py +243 -0
- syntharc-0.1.0/src/syntharc/text/transformer.py +206 -0
- syntharc-0.1.0/src/syntharc/text/utils.py +14 -0
- syntharc-0.1.0/src/syntharc/timeseries/__init__.py +15 -0
- syntharc-0.1.0/src/syntharc/timeseries/evaluation.py +140 -0
- syntharc-0.1.0/src/syntharc/timeseries/par.py +224 -0
- syntharc-0.1.0/src/syntharc/timeseries/utils.py +97 -0
- syntharc-0.1.0/src/syntharc.egg-info/PKG-INFO +243 -0
- syntharc-0.1.0/src/syntharc.egg-info/SOURCES.txt +32 -0
- syntharc-0.1.0/src/syntharc.egg-info/dependency_links.txt +1 -0
- syntharc-0.1.0/src/syntharc.egg-info/requires.txt +39 -0
- syntharc-0.1.0/src/syntharc.egg-info/top_level.txt +1 -0
syntharc-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Fahad Abdullah
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
syntharc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: syntharc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified synthetic data generation for tabular, time-series, image, and text data
|
|
5
|
+
Author-email: Fahad Abdullah <fahadai.co@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Fahad Abdullah
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/FAbdullah17/syntharc
|
|
29
|
+
Project-URL: Repository, https://github.com/FAbdullah17/syntharc
|
|
30
|
+
Project-URL: Issues, https://github.com/FAbdullah17/syntharc/issues
|
|
31
|
+
Project-URL: Documentation, https://github.com/FAbdullah17/syntharc/tree/main/docs
|
|
32
|
+
Keywords: synthetic-data,syntharceration,augmentation,tabular,time-series,image,text
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
43
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
44
|
+
Requires-Python: >=3.10
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
License-File: LICENSE
|
|
47
|
+
Requires-Dist: pandas>=1.5
|
|
48
|
+
Requires-Dist: numpy>=1.23
|
|
49
|
+
Requires-Dist: scikit-learn>=1.2
|
|
50
|
+
Requires-Dist: matplotlib>=3.6
|
|
51
|
+
Requires-Dist: Pillow>=9.0
|
|
52
|
+
Requires-Dist: pyyaml>=6.0
|
|
53
|
+
Requires-Dist: rich>=13.0
|
|
54
|
+
Requires-Dist: torch>=2.0
|
|
55
|
+
Provides-Extra: tabular
|
|
56
|
+
Requires-Dist: sdv>=1.10; extra == "tabular"
|
|
57
|
+
Provides-Extra: timeseries
|
|
58
|
+
Requires-Dist: sdv>=1.10; extra == "timeseries"
|
|
59
|
+
Provides-Extra: image
|
|
60
|
+
Requires-Dist: albumentations>=1.4; extra == "image"
|
|
61
|
+
Requires-Dist: opencv-python-headless>=4.9; extra == "image"
|
|
62
|
+
Provides-Extra: text
|
|
63
|
+
Requires-Dist: transformers>=4.40; extra == "text"
|
|
64
|
+
Requires-Dist: accelerate>=0.22; extra == "text"
|
|
65
|
+
Provides-Extra: dev
|
|
66
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
67
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
68
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
69
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
70
|
+
Requires-Dist: pre-commit>=3.7; extra == "dev"
|
|
71
|
+
Requires-Dist: pandas-stubs; extra == "dev"
|
|
72
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
73
|
+
Requires-Dist: types-Pillow; extra == "dev"
|
|
74
|
+
Provides-Extra: all
|
|
75
|
+
Requires-Dist: syntharc[tabular]; extra == "all"
|
|
76
|
+
Requires-Dist: syntharc[timeseries]; extra == "all"
|
|
77
|
+
Requires-Dist: syntharc[image]; extra == "all"
|
|
78
|
+
Requires-Dist: syntharc[text]; extra == "all"
|
|
79
|
+
Requires-Dist: syntharc[dev]; extra == "all"
|
|
80
|
+
Dynamic: license-file
|
|
81
|
+
|
|
82
|
+
# syntharc: The Grand Unified Synthetic Data Generator
|
|
83
|
+
|
|
84
|
+
[](https://github.com/your-username/syntharc/actions/workflows/ci.yml)
|
|
85
|
+
[](https://badge.fury.io/py/syntharc)
|
|
86
|
+
[](https://pypi.org/project/syntharc/)
|
|
87
|
+
[](https://opensource.org/licenses/MIT)
|
|
88
|
+
|
|
89
|
+
**`syntharc`** is a powerful, flexible, and unified Python library for generating high-quality synthetic data across multiple domains. In the modern machine learning ecosystem, acquiring high-quality, privacy-compliant, and diverse datasets is often the biggest bottleneck. `syntharc` solves this by abstracting the complexity of various generative algorithms into a single, intuitive framework.
|
|
90
|
+
|
|
91
|
+
Whether you need to anonymize sensitive tabular records, augment image operations for computer vision, simulate realistic text sequences, or replicate complex financial time-series signals. `syntharc` provides the unified API to do it all using state-of-the-art backend engines like PyTorch, Hugging Face Transformers, Albumentations, and the Synthetic Data Vault (SDV).
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 🧠 How It Works: The Unified Lifecycle
|
|
96
|
+
|
|
97
|
+
One of the biggest hurdles in synthetic data generation is the fragmented tooling across different data types. `syntharc` forces all underlying models into a highly predictable **3-step lifecycle** inherited from our `BaseSynthesizer` architecture:
|
|
98
|
+
|
|
99
|
+
1. **`prepare(**kwargs)`**: Configures the internal environment. This is where you define structural metadata, column types, sequence keys, or image transformation pipelines to guide the generation.
|
|
100
|
+
2. **`fit(data)`**: Feeds your actual, real-world dataset into the generative model so it can learn patterns, probabilistic distributions, and internal representations.
|
|
101
|
+
3. **`generate(**kwargs)`**: Samples from the trained model to yield your brand new synthetic dataset—retaining the statistical properties of the original without exposing real user information.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 🚀 Key Features
|
|
106
|
+
|
|
107
|
+
* 📊 **Tabular Data:** Synthesize fully relational datasets using Deep Learning (`CTGANSynthesizer`) or statistical modeling (`GaussianCopulaSynthesizer`).
|
|
108
|
+
* 📝 **Text Generation:** Leverage the power of LLMs (`TransformerSynthesizer`), classic statistical chains (`MarkovSynthesizer`), or strict ruleings (`TemplateSynthesizer`).
|
|
109
|
+
* 🖼️ **Image Augmentation:** Use `ImageAugmentor` to rapidly iterate through spatial and pixel-level augmentations, dynamically expanding computer vision datasets.
|
|
110
|
+
* 📈 **Time-Series:** Employ the `PARSynthesizer` (Probabilistic AutoRegressive model) to safely synthesize robust sequence data over time.
|
|
111
|
+
* ✅ **Evaluations:** Built-in tools for evaluating the quality, fidelity, and privacy metrics of your generated data against your source data.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## 📦 Installation
|
|
116
|
+
|
|
117
|
+
Install `syntharc` via pip. The base package provides the core infrastructure. We highly recommend installing the specific domain dependencies you intend to use to keep your environment lean:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Install everything (Recommended for the full experience)
|
|
121
|
+
pip install "syntharc[all]"
|
|
122
|
+
|
|
123
|
+
# Or individually pick your domains:
|
|
124
|
+
pip install "syntharc[tabular]"
|
|
125
|
+
pip install "syntharc[text]"
|
|
126
|
+
pip install "syntharc[image]"
|
|
127
|
+
pip install "syntharc[timeseries]"
|
|
128
|
+
|
|
129
|
+
# For contributors and local development:
|
|
130
|
+
pip install "syntharc[dev]"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## 💡 Comprehensive Quick Start
|
|
136
|
+
|
|
137
|
+
Below are examples of how our unified framework elegantly handles vastly different data constraints.
|
|
138
|
+
|
|
139
|
+
### 1. Privacy-Preserving Tabular Data
|
|
140
|
+
Train a CTGAN model to learn the distribution of your customer data without saving real identities.
|
|
141
|
+
```python
|
|
142
|
+
import pandas as pd
|
|
143
|
+
from syntharc.tabular.ctgan import CTGANSynthesizer
|
|
144
|
+
|
|
145
|
+
real_data = pd.read_csv("customer_data.csv")
|
|
146
|
+
|
|
147
|
+
# 1. Initialize & Prepare metadata
|
|
148
|
+
synth = CTGANSynthesizer(epochs=50)
|
|
149
|
+
synth.prepare(metadata_dict={"primary_key": "user_id"})
|
|
150
|
+
|
|
151
|
+
# 2. Fit to real data
|
|
152
|
+
synth.fit(real_data)
|
|
153
|
+
|
|
154
|
+
# 3. Generate 1,000 synthetic rows!
|
|
155
|
+
synthetic_data = synth.generate(num_rows=1000)
|
|
156
|
+
print(synthetic_data.head())
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 2. Deep Time-Series Generation
|
|
160
|
+
Synthesize sequential metrics, like stock prices or IoT sensor readings, using AutoRegressive modeling.
|
|
161
|
+
```python
|
|
162
|
+
from syntharc.timeseries.par import PARSynthesizer
|
|
163
|
+
|
|
164
|
+
# 1. Initialize & Prepare
|
|
165
|
+
synth = PARSynthesizer(epochs=25)
|
|
166
|
+
# Define what makes a "sequence" (e.g., separate tracking per 'device_id')
|
|
167
|
+
synth.prepare(metadata_dict={
|
|
168
|
+
"sequence_key": "device_id",
|
|
169
|
+
"context_columns": ["region"]
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
# 2. Fit
|
|
173
|
+
synth.fit(sensor_dataframe)
|
|
174
|
+
|
|
175
|
+
# 3. Generate sequential data
|
|
176
|
+
synthetic_series = synth.generate(num_sequences=50)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 3. Causal Text Generation (LLMs)
|
|
180
|
+
Easily utilize Hugging Face causal language models.
|
|
181
|
+
```python
|
|
182
|
+
from syntharc.text.transformer import TransformerTextGenerator
|
|
183
|
+
|
|
184
|
+
# 1. Initialize (Downloads SmolLM2-360M-Instruct by default)
|
|
185
|
+
synth = TransformerTextGenerator()
|
|
186
|
+
|
|
187
|
+
# 2. Prepare with an optional style context
|
|
188
|
+
synth.prepare(corpus="The future of synthetic data is bright and highly scalable.")
|
|
189
|
+
|
|
190
|
+
# 3. Generate structured text from instructions
|
|
191
|
+
text_output = synth.generate(num_samples=1, instructions="Write a short summary about data.")
|
|
192
|
+
print(text_output[0])
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### 4. High-Speed Image Augmentation
|
|
196
|
+
Prepare a pipeline of augmentations to expand your Machine Learning dataset effortlessly.
|
|
197
|
+
```python
|
|
198
|
+
from syntharc.image.augmentor import ImageAugmentor
|
|
199
|
+
import cv2
|
|
200
|
+
|
|
201
|
+
# 1. Initialize rules
|
|
202
|
+
augmentor = ImageAugmentor()
|
|
203
|
+
# Uses Albumentations backend dictionary standards mapping
|
|
204
|
+
augmentor.prepare(config={
|
|
205
|
+
'resize': (256, 256),
|
|
206
|
+
'horizontal_flip': 0.5,
|
|
207
|
+
'brightness_contrast': 0.2
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
# 2. Load Real Image
|
|
211
|
+
image = cv2.imread('dataset/cat.jpg')
|
|
212
|
+
|
|
213
|
+
# 3. Generate augmentations
|
|
214
|
+
# The generator seamlessly wraps Albumentations under the hood
|
|
215
|
+
augmented_image = augmentor.generate(num_samples=1, data=[image])
|
|
216
|
+
print(f"Augmented Shape: {augmented_image[0].shape}")
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 📖 Documentation
|
|
222
|
+
|
|
223
|
+
For comprehensive guides, parameter references, evaluations, and interactive Jupyter showcase notebooks, please check out the `docs/` folder:
|
|
224
|
+
- [Getting Started](docs/getting_started.md)
|
|
225
|
+
- [Tabular Generation Guide](docs/tabular.md)
|
|
226
|
+
- [Time-Series Generation Guide](docs/timeseries.md)
|
|
227
|
+
- [Text Generation Guide](docs/text.md)
|
|
228
|
+
- [Image Augmentation Guide](docs/image.md)
|
|
229
|
+
- [API Reference](docs/api_reference.md)
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## 🤝 Contributing
|
|
234
|
+
|
|
235
|
+
We love our contributors! If you're interested in adding a new generative model, fixing a bug, or improving the documentation, please refer to our [CONTRIBUTING.md](CONTRIBUTING.md) for detailed instructions on setting up your local environment, managing dependencies, and passing our CI checks.
|
|
236
|
+
|
|
237
|
+
## 📄 License
|
|
238
|
+
|
|
239
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
Made with ❤️ by the syntharc team
|
syntharc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# syntharc: The Grand Unified Synthetic Data Generator
|
|
2
|
+
|
|
3
|
+
[](https://github.com/your-username/syntharc/actions/workflows/ci.yml)
|
|
4
|
+
[](https://badge.fury.io/py/syntharc)
|
|
5
|
+
[](https://pypi.org/project/syntharc/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
**`syntharc`** is a powerful, flexible, and unified Python library for generating high-quality synthetic data across multiple domains. In the modern machine learning ecosystem, acquiring high-quality, privacy-compliant, and diverse datasets is often the biggest bottleneck. `syntharc` solves this by abstracting the complexity of various generative algorithms into a single, intuitive framework.
|
|
9
|
+
|
|
10
|
+
Whether you need to anonymize sensitive tabular records, augment image operations for computer vision, simulate realistic text sequences, or replicate complex financial time-series signals. `syntharc` provides the unified API to do it all using state-of-the-art backend engines like PyTorch, Hugging Face Transformers, Albumentations, and the Synthetic Data Vault (SDV).
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## 🧠 How It Works: The Unified Lifecycle
|
|
15
|
+
|
|
16
|
+
One of the biggest hurdles in synthetic data generation is the fragmented tooling across different data types. `syntharc` forces all underlying models into a highly predictable **3-step lifecycle** inherited from our `BaseSynthesizer` architecture:
|
|
17
|
+
|
|
18
|
+
1. **`prepare(**kwargs)`**: Configures the internal environment. This is where you define structural metadata, column types, sequence keys, or image transformation pipelines to guide the generation.
|
|
19
|
+
2. **`fit(data)`**: Feeds your actual, real-world dataset into the generative model so it can learn patterns, probabilistic distributions, and internal representations.
|
|
20
|
+
3. **`generate(**kwargs)`**: Samples from the trained model to yield your brand new synthetic dataset—retaining the statistical properties of the original without exposing real user information.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 🚀 Key Features
|
|
25
|
+
|
|
26
|
+
* 📊 **Tabular Data:** Synthesize fully relational datasets using Deep Learning (`CTGANSynthesizer`) or statistical modeling (`GaussianCopulaSynthesizer`).
|
|
27
|
+
* 📝 **Text Generation:** Leverage the power of LLMs (`TransformerSynthesizer`), classic statistical chains (`MarkovSynthesizer`), or strict ruleings (`TemplateSynthesizer`).
|
|
28
|
+
* 🖼️ **Image Augmentation:** Use `ImageAugmentor` to rapidly iterate through spatial and pixel-level augmentations, dynamically expanding computer vision datasets.
|
|
29
|
+
* 📈 **Time-Series:** Employ the `PARSynthesizer` (Probabilistic AutoRegressive model) to safely synthesize robust sequence data over time.
|
|
30
|
+
* ✅ **Evaluations:** Built-in tools for evaluating the quality, fidelity, and privacy metrics of your generated data against your source data.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## 📦 Installation
|
|
35
|
+
|
|
36
|
+
Install `syntharc` via pip. The base package provides the core infrastructure. We highly recommend installing the specific domain dependencies you intend to use to keep your environment lean:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Install everything (Recommended for the full experience)
|
|
40
|
+
pip install "syntharc[all]"
|
|
41
|
+
|
|
42
|
+
# Or individually pick your domains:
|
|
43
|
+
pip install "syntharc[tabular]"
|
|
44
|
+
pip install "syntharc[text]"
|
|
45
|
+
pip install "syntharc[image]"
|
|
46
|
+
pip install "syntharc[timeseries]"
|
|
47
|
+
|
|
48
|
+
# For contributors and local development:
|
|
49
|
+
pip install "syntharc[dev]"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## 💡 Comprehensive Quick Start
|
|
55
|
+
|
|
56
|
+
Below are examples of how our unified framework elegantly handles vastly different data constraints.
|
|
57
|
+
|
|
58
|
+
### 1. Privacy-Preserving Tabular Data
|
|
59
|
+
Train a CTGAN model to learn the distribution of your customer data without saving real identities.
|
|
60
|
+
```python
|
|
61
|
+
import pandas as pd
|
|
62
|
+
from syntharc.tabular.ctgan import CTGANSynthesizer
|
|
63
|
+
|
|
64
|
+
real_data = pd.read_csv("customer_data.csv")
|
|
65
|
+
|
|
66
|
+
# 1. Initialize & Prepare metadata
|
|
67
|
+
synth = CTGANSynthesizer(epochs=50)
|
|
68
|
+
synth.prepare(metadata_dict={"primary_key": "user_id"})
|
|
69
|
+
|
|
70
|
+
# 2. Fit to real data
|
|
71
|
+
synth.fit(real_data)
|
|
72
|
+
|
|
73
|
+
# 3. Generate 1,000 synthetic rows!
|
|
74
|
+
synthetic_data = synth.generate(num_rows=1000)
|
|
75
|
+
print(synthetic_data.head())
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### 2. Deep Time-Series Generation
|
|
79
|
+
Synthesize sequential metrics, like stock prices or IoT sensor readings, using AutoRegressive modeling.
|
|
80
|
+
```python
|
|
81
|
+
from syntharc.timeseries.par import PARSynthesizer
|
|
82
|
+
|
|
83
|
+
# 1. Initialize & Prepare
|
|
84
|
+
synth = PARSynthesizer(epochs=25)
|
|
85
|
+
# Define what makes a "sequence" (e.g., separate tracking per 'device_id')
|
|
86
|
+
synth.prepare(metadata_dict={
|
|
87
|
+
"sequence_key": "device_id",
|
|
88
|
+
"context_columns": ["region"]
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
# 2. Fit
|
|
92
|
+
synth.fit(sensor_dataframe)
|
|
93
|
+
|
|
94
|
+
# 3. Generate sequential data
|
|
95
|
+
synthetic_series = synth.generate(num_sequences=50)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### 3. Causal Text Generation (LLMs)
|
|
99
|
+
Easily utilize Hugging Face causal language models.
|
|
100
|
+
```python
|
|
101
|
+
from syntharc.text.transformer import TransformerTextGenerator
|
|
102
|
+
|
|
103
|
+
# 1. Initialize (Downloads SmolLM2-360M-Instruct by default)
|
|
104
|
+
synth = TransformerTextGenerator()
|
|
105
|
+
|
|
106
|
+
# 2. Prepare with an optional style context
|
|
107
|
+
synth.prepare(corpus="The future of synthetic data is bright and highly scalable.")
|
|
108
|
+
|
|
109
|
+
# 3. Generate structured text from instructions
|
|
110
|
+
text_output = synth.generate(num_samples=1, instructions="Write a short summary about data.")
|
|
111
|
+
print(text_output[0])
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### 4. High-Speed Image Augmentation
|
|
115
|
+
Prepare a pipeline of augmentations to expand your Machine Learning dataset effortlessly.
|
|
116
|
+
```python
|
|
117
|
+
from syntharc.image.augmentor import ImageAugmentor
|
|
118
|
+
import cv2
|
|
119
|
+
|
|
120
|
+
# 1. Initialize rules
|
|
121
|
+
augmentor = ImageAugmentor()
|
|
122
|
+
# Uses Albumentations backend dictionary standards mapping
|
|
123
|
+
augmentor.prepare(config={
|
|
124
|
+
'resize': (256, 256),
|
|
125
|
+
'horizontal_flip': 0.5,
|
|
126
|
+
'brightness_contrast': 0.2
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
# 2. Load Real Image
|
|
130
|
+
image = cv2.imread('dataset/cat.jpg')
|
|
131
|
+
|
|
132
|
+
# 3. Generate augmentations
|
|
133
|
+
# The generator seamlessly wraps Albumentations under the hood
|
|
134
|
+
augmented_image = augmentor.generate(num_samples=1, data=[image])
|
|
135
|
+
print(f"Augmented Shape: {augmented_image[0].shape}")
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## 📖 Documentation
|
|
141
|
+
|
|
142
|
+
For comprehensive guides, parameter references, evaluations, and interactive Jupyter showcase notebooks, please check out the `docs/` folder:
|
|
143
|
+
- [Getting Started](docs/getting_started.md)
|
|
144
|
+
- [Tabular Generation Guide](docs/tabular.md)
|
|
145
|
+
- [Time-Series Generation Guide](docs/timeseries.md)
|
|
146
|
+
- [Text Generation Guide](docs/text.md)
|
|
147
|
+
- [Image Augmentation Guide](docs/image.md)
|
|
148
|
+
- [API Reference](docs/api_reference.md)
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 🤝 Contributing
|
|
153
|
+
|
|
154
|
+
We love our contributors! If you're interested in adding a new generative model, fixing a bug, or improving the documentation, please refer to our [CONTRIBUTING.md](CONTRIBUTING.md) for detailed instructions on setting up your local environment, managing dependencies, and passing our CI checks.
|
|
155
|
+
|
|
156
|
+
## 📄 License
|
|
157
|
+
|
|
158
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
Made with ❤️ by the syntharc team
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "syntharc"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Unified synthetic data generation for tabular, time-series, image, and text data"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Fahad Abdullah", email = "fahadai.co@gmail.com"},
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"synthetic-data",
|
|
17
|
+
"syntharceration",
|
|
18
|
+
"augmentation",
|
|
19
|
+
"tabular",
|
|
20
|
+
"time-series",
|
|
21
|
+
"image",
|
|
22
|
+
"text",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"Intended Audience :: Science/Research",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.10",
|
|
32
|
+
"Programming Language :: Python :: 3.11",
|
|
33
|
+
"Programming Language :: Python :: 3.12",
|
|
34
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
35
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
dependencies = [
|
|
39
|
+
"pandas>=1.5",
|
|
40
|
+
"numpy>=1.23",
|
|
41
|
+
"scikit-learn>=1.2",
|
|
42
|
+
"matplotlib>=3.6",
|
|
43
|
+
"Pillow>=9.0",
|
|
44
|
+
"pyyaml>=6.0",
|
|
45
|
+
"rich>=13.0",
|
|
46
|
+
"torch>=2.0",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
tabular = [
|
|
51
|
+
"sdv>=1.10",
|
|
52
|
+
]
|
|
53
|
+
timeseries = [
|
|
54
|
+
"sdv>=1.10",
|
|
55
|
+
]
|
|
56
|
+
image = [
|
|
57
|
+
"albumentations>=1.4",
|
|
58
|
+
"opencv-python-headless>=4.9",
|
|
59
|
+
]
|
|
60
|
+
text = [
|
|
61
|
+
"transformers>=4.40",
|
|
62
|
+
"accelerate>=0.22",
|
|
63
|
+
]
|
|
64
|
+
dev = [
|
|
65
|
+
"pytest>=8.0",
|
|
66
|
+
"pytest-cov>=5.0",
|
|
67
|
+
"ruff>=0.4",
|
|
68
|
+
"mypy>=1.10",
|
|
69
|
+
"pre-commit>=3.7",
|
|
70
|
+
"pandas-stubs",
|
|
71
|
+
"types-PyYAML",
|
|
72
|
+
"types-Pillow",
|
|
73
|
+
]
|
|
74
|
+
all = [
|
|
75
|
+
"syntharc[tabular]",
|
|
76
|
+
"syntharc[timeseries]",
|
|
77
|
+
"syntharc[image]",
|
|
78
|
+
"syntharc[text]",
|
|
79
|
+
"syntharc[dev]",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[project.urls]
|
|
83
|
+
Homepage = "https://github.com/FAbdullah17/syntharc"
|
|
84
|
+
Repository = "https://github.com/FAbdullah17/syntharc"
|
|
85
|
+
Issues = "https://github.com/FAbdullah17/syntharc/issues"
|
|
86
|
+
Documentation = "https://github.com/FAbdullah17/syntharc/tree/main/docs"
|
|
87
|
+
|
|
88
|
+
[tool.setuptools.packages.find]
|
|
89
|
+
where = ["src"]
|
|
90
|
+
|
|
91
|
+
[tool.ruff]
|
|
92
|
+
target-version = "py310"
|
|
93
|
+
line-length = 100
|
|
94
|
+
src = ["src", "tests"]
|
|
95
|
+
|
|
96
|
+
[tool.ruff.lint]
|
|
97
|
+
select = [
|
|
98
|
+
"E", # pycodestyle errors
|
|
99
|
+
"F", # pyflakes
|
|
100
|
+
"W", # pycodestyle warnings
|
|
101
|
+
"I", # isort
|
|
102
|
+
"N", # pep8-naming
|
|
103
|
+
"UP", # pyupgrade
|
|
104
|
+
"B", # flake8-bugbear
|
|
105
|
+
"SIM", # flake8-simplify
|
|
106
|
+
"RUF", # ruff-specific rules
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
[tool.ruff.lint.isort]
|
|
110
|
+
known-first-party = ["syntharc"]
|
|
111
|
+
|
|
112
|
+
[tool.mypy]
|
|
113
|
+
python_version = "3.10"
|
|
114
|
+
warn_return_any = true
|
|
115
|
+
warn_unused_configs = true
|
|
116
|
+
disallow_untyped_defs = true
|
|
117
|
+
mypy_path = "src"
|
|
118
|
+
packages = ["syntharc"]
|
|
119
|
+
|
|
120
|
+
[tool.pytest.ini_options]
|
|
121
|
+
testpaths = ["tests"]
|
|
122
|
+
addopts = "-v --tb=short"
|
syntharc-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""syntharc — Unified synthetic data generation.
|
|
2
|
+
|
|
3
|
+
A lightweight Python package for synthetic data generation across
|
|
4
|
+
tabular, time-series, image, and text domains using sample-based
|
|
5
|
+
learning, augmentation, and lightweight generative techniques.
|
|
6
|
+
|
|
7
|
+
Quick Start
|
|
8
|
+
-----------
|
|
9
|
+
>>> from syntharc.core import BaseSynthesizer, set_seed, setup_logging
|
|
10
|
+
|
|
11
|
+
Tabular (requires ``pip install syntharc[tabular]``):
|
|
12
|
+
|
|
13
|
+
>>> from syntharc.tabular import CTGANSynthesizer # doctest: +SKIP
|
|
14
|
+
>>> from syntharc.tabular import GaussianCopulaSynthesizer # doctest: +SKIP
|
|
15
|
+
|
|
16
|
+
Time-series (requires ``pip install syntharc[timeseries]``):
|
|
17
|
+
|
|
18
|
+
>>> from syntharc.timeseries import TimeSeriesSynthesizer # doctest: +SKIP
|
|
19
|
+
|
|
20
|
+
Image (requires ``pip install syntharc[image]``):
|
|
21
|
+
|
|
22
|
+
>>> from syntharc.image import ImageAugmentor # doctest: +SKIP
|
|
23
|
+
|
|
24
|
+
Text (markov/template work out of the box, transformer needs
|
|
25
|
+
``pip install syntharc[text]``):
|
|
26
|
+
|
|
27
|
+
>>> from syntharc.text import MarkovTextGenerator # doctest: +SKIP
|
|
28
|
+
>>> from syntharc.text import TemplateTextGenerator # doctest: +SKIP
|
|
29
|
+
>>> from syntharc.text import TransformerTextGenerator # doctest: +SKIP
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
__version__ = "0.1.0"
|
|
35
|
+
__author__ = "Fahad Abdullah"
|
|
36
|
+
__email__ = "fahadai.co@gmail.com"
|
|
37
|
+
|
|
38
|
+
__all__ = ["__version__"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""syntharc.core — Core infrastructure for syntharc."""
|
|
2
|
+
|
|
3
|
+
from syntharc.core.base import BaseSynthesizer
|
|
4
|
+
from syntharc.core.config import load_config, validate_config
|
|
5
|
+
from syntharc.core.utils import get_device, set_seed, setup_logging
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"BaseSynthesizer",
|
|
9
|
+
"load_config",
|
|
10
|
+
"validate_config",
|
|
11
|
+
"get_device",
|
|
12
|
+
"set_seed",
|
|
13
|
+
"setup_logging",
|
|
14
|
+
]
|