text-albumentations 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. text_albumentations-0.1.0/LICENSE +21 -0
  2. text_albumentations-0.1.0/PKG-INFO +314 -0
  3. text_albumentations-0.1.0/README.md +299 -0
  4. text_albumentations-0.1.0/pyproject.toml +24 -0
  5. text_albumentations-0.1.0/setup.cfg +4 -0
  6. text_albumentations-0.1.0/text_albumentations/__init__.py +56 -0
  7. text_albumentations-0.1.0/text_albumentations/base.py +281 -0
  8. text_albumentations-0.1.0/text_albumentations/ingest.py +112 -0
  9. text_albumentations-0.1.0/text_albumentations/modeling.py +45 -0
  10. text_albumentations-0.1.0/text_albumentations/output_format_adapters/__init__.py +3 -0
  11. text_albumentations-0.1.0/text_albumentations/output_format_adapters/alpaca.py +15 -0
  12. text_albumentations-0.1.0/text_albumentations/response_formats/__init__.py +9 -0
  13. text_albumentations-0.1.0/text_albumentations/response_formats/base.py +40 -0
  14. text_albumentations-0.1.0/text_albumentations/runner.py +49 -0
  15. text_albumentations-0.1.0/text_albumentations/runtime.py +268 -0
  16. text_albumentations-0.1.0/text_albumentations/tasks/__init__.py +31 -0
  17. text_albumentations-0.1.0/text_albumentations/tasks/bullets.py +107 -0
  18. text_albumentations-0.1.0/text_albumentations/tasks/comparison.py +92 -0
  19. text_albumentations-0.1.0/text_albumentations/tasks/continuation.py +137 -0
  20. text_albumentations-0.1.0/text_albumentations/tasks/qa_pairs.py +228 -0
  21. text_albumentations-0.1.0/text_albumentations/tasks/rephrase.py +57 -0
  22. text_albumentations-0.1.0/text_albumentations/tasks/retrieval.py +386 -0
  23. text_albumentations-0.1.0/text_albumentations/tasks/triplets.py +122 -0
  24. text_albumentations-0.1.0/text_albumentations/utils.py +25 -0
  25. text_albumentations-0.1.0/text_albumentations.egg-info/PKG-INFO +314 -0
  26. text_albumentations-0.1.0/text_albumentations.egg-info/SOURCES.txt +27 -0
  27. text_albumentations-0.1.0/text_albumentations.egg-info/dependency_links.txt +1 -0
  28. text_albumentations-0.1.0/text_albumentations.egg-info/requires.txt +3 -0
  29. text_albumentations-0.1.0/text_albumentations.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Avishek Biswas
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,314 @@
1
+ Metadata-Version: 2.4
2
+ Name: text-albumentations
3
+ Version: 0.1.0
4
+ Summary: Structured synthetic text data generation for SFT and distillation.
5
+ Author: Avishek Biswas
6
+ License-Expression: MIT
7
+ Project-URL: Repository, https://github.com/avbiswas/text-albumentations
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: openai>=2.30.0
12
+ Requires-Dist: outlines>=1.2.12
13
+ Requires-Dist: pydantic>=2
14
+ Dynamic: license-file
15
+
16
+ # text-albumentations
17
+
18
+ `text-albumentations` is a synthetic data generation engine for text.
19
+
20
+ The goal is to help generate instruction-tuning and distillation datasets from existing text corpora by applying structured augmentations over passages.
21
+
22
+ This is built for the practical case where good supervised fine-tuning often requires more examples than you already have, and where synthetic data generation is one of the fastest ways to create task-shaped training data from raw documents.
23
+
24
+ ## Why This Exists
25
+
26
+ Modern LLM workflows often need:
27
+
28
+ - synthetic SFT data
29
+ - task-specific distillation data
30
+ - multiple renderings of the same semantic content
31
+ - structured supervision generated from long-form text
32
+
33
+ If you already have long amounts of text, you can usually derive many useful supervision targets from it:
34
+
35
+ - bullet-point summaries
36
+ - QA pairs
37
+ - rephrasings
38
+ - continuation tasks
39
+ - retrieval examples
40
+ - comparisons
41
+ - knowledge graph triplets
42
+
43
+ Instead of treating synthetic data generation as one giant prompt, this project breaks it into explicit, composable pieces.
44
+
45
+
46
+ ## Ideology
47
+
48
+ The core idea is:
49
+
50
+ `structured generation + simple priors -> dataset`
51
+
52
+ Structured generation gives you typed intermediate outputs using Pydantic schemas.
53
+
54
+ Simple priors give you the task shape:
55
+
56
+ - "extract bullets"
57
+ - "produce QA pairs"
58
+ - "find the answering passage"
59
+ - "serialize the response as markdown/json/etc"
60
+
61
+ That combination is easier to reason about than unstructured free-form prompting. It also makes the pipeline more extensible: you can swap prompts, schemas, response formats, runtimes, and adapters without rewriting the whole system.
62
+
63
+ ## Current Capabilities
64
+
65
+ The project currently supports:
66
+
67
+ - single-chunk augmentations
68
+ - multi-chunk augmentations
69
+ - typed structured outputs with Pydantic
70
+ - Alpaca-format dataset generation
71
+ - response-format control for the Alpaca `output` field
72
+ - sync and async generation runtimes
73
+ - Outlines-backed local models
74
+ - Outlines-backed OpenAI models
75
+ - long-text ingestion with fixed-size character chunking
76
+ - JSONL dataset writing
77
+
78
+ Built-in augmentation families include:
79
+
80
+ - bullets
81
+ - QA pairs
82
+ - rephrase
83
+ - continuation
84
+ - retrieval
85
+ - comparison
86
+ - triplets
87
+
88
+ ## Architecture
89
+
90
+ The main abstractions are:
91
+
92
+ - `BaseSingleChunkAugmentation` and `BaseMultiChunkAugmentation`
93
+ These define the task contract: schema, prompt, response formats, generation knobs, and dataset construction.
94
+
95
+ - `BaseResponseFormat`
96
+ This controls how the Alpaca `output` field should be represented and can also modify the system prompt with format-specific instructions.
97
+
98
+ - `BaseAlpacaAdapter`
99
+ This converts typed structured outputs into Alpaca rows.
100
+
101
+ - `ModelRuntime`
102
+ This is the model execution interface. Current implementations support local Outlines models and OpenAI-through-Outlines models.
103
+
104
+ - `AugmentationRunner`
105
+ This binds together:
106
+ 1. input data
107
+ 2. a runtime
108
+ 3. an augmentation
109
+
110
+ ## Usage
111
+
112
+ ### Minimal Local Example
113
+
114
+ ```python
115
+ import mlx_lm
116
+ import outlines
117
+
118
+ from text_albumentations import OutlinesModel, run_augmentation
119
+ from text_albumentations.tasks.bullets import bullet_augmentation
120
+
121
+ model = outlines.from_mlxlm(*mlx_lm.load("mlx-community/Qwen3.5-4B-OptiQ-4bit"))
122
+ runtime = OutlinesModel(model=model)
123
+
124
+ rows = run_augmentation(
125
+ "The Transformer replaces recurrence with attention and improves parallelization.",
126
+ bullet_augmentation,
127
+ runtime,
128
+ )
129
+
130
+ for row in rows:
131
+ print(row.model_dump_json())
132
+ ```
133
+
134
+ See [`examples/example_minimal.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_minimal.py).
135
+
136
+ ### OpenAI Sync
137
+
138
+ ```python
139
+ import openai
140
+ import outlines
141
+
142
+ from text_albumentations import OutlinesModel, run_augmentation
143
+ from text_albumentations.tasks.bullets import bullet_augmentation
144
+
145
+ model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
146
+ runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
147
+
148
+ rows = run_augmentation("some passage", bullet_augmentation, runtime)
149
+ ```
150
+
151
+ See [`examples/example_openai_sync.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_sync.py).
152
+
153
+ ### OpenAI Async
154
+
155
+ ```python
156
+ import asyncio
157
+ import openai
158
+ import outlines
159
+
160
+ from text_albumentations import OutlinesModel, arun_augmentation
161
+ from text_albumentations.tasks.bullets import bullet_augmentation
162
+
163
+
164
+ async def main():
165
+ model = outlines.from_openai(openai.AsyncOpenAI(), "gpt-5.4-nano")
166
+ runtime = OutlinesModel(
167
+ model,
168
+ async_mode=True,
169
+ total_concurrent_calls=4,
170
+ max_tokens_parameter="max_completion_tokens",
171
+ )
172
+
173
+ rows = await arun_augmentation("some passage", bullet_augmentation, runtime)
174
+ print(len(rows))
175
+
176
+
177
+ asyncio.run(main())
178
+ ```
179
+
180
+ See [`examples/example_openai_async.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_async.py).
181
+
182
+ ### Transformers Local Model
183
+
184
+ ```python
185
+ import outlines
186
+ from transformers import AutoModelForCausalLM, AutoTokenizer
187
+
188
+ from text_albumentations import OutlinesModel, run_augmentation
189
+ from text_albumentations.tasks.bullets import bullet_augmentation
190
+
191
+ hf_model = AutoModelForCausalLM.from_pretrained(
192
+ "google/gemma-3-1b-it",
193
+ torch_dtype="auto",
194
+ device_map="auto",
195
+ )
196
+ hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
197
+
198
+ model = outlines.from_transformers(hf_model, hf_tokenizer)
199
+ runtime = OutlinesModel(model, max_tokens_parameter="max_new_tokens")
200
+
201
+ rows = run_augmentation("some passage", bullet_augmentation, runtime)
202
+ ```
203
+
204
+ See [`examples/example_transformers_gemma.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_transformers_gemma.py).
205
+
206
+ ### Long Text To JSONL
207
+
208
+ ```python
209
+ import openai
210
+ import outlines
211
+
212
+ from text_albumentations import OutlinesModel, save_long_text_dataset
213
+ from text_albumentations.tasks.bullets import bullet_augmentation
214
+
215
+ model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
216
+ runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
217
+
218
+ save_long_text_dataset(
219
+ text=long_text,
220
+ output_jsonl="out.jsonl",
221
+ augmentation=bullet_augmentation,
222
+ runtime=runtime,
223
+ chunk_size_chars=300,
224
+ )
225
+ ```
226
+
227
+ See [`examples/example_long_text_to_jsonl.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_long_text_to_jsonl.py).
228
+
229
+ ### Multiple Augmentations Over The Same Passage
230
+
231
+ ```python
232
+ import openai
233
+ import outlines
234
+
235
+ from text_albumentations import OutlinesModel, run_augmentation
236
+ from text_albumentations.tasks.bullets import bullet_augmentation
237
+ from text_albumentations.tasks.rephrase import rephrase_augmentation
238
+
239
+ model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
240
+ runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
241
+
242
+ rows = []
243
+ rows.extend(run_augmentation("some passage", bullet_augmentation, runtime))
244
+ rows.extend(run_augmentation("some passage", rephrase_augmentation, runtime))
245
+ ```
246
+
247
+ See [`examples/example_multiple_augmentations.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_multiple_augmentations.py).
248
+
249
+ ### Custom Preprocessing Model
250
+
251
+ You can also make the augmentation input itself be a custom Pydantic model instead of a raw string.
252
+
253
+ See [`examples/example_custom_preprocessing.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_custom_preprocessing.py).
254
+
255
+ ## Extensibility
256
+
257
+ The project is designed so users can extend it in layers.
258
+
259
+ ### 1. Add A New Augmentation
260
+
261
+ Subclass one of:
262
+
263
+ - `BaseSingleChunkAugmentation`
264
+ - `BaseMultiChunkAugmentation`
265
+
266
+ Define:
267
+
268
+ - a Pydantic schema
269
+ - a system prompt
270
+ - `build_user_message(...)`
271
+ - one or more response formats
272
+
273
+ ### 2. Add A New Response Format
274
+
275
+ Subclass `BaseResponseFormat` if you want to control:
276
+
277
+ - how the format modifies the system prompt
278
+ - how the final Alpaca `output` field is rendered
279
+
280
+ For common Alpaca row generation, `AlpacaResponseFormat` is usually enough.
281
+
282
+ ### 3. Add A New Adapter
283
+
284
+ Subclass `BaseAlpacaAdapter` to convert a typed structured output into one or more Alpaca rows.
285
+
286
+ One structured output can expand into multiple rows.
287
+
288
+ ### 4. Add A New Runtime
289
+
290
+ Implement `ModelRuntime` if you want to support a new backend.
291
+
292
+ That keeps model execution separate from:
293
+
294
+ - augmentation semantics
295
+ - prompt construction
296
+ - dataset adapters
297
+ - response serialization
298
+
299
+ This separation is intentional. The project should let you swap the model layer without rewriting the dataset logic.
300
+
301
+ ## Philosophy On Synthetic Data
302
+
303
+ This project does not assume synthetic data is magic.
304
+
305
+ It assumes:
306
+
307
+ - synthetic data works best when the task shape is explicit
308
+ - typed intermediate representations are easier to control
309
+ - simple priors beat vague giant prompts
310
+ - extensibility matters because different teams want different schemas, formats, and runtimes
311
+
312
+ The aim is not "generate random data."
313
+
314
+ The aim is to turn raw text into useful supervision signals for SFT and distillation in a way that is structured, inspectable, and easy to extend.
@@ -0,0 +1,299 @@
1
+ # text-albumentations
2
+
3
+ `text-albumentations` is a synthetic data generation engine for text.
4
+
5
+ The goal is to help generate instruction-tuning and distillation datasets from existing text corpora by applying structured augmentations over passages.
6
+
7
+ This is built for the practical case where good supervised fine-tuning often requires more examples than you already have, and where synthetic data generation is one of the fastest ways to create task-shaped training data from raw documents.
8
+
9
+ ## Why This Exists
10
+
11
+ Modern LLM workflows often need:
12
+
13
+ - synthetic SFT data
14
+ - task-specific distillation data
15
+ - multiple renderings of the same semantic content
16
+ - structured supervision generated from long-form text
17
+
18
+ If you already have long amounts of text, you can usually derive many useful supervision targets from it:
19
+
20
+ - bullet-point summaries
21
+ - QA pairs
22
+ - rephrasings
23
+ - continuation tasks
24
+ - retrieval examples
25
+ - comparisons
26
+ - knowledge graph triplets
27
+
28
+ Instead of treating synthetic data generation as one giant prompt, this project breaks it into explicit, composable pieces.
29
+
30
+
31
+ ## Ideology
32
+
33
+ The core idea is:
34
+
35
+ `structured generation + simple priors -> dataset`
36
+
37
+ Structured generation gives you typed intermediate outputs using Pydantic schemas.
38
+
39
+ Simple priors give you the task shape:
40
+
41
+ - "extract bullets"
42
+ - "produce QA pairs"
43
+ - "find the answering passage"
44
+ - "serialize the response as markdown/json/etc"
45
+
46
+ That combination is easier to reason about than unstructured free-form prompting. It also makes the pipeline more extensible: you can swap prompts, schemas, response formats, runtimes, and adapters without rewriting the whole system.
47
+
48
+ ## Current Capabilities
49
+
50
+ The project currently supports:
51
+
52
+ - single-chunk augmentations
53
+ - multi-chunk augmentations
54
+ - typed structured outputs with Pydantic
55
+ - Alpaca-format dataset generation
56
+ - response-format control for the Alpaca `output` field
57
+ - sync and async generation runtimes
58
+ - Outlines-backed local models
59
+ - Outlines-backed OpenAI models
60
+ - long-text ingestion with fixed-size character chunking
61
+ - JSONL dataset writing
62
+
63
+ Built-in augmentation families include:
64
+
65
+ - bullets
66
+ - QA pairs
67
+ - rephrase
68
+ - continuation
69
+ - retrieval
70
+ - comparison
71
+ - triplets
72
+
73
+ ## Architecture
74
+
75
+ The main abstractions are:
76
+
77
+ - `BaseSingleChunkAugmentation` and `BaseMultiChunkAugmentation`
78
+ These define the task contract: schema, prompt, response formats, generation knobs, and dataset construction.
79
+
80
+ - `BaseResponseFormat`
81
+ This controls how the Alpaca `output` field should be represented and can also modify the system prompt with format-specific instructions.
82
+
83
+ - `BaseAlpacaAdapter`
84
+ This converts typed structured outputs into Alpaca rows.
85
+
86
+ - `ModelRuntime`
87
+ This is the model execution interface. Current implementations support local Outlines models and OpenAI-through-Outlines models.
88
+
89
+ - `AugmentationRunner`
90
+ This binds together:
91
+ 1. input data
92
+ 2. a runtime
93
+ 3. an augmentation
94
+
95
+ ## Usage
96
+
97
+ ### Minimal Local Example
98
+
99
+ ```python
100
+ import mlx_lm
101
+ import outlines
102
+
103
+ from text_albumentations import OutlinesModel, run_augmentation
104
+ from text_albumentations.tasks.bullets import bullet_augmentation
105
+
106
+ model = outlines.from_mlxlm(*mlx_lm.load("mlx-community/Qwen3.5-4B-OptiQ-4bit"))
107
+ runtime = OutlinesModel(model=model)
108
+
109
+ rows = run_augmentation(
110
+ "The Transformer replaces recurrence with attention and improves parallelization.",
111
+ bullet_augmentation,
112
+ runtime,
113
+ )
114
+
115
+ for row in rows:
116
+ print(row.model_dump_json())
117
+ ```
118
+
119
+ See [`examples/example_minimal.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_minimal.py).
120
+
121
+ ### OpenAI Sync
122
+
123
+ ```python
124
+ import openai
125
+ import outlines
126
+
127
+ from text_albumentations import OutlinesModel, run_augmentation
128
+ from text_albumentations.tasks.bullets import bullet_augmentation
129
+
130
+ model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
131
+ runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
132
+
133
+ rows = run_augmentation("some passage", bullet_augmentation, runtime)
134
+ ```
135
+
136
+ See [`examples/example_openai_sync.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_sync.py).
137
+
138
+ ### OpenAI Async
139
+
140
+ ```python
141
+ import asyncio
142
+ import openai
143
+ import outlines
144
+
145
+ from text_albumentations import OutlinesModel, arun_augmentation
146
+ from text_albumentations.tasks.bullets import bullet_augmentation
147
+
148
+
149
+ async def main():
150
+ model = outlines.from_openai(openai.AsyncOpenAI(), "gpt-5.4-nano")
151
+ runtime = OutlinesModel(
152
+ model,
153
+ async_mode=True,
154
+ total_concurrent_calls=4,
155
+ max_tokens_parameter="max_completion_tokens",
156
+ )
157
+
158
+ rows = await arun_augmentation("some passage", bullet_augmentation, runtime)
159
+ print(len(rows))
160
+
161
+
162
+ asyncio.run(main())
163
+ ```
164
+
165
+ See [`examples/example_openai_async.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_async.py).
166
+
167
+ ### Transformers Local Model
168
+
169
+ ```python
170
+ import outlines
171
+ from transformers import AutoModelForCausalLM, AutoTokenizer
172
+
173
+ from text_albumentations import OutlinesModel, run_augmentation
174
+ from text_albumentations.tasks.bullets import bullet_augmentation
175
+
176
+ hf_model = AutoModelForCausalLM.from_pretrained(
177
+ "google/gemma-3-1b-it",
178
+ torch_dtype="auto",
179
+ device_map="auto",
180
+ )
181
+ hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
182
+
183
+ model = outlines.from_transformers(hf_model, hf_tokenizer)
184
+ runtime = OutlinesModel(model, max_tokens_parameter="max_new_tokens")
185
+
186
+ rows = run_augmentation("some passage", bullet_augmentation, runtime)
187
+ ```
188
+
189
+ See [`examples/example_transformers_gemma.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_transformers_gemma.py).
190
+
191
+ ### Long Text To JSONL
192
+
193
+ ```python
194
+ import openai
195
+ import outlines
196
+
197
+ from text_albumentations import OutlinesModel, save_long_text_dataset
198
+ from text_albumentations.tasks.bullets import bullet_augmentation
199
+
200
+ model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
201
+ runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
202
+
203
+ save_long_text_dataset(
204
+ text=long_text,
205
+ output_jsonl="out.jsonl",
206
+ augmentation=bullet_augmentation,
207
+ runtime=runtime,
208
+ chunk_size_chars=300,
209
+ )
210
+ ```
211
+
212
+ See [`examples/example_long_text_to_jsonl.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_long_text_to_jsonl.py).
213
+
214
+ ### Multiple Augmentations Over The Same Passage
215
+
216
+ ```python
217
+ import openai
218
+ import outlines
219
+
220
+ from text_albumentations import OutlinesModel, run_augmentation
221
+ from text_albumentations.tasks.bullets import bullet_augmentation
222
+ from text_albumentations.tasks.rephrase import rephrase_augmentation
223
+
224
+ model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
225
+ runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
226
+
227
+ rows = []
228
+ rows.extend(run_augmentation("some passage", bullet_augmentation, runtime))
229
+ rows.extend(run_augmentation("some passage", rephrase_augmentation, runtime))
230
+ ```
231
+
232
+ See [`examples/example_multiple_augmentations.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_multiple_augmentations.py).
233
+
234
+ ### Custom Preprocessing Model
235
+
236
+ You can also make the augmentation input itself be a custom Pydantic model instead of a raw string.
237
+
238
+ See [`examples/example_custom_preprocessing.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_custom_preprocessing.py).
239
+
240
+ ## Extensibility
241
+
242
+ The project is designed so users can extend it in layers.
243
+
244
+ ### 1. Add A New Augmentation
245
+
246
+ Subclass one of:
247
+
248
+ - `BaseSingleChunkAugmentation`
249
+ - `BaseMultiChunkAugmentation`
250
+
251
+ Define:
252
+
253
+ - a Pydantic schema
254
+ - a system prompt
255
+ - `build_user_message(...)`
256
+ - one or more response formats
257
+
258
+ ### 2. Add A New Response Format
259
+
260
+ Subclass `BaseResponseFormat` if you want to control:
261
+
262
+ - how the format modifies the system prompt
263
+ - how the final Alpaca `output` field is rendered
264
+
265
+ For common Alpaca row generation, `AlpacaResponseFormat` is usually enough.
266
+
267
+ ### 3. Add A New Adapter
268
+
269
+ Subclass `BaseAlpacaAdapter` to convert a typed structured output into one or more Alpaca rows.
270
+
271
+ One structured output can expand into multiple rows.
272
+
273
+ ### 4. Add A New Runtime
274
+
275
+ Implement `ModelRuntime` if you want to support a new backend.
276
+
277
+ That keeps model execution separate from:
278
+
279
+ - augmentation semantics
280
+ - prompt construction
281
+ - dataset adapters
282
+ - response serialization
283
+
284
+ This separation is intentional. The project should let you swap the model layer without rewriting the dataset logic.
285
+
286
+ ## Philosophy On Synthetic Data
287
+
288
+ This project does not assume synthetic data is magic.
289
+
290
+ It assumes:
291
+
292
+ - synthetic data works best when the task shape is explicit
293
+ - typed intermediate representations are easier to control
294
+ - simple priors beat vague giant prompts
295
+ - extensibility matters because different teams want different schemas, formats, and runtimes
296
+
297
+ The aim is not "generate random data."
298
+
299
+ The aim is to turn raw text into useful supervision signals for SFT and distillation in a way that is structured, inspectable, and easy to extend.
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "text-albumentations"
7
+ version = "0.1.0"
8
+ description = "Structured synthetic text data generation for SFT and distillation."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Avishek Biswas" },
14
+ ]
15
+ urls = { Repository = "https://github.com/avbiswas/text-albumentations" }
16
+ dependencies = [
17
+ "openai>=2.30.0",
18
+ "outlines>=1.2.12",
19
+ "pydantic>=2",
20
+ ]
21
+
22
+ [tool.setuptools.packages.find]
23
+ where = ["."]
24
+ include = ["text_albumentations*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+