text-albumentations 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- text_albumentations-0.1.0/LICENSE +21 -0
- text_albumentations-0.1.0/PKG-INFO +314 -0
- text_albumentations-0.1.0/README.md +299 -0
- text_albumentations-0.1.0/pyproject.toml +24 -0
- text_albumentations-0.1.0/setup.cfg +4 -0
- text_albumentations-0.1.0/text_albumentations/__init__.py +56 -0
- text_albumentations-0.1.0/text_albumentations/base.py +281 -0
- text_albumentations-0.1.0/text_albumentations/ingest.py +112 -0
- text_albumentations-0.1.0/text_albumentations/modeling.py +45 -0
- text_albumentations-0.1.0/text_albumentations/output_format_adapters/__init__.py +3 -0
- text_albumentations-0.1.0/text_albumentations/output_format_adapters/alpaca.py +15 -0
- text_albumentations-0.1.0/text_albumentations/response_formats/__init__.py +9 -0
- text_albumentations-0.1.0/text_albumentations/response_formats/base.py +40 -0
- text_albumentations-0.1.0/text_albumentations/runner.py +49 -0
- text_albumentations-0.1.0/text_albumentations/runtime.py +268 -0
- text_albumentations-0.1.0/text_albumentations/tasks/__init__.py +31 -0
- text_albumentations-0.1.0/text_albumentations/tasks/bullets.py +107 -0
- text_albumentations-0.1.0/text_albumentations/tasks/comparison.py +92 -0
- text_albumentations-0.1.0/text_albumentations/tasks/continuation.py +137 -0
- text_albumentations-0.1.0/text_albumentations/tasks/qa_pairs.py +228 -0
- text_albumentations-0.1.0/text_albumentations/tasks/rephrase.py +57 -0
- text_albumentations-0.1.0/text_albumentations/tasks/retrieval.py +386 -0
- text_albumentations-0.1.0/text_albumentations/tasks/triplets.py +122 -0
- text_albumentations-0.1.0/text_albumentations/utils.py +25 -0
- text_albumentations-0.1.0/text_albumentations.egg-info/PKG-INFO +314 -0
- text_albumentations-0.1.0/text_albumentations.egg-info/SOURCES.txt +27 -0
- text_albumentations-0.1.0/text_albumentations.egg-info/dependency_links.txt +1 -0
- text_albumentations-0.1.0/text_albumentations.egg-info/requires.txt +3 -0
- text_albumentations-0.1.0/text_albumentations.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Avishek Biswas
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: text-albumentations
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Structured synthetic text data generation for SFT and distillation.
|
|
5
|
+
Author: Avishek Biswas
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/avbiswas/text-albumentations
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: openai>=2.30.0
|
|
12
|
+
Requires-Dist: outlines>=1.2.12
|
|
13
|
+
Requires-Dist: pydantic>=2
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# text-albumentations
|
|
17
|
+
|
|
18
|
+
`text-albumentations` is a synthetic data generation engine for text.
|
|
19
|
+
|
|
20
|
+
The goal is to help generate instruction-tuning and distillation datasets from existing text corpora by applying structured augmentations over passages.
|
|
21
|
+
|
|
22
|
+
This is built for the practical case where good supervised fine-tuning often requires more examples than you already have, and where synthetic data generation is one of the fastest ways to create task-shaped training data from raw documents.
|
|
23
|
+
|
|
24
|
+
## Why This Exists
|
|
25
|
+
|
|
26
|
+
Modern LLM workflows often need:
|
|
27
|
+
|
|
28
|
+
- synthetic SFT data
|
|
29
|
+
- task-specific distillation data
|
|
30
|
+
- multiple renderings of the same semantic content
|
|
31
|
+
- structured supervision generated from long-form text
|
|
32
|
+
|
|
33
|
+
If you already have long amounts of text, you can usually derive many useful supervision targets from it:
|
|
34
|
+
|
|
35
|
+
- bullet-point summaries
|
|
36
|
+
- QA pairs
|
|
37
|
+
- rephrasings
|
|
38
|
+
- continuation tasks
|
|
39
|
+
- retrieval examples
|
|
40
|
+
- comparisons
|
|
41
|
+
- knowledge graph triplets
|
|
42
|
+
|
|
43
|
+
Instead of treating synthetic data generation as one giant prompt, this project breaks it into explicit, composable pieces.
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
## Ideology
|
|
47
|
+
|
|
48
|
+
The core idea is:
|
|
49
|
+
|
|
50
|
+
`structured generation + simple priors -> dataset`
|
|
51
|
+
|
|
52
|
+
Structured generation gives you typed intermediate outputs using Pydantic schemas.
|
|
53
|
+
|
|
54
|
+
Simple priors give you the task shape:
|
|
55
|
+
|
|
56
|
+
- "extract bullets"
|
|
57
|
+
- "produce QA pairs"
|
|
58
|
+
- "find the answering passage"
|
|
59
|
+
- "serialize the response as markdown/json/etc"
|
|
60
|
+
|
|
61
|
+
That combination is easier to reason about than unstructured free-form prompting. It also makes the pipeline more extensible: you can swap prompts, schemas, response formats, runtimes, and adapters without rewriting the whole system.
|
|
62
|
+
|
|
63
|
+
## Current Capabilities
|
|
64
|
+
|
|
65
|
+
The project currently supports:
|
|
66
|
+
|
|
67
|
+
- single-chunk augmentations
|
|
68
|
+
- multi-chunk augmentations
|
|
69
|
+
- typed structured outputs with Pydantic
|
|
70
|
+
- Alpaca-format dataset generation
|
|
71
|
+
- response-format control for the Alpaca `output` field
|
|
72
|
+
- sync and async generation runtimes
|
|
73
|
+
- Outlines-backed local models
|
|
74
|
+
- Outlines-backed OpenAI models
|
|
75
|
+
- long-text ingestion with fixed-size character chunking
|
|
76
|
+
- JSONL dataset writing
|
|
77
|
+
|
|
78
|
+
Built-in augmentation families include:
|
|
79
|
+
|
|
80
|
+
- bullets
|
|
81
|
+
- QA pairs
|
|
82
|
+
- rephrase
|
|
83
|
+
- continuation
|
|
84
|
+
- retrieval
|
|
85
|
+
- comparison
|
|
86
|
+
- triplets
|
|
87
|
+
|
|
88
|
+
## Architecture
|
|
89
|
+
|
|
90
|
+
The main abstractions are:
|
|
91
|
+
|
|
92
|
+
- `BaseSingleChunkAugmentation` and `BaseMultiChunkAugmentation`
|
|
93
|
+
These define the task contract: schema, prompt, response formats, generation knobs, and dataset construction.
|
|
94
|
+
|
|
95
|
+
- `BaseResponseFormat`
|
|
96
|
+
This controls how the Alpaca `output` field should be represented and can also modify the system prompt with format-specific instructions.
|
|
97
|
+
|
|
98
|
+
- `BaseAlpacaAdapter`
|
|
99
|
+
This converts typed structured outputs into Alpaca rows.
|
|
100
|
+
|
|
101
|
+
- `ModelRuntime`
|
|
102
|
+
This is the model execution interface. Current implementations support local Outlines models and OpenAI-through-Outlines models.
|
|
103
|
+
|
|
104
|
+
- `AugmentationRunner`
|
|
105
|
+
This binds together:
|
|
106
|
+
1. input data
|
|
107
|
+
2. a runtime
|
|
108
|
+
3. an augmentation
|
|
109
|
+
|
|
110
|
+
## Usage
|
|
111
|
+
|
|
112
|
+
### Minimal Local Example
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import mlx_lm
|
|
116
|
+
import outlines
|
|
117
|
+
|
|
118
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
119
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
120
|
+
|
|
121
|
+
model = outlines.from_mlxlm(*mlx_lm.load("mlx-community/Qwen3.5-4B-OptiQ-4bit"))
|
|
122
|
+
runtime = OutlinesModel(model=model)
|
|
123
|
+
|
|
124
|
+
rows = run_augmentation(
|
|
125
|
+
"The Transformer replaces recurrence with attention and improves parallelization.",
|
|
126
|
+
bullet_augmentation,
|
|
127
|
+
runtime,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
for row in rows:
|
|
131
|
+
print(row.model_dump_json())
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
See [`examples/example_minimal.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_minimal.py).
|
|
135
|
+
|
|
136
|
+
### OpenAI Sync
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
import openai
|
|
140
|
+
import outlines
|
|
141
|
+
|
|
142
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
143
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
144
|
+
|
|
145
|
+
model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
|
|
146
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
|
|
147
|
+
|
|
148
|
+
rows = run_augmentation("some passage", bullet_augmentation, runtime)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
See [`examples/example_openai_sync.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_sync.py).
|
|
152
|
+
|
|
153
|
+
### OpenAI Async
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import asyncio
|
|
157
|
+
import openai
|
|
158
|
+
import outlines
|
|
159
|
+
|
|
160
|
+
from text_albumentations import OutlinesModel, arun_augmentation
|
|
161
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def main():
|
|
165
|
+
model = outlines.from_openai(openai.AsyncOpenAI(), "gpt-5.4-nano")
|
|
166
|
+
runtime = OutlinesModel(
|
|
167
|
+
model,
|
|
168
|
+
async_mode=True,
|
|
169
|
+
total_concurrent_calls=4,
|
|
170
|
+
max_tokens_parameter="max_completion_tokens",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
rows = await arun_augmentation("some passage", bullet_augmentation, runtime)
|
|
174
|
+
print(len(rows))
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
asyncio.run(main())
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
See [`examples/example_openai_async.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_async.py).
|
|
181
|
+
|
|
182
|
+
### Transformers Local Model
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
import outlines
|
|
186
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
187
|
+
|
|
188
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
189
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
190
|
+
|
|
191
|
+
hf_model = AutoModelForCausalLM.from_pretrained(
|
|
192
|
+
"google/gemma-3-1b-it",
|
|
193
|
+
torch_dtype="auto",
|
|
194
|
+
device_map="auto",
|
|
195
|
+
)
|
|
196
|
+
hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
|
|
197
|
+
|
|
198
|
+
model = outlines.from_transformers(hf_model, hf_tokenizer)
|
|
199
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_new_tokens")
|
|
200
|
+
|
|
201
|
+
rows = run_augmentation("some passage", bullet_augmentation, runtime)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
See [`examples/example_transformers_gemma.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_transformers_gemma.py).
|
|
205
|
+
|
|
206
|
+
### Long Text To JSONL
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
import openai
|
|
210
|
+
import outlines
|
|
211
|
+
|
|
212
|
+
from text_albumentations import OutlinesModel, save_long_text_dataset
|
|
213
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
214
|
+
|
|
215
|
+
model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
|
|
216
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
|
|
217
|
+
|
|
218
|
+
save_long_text_dataset(
|
|
219
|
+
text=long_text,
|
|
220
|
+
output_jsonl="out.jsonl",
|
|
221
|
+
augmentation=bullet_augmentation,
|
|
222
|
+
runtime=runtime,
|
|
223
|
+
chunk_size_chars=300,
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
See [`examples/example_long_text_to_jsonl.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_long_text_to_jsonl.py).
|
|
228
|
+
|
|
229
|
+
### Multiple Augmentations Over The Same Passage
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
import openai
|
|
233
|
+
import outlines
|
|
234
|
+
|
|
235
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
236
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
237
|
+
from text_albumentations.tasks.rephrase import rephrase_augmentation
|
|
238
|
+
|
|
239
|
+
model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
|
|
240
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
|
|
241
|
+
|
|
242
|
+
rows = []
|
|
243
|
+
rows.extend(run_augmentation("some passage", bullet_augmentation, runtime))
|
|
244
|
+
rows.extend(run_augmentation("some passage", rephrase_augmentation, runtime))
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
See [`examples/example_multiple_augmentations.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_multiple_augmentations.py).
|
|
248
|
+
|
|
249
|
+
### Custom Preprocessing Model
|
|
250
|
+
|
|
251
|
+
You can also make the augmentation input itself be a custom Pydantic model instead of a raw string.
|
|
252
|
+
|
|
253
|
+
See [`examples/example_custom_preprocessing.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_custom_preprocessing.py).
|
|
254
|
+
|
|
255
|
+
## Extensibility
|
|
256
|
+
|
|
257
|
+
The project is designed so users can extend it in layers.
|
|
258
|
+
|
|
259
|
+
### 1. Add A New Augmentation
|
|
260
|
+
|
|
261
|
+
Subclass one of:
|
|
262
|
+
|
|
263
|
+
- `BaseSingleChunkAugmentation`
|
|
264
|
+
- `BaseMultiChunkAugmentation`
|
|
265
|
+
|
|
266
|
+
Define:
|
|
267
|
+
|
|
268
|
+
- a Pydantic schema
|
|
269
|
+
- a system prompt
|
|
270
|
+
- `build_user_message(...)`
|
|
271
|
+
- one or more response formats
|
|
272
|
+
|
|
273
|
+
### 2. Add A New Response Format
|
|
274
|
+
|
|
275
|
+
Subclass `BaseResponseFormat` if you want to control:
|
|
276
|
+
|
|
277
|
+
- how the format modifies the system prompt
|
|
278
|
+
- how the final Alpaca `output` field is rendered
|
|
279
|
+
|
|
280
|
+
For common Alpaca row generation, `AlpacaResponseFormat` is usually enough.
|
|
281
|
+
|
|
282
|
+
### 3. Add A New Adapter
|
|
283
|
+
|
|
284
|
+
Subclass `BaseAlpacaAdapter` to convert a typed structured output into one or more Alpaca rows.
|
|
285
|
+
|
|
286
|
+
One structured output can expand into multiple rows.
|
|
287
|
+
|
|
288
|
+
### 4. Add A New Runtime
|
|
289
|
+
|
|
290
|
+
Implement `ModelRuntime` if you want to support a new backend.
|
|
291
|
+
|
|
292
|
+
That keeps model execution separate from:
|
|
293
|
+
|
|
294
|
+
- augmentation semantics
|
|
295
|
+
- prompt construction
|
|
296
|
+
- dataset adapters
|
|
297
|
+
- response serialization
|
|
298
|
+
|
|
299
|
+
This separation is intentional. The project should let you swap the model layer without rewriting the dataset logic.
|
|
300
|
+
|
|
301
|
+
## Philosophy On Synthetic Data
|
|
302
|
+
|
|
303
|
+
This project does not assume synthetic data is magic.
|
|
304
|
+
|
|
305
|
+
It assumes:
|
|
306
|
+
|
|
307
|
+
- synthetic data works best when the task shape is explicit
|
|
308
|
+
- typed intermediate representations are easier to control
|
|
309
|
+
- simple priors beat vague giant prompts
|
|
310
|
+
- extensibility matters because different teams want different schemas, formats, and runtimes
|
|
311
|
+
|
|
312
|
+
The aim is not "generate random data."
|
|
313
|
+
|
|
314
|
+
The aim is to turn raw text into useful supervision signals for SFT and distillation in a way that is structured, inspectable, and easy to extend.
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# text-albumentations
|
|
2
|
+
|
|
3
|
+
`text-albumentations` is a synthetic data generation engine for text.
|
|
4
|
+
|
|
5
|
+
The goal is to help generate instruction-tuning and distillation datasets from existing text corpora by applying structured augmentations over passages.
|
|
6
|
+
|
|
7
|
+
This is built for the practical case where good supervised fine-tuning often requires more examples than you already have, and where synthetic data generation is one of the fastest ways to create task-shaped training data from raw documents.
|
|
8
|
+
|
|
9
|
+
## Why This Exists
|
|
10
|
+
|
|
11
|
+
Modern LLM workflows often need:
|
|
12
|
+
|
|
13
|
+
- synthetic SFT data
|
|
14
|
+
- task-specific distillation data
|
|
15
|
+
- multiple renderings of the same semantic content
|
|
16
|
+
- structured supervision generated from long-form text
|
|
17
|
+
|
|
18
|
+
If you already have long amounts of text, you can usually derive many useful supervision targets from it:
|
|
19
|
+
|
|
20
|
+
- bullet-point summaries
|
|
21
|
+
- QA pairs
|
|
22
|
+
- rephrasings
|
|
23
|
+
- continuation tasks
|
|
24
|
+
- retrieval examples
|
|
25
|
+
- comparisons
|
|
26
|
+
- knowledge graph triplets
|
|
27
|
+
|
|
28
|
+
Instead of treating synthetic data generation as one giant prompt, this project breaks it into explicit, composable pieces.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Ideology
|
|
32
|
+
|
|
33
|
+
The core idea is:
|
|
34
|
+
|
|
35
|
+
`structured generation + simple priors -> dataset`
|
|
36
|
+
|
|
37
|
+
Structured generation gives you typed intermediate outputs using Pydantic schemas.
|
|
38
|
+
|
|
39
|
+
Simple priors give you the task shape:
|
|
40
|
+
|
|
41
|
+
- "extract bullets"
|
|
42
|
+
- "produce QA pairs"
|
|
43
|
+
- "find the answering passage"
|
|
44
|
+
- "serialize the response as markdown/json/etc"
|
|
45
|
+
|
|
46
|
+
That combination is easier to reason about than unstructured free-form prompting. It also makes the pipeline more extensible: you can swap prompts, schemas, response formats, runtimes, and adapters without rewriting the whole system.
|
|
47
|
+
|
|
48
|
+
## Current Capabilities
|
|
49
|
+
|
|
50
|
+
The project currently supports:
|
|
51
|
+
|
|
52
|
+
- single-chunk augmentations
|
|
53
|
+
- multi-chunk augmentations
|
|
54
|
+
- typed structured outputs with Pydantic
|
|
55
|
+
- Alpaca-format dataset generation
|
|
56
|
+
- response-format control for the Alpaca `output` field
|
|
57
|
+
- sync and async generation runtimes
|
|
58
|
+
- Outlines-backed local models
|
|
59
|
+
- Outlines-backed OpenAI models
|
|
60
|
+
- long-text ingestion with fixed-size character chunking
|
|
61
|
+
- JSONL dataset writing
|
|
62
|
+
|
|
63
|
+
Built-in augmentation families include:
|
|
64
|
+
|
|
65
|
+
- bullets
|
|
66
|
+
- QA pairs
|
|
67
|
+
- rephrase
|
|
68
|
+
- continuation
|
|
69
|
+
- retrieval
|
|
70
|
+
- comparison
|
|
71
|
+
- triplets
|
|
72
|
+
|
|
73
|
+
## Architecture
|
|
74
|
+
|
|
75
|
+
The main abstractions are:
|
|
76
|
+
|
|
77
|
+
- `BaseSingleChunkAugmentation` and `BaseMultiChunkAugmentation`
|
|
78
|
+
These define the task contract: schema, prompt, response formats, generation knobs, and dataset construction.
|
|
79
|
+
|
|
80
|
+
- `BaseResponseFormat`
|
|
81
|
+
This controls how the Alpaca `output` field should be represented and can also modify the system prompt with format-specific instructions.
|
|
82
|
+
|
|
83
|
+
- `BaseAlpacaAdapter`
|
|
84
|
+
This converts typed structured outputs into Alpaca rows.
|
|
85
|
+
|
|
86
|
+
- `ModelRuntime`
|
|
87
|
+
This is the model execution interface. Current implementations support local Outlines models and OpenAI-through-Outlines models.
|
|
88
|
+
|
|
89
|
+
- `AugmentationRunner`
|
|
90
|
+
This binds together:
|
|
91
|
+
1. input data
|
|
92
|
+
2. a runtime
|
|
93
|
+
3. an augmentation
|
|
94
|
+
|
|
95
|
+
## Usage
|
|
96
|
+
|
|
97
|
+
### Minimal Local Example
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import mlx_lm
|
|
101
|
+
import outlines
|
|
102
|
+
|
|
103
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
104
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
105
|
+
|
|
106
|
+
model = outlines.from_mlxlm(*mlx_lm.load("mlx-community/Qwen3.5-4B-OptiQ-4bit"))
|
|
107
|
+
runtime = OutlinesModel(model=model)
|
|
108
|
+
|
|
109
|
+
rows = run_augmentation(
|
|
110
|
+
"The Transformer replaces recurrence with attention and improves parallelization.",
|
|
111
|
+
bullet_augmentation,
|
|
112
|
+
runtime,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
for row in rows:
|
|
116
|
+
print(row.model_dump_json())
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
See [`examples/example_minimal.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_minimal.py).
|
|
120
|
+
|
|
121
|
+
### OpenAI Sync
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import openai
|
|
125
|
+
import outlines
|
|
126
|
+
|
|
127
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
128
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
129
|
+
|
|
130
|
+
model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
|
|
131
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
|
|
132
|
+
|
|
133
|
+
rows = run_augmentation("some passage", bullet_augmentation, runtime)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
See [`examples/example_openai_sync.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_sync.py).
|
|
137
|
+
|
|
138
|
+
### OpenAI Async
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
import asyncio
|
|
142
|
+
import openai
|
|
143
|
+
import outlines
|
|
144
|
+
|
|
145
|
+
from text_albumentations import OutlinesModel, arun_augmentation
|
|
146
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
async def main():
|
|
150
|
+
model = outlines.from_openai(openai.AsyncOpenAI(), "gpt-5.4-nano")
|
|
151
|
+
runtime = OutlinesModel(
|
|
152
|
+
model,
|
|
153
|
+
async_mode=True,
|
|
154
|
+
total_concurrent_calls=4,
|
|
155
|
+
max_tokens_parameter="max_completion_tokens",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
rows = await arun_augmentation("some passage", bullet_augmentation, runtime)
|
|
159
|
+
print(len(rows))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
asyncio.run(main())
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
See [`examples/example_openai_async.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_openai_async.py).
|
|
166
|
+
|
|
167
|
+
### Transformers Local Model
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
import outlines
|
|
171
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
172
|
+
|
|
173
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
174
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
175
|
+
|
|
176
|
+
hf_model = AutoModelForCausalLM.from_pretrained(
|
|
177
|
+
"google/gemma-3-1b-it",
|
|
178
|
+
torch_dtype="auto",
|
|
179
|
+
device_map="auto",
|
|
180
|
+
)
|
|
181
|
+
hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
|
|
182
|
+
|
|
183
|
+
model = outlines.from_transformers(hf_model, hf_tokenizer)
|
|
184
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_new_tokens")
|
|
185
|
+
|
|
186
|
+
rows = run_augmentation("some passage", bullet_augmentation, runtime)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
See [`examples/example_transformers_gemma.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_transformers_gemma.py).
|
|
190
|
+
|
|
191
|
+
### Long Text To JSONL
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
import openai
|
|
195
|
+
import outlines
|
|
196
|
+
|
|
197
|
+
from text_albumentations import OutlinesModel, save_long_text_dataset
|
|
198
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
199
|
+
|
|
200
|
+
model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
|
|
201
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
|
|
202
|
+
|
|
203
|
+
save_long_text_dataset(
|
|
204
|
+
text=long_text,
|
|
205
|
+
output_jsonl="out.jsonl",
|
|
206
|
+
augmentation=bullet_augmentation,
|
|
207
|
+
runtime=runtime,
|
|
208
|
+
chunk_size_chars=300,
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
See [`examples/example_long_text_to_jsonl.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_long_text_to_jsonl.py).
|
|
213
|
+
|
|
214
|
+
### Multiple Augmentations Over The Same Passage
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
import openai
|
|
218
|
+
import outlines
|
|
219
|
+
|
|
220
|
+
from text_albumentations import OutlinesModel, run_augmentation
|
|
221
|
+
from text_albumentations.tasks.bullets import bullet_augmentation
|
|
222
|
+
from text_albumentations.tasks.rephrase import rephrase_augmentation
|
|
223
|
+
|
|
224
|
+
model = outlines.from_openai(openai.OpenAI(), "gpt-5.4-nano")
|
|
225
|
+
runtime = OutlinesModel(model, max_tokens_parameter="max_completion_tokens")
|
|
226
|
+
|
|
227
|
+
rows = []
|
|
228
|
+
rows.extend(run_augmentation("some passage", bullet_augmentation, runtime))
|
|
229
|
+
rows.extend(run_augmentation("some passage", rephrase_augmentation, runtime))
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
See [`examples/example_multiple_augmentations.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_multiple_augmentations.py).
|
|
233
|
+
|
|
234
|
+
### Custom Preprocessing Model
|
|
235
|
+
|
|
236
|
+
You can also make the augmentation input itself be a custom Pydantic model instead of a raw string.
|
|
237
|
+
|
|
238
|
+
See [`examples/example_custom_preprocessing.py`](/Users/avishekbiswas/Projects/text-albumentations/examples/example_custom_preprocessing.py).
|
|
239
|
+
|
|
240
|
+
## Extensibility
|
|
241
|
+
|
|
242
|
+
The project is designed so users can extend it in layers.
|
|
243
|
+
|
|
244
|
+
### 1. Add A New Augmentation
|
|
245
|
+
|
|
246
|
+
Subclass one of:
|
|
247
|
+
|
|
248
|
+
- `BaseSingleChunkAugmentation`
|
|
249
|
+
- `BaseMultiChunkAugmentation`
|
|
250
|
+
|
|
251
|
+
Define:
|
|
252
|
+
|
|
253
|
+
- a Pydantic schema
|
|
254
|
+
- a system prompt
|
|
255
|
+
- `build_user_message(...)`
|
|
256
|
+
- one or more response formats
|
|
257
|
+
|
|
258
|
+
### 2. Add A New Response Format
|
|
259
|
+
|
|
260
|
+
Subclass `BaseResponseFormat` if you want to control:
|
|
261
|
+
|
|
262
|
+
- how the format modifies the system prompt
|
|
263
|
+
- how the final Alpaca `output` field is rendered
|
|
264
|
+
|
|
265
|
+
For common Alpaca row generation, `AlpacaResponseFormat` is usually enough.
|
|
266
|
+
|
|
267
|
+
### 3. Add A New Adapter
|
|
268
|
+
|
|
269
|
+
Subclass `BaseAlpacaAdapter` to convert a typed structured output into one or more Alpaca rows.
|
|
270
|
+
|
|
271
|
+
One structured output can expand into multiple rows.
|
|
272
|
+
|
|
273
|
+
### 4. Add A New Runtime
|
|
274
|
+
|
|
275
|
+
Implement `ModelRuntime` if you want to support a new backend.
|
|
276
|
+
|
|
277
|
+
That keeps model execution separate from:
|
|
278
|
+
|
|
279
|
+
- augmentation semantics
|
|
280
|
+
- prompt construction
|
|
281
|
+
- dataset adapters
|
|
282
|
+
- response serialization
|
|
283
|
+
|
|
284
|
+
This separation is intentional. The project should let you swap the model layer without rewriting the dataset logic.
|
|
285
|
+
|
|
286
|
+
## Philosophy On Synthetic Data
|
|
287
|
+
|
|
288
|
+
This project does not assume synthetic data is magic.
|
|
289
|
+
|
|
290
|
+
It assumes:
|
|
291
|
+
|
|
292
|
+
- synthetic data works best when the task shape is explicit
|
|
293
|
+
- typed intermediate representations are easier to control
|
|
294
|
+
- simple priors beat vague giant prompts
|
|
295
|
+
- extensibility matters because different teams want different schemas, formats, and runtimes
|
|
296
|
+
|
|
297
|
+
The aim is not "generate random data."
|
|
298
|
+
|
|
299
|
+
The aim is to turn raw text into useful supervision signals for SFT and distillation in a way that is structured, inspectable, and easy to extend.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "text-albumentations"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Structured synthetic text data generation for SFT and distillation."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Avishek Biswas" },
|
|
14
|
+
]
|
|
15
|
+
urls = { Repository = "https://github.com/avbiswas/text-albumentations" }
|
|
16
|
+
dependencies = [
|
|
17
|
+
"openai>=2.30.0",
|
|
18
|
+
"outlines>=1.2.12",
|
|
19
|
+
"pydantic>=2",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["."]
|
|
24
|
+
include = ["text_albumentations*"]
|