cudag 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudag/__init__.py +334 -0
- cudag/annotation/__init__.py +77 -0
- cudag/annotation/codegen.py +648 -0
- cudag/annotation/config.py +545 -0
- cudag/annotation/loader.py +342 -0
- cudag/annotation/scaffold.py +121 -0
- cudag/annotation/transcription.py +296 -0
- cudag/cli/__init__.py +5 -0
- cudag/cli/main.py +315 -0
- cudag/cli/new.py +873 -0
- cudag/core/__init__.py +364 -0
- cudag/core/button.py +137 -0
- cudag/core/canvas.py +222 -0
- cudag/core/config.py +70 -0
- cudag/core/coords.py +233 -0
- cudag/core/data_grid.py +804 -0
- cudag/core/dataset.py +678 -0
- cudag/core/distribution.py +136 -0
- cudag/core/drawing.py +75 -0
- cudag/core/fonts.py +156 -0
- cudag/core/generator.py +163 -0
- cudag/core/grid.py +367 -0
- cudag/core/grounding_task.py +247 -0
- cudag/core/icon.py +207 -0
- cudag/core/iconlist_task.py +301 -0
- cudag/core/models.py +1251 -0
- cudag/core/random.py +130 -0
- cudag/core/renderer.py +190 -0
- cudag/core/screen.py +402 -0
- cudag/core/scroll_task.py +254 -0
- cudag/core/scrollable_grid.py +447 -0
- cudag/core/state.py +110 -0
- cudag/core/task.py +293 -0
- cudag/core/taskbar.py +350 -0
- cudag/core/text.py +212 -0
- cudag/core/utils.py +82 -0
- cudag/data/surnames.txt +5000 -0
- cudag/modal_apps/__init__.py +4 -0
- cudag/modal_apps/archive.py +103 -0
- cudag/modal_apps/extract.py +138 -0
- cudag/modal_apps/preprocess.py +529 -0
- cudag/modal_apps/upload.py +317 -0
- cudag/prompts/SYSTEM_PROMPT.txt +104 -0
- cudag/prompts/__init__.py +33 -0
- cudag/prompts/system.py +43 -0
- cudag/prompts/tools.py +382 -0
- cudag/py.typed +0 -0
- cudag/schemas/filesystem.json +90 -0
- cudag/schemas/test_record.schema.json +113 -0
- cudag/schemas/train_record.schema.json +90 -0
- cudag/server/__init__.py +21 -0
- cudag/server/app.py +232 -0
- cudag/server/services/__init__.py +9 -0
- cudag/server/services/generator.py +128 -0
- cudag/templates/scripts/archive.sh +35 -0
- cudag/templates/scripts/build.sh +13 -0
- cudag/templates/scripts/extract.sh +54 -0
- cudag/templates/scripts/generate.sh +116 -0
- cudag/templates/scripts/pre-commit.sh +44 -0
- cudag/templates/scripts/preprocess.sh +46 -0
- cudag/templates/scripts/upload.sh +63 -0
- cudag/templates/scripts/verify.py +428 -0
- cudag/validation/__init__.py +35 -0
- cudag/validation/validate.py +508 -0
- cudag-0.3.10.dist-info/METADATA +570 -0
- cudag-0.3.10.dist-info/RECORD +69 -0
- cudag-0.3.10.dist-info/WHEEL +4 -0
- cudag-0.3.10.dist-info/entry_points.txt +2 -0
- cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
3
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
4
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
CUDAG Dataset Preprocessing on Modal
|
|
8
|
+
|
|
9
|
+
Preprocess the raw JSONL + images dataset on Modal's CPU instances.
|
|
10
|
+
Saves preprocessed tensors to a Modal volume for reuse across training runs.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
modal run preprocess.py --dataset-name my-dataset
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import multiprocessing
|
|
18
|
+
import sys
|
|
19
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import modal
|
|
23
|
+
|
|
24
|
+
# System prompt injected during preprocessing (not stored in training data)
|
|
25
|
+
# fmt: off
|
|
26
|
+
# ruff: noqa: E501
|
|
27
|
+
SYSTEM_PROMPT = """# Tools
|
|
28
|
+
|
|
29
|
+
You may call one or more functions to assist with the user query.
|
|
30
|
+
|
|
31
|
+
You are provided with function signatures within <tools></tools> XML tags:
|
|
32
|
+
<tools>
|
|
33
|
+
{
|
|
34
|
+
\t"type": "function",
|
|
35
|
+
\t"function": {
|
|
36
|
+
\t\t"name_for_human": "computer_use",
|
|
37
|
+
\t\t"name": "computer_use",
|
|
38
|
+
\t\t"description": "Perform computer actions",
|
|
39
|
+
\t\t"parameters": {
|
|
40
|
+
\t\t\t"properties": {
|
|
41
|
+
\t\t\t\t"action": {
|
|
42
|
+
\t\t\t\t\t"description": "* `key`: Press keys in order, release in reverse.\\n* `type`: Type a string of text.\\n* `mouse_move`: Move the cursor to (x, y).\\n* `left_click`: Left click at (x, y).\\n* `left_click_drag`: Click and drag from current to (x, y).\\n* `right_click`: Right click at (x, y).\\n* `middle_click`: Middle click at (x, y).\\n* `double_click`: Double-click at (x, y).\\n* `triple_click`: Triple-click at (x, y) (simulated as double-click).\\n* `scroll`: Scroll the mouse wheel.\\n* `hscroll`: Horizontal scroll.\\n* `wait`: Wait N seconds.\\n* `terminate`: End the task with a status.\\n* `answer`: Answer a question.",
|
|
43
|
+
\t\t\t\t\t"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"],
|
|
44
|
+
\t\t\t\t\t"type": "string"
|
|
45
|
+
\t\t\t\t},
|
|
46
|
+
\t\t\t\t"keys": {"description": "Required only by `action=key`.", "type": "array"},
|
|
47
|
+
\t\t\t\t"text": {"description": "Required only by `action=type`.", "type": "string"},
|
|
48
|
+
\t\t\t\t"coordinate": {"description": "Mouse coordinates (1000x1000 normalized).", "type": "array"},
|
|
49
|
+
\t\t\t\t"pixels": {"description": "The amount of scrolling.", "type": "number"},
|
|
50
|
+
\t\t\t\t"time": {"description": "The seconds to wait.", "type": "number"},
|
|
51
|
+
\t\t\t\t"status": {"description": "The status of the task.", "type": "string", "enum": ["success", "failure"]}
|
|
52
|
+
\t\t\t},
|
|
53
|
+
\t\t\t"required": ["action"],
|
|
54
|
+
\t\t\t"type": "object"
|
|
55
|
+
\t\t},
|
|
56
|
+
\t\t"args_format": "Format the arguments as a JSON object."
|
|
57
|
+
\t}
|
|
58
|
+
}
|
|
59
|
+
</tools>
|
|
60
|
+
|
|
61
|
+
For each function call, return a json object with function name and arguments within
|
|
62
|
+
<tool_call></tool_call> XML tags:
|
|
63
|
+
<tool_call>
|
|
64
|
+
{"name": <function-name>, "arguments": <args-json-object>}
|
|
65
|
+
</tool_call>
|
|
66
|
+
|
|
67
|
+
# Response format
|
|
68
|
+
|
|
69
|
+
Response format for every step:
|
|
70
|
+
1) Action: a short imperative describing what to do in the UI.
|
|
71
|
+
2) One or more <tool_call>...</tool_call> blocks, one per line, each containing only the JSON:
|
|
72
|
+
\t{"name": <function-name>, "arguments": <args-json-object>}.
|
|
73
|
+
|
|
74
|
+
Rules:
|
|
75
|
+
- Output exactly in the order: Action, <tool_call>(s).
|
|
76
|
+
- Be brief: one sentence for Action.
|
|
77
|
+
- Multiple tool calls can be output, one per line.
|
|
78
|
+
- Do not output anything else outside those parts.
|
|
79
|
+
- If finishing, use action=terminate in the tool call."""
|
|
80
|
+
# fmt: on
|
|
81
|
+
|
|
82
|
+
# =============================================================================
|
|
83
|
+
# CENTRALIZED CONFIGURATION
|
|
84
|
+
# =============================================================================
|
|
85
|
+
# Volume names and model info are loaded from config/adapters.yaml via the SDK.
|
|
86
|
+
# Users can customize these by editing the YAML file.
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
from sdk.modal_compat import get_base_vlm, get_volume_name
|
|
90
|
+
|
|
91
|
+
DEFAULT_VOLUME = get_volume_name("lora_training")
|
|
92
|
+
BASE_MODEL = get_base_vlm()
|
|
93
|
+
except ImportError:
|
|
94
|
+
# Fallback when SDK not available
|
|
95
|
+
DEFAULT_VOLUME = "claimhawk-lora-training"
|
|
96
|
+
BASE_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _get_generator_name() -> str:
|
|
100
|
+
"""Extract generator name from --dataset-name arg for dynamic app naming."""
|
|
101
|
+
for i, arg in enumerate(sys.argv):
|
|
102
|
+
if arg == "--dataset-name" and i + 1 < len(sys.argv):
|
|
103
|
+
ds_name = sys.argv[i + 1]
|
|
104
|
+
# Generator name is first part before dash (e.g., "desktop" from "desktop-mike-...")
|
|
105
|
+
return ds_name.split("-")[0] if ds_name else "cudag"
|
|
106
|
+
return "cudag"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# Modal App Setup - dynamically named based on generator
|
|
110
|
+
app = modal.App(f"{_get_generator_name()}-preprocess")
|
|
111
|
+
|
|
112
|
+
# Volume - matches modal-volumes.md structure
|
|
113
|
+
VOLUME = modal.Volume.from_name(DEFAULT_VOLUME, create_if_missing=True)
|
|
114
|
+
|
|
115
|
+
# Docker Image with Dependencies (CPU-only, no GPU needed)
|
|
116
|
+
image = (
|
|
117
|
+
modal.Image.debian_slim(python_version="3.12")
|
|
118
|
+
.pip_install(
|
|
119
|
+
"torch==2.4.0",
|
|
120
|
+
"torchvision==0.19.0",
|
|
121
|
+
)
|
|
122
|
+
.pip_install(
|
|
123
|
+
"transformers>=4.57.0",
|
|
124
|
+
"qwen-vl-utils",
|
|
125
|
+
"Pillow>=10.0.0",
|
|
126
|
+
"numpy>=1.24.0",
|
|
127
|
+
"tqdm>=4.65.0",
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@app.function(
|
|
133
|
+
image=image,
|
|
134
|
+
cpu=16,
|
|
135
|
+
memory=32768, # 32GB RAM
|
|
136
|
+
timeout=7200, # 2 hours max
|
|
137
|
+
volumes={
|
|
138
|
+
"/data": VOLUME,
|
|
139
|
+
},
|
|
140
|
+
)
|
|
141
|
+
def preprocess_dataset_impl(dataset_name: str):
|
|
142
|
+
"""
|
|
143
|
+
Preprocess the dataset on Modal CPU instance.
|
|
144
|
+
|
|
145
|
+
Reads from: /data/datasets/{dataset_name}/
|
|
146
|
+
Writes to: /data/preprocessed/{dataset_name}/
|
|
147
|
+
"""
|
|
148
|
+
import torch
|
|
149
|
+
from PIL import Image
|
|
150
|
+
from qwen_vl_utils import process_vision_info
|
|
151
|
+
from tqdm import tqdm
|
|
152
|
+
from transformers import AutoProcessor
|
|
153
|
+
|
|
154
|
+
# Reload the mounted volume to see latest committed data
|
|
155
|
+
VOLUME.reload()
|
|
156
|
+
|
|
157
|
+
# Paths according to modal-volumes.md structure
|
|
158
|
+
data_root = Path("/data")
|
|
159
|
+
dataset_path = data_root / "datasets" / dataset_name
|
|
160
|
+
preprocessed_path = data_root / "preprocessed" / dataset_name
|
|
161
|
+
|
|
162
|
+
print(f"\n{'='*80}")
|
|
163
|
+
print(f"Starting CUDAG Preprocessing: {dataset_name}")
|
|
164
|
+
print(f"{'='*80}\n")
|
|
165
|
+
print(f"Dataset: {dataset_name}")
|
|
166
|
+
print(f"Dataset path: {dataset_path}")
|
|
167
|
+
print(f"Output path: {preprocessed_path}")
|
|
168
|
+
|
|
169
|
+
# Debug: List what's in the directory
|
|
170
|
+
print(f"\nListing contents of {dataset_path}:")
|
|
171
|
+
if dataset_path.exists():
|
|
172
|
+
all_files = list(dataset_path.iterdir())
|
|
173
|
+
print(f"Found {len(all_files)} items:")
|
|
174
|
+
for item in all_files[:20]:
|
|
175
|
+
print(f" - {item.name} ({'dir' if item.is_dir() else 'file'})")
|
|
176
|
+
else:
|
|
177
|
+
print("Directory does not exist!")
|
|
178
|
+
print("Available in datasets/:")
|
|
179
|
+
datasets_dir = data_root / "datasets"
|
|
180
|
+
if datasets_dir.exists():
|
|
181
|
+
for item in datasets_dir.iterdir():
|
|
182
|
+
print(f" - {item.name}")
|
|
183
|
+
raise FileNotFoundError(f"Dataset not found: {dataset_path}")
|
|
184
|
+
|
|
185
|
+
# Find train and val files
|
|
186
|
+
train_files = list(dataset_path.glob("train*.jsonl"))
|
|
187
|
+
val_files = list(dataset_path.glob("val*.jsonl"))
|
|
188
|
+
test_files = list(dataset_path.glob("test*.jsonl"))
|
|
189
|
+
data_files = list(dataset_path.glob("data.jsonl"))
|
|
190
|
+
|
|
191
|
+
def load_jsonl(path):
|
|
192
|
+
data = []
|
|
193
|
+
with open(path) as f:
|
|
194
|
+
for line in f:
|
|
195
|
+
data.append(json.loads(line))
|
|
196
|
+
return data
|
|
197
|
+
|
|
198
|
+
# Priority: Use existing train/val or train/test splits if available
|
|
199
|
+
if train_files and (val_files or test_files):
|
|
200
|
+
train_path = train_files[0]
|
|
201
|
+
val_path = val_files[0] if val_files else test_files[0]
|
|
202
|
+
print("\nUsing existing dataset split:")
|
|
203
|
+
print(f" Train: {train_path.name}")
|
|
204
|
+
print(f" Val: {val_path.name}")
|
|
205
|
+
train_data = load_jsonl(train_path)
|
|
206
|
+
val_data = load_jsonl(val_path)
|
|
207
|
+
elif data_files:
|
|
208
|
+
print("\nFound single data.jsonl, auto-splitting 90/10...")
|
|
209
|
+
all_data = load_jsonl(data_files[0])
|
|
210
|
+
split_idx = int(len(all_data) * 0.9)
|
|
211
|
+
train_data = all_data[:split_idx]
|
|
212
|
+
val_data = all_data[split_idx:]
|
|
213
|
+
else:
|
|
214
|
+
raise FileNotFoundError(
|
|
215
|
+
f"Could not find train*.jsonl/val*.jsonl or data.jsonl in {dataset_path}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
print("\nDataset size:")
|
|
219
|
+
print(f" Train samples: {len(train_data)}")
|
|
220
|
+
print(f" Val samples: {len(val_data)}")
|
|
221
|
+
|
|
222
|
+
# Load Processor
|
|
223
|
+
print(f"\n{'='*80}")
|
|
224
|
+
print("Loading Processor")
|
|
225
|
+
print(f"{'='*80}\n")
|
|
226
|
+
|
|
227
|
+
model_name = BASE_MODEL
|
|
228
|
+
print(f"Loading processor: {model_name}")
|
|
229
|
+
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
|
230
|
+
print("Processor loaded")
|
|
231
|
+
|
|
232
|
+
# Cache Image Embeddings
|
|
233
|
+
print(f"\n{'='*80}")
|
|
234
|
+
print("Caching Image Embeddings")
|
|
235
|
+
print(f"{'='*80}\n")
|
|
236
|
+
|
|
237
|
+
unique_images = set()
|
|
238
|
+
for sample in train_data + val_data:
|
|
239
|
+
unique_images.add(sample["image"])
|
|
240
|
+
|
|
241
|
+
total_samples = len(train_data) + len(val_data)
|
|
242
|
+
print(f"Found {len(unique_images)} unique images (from {total_samples} total)")
|
|
243
|
+
print(
|
|
244
|
+
f"Reuse ratio: {total_samples / max(len(unique_images), 1):.1f}x"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
image_cache = {}
|
|
248
|
+
|
|
249
|
+
# Helper function for parallel image processing
|
|
250
|
+
def process_single_image(img_path: str) -> tuple[str, dict | None]:
|
|
251
|
+
"""Process a single image and return (path, cached_data) or (path, None) on error."""
|
|
252
|
+
img_path_str = str(img_path)
|
|
253
|
+
|
|
254
|
+
# Handle nested paths - strip dataset name prefix if present
|
|
255
|
+
base_name = dataset_name.split("/")[0] if "/" in dataset_name else dataset_name
|
|
256
|
+
if img_path_str.startswith(f"{base_name}/"):
|
|
257
|
+
img_path_str = img_path_str[len(base_name) + 1 :]
|
|
258
|
+
|
|
259
|
+
full_path = dataset_path / img_path_str
|
|
260
|
+
if not full_path.exists():
|
|
261
|
+
return (img_path, None)
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
image = Image.open(full_path)
|
|
265
|
+
image_inputs, _ = process_vision_info(
|
|
266
|
+
[
|
|
267
|
+
{
|
|
268
|
+
"role": "user",
|
|
269
|
+
"content": [{"type": "image", "image": f"file://{full_path}"}],
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
image_patch_size=16,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return (
|
|
276
|
+
img_path,
|
|
277
|
+
{
|
|
278
|
+
"pixel_values": image_inputs[0] if image_inputs else None,
|
|
279
|
+
"image": image,
|
|
280
|
+
},
|
|
281
|
+
)
|
|
282
|
+
except Exception as e:
|
|
283
|
+
print(f"Warning: Failed to process {full_path}: {e}")
|
|
284
|
+
return (img_path, None)
|
|
285
|
+
|
|
286
|
+
# Process images in parallel using ThreadPoolExecutor
|
|
287
|
+
print("\nProcessing unique images in parallel...")
|
|
288
|
+
num_workers = min(8, multiprocessing.cpu_count())
|
|
289
|
+
print(f"Using {num_workers} workers")
|
|
290
|
+
|
|
291
|
+
sorted_images = sorted(unique_images)
|
|
292
|
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
293
|
+
future_to_path = {
|
|
294
|
+
executor.submit(process_single_image, p): p for p in sorted_images
|
|
295
|
+
}
|
|
296
|
+
for future in tqdm(
|
|
297
|
+
as_completed(future_to_path),
|
|
298
|
+
total=len(sorted_images),
|
|
299
|
+
desc="Caching images",
|
|
300
|
+
):
|
|
301
|
+
img_path, cached_data = future.result()
|
|
302
|
+
if cached_data is not None:
|
|
303
|
+
image_cache[img_path] = cached_data
|
|
304
|
+
|
|
305
|
+
print(f"Cached {len(image_cache)} images")
|
|
306
|
+
|
|
307
|
+
# Preprocess Data
|
|
308
|
+
print(f"\n{'='*80}")
|
|
309
|
+
print("Preprocessing Data")
|
|
310
|
+
print(f"{'='*80}\n")
|
|
311
|
+
|
|
312
|
+
def prepare_sample(sample, image_cache):
|
|
313
|
+
"""Prepare a single sample for training."""
|
|
314
|
+
img_path = sample["image"]
|
|
315
|
+
if img_path not in image_cache:
|
|
316
|
+
raise FileNotFoundError(f"Image not in cache: {img_path}")
|
|
317
|
+
|
|
318
|
+
cached_image = image_cache[img_path]
|
|
319
|
+
old_conversations = sample["conversations"]
|
|
320
|
+
|
|
321
|
+
# Inject system prompt (not stored in training data)
|
|
322
|
+
messages = [
|
|
323
|
+
{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]}
|
|
324
|
+
]
|
|
325
|
+
|
|
326
|
+
# Convert to Qwen-VL format, skipping any system prompts from dataset
|
|
327
|
+
for msg in old_conversations:
|
|
328
|
+
if msg["from"] == "system":
|
|
329
|
+
# Skip - we inject our own system prompt above
|
|
330
|
+
continue
|
|
331
|
+
elif msg["from"] == "human":
|
|
332
|
+
role = "user"
|
|
333
|
+
else:
|
|
334
|
+
role = "assistant"
|
|
335
|
+
|
|
336
|
+
content_list = []
|
|
337
|
+
value = msg["value"]
|
|
338
|
+
|
|
339
|
+
if "<image>" in value:
|
|
340
|
+
content_list.append({"type": "image"})
|
|
341
|
+
text = value.replace("<image>", "").strip()
|
|
342
|
+
if text:
|
|
343
|
+
content_list.append({"type": "text", "text": text})
|
|
344
|
+
else:
|
|
345
|
+
content_list.append({"type": "text", "text": value})
|
|
346
|
+
|
|
347
|
+
messages.append({"role": role, "content": content_list})
|
|
348
|
+
|
|
349
|
+
text = processor.apply_chat_template(
|
|
350
|
+
messages, tokenize=False, add_generation_prompt=False
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
image_inputs = (
|
|
354
|
+
[cached_image["pixel_values"]]
|
|
355
|
+
if cached_image["pixel_values"] is not None
|
|
356
|
+
else None
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
model_inputs = processor(
|
|
360
|
+
text=[text],
|
|
361
|
+
images=image_inputs,
|
|
362
|
+
videos=None,
|
|
363
|
+
return_tensors="pt",
|
|
364
|
+
padding=False,
|
|
365
|
+
do_resize=False,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
input_ids = (
|
|
369
|
+
model_inputs["input_ids"][0]
|
|
370
|
+
if isinstance(model_inputs["input_ids"][0], torch.Tensor)
|
|
371
|
+
else torch.tensor(model_inputs["input_ids"][0])
|
|
372
|
+
)
|
|
373
|
+
attention_mask = (
|
|
374
|
+
model_inputs["attention_mask"][0]
|
|
375
|
+
if isinstance(model_inputs["attention_mask"][0], torch.Tensor)
|
|
376
|
+
else torch.tensor(model_inputs["attention_mask"][0])
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Create labels: Only train on assistant responses
|
|
380
|
+
ignore_index = -100
|
|
381
|
+
labels = torch.full_like(input_ids, ignore_index)
|
|
382
|
+
|
|
383
|
+
input_ids_list = input_ids.tolist()
|
|
384
|
+
seq_len = len(input_ids_list)
|
|
385
|
+
pos = 0
|
|
386
|
+
|
|
387
|
+
while pos < seq_len:
|
|
388
|
+
# Look for <|im_start|>assistant (token ID 77091)
|
|
389
|
+
if input_ids_list[pos] == 77091:
|
|
390
|
+
ans_start = pos + 2
|
|
391
|
+
ans_end = ans_start
|
|
392
|
+
|
|
393
|
+
# Find <|im_end|> (token ID 151645)
|
|
394
|
+
while ans_end < seq_len and input_ids_list[ans_end] != 151645:
|
|
395
|
+
ans_end += 1
|
|
396
|
+
|
|
397
|
+
if ans_end < seq_len:
|
|
398
|
+
labels[ans_start : ans_end + 2] = input_ids[ans_start : ans_end + 2]
|
|
399
|
+
pos = ans_end
|
|
400
|
+
pos += 1
|
|
401
|
+
|
|
402
|
+
result = {
|
|
403
|
+
"input_ids": input_ids,
|
|
404
|
+
"attention_mask": attention_mask,
|
|
405
|
+
"labels": labels,
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if "pixel_values" in model_inputs:
|
|
409
|
+
result["pixel_values"] = model_inputs["pixel_values"]
|
|
410
|
+
result["image_grid_thw"] = model_inputs["image_grid_thw"]
|
|
411
|
+
|
|
412
|
+
return result
|
|
413
|
+
|
|
414
|
+
# Helper to process and save a single sample
|
|
415
|
+
def process_and_save_sample(
|
|
416
|
+
args: tuple[int, dict, Path],
|
|
417
|
+
) -> tuple[int, str | None, str | None]:
|
|
418
|
+
"""Process a single sample and save to disk. Returns (idx, path, error)."""
|
|
419
|
+
idx, sample, output_dir = args
|
|
420
|
+
try:
|
|
421
|
+
processed = prepare_sample(sample, image_cache)
|
|
422
|
+
|
|
423
|
+
processed_cpu = {
|
|
424
|
+
"input_ids": processed["input_ids"].cpu(),
|
|
425
|
+
"attention_mask": processed["attention_mask"].cpu(),
|
|
426
|
+
"labels": processed["labels"].cpu(),
|
|
427
|
+
}
|
|
428
|
+
if "pixel_values" in processed:
|
|
429
|
+
processed_cpu["pixel_values"] = processed["pixel_values"].cpu()
|
|
430
|
+
processed_cpu["image_grid_thw"] = processed["image_grid_thw"].cpu()
|
|
431
|
+
|
|
432
|
+
sample_path = output_dir / f"sample_{idx:06d}.pt"
|
|
433
|
+
torch.save(processed_cpu, sample_path)
|
|
434
|
+
return (idx, str(sample_path), None)
|
|
435
|
+
except Exception as e:
|
|
436
|
+
return (idx, None, str(e))
|
|
437
|
+
|
|
438
|
+
# Process training data in parallel
|
|
439
|
+
print("Processing training data in parallel...")
|
|
440
|
+
train_output_dir = preprocessed_path / "train"
|
|
441
|
+
train_output_dir.mkdir(parents=True, exist_ok=True)
|
|
442
|
+
|
|
443
|
+
train_args = [(i, sample, train_output_dir) for i, sample in enumerate(train_data)]
|
|
444
|
+
train_processed = []
|
|
445
|
+
|
|
446
|
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
447
|
+
futures = {executor.submit(process_and_save_sample, arg): arg[0] for arg in train_args}
|
|
448
|
+
for future in tqdm(as_completed(futures), total=len(futures), desc="Train"):
|
|
449
|
+
idx, path, error = future.result()
|
|
450
|
+
if error:
|
|
451
|
+
print(f"\nError processing train sample {idx}: {error}")
|
|
452
|
+
raise RuntimeError(f"Failed to process sample {idx}: {error}")
|
|
453
|
+
if path:
|
|
454
|
+
train_processed.append(path)
|
|
455
|
+
|
|
456
|
+
# Process validation data in parallel
|
|
457
|
+
print("\nProcessing validation data in parallel...")
|
|
458
|
+
val_output_dir = preprocessed_path / "val"
|
|
459
|
+
val_output_dir.mkdir(parents=True, exist_ok=True)
|
|
460
|
+
|
|
461
|
+
val_args = [(i, sample, val_output_dir) for i, sample in enumerate(val_data)]
|
|
462
|
+
val_processed = []
|
|
463
|
+
|
|
464
|
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
465
|
+
futures = {executor.submit(process_and_save_sample, arg): arg[0] for arg in val_args}
|
|
466
|
+
for future in tqdm(as_completed(futures), total=len(futures), desc="Val"):
|
|
467
|
+
idx, path, error = future.result()
|
|
468
|
+
if error:
|
|
469
|
+
print(f"\nError processing val sample {idx}: {error}")
|
|
470
|
+
raise RuntimeError(f"Failed to process sample {idx}: {error}")
|
|
471
|
+
if path:
|
|
472
|
+
val_processed.append(path)
|
|
473
|
+
|
|
474
|
+
# Save metadata
|
|
475
|
+
metadata = {
|
|
476
|
+
"train_samples": len(train_processed),
|
|
477
|
+
"val_samples": len(val_processed),
|
|
478
|
+
"model_name": model_name,
|
|
479
|
+
"dataset_name": dataset_name,
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
metadata_path = preprocessed_path / "metadata.json"
|
|
483
|
+
with open(metadata_path, "w") as f:
|
|
484
|
+
json.dump(metadata, f, indent=2)
|
|
485
|
+
|
|
486
|
+
print("\nPreprocessing complete!")
|
|
487
|
+
print(f" Train samples: {len(train_processed)}")
|
|
488
|
+
print(f" Val samples: {len(val_processed)}")
|
|
489
|
+
|
|
490
|
+
total_size = (
|
|
491
|
+
sum(f.stat().st_size for f in preprocessed_path.rglob("*.pt")) / (1024**3)
|
|
492
|
+
)
|
|
493
|
+
print(f" Total preprocessed size: {total_size:.2f} GB")
|
|
494
|
+
|
|
495
|
+
# Commit volume changes
|
|
496
|
+
VOLUME.commit()
|
|
497
|
+
|
|
498
|
+
print(f"\nPreprocessed data saved to Modal volume: preprocessed/{dataset_name}")
|
|
499
|
+
|
|
500
|
+
print(f"\n{'='*80}")
|
|
501
|
+
print("PREPROCESSING COMPLETE!")
|
|
502
|
+
print(f"{'='*80}\n")
|
|
503
|
+
|
|
504
|
+
return {
|
|
505
|
+
"train_samples": len(train_processed),
|
|
506
|
+
"val_samples": len(val_processed),
|
|
507
|
+
"total_size_gb": total_size,
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
@app.local_entrypoint()
|
|
512
|
+
def main(dataset_name: str):
|
|
513
|
+
"""
|
|
514
|
+
Local entrypoint for running preprocessing.
|
|
515
|
+
|
|
516
|
+
Usage:
|
|
517
|
+
modal run preprocess.py --dataset-name my-dataset
|
|
518
|
+
"""
|
|
519
|
+
print(f"\n{'='*80}")
|
|
520
|
+
print("Submitting preprocessing job to Modal...")
|
|
521
|
+
print(f"Dataset: {dataset_name}")
|
|
522
|
+
print(f"{'='*80}\n")
|
|
523
|
+
|
|
524
|
+
result = preprocess_dataset_impl.remote(dataset_name)
|
|
525
|
+
|
|
526
|
+
print(f"\n{'='*80}")
|
|
527
|
+
print("Preprocessing job completed!")
|
|
528
|
+
print(f"{'='*80}\n")
|
|
529
|
+
print(f"Results: {result}")
|