cudag 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cudag/__init__.py +334 -0
  2. cudag/annotation/__init__.py +77 -0
  3. cudag/annotation/codegen.py +648 -0
  4. cudag/annotation/config.py +545 -0
  5. cudag/annotation/loader.py +342 -0
  6. cudag/annotation/scaffold.py +121 -0
  7. cudag/annotation/transcription.py +296 -0
  8. cudag/cli/__init__.py +5 -0
  9. cudag/cli/main.py +315 -0
  10. cudag/cli/new.py +873 -0
  11. cudag/core/__init__.py +364 -0
  12. cudag/core/button.py +137 -0
  13. cudag/core/canvas.py +222 -0
  14. cudag/core/config.py +70 -0
  15. cudag/core/coords.py +233 -0
  16. cudag/core/data_grid.py +804 -0
  17. cudag/core/dataset.py +678 -0
  18. cudag/core/distribution.py +136 -0
  19. cudag/core/drawing.py +75 -0
  20. cudag/core/fonts.py +156 -0
  21. cudag/core/generator.py +163 -0
  22. cudag/core/grid.py +367 -0
  23. cudag/core/grounding_task.py +247 -0
  24. cudag/core/icon.py +207 -0
  25. cudag/core/iconlist_task.py +301 -0
  26. cudag/core/models.py +1251 -0
  27. cudag/core/random.py +130 -0
  28. cudag/core/renderer.py +190 -0
  29. cudag/core/screen.py +402 -0
  30. cudag/core/scroll_task.py +254 -0
  31. cudag/core/scrollable_grid.py +447 -0
  32. cudag/core/state.py +110 -0
  33. cudag/core/task.py +293 -0
  34. cudag/core/taskbar.py +350 -0
  35. cudag/core/text.py +212 -0
  36. cudag/core/utils.py +82 -0
  37. cudag/data/surnames.txt +5000 -0
  38. cudag/modal_apps/__init__.py +4 -0
  39. cudag/modal_apps/archive.py +103 -0
  40. cudag/modal_apps/extract.py +138 -0
  41. cudag/modal_apps/preprocess.py +529 -0
  42. cudag/modal_apps/upload.py +317 -0
  43. cudag/prompts/SYSTEM_PROMPT.txt +104 -0
  44. cudag/prompts/__init__.py +33 -0
  45. cudag/prompts/system.py +43 -0
  46. cudag/prompts/tools.py +382 -0
  47. cudag/py.typed +0 -0
  48. cudag/schemas/filesystem.json +90 -0
  49. cudag/schemas/test_record.schema.json +113 -0
  50. cudag/schemas/train_record.schema.json +90 -0
  51. cudag/server/__init__.py +21 -0
  52. cudag/server/app.py +232 -0
  53. cudag/server/services/__init__.py +9 -0
  54. cudag/server/services/generator.py +128 -0
  55. cudag/templates/scripts/archive.sh +35 -0
  56. cudag/templates/scripts/build.sh +13 -0
  57. cudag/templates/scripts/extract.sh +54 -0
  58. cudag/templates/scripts/generate.sh +116 -0
  59. cudag/templates/scripts/pre-commit.sh +44 -0
  60. cudag/templates/scripts/preprocess.sh +46 -0
  61. cudag/templates/scripts/upload.sh +63 -0
  62. cudag/templates/scripts/verify.py +428 -0
  63. cudag/validation/__init__.py +35 -0
  64. cudag/validation/validate.py +508 -0
  65. cudag-0.3.10.dist-info/METADATA +570 -0
  66. cudag-0.3.10.dist-info/RECORD +69 -0
  67. cudag-0.3.10.dist-info/WHEEL +4 -0
  68. cudag-0.3.10.dist-info/entry_points.txt +2 -0
  69. cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
@@ -0,0 +1,529 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2025 Tylt LLC. All rights reserved.
3
+ # CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
4
+ # is strictly prohibited. For licensing inquiries: hello@claimhawk.app
5
+
6
+ """
7
+ CUDAG Dataset Preprocessing on Modal
8
+
9
+ Preprocess the raw JSONL + images dataset on Modal's CPU instances.
10
+ Saves preprocessed tensors to a Modal volume for reuse across training runs.
11
+
12
+ Usage:
13
+ modal run preprocess.py --dataset-name my-dataset
14
+ """
15
+
16
+ import json
17
+ import multiprocessing
18
+ import sys
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from pathlib import Path
21
+
22
+ import modal
23
+
24
+ # System prompt injected during preprocessing (not stored in training data)
25
+ # fmt: off
26
+ # ruff: noqa: E501
27
+ SYSTEM_PROMPT = """# Tools
28
+
29
+ You may call one or more functions to assist with the user query.
30
+
31
+ You are provided with function signatures within <tools></tools> XML tags:
32
+ <tools>
33
+ {
34
+ \t"type": "function",
35
+ \t"function": {
36
+ \t\t"name_for_human": "computer_use",
37
+ \t\t"name": "computer_use",
38
+ \t\t"description": "Perform computer actions",
39
+ \t\t"parameters": {
40
+ \t\t\t"properties": {
41
+ \t\t\t\t"action": {
42
+ \t\t\t\t\t"description": "* `key`: Press keys in order, release in reverse.\\n* `type`: Type a string of text.\\n* `mouse_move`: Move the cursor to (x, y).\\n* `left_click`: Left click at (x, y).\\n* `left_click_drag`: Click and drag from current to (x, y).\\n* `right_click`: Right click at (x, y).\\n* `middle_click`: Middle click at (x, y).\\n* `double_click`: Double-click at (x, y).\\n* `triple_click`: Triple-click at (x, y) (simulated as double-click).\\n* `scroll`: Scroll the mouse wheel.\\n* `hscroll`: Horizontal scroll.\\n* `wait`: Wait N seconds.\\n* `terminate`: End the task with a status.\\n* `answer`: Answer a question.",
43
+ \t\t\t\t\t"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"],
44
+ \t\t\t\t\t"type": "string"
45
+ \t\t\t\t},
46
+ \t\t\t\t"keys": {"description": "Required only by `action=key`.", "type": "array"},
47
+ \t\t\t\t"text": {"description": "Required only by `action=type`.", "type": "string"},
48
+ \t\t\t\t"coordinate": {"description": "Mouse coordinates (1000x1000 normalized).", "type": "array"},
49
+ \t\t\t\t"pixels": {"description": "The amount of scrolling.", "type": "number"},
50
+ \t\t\t\t"time": {"description": "The seconds to wait.", "type": "number"},
51
+ \t\t\t\t"status": {"description": "The status of the task.", "type": "string", "enum": ["success", "failure"]}
52
+ \t\t\t},
53
+ \t\t\t"required": ["action"],
54
+ \t\t\t"type": "object"
55
+ \t\t},
56
+ \t\t"args_format": "Format the arguments as a JSON object."
57
+ \t}
58
+ }
59
+ </tools>
60
+
61
+ For each function call, return a json object with function name and arguments within
62
+ <tool_call></tool_call> XML tags:
63
+ <tool_call>
64
+ {"name": <function-name>, "arguments": <args-json-object>}
65
+ </tool_call>
66
+
67
+ # Response format
68
+
69
+ Response format for every step:
70
+ 1) Action: a short imperative describing what to do in the UI.
71
+ 2) One or more <tool_call>...</tool_call> blocks, one per line, each containing only the JSON:
72
+ \t{"name": <function-name>, "arguments": <args-json-object>}.
73
+
74
+ Rules:
75
+ - Output exactly in the order: Action, <tool_call>(s).
76
+ - Be brief: one sentence for Action.
77
+ - Multiple tool calls can be output, one per line.
78
+ - Do not output anything else outside those parts.
79
+ - If finishing, use action=terminate in the tool call."""
80
+ # fmt: on
81
+
82
+ # =============================================================================
83
+ # CENTRALIZED CONFIGURATION
84
+ # =============================================================================
85
+ # Volume names and model info are loaded from config/adapters.yaml via the SDK.
86
+ # Users can customize these by editing the YAML file.
87
+
88
+ try:
89
+ from sdk.modal_compat import get_base_vlm, get_volume_name
90
+
91
+ DEFAULT_VOLUME = get_volume_name("lora_training")
92
+ BASE_MODEL = get_base_vlm()
93
+ except ImportError:
94
+ # Fallback when SDK not available
95
+ DEFAULT_VOLUME = "claimhawk-lora-training"
96
+ BASE_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
97
+
98
+
99
+ def _get_generator_name() -> str:
100
+ """Extract generator name from --dataset-name arg for dynamic app naming."""
101
+ for i, arg in enumerate(sys.argv):
102
+ if arg == "--dataset-name" and i + 1 < len(sys.argv):
103
+ ds_name = sys.argv[i + 1]
104
+ # Generator name is first part before dash (e.g., "desktop" from "desktop-mike-...")
105
+ return ds_name.split("-")[0] if ds_name else "cudag"
106
+ return "cudag"
107
+
108
+
109
+ # Modal App Setup - dynamically named based on generator
110
+ app = modal.App(f"{_get_generator_name()}-preprocess")
111
+
112
+ # Volume - matches modal-volumes.md structure
113
+ VOLUME = modal.Volume.from_name(DEFAULT_VOLUME, create_if_missing=True)
114
+
115
+ # Docker Image with Dependencies (CPU-only, no GPU needed)
116
+ image = (
117
+ modal.Image.debian_slim(python_version="3.12")
118
+ .pip_install(
119
+ "torch==2.4.0",
120
+ "torchvision==0.19.0",
121
+ )
122
+ .pip_install(
123
+ "transformers>=4.57.0",
124
+ "qwen-vl-utils",
125
+ "Pillow>=10.0.0",
126
+ "numpy>=1.24.0",
127
+ "tqdm>=4.65.0",
128
+ )
129
+ )
130
+
131
+
132
+ @app.function(
133
+ image=image,
134
+ cpu=16,
135
+ memory=32768, # 32GB RAM
136
+ timeout=7200, # 2 hours max
137
+ volumes={
138
+ "/data": VOLUME,
139
+ },
140
+ )
141
+ def preprocess_dataset_impl(dataset_name: str):
142
+ """
143
+ Preprocess the dataset on Modal CPU instance.
144
+
145
+ Reads from: /data/datasets/{dataset_name}/
146
+ Writes to: /data/preprocessed/{dataset_name}/
147
+ """
148
+ import torch
149
+ from PIL import Image
150
+ from qwen_vl_utils import process_vision_info
151
+ from tqdm import tqdm
152
+ from transformers import AutoProcessor
153
+
154
+ # Reload the mounted volume to see latest committed data
155
+ VOLUME.reload()
156
+
157
+ # Paths according to modal-volumes.md structure
158
+ data_root = Path("/data")
159
+ dataset_path = data_root / "datasets" / dataset_name
160
+ preprocessed_path = data_root / "preprocessed" / dataset_name
161
+
162
+ print(f"\n{'='*80}")
163
+ print(f"Starting CUDAG Preprocessing: {dataset_name}")
164
+ print(f"{'='*80}\n")
165
+ print(f"Dataset: {dataset_name}")
166
+ print(f"Dataset path: {dataset_path}")
167
+ print(f"Output path: {preprocessed_path}")
168
+
169
+ # Debug: List what's in the directory
170
+ print(f"\nListing contents of {dataset_path}:")
171
+ if dataset_path.exists():
172
+ all_files = list(dataset_path.iterdir())
173
+ print(f"Found {len(all_files)} items:")
174
+ for item in all_files[:20]:
175
+ print(f" - {item.name} ({'dir' if item.is_dir() else 'file'})")
176
+ else:
177
+ print("Directory does not exist!")
178
+ print("Available in datasets/:")
179
+ datasets_dir = data_root / "datasets"
180
+ if datasets_dir.exists():
181
+ for item in datasets_dir.iterdir():
182
+ print(f" - {item.name}")
183
+ raise FileNotFoundError(f"Dataset not found: {dataset_path}")
184
+
185
+ # Find train and val files
186
+ train_files = list(dataset_path.glob("train*.jsonl"))
187
+ val_files = list(dataset_path.glob("val*.jsonl"))
188
+ test_files = list(dataset_path.glob("test*.jsonl"))
189
+ data_files = list(dataset_path.glob("data.jsonl"))
190
+
191
+ def load_jsonl(path):
192
+ data = []
193
+ with open(path) as f:
194
+ for line in f:
195
+ data.append(json.loads(line))
196
+ return data
197
+
198
+ # Priority: Use existing train/val or train/test splits if available
199
+ if train_files and (val_files or test_files):
200
+ train_path = train_files[0]
201
+ val_path = val_files[0] if val_files else test_files[0]
202
+ print("\nUsing existing dataset split:")
203
+ print(f" Train: {train_path.name}")
204
+ print(f" Val: {val_path.name}")
205
+ train_data = load_jsonl(train_path)
206
+ val_data = load_jsonl(val_path)
207
+ elif data_files:
208
+ print("\nFound single data.jsonl, auto-splitting 90/10...")
209
+ all_data = load_jsonl(data_files[0])
210
+ split_idx = int(len(all_data) * 0.9)
211
+ train_data = all_data[:split_idx]
212
+ val_data = all_data[split_idx:]
213
+ else:
214
+ raise FileNotFoundError(
215
+ f"Could not find train*.jsonl/val*.jsonl or data.jsonl in {dataset_path}"
216
+ )
217
+
218
+ print("\nDataset size:")
219
+ print(f" Train samples: {len(train_data)}")
220
+ print(f" Val samples: {len(val_data)}")
221
+
222
+ # Load Processor
223
+ print(f"\n{'='*80}")
224
+ print("Loading Processor")
225
+ print(f"{'='*80}\n")
226
+
227
+ model_name = BASE_MODEL
228
+ print(f"Loading processor: {model_name}")
229
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
230
+ print("Processor loaded")
231
+
232
+ # Cache Image Embeddings
233
+ print(f"\n{'='*80}")
234
+ print("Caching Image Embeddings")
235
+ print(f"{'='*80}\n")
236
+
237
+ unique_images = set()
238
+ for sample in train_data + val_data:
239
+ unique_images.add(sample["image"])
240
+
241
+ total_samples = len(train_data) + len(val_data)
242
+ print(f"Found {len(unique_images)} unique images (from {total_samples} total)")
243
+ print(
244
+ f"Reuse ratio: {total_samples / max(len(unique_images), 1):.1f}x"
245
+ )
246
+
247
+ image_cache = {}
248
+
249
+ # Helper function for parallel image processing
250
+ def process_single_image(img_path: str) -> tuple[str, dict | None]:
251
+ """Process a single image and return (path, cached_data) or (path, None) on error."""
252
+ img_path_str = str(img_path)
253
+
254
+ # Handle nested paths - strip dataset name prefix if present
255
+ base_name = dataset_name.split("/")[0] if "/" in dataset_name else dataset_name
256
+ if img_path_str.startswith(f"{base_name}/"):
257
+ img_path_str = img_path_str[len(base_name) + 1 :]
258
+
259
+ full_path = dataset_path / img_path_str
260
+ if not full_path.exists():
261
+ return (img_path, None)
262
+
263
+ try:
264
+ image = Image.open(full_path)
265
+ image_inputs, _ = process_vision_info(
266
+ [
267
+ {
268
+ "role": "user",
269
+ "content": [{"type": "image", "image": f"file://{full_path}"}],
270
+ }
271
+ ],
272
+ image_patch_size=16,
273
+ )
274
+
275
+ return (
276
+ img_path,
277
+ {
278
+ "pixel_values": image_inputs[0] if image_inputs else None,
279
+ "image": image,
280
+ },
281
+ )
282
+ except Exception as e:
283
+ print(f"Warning: Failed to process {full_path}: {e}")
284
+ return (img_path, None)
285
+
286
+ # Process images in parallel using ThreadPoolExecutor
287
+ print("\nProcessing unique images in parallel...")
288
+ num_workers = min(8, multiprocessing.cpu_count())
289
+ print(f"Using {num_workers} workers")
290
+
291
+ sorted_images = sorted(unique_images)
292
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
293
+ future_to_path = {
294
+ executor.submit(process_single_image, p): p for p in sorted_images
295
+ }
296
+ for future in tqdm(
297
+ as_completed(future_to_path),
298
+ total=len(sorted_images),
299
+ desc="Caching images",
300
+ ):
301
+ img_path, cached_data = future.result()
302
+ if cached_data is not None:
303
+ image_cache[img_path] = cached_data
304
+
305
+ print(f"Cached {len(image_cache)} images")
306
+
307
+ # Preprocess Data
308
+ print(f"\n{'='*80}")
309
+ print("Preprocessing Data")
310
+ print(f"{'='*80}\n")
311
+
312
+ def prepare_sample(sample, image_cache):
313
+ """Prepare a single sample for training."""
314
+ img_path = sample["image"]
315
+ if img_path not in image_cache:
316
+ raise FileNotFoundError(f"Image not in cache: {img_path}")
317
+
318
+ cached_image = image_cache[img_path]
319
+ old_conversations = sample["conversations"]
320
+
321
+ # Inject system prompt (not stored in training data)
322
+ messages = [
323
+ {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]}
324
+ ]
325
+
326
+ # Convert to Qwen-VL format, skipping any system prompts from dataset
327
+ for msg in old_conversations:
328
+ if msg["from"] == "system":
329
+ # Skip - we inject our own system prompt above
330
+ continue
331
+ elif msg["from"] == "human":
332
+ role = "user"
333
+ else:
334
+ role = "assistant"
335
+
336
+ content_list = []
337
+ value = msg["value"]
338
+
339
+ if "<image>" in value:
340
+ content_list.append({"type": "image"})
341
+ text = value.replace("<image>", "").strip()
342
+ if text:
343
+ content_list.append({"type": "text", "text": text})
344
+ else:
345
+ content_list.append({"type": "text", "text": value})
346
+
347
+ messages.append({"role": role, "content": content_list})
348
+
349
+ text = processor.apply_chat_template(
350
+ messages, tokenize=False, add_generation_prompt=False
351
+ )
352
+
353
+ image_inputs = (
354
+ [cached_image["pixel_values"]]
355
+ if cached_image["pixel_values"] is not None
356
+ else None
357
+ )
358
+
359
+ model_inputs = processor(
360
+ text=[text],
361
+ images=image_inputs,
362
+ videos=None,
363
+ return_tensors="pt",
364
+ padding=False,
365
+ do_resize=False,
366
+ )
367
+
368
+ input_ids = (
369
+ model_inputs["input_ids"][0]
370
+ if isinstance(model_inputs["input_ids"][0], torch.Tensor)
371
+ else torch.tensor(model_inputs["input_ids"][0])
372
+ )
373
+ attention_mask = (
374
+ model_inputs["attention_mask"][0]
375
+ if isinstance(model_inputs["attention_mask"][0], torch.Tensor)
376
+ else torch.tensor(model_inputs["attention_mask"][0])
377
+ )
378
+
379
+ # Create labels: Only train on assistant responses
380
+ ignore_index = -100
381
+ labels = torch.full_like(input_ids, ignore_index)
382
+
383
+ input_ids_list = input_ids.tolist()
384
+ seq_len = len(input_ids_list)
385
+ pos = 0
386
+
387
+ while pos < seq_len:
388
+ # Look for <|im_start|>assistant (token ID 77091)
389
+ if input_ids_list[pos] == 77091:
390
+ ans_start = pos + 2
391
+ ans_end = ans_start
392
+
393
+ # Find <|im_end|> (token ID 151645)
394
+ while ans_end < seq_len and input_ids_list[ans_end] != 151645:
395
+ ans_end += 1
396
+
397
+ if ans_end < seq_len:
398
+ labels[ans_start : ans_end + 2] = input_ids[ans_start : ans_end + 2]
399
+ pos = ans_end
400
+ pos += 1
401
+
402
+ result = {
403
+ "input_ids": input_ids,
404
+ "attention_mask": attention_mask,
405
+ "labels": labels,
406
+ }
407
+
408
+ if "pixel_values" in model_inputs:
409
+ result["pixel_values"] = model_inputs["pixel_values"]
410
+ result["image_grid_thw"] = model_inputs["image_grid_thw"]
411
+
412
+ return result
413
+
414
+ # Helper to process and save a single sample
415
+ def process_and_save_sample(
416
+ args: tuple[int, dict, Path],
417
+ ) -> tuple[int, str | None, str | None]:
418
+ """Process a single sample and save to disk. Returns (idx, path, error)."""
419
+ idx, sample, output_dir = args
420
+ try:
421
+ processed = prepare_sample(sample, image_cache)
422
+
423
+ processed_cpu = {
424
+ "input_ids": processed["input_ids"].cpu(),
425
+ "attention_mask": processed["attention_mask"].cpu(),
426
+ "labels": processed["labels"].cpu(),
427
+ }
428
+ if "pixel_values" in processed:
429
+ processed_cpu["pixel_values"] = processed["pixel_values"].cpu()
430
+ processed_cpu["image_grid_thw"] = processed["image_grid_thw"].cpu()
431
+
432
+ sample_path = output_dir / f"sample_{idx:06d}.pt"
433
+ torch.save(processed_cpu, sample_path)
434
+ return (idx, str(sample_path), None)
435
+ except Exception as e:
436
+ return (idx, None, str(e))
437
+
438
+ # Process training data in parallel
439
+ print("Processing training data in parallel...")
440
+ train_output_dir = preprocessed_path / "train"
441
+ train_output_dir.mkdir(parents=True, exist_ok=True)
442
+
443
+ train_args = [(i, sample, train_output_dir) for i, sample in enumerate(train_data)]
444
+ train_processed = []
445
+
446
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
447
+ futures = {executor.submit(process_and_save_sample, arg): arg[0] for arg in train_args}
448
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Train"):
449
+ idx, path, error = future.result()
450
+ if error:
451
+ print(f"\nError processing train sample {idx}: {error}")
452
+ raise RuntimeError(f"Failed to process sample {idx}: {error}")
453
+ if path:
454
+ train_processed.append(path)
455
+
456
+ # Process validation data in parallel
457
+ print("\nProcessing validation data in parallel...")
458
+ val_output_dir = preprocessed_path / "val"
459
+ val_output_dir.mkdir(parents=True, exist_ok=True)
460
+
461
+ val_args = [(i, sample, val_output_dir) for i, sample in enumerate(val_data)]
462
+ val_processed = []
463
+
464
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
465
+ futures = {executor.submit(process_and_save_sample, arg): arg[0] for arg in val_args}
466
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Val"):
467
+ idx, path, error = future.result()
468
+ if error:
469
+ print(f"\nError processing val sample {idx}: {error}")
470
+ raise RuntimeError(f"Failed to process sample {idx}: {error}")
471
+ if path:
472
+ val_processed.append(path)
473
+
474
+ # Save metadata
475
+ metadata = {
476
+ "train_samples": len(train_processed),
477
+ "val_samples": len(val_processed),
478
+ "model_name": model_name,
479
+ "dataset_name": dataset_name,
480
+ }
481
+
482
+ metadata_path = preprocessed_path / "metadata.json"
483
+ with open(metadata_path, "w") as f:
484
+ json.dump(metadata, f, indent=2)
485
+
486
+ print("\nPreprocessing complete!")
487
+ print(f" Train samples: {len(train_processed)}")
488
+ print(f" Val samples: {len(val_processed)}")
489
+
490
+ total_size = (
491
+ sum(f.stat().st_size for f in preprocessed_path.rglob("*.pt")) / (1024**3)
492
+ )
493
+ print(f" Total preprocessed size: {total_size:.2f} GB")
494
+
495
+ # Commit volume changes
496
+ VOLUME.commit()
497
+
498
+ print(f"\nPreprocessed data saved to Modal volume: preprocessed/{dataset_name}")
499
+
500
+ print(f"\n{'='*80}")
501
+ print("PREPROCESSING COMPLETE!")
502
+ print(f"{'='*80}\n")
503
+
504
+ return {
505
+ "train_samples": len(train_processed),
506
+ "val_samples": len(val_processed),
507
+ "total_size_gb": total_size,
508
+ }
509
+
510
+
511
+ @app.local_entrypoint()
512
+ def main(dataset_name: str):
513
+ """
514
+ Local entrypoint for running preprocessing.
515
+
516
+ Usage:
517
+ modal run preprocess.py --dataset-name my-dataset
518
+ """
519
+ print(f"\n{'='*80}")
520
+ print("Submitting preprocessing job to Modal...")
521
+ print(f"Dataset: {dataset_name}")
522
+ print(f"{'='*80}\n")
523
+
524
+ result = preprocess_dataset_impl.remote(dataset_name)
525
+
526
+ print(f"\n{'='*80}")
527
+ print("Preprocessing job completed!")
528
+ print(f"{'='*80}\n")
529
+ print(f"Results: {result}")