cudag 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cudag/__init__.py +334 -0
  2. cudag/annotation/__init__.py +77 -0
  3. cudag/annotation/codegen.py +648 -0
  4. cudag/annotation/config.py +545 -0
  5. cudag/annotation/loader.py +342 -0
  6. cudag/annotation/scaffold.py +121 -0
  7. cudag/annotation/transcription.py +296 -0
  8. cudag/cli/__init__.py +5 -0
  9. cudag/cli/main.py +315 -0
  10. cudag/cli/new.py +873 -0
  11. cudag/core/__init__.py +364 -0
  12. cudag/core/button.py +137 -0
  13. cudag/core/canvas.py +222 -0
  14. cudag/core/config.py +70 -0
  15. cudag/core/coords.py +233 -0
  16. cudag/core/data_grid.py +804 -0
  17. cudag/core/dataset.py +678 -0
  18. cudag/core/distribution.py +136 -0
  19. cudag/core/drawing.py +75 -0
  20. cudag/core/fonts.py +156 -0
  21. cudag/core/generator.py +163 -0
  22. cudag/core/grid.py +367 -0
  23. cudag/core/grounding_task.py +247 -0
  24. cudag/core/icon.py +207 -0
  25. cudag/core/iconlist_task.py +301 -0
  26. cudag/core/models.py +1251 -0
  27. cudag/core/random.py +130 -0
  28. cudag/core/renderer.py +190 -0
  29. cudag/core/screen.py +402 -0
  30. cudag/core/scroll_task.py +254 -0
  31. cudag/core/scrollable_grid.py +447 -0
  32. cudag/core/state.py +110 -0
  33. cudag/core/task.py +293 -0
  34. cudag/core/taskbar.py +350 -0
  35. cudag/core/text.py +212 -0
  36. cudag/core/utils.py +82 -0
  37. cudag/data/surnames.txt +5000 -0
  38. cudag/modal_apps/__init__.py +4 -0
  39. cudag/modal_apps/archive.py +103 -0
  40. cudag/modal_apps/extract.py +138 -0
  41. cudag/modal_apps/preprocess.py +529 -0
  42. cudag/modal_apps/upload.py +317 -0
  43. cudag/prompts/SYSTEM_PROMPT.txt +104 -0
  44. cudag/prompts/__init__.py +33 -0
  45. cudag/prompts/system.py +43 -0
  46. cudag/prompts/tools.py +382 -0
  47. cudag/py.typed +0 -0
  48. cudag/schemas/filesystem.json +90 -0
  49. cudag/schemas/test_record.schema.json +113 -0
  50. cudag/schemas/train_record.schema.json +90 -0
  51. cudag/server/__init__.py +21 -0
  52. cudag/server/app.py +232 -0
  53. cudag/server/services/__init__.py +9 -0
  54. cudag/server/services/generator.py +128 -0
  55. cudag/templates/scripts/archive.sh +35 -0
  56. cudag/templates/scripts/build.sh +13 -0
  57. cudag/templates/scripts/extract.sh +54 -0
  58. cudag/templates/scripts/generate.sh +116 -0
  59. cudag/templates/scripts/pre-commit.sh +44 -0
  60. cudag/templates/scripts/preprocess.sh +46 -0
  61. cudag/templates/scripts/upload.sh +63 -0
  62. cudag/templates/scripts/verify.py +428 -0
  63. cudag/validation/__init__.py +35 -0
  64. cudag/validation/validate.py +508 -0
  65. cudag-0.3.10.dist-info/METADATA +570 -0
  66. cudag-0.3.10.dist-info/RECORD +69 -0
  67. cudag-0.3.10.dist-info/WHEEL +4 -0
  68. cudag-0.3.10.dist-info/entry_points.txt +2 -0
  69. cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
@@ -0,0 +1,508 @@
1
+ # Copyright (c) 2025 Tylt LLC. All rights reserved.
2
+ # CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
3
+ # is strictly prohibited. For licensing inquiries: hello@claimhawk.app
4
+
5
+ """Dataset validation functions.
6
+
7
+ Validates CUDAG datasets against the expected filesystem structure and schemas.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ @dataclass
19
+ class ValidationError:
20
+ """A validation error with location and message."""
21
+
22
+ file: str
23
+ """File or directory where error occurred."""
24
+
25
+ line: int | None
26
+ """Line number (for JSONL files), None for filesystem errors."""
27
+
28
+ message: str
29
+ """Human-readable error message."""
30
+
31
+ def __str__(self) -> str:
32
+ if self.line is not None:
33
+ return f"{self.file}:{self.line}: {self.message}"
34
+ return f"{self.file}: {self.message}"
35
+
36
+
37
+ def validate_filesystem(dataset_path: Path) -> list[ValidationError]:
38
+ """Validate dataset filesystem structure.
39
+
40
+ Checks that required files and directories exist.
41
+
42
+ Args:
43
+ dataset_path: Path to dataset root directory.
44
+
45
+ Returns:
46
+ List of validation errors (empty if valid).
47
+ """
48
+ errors: list[ValidationError] = []
49
+
50
+ # Required files at root
51
+ required_files = ["config.json", "data.jsonl", "train.jsonl", "val.jsonl"]
52
+ for filename in required_files:
53
+ if not (dataset_path / filename).exists():
54
+ errors.append(
55
+ ValidationError(
56
+ file=str(dataset_path),
57
+ line=None,
58
+ message=f"Missing required file: {filename}",
59
+ )
60
+ )
61
+
62
+ # Required directories at root
63
+ if not (dataset_path / "images").is_dir():
64
+ errors.append(
65
+ ValidationError(
66
+ file=str(dataset_path),
67
+ line=None,
68
+ message="Missing required directory: images/",
69
+ )
70
+ )
71
+
72
+ # Test directory structure
73
+ test_dir = dataset_path / "test"
74
+ if not test_dir.is_dir():
75
+ errors.append(
76
+ ValidationError(
77
+ file=str(dataset_path),
78
+ line=None,
79
+ message="Missing required directory: test/",
80
+ )
81
+ )
82
+ else:
83
+ # Required files in test/
84
+ if not (test_dir / "test.json").exists():
85
+ errors.append(
86
+ ValidationError(
87
+ file=str(test_dir),
88
+ line=None,
89
+ message="Missing required file: test.json",
90
+ )
91
+ )
92
+
93
+ # Required directories in test/
94
+ if not (test_dir / "images").is_dir():
95
+ errors.append(
96
+ ValidationError(
97
+ file=str(test_dir),
98
+ line=None,
99
+ message="Missing required directory: images/",
100
+ )
101
+ )
102
+
103
+ return errors
104
+
105
+
106
+ def _validate_train_record(
107
+ record: dict[str, Any], line_num: int, file_path: str
108
+ ) -> list[ValidationError]:
109
+ """Validate a single training record."""
110
+ errors: list[ValidationError] = []
111
+
112
+ # Required fields
113
+ required = ["id", "image", "conversations", "metadata"]
114
+ for field in required:
115
+ if field not in record:
116
+ errors.append(
117
+ ValidationError(
118
+ file=file_path,
119
+ line=line_num,
120
+ message=f"Missing required field: {field}",
121
+ )
122
+ )
123
+
124
+ # Validate image path format
125
+ if "image" in record:
126
+ image = record["image"]
127
+ if not isinstance(image, str) or not image.startswith("images/"):
128
+ errors.append(
129
+ ValidationError(
130
+ file=file_path,
131
+ line=line_num,
132
+ message=f"Invalid image path: {image} (must start with 'images/')",
133
+ )
134
+ )
135
+
136
+ # Validate conversations structure
137
+ if "conversations" in record:
138
+ convs = record["conversations"]
139
+ if not isinstance(convs, list) or len(convs) != 2:
140
+ errors.append(
141
+ ValidationError(
142
+ file=file_path,
143
+ line=line_num,
144
+ message="conversations must be array of 2 items",
145
+ )
146
+ )
147
+ else:
148
+ # Validate human turn
149
+ if convs[0].get("from") != "human":
150
+ errors.append(
151
+ ValidationError(
152
+ file=file_path,
153
+ line=line_num,
154
+ message="First conversation turn must be from 'human'",
155
+ )
156
+ )
157
+ human_value = convs[0].get("value", "")
158
+ if not human_value.startswith("<image>\n"):
159
+ errors.append(
160
+ ValidationError(
161
+ file=file_path,
162
+ line=line_num,
163
+ message="Human value must start with '<image>\\n'",
164
+ )
165
+ )
166
+
167
+ # Validate gpt turn
168
+ if convs[1].get("from") != "gpt":
169
+ errors.append(
170
+ ValidationError(
171
+ file=file_path,
172
+ line=line_num,
173
+ message="Second conversation turn must be from 'gpt'",
174
+ )
175
+ )
176
+ gpt_value = convs[1].get("value", "")
177
+ if not gpt_value.startswith("<tool_call>"):
178
+ errors.append(
179
+ ValidationError(
180
+ file=file_path,
181
+ line=line_num,
182
+ message="GPT value must start with '<tool_call>'",
183
+ )
184
+ )
185
+
186
+ # Validate metadata
187
+ if "metadata" in record:
188
+ metadata = record["metadata"]
189
+ if not isinstance(metadata, dict):
190
+ errors.append(
191
+ ValidationError(
192
+ file=file_path,
193
+ line=line_num,
194
+ message="metadata must be an object",
195
+ )
196
+ )
197
+ else:
198
+ if "task_type" not in metadata:
199
+ errors.append(
200
+ ValidationError(
201
+ file=file_path,
202
+ line=line_num,
203
+ message="metadata missing required field: task_type",
204
+ )
205
+ )
206
+ if "real_coords" not in metadata:
207
+ errors.append(
208
+ ValidationError(
209
+ file=file_path,
210
+ line=line_num,
211
+ message="metadata missing required field: real_coords",
212
+ )
213
+ )
214
+ elif not isinstance(metadata["real_coords"], list) or len(metadata["real_coords"]) != 2:
215
+ errors.append(
216
+ ValidationError(
217
+ file=file_path,
218
+ line=line_num,
219
+ message="metadata.real_coords must be [x, y] array",
220
+ )
221
+ )
222
+
223
+ return errors
224
+
225
+
226
+ def validate_training_records(jsonl_path: Path) -> list[ValidationError]:
227
+ """Validate training records in a JSONL file.
228
+
229
+ Args:
230
+ jsonl_path: Path to train.jsonl, val.jsonl, or data.jsonl.
231
+
232
+ Returns:
233
+ List of validation errors (empty if valid).
234
+ """
235
+ errors: list[ValidationError] = []
236
+ file_str = str(jsonl_path)
237
+
238
+ if not jsonl_path.exists():
239
+ errors.append(
240
+ ValidationError(file=file_str, line=None, message="File not found")
241
+ )
242
+ return errors
243
+
244
+ with open(jsonl_path, encoding="utf-8") as f:
245
+ for line_num, line in enumerate(f, start=1):
246
+ line = line.strip()
247
+ if not line:
248
+ continue
249
+
250
+ try:
251
+ record = json.loads(line)
252
+ except json.JSONDecodeError as e:
253
+ errors.append(
254
+ ValidationError(
255
+ file=file_str,
256
+ line=line_num,
257
+ message=f"Invalid JSON: {e}",
258
+ )
259
+ )
260
+ continue
261
+
262
+ errors.extend(_validate_train_record(record, line_num, file_str))
263
+
264
+ return errors
265
+
266
+
267
+ def _validate_test_record(
268
+ record: dict[str, Any], index: int, file_path: str
269
+ ) -> list[ValidationError]:
270
+ """Validate a single test record."""
271
+ errors: list[ValidationError] = []
272
+
273
+ # Required fields
274
+ required = ["test_id", "screenshot", "prompt", "expected_action", "tolerance", "metadata"]
275
+ for field in required:
276
+ if field not in record:
277
+ errors.append(
278
+ ValidationError(
279
+ file=file_path,
280
+ line=index,
281
+ message=f"Missing required field: {field}",
282
+ )
283
+ )
284
+
285
+ # Validate screenshot path format
286
+ if "screenshot" in record:
287
+ screenshot = record["screenshot"]
288
+ if not isinstance(screenshot, str) or not screenshot.startswith("images/"):
289
+ errors.append(
290
+ ValidationError(
291
+ file=file_path,
292
+ line=index,
293
+ message=f"Invalid screenshot path: {screenshot} (must start with 'images/')",
294
+ )
295
+ )
296
+
297
+ # Validate expected_action
298
+ if "expected_action" in record:
299
+ action = record["expected_action"]
300
+ if not isinstance(action, dict):
301
+ errors.append(
302
+ ValidationError(
303
+ file=file_path,
304
+ line=index,
305
+ message="expected_action must be an object",
306
+ )
307
+ )
308
+ else:
309
+ if action.get("name") != "computer_use":
310
+ errors.append(
311
+ ValidationError(
312
+ file=file_path,
313
+ line=index,
314
+ message="expected_action.name must be 'computer_use'",
315
+ )
316
+ )
317
+ if "arguments" not in action:
318
+ errors.append(
319
+ ValidationError(
320
+ file=file_path,
321
+ line=index,
322
+ message="expected_action missing 'arguments'",
323
+ )
324
+ )
325
+ elif "action" not in action["arguments"]:
326
+ errors.append(
327
+ ValidationError(
328
+ file=file_path,
329
+ line=index,
330
+ message="expected_action.arguments missing 'action'",
331
+ )
332
+ )
333
+
334
+ # Validate tolerance
335
+ if "tolerance" in record:
336
+ tolerance = record["tolerance"]
337
+ if not isinstance(tolerance, list) or len(tolerance) != 2:
338
+ errors.append(
339
+ ValidationError(
340
+ file=file_path,
341
+ line=index,
342
+ message="tolerance must be [tol_x, tol_y] array",
343
+ )
344
+ )
345
+
346
+ # Validate metadata
347
+ if "metadata" in record:
348
+ metadata = record["metadata"]
349
+ if not isinstance(metadata, dict):
350
+ errors.append(
351
+ ValidationError(
352
+ file=file_path,
353
+ line=index,
354
+ message="metadata must be an object",
355
+ )
356
+ )
357
+ elif "task_type" not in metadata:
358
+ errors.append(
359
+ ValidationError(
360
+ file=file_path,
361
+ line=index,
362
+ message="metadata missing required field: task_type",
363
+ )
364
+ )
365
+
366
+ return errors
367
+
368
+
369
+ def validate_test_records(json_path: Path) -> list[ValidationError]:
370
+ """Validate test records in test.json.
371
+
372
+ Args:
373
+ json_path: Path to test/test.json.
374
+
375
+ Returns:
376
+ List of validation errors (empty if valid).
377
+ """
378
+ errors: list[ValidationError] = []
379
+ file_str = str(json_path)
380
+
381
+ if not json_path.exists():
382
+ errors.append(
383
+ ValidationError(file=file_str, line=None, message="File not found")
384
+ )
385
+ return errors
386
+
387
+ try:
388
+ with open(json_path, encoding="utf-8") as f:
389
+ records = json.load(f)
390
+ except json.JSONDecodeError as e:
391
+ errors.append(
392
+ ValidationError(file=file_str, line=None, message=f"Invalid JSON: {e}")
393
+ )
394
+ return errors
395
+
396
+ if not isinstance(records, list):
397
+ errors.append(
398
+ ValidationError(
399
+ file=file_str,
400
+ line=None,
401
+ message="test.json must be a JSON array",
402
+ )
403
+ )
404
+ return errors
405
+
406
+ for index, record in enumerate(records):
407
+ errors.extend(_validate_test_record(record, index, file_str))
408
+
409
+ return errors
410
+
411
+
412
+ def validate_image_paths(dataset_path: Path) -> list[ValidationError]:
413
+ """Validate that all image paths in JSONL/JSON files exist.
414
+
415
+ Args:
416
+ dataset_path: Path to dataset root directory.
417
+
418
+ Returns:
419
+ List of validation errors (empty if valid).
420
+ """
421
+ errors: list[ValidationError] = []
422
+
423
+ # Check training images
424
+ for jsonl_name in ["train.jsonl", "val.jsonl"]:
425
+ jsonl_path = dataset_path / jsonl_name
426
+ if not jsonl_path.exists():
427
+ continue
428
+
429
+ with open(jsonl_path, encoding="utf-8") as f:
430
+ for line_num, line in enumerate(f, start=1):
431
+ line = line.strip()
432
+ if not line:
433
+ continue
434
+
435
+ try:
436
+ record = json.loads(line)
437
+ except json.JSONDecodeError:
438
+ continue # Already caught by validate_training_records
439
+
440
+ image_path = record.get("image", "")
441
+ full_path = dataset_path / image_path
442
+ if not full_path.exists():
443
+ errors.append(
444
+ ValidationError(
445
+ file=str(jsonl_path),
446
+ line=line_num,
447
+ message=f"Image not found: {image_path}",
448
+ )
449
+ )
450
+
451
+ # Check test images
452
+ test_json_path = dataset_path / "test" / "test.json"
453
+ if test_json_path.exists():
454
+ try:
455
+ with open(test_json_path, encoding="utf-8") as f:
456
+ records = json.load(f)
457
+
458
+ if isinstance(records, list):
459
+ test_dir = dataset_path / "test"
460
+ for index, record in enumerate(records):
461
+ screenshot = record.get("screenshot", "")
462
+ full_path = test_dir / screenshot
463
+ if not full_path.exists():
464
+ errors.append(
465
+ ValidationError(
466
+ file=str(test_json_path),
467
+ line=index,
468
+ message=f"Screenshot not found: {screenshot}",
469
+ )
470
+ )
471
+ except json.JSONDecodeError:
472
+ pass # Already caught by validate_test_records
473
+
474
+ return errors
475
+
476
+
477
+ def validate_dataset(dataset_path: Path) -> list[ValidationError]:
478
+ """Run all validations on a dataset.
479
+
480
+ This is the main entry point for dataset validation.
481
+
482
+ Args:
483
+ dataset_path: Path to dataset root directory.
484
+
485
+ Returns:
486
+ List of all validation errors (empty if valid).
487
+ """
488
+ errors: list[ValidationError] = []
489
+
490
+ # 1. Validate filesystem structure
491
+ errors.extend(validate_filesystem(dataset_path))
492
+
493
+ # 2. Validate training records (only if files exist)
494
+ for jsonl_name in ["train.jsonl", "val.jsonl"]:
495
+ jsonl_path = dataset_path / jsonl_name
496
+ if jsonl_path.exists():
497
+ errors.extend(validate_training_records(jsonl_path))
498
+
499
+ # 3. Validate test records
500
+ test_json_path = dataset_path / "test" / "test.json"
501
+ if test_json_path.exists():
502
+ errors.extend(validate_test_records(test_json_path))
503
+
504
+ # 4. Validate image paths (only if basic structure is valid)
505
+ if not any(e.message.startswith("Missing required") for e in errors):
506
+ errors.extend(validate_image_paths(dataset_path))
507
+
508
+ return errors