cudag 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudag/__init__.py +334 -0
- cudag/annotation/__init__.py +77 -0
- cudag/annotation/codegen.py +648 -0
- cudag/annotation/config.py +545 -0
- cudag/annotation/loader.py +342 -0
- cudag/annotation/scaffold.py +121 -0
- cudag/annotation/transcription.py +296 -0
- cudag/cli/__init__.py +5 -0
- cudag/cli/main.py +315 -0
- cudag/cli/new.py +873 -0
- cudag/core/__init__.py +364 -0
- cudag/core/button.py +137 -0
- cudag/core/canvas.py +222 -0
- cudag/core/config.py +70 -0
- cudag/core/coords.py +233 -0
- cudag/core/data_grid.py +804 -0
- cudag/core/dataset.py +678 -0
- cudag/core/distribution.py +136 -0
- cudag/core/drawing.py +75 -0
- cudag/core/fonts.py +156 -0
- cudag/core/generator.py +163 -0
- cudag/core/grid.py +367 -0
- cudag/core/grounding_task.py +247 -0
- cudag/core/icon.py +207 -0
- cudag/core/iconlist_task.py +301 -0
- cudag/core/models.py +1251 -0
- cudag/core/random.py +130 -0
- cudag/core/renderer.py +190 -0
- cudag/core/screen.py +402 -0
- cudag/core/scroll_task.py +254 -0
- cudag/core/scrollable_grid.py +447 -0
- cudag/core/state.py +110 -0
- cudag/core/task.py +293 -0
- cudag/core/taskbar.py +350 -0
- cudag/core/text.py +212 -0
- cudag/core/utils.py +82 -0
- cudag/data/surnames.txt +5000 -0
- cudag/modal_apps/__init__.py +4 -0
- cudag/modal_apps/archive.py +103 -0
- cudag/modal_apps/extract.py +138 -0
- cudag/modal_apps/preprocess.py +529 -0
- cudag/modal_apps/upload.py +317 -0
- cudag/prompts/SYSTEM_PROMPT.txt +104 -0
- cudag/prompts/__init__.py +33 -0
- cudag/prompts/system.py +43 -0
- cudag/prompts/tools.py +382 -0
- cudag/py.typed +0 -0
- cudag/schemas/filesystem.json +90 -0
- cudag/schemas/test_record.schema.json +113 -0
- cudag/schemas/train_record.schema.json +90 -0
- cudag/server/__init__.py +21 -0
- cudag/server/app.py +232 -0
- cudag/server/services/__init__.py +9 -0
- cudag/server/services/generator.py +128 -0
- cudag/templates/scripts/archive.sh +35 -0
- cudag/templates/scripts/build.sh +13 -0
- cudag/templates/scripts/extract.sh +54 -0
- cudag/templates/scripts/generate.sh +116 -0
- cudag/templates/scripts/pre-commit.sh +44 -0
- cudag/templates/scripts/preprocess.sh +46 -0
- cudag/templates/scripts/upload.sh +63 -0
- cudag/templates/scripts/verify.py +428 -0
- cudag/validation/__init__.py +35 -0
- cudag/validation/validate.py +508 -0
- cudag-0.3.10.dist-info/METADATA +570 -0
- cudag-0.3.10.dist-info/RECORD +69 -0
- cudag-0.3.10.dist-info/WHEEL +4 -0
- cudag-0.3.10.dist-info/entry_points.txt +2 -0
- cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
2
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
3
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
4
|
+
|
|
5
|
+
"""Dataset validation functions.
|
|
6
|
+
|
|
7
|
+
Validates CUDAG datasets against the expected filesystem structure and schemas.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ValidationError:
|
|
20
|
+
"""A validation error with location and message."""
|
|
21
|
+
|
|
22
|
+
file: str
|
|
23
|
+
"""File or directory where error occurred."""
|
|
24
|
+
|
|
25
|
+
line: int | None
|
|
26
|
+
"""Line number (for JSONL files), None for filesystem errors."""
|
|
27
|
+
|
|
28
|
+
message: str
|
|
29
|
+
"""Human-readable error message."""
|
|
30
|
+
|
|
31
|
+
def __str__(self) -> str:
|
|
32
|
+
if self.line is not None:
|
|
33
|
+
return f"{self.file}:{self.line}: {self.message}"
|
|
34
|
+
return f"{self.file}: {self.message}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validate_filesystem(dataset_path: Path) -> list[ValidationError]:
|
|
38
|
+
"""Validate dataset filesystem structure.
|
|
39
|
+
|
|
40
|
+
Checks that required files and directories exist.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
dataset_path: Path to dataset root directory.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List of validation errors (empty if valid).
|
|
47
|
+
"""
|
|
48
|
+
errors: list[ValidationError] = []
|
|
49
|
+
|
|
50
|
+
# Required files at root
|
|
51
|
+
required_files = ["config.json", "data.jsonl", "train.jsonl", "val.jsonl"]
|
|
52
|
+
for filename in required_files:
|
|
53
|
+
if not (dataset_path / filename).exists():
|
|
54
|
+
errors.append(
|
|
55
|
+
ValidationError(
|
|
56
|
+
file=str(dataset_path),
|
|
57
|
+
line=None,
|
|
58
|
+
message=f"Missing required file: {filename}",
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Required directories at root
|
|
63
|
+
if not (dataset_path / "images").is_dir():
|
|
64
|
+
errors.append(
|
|
65
|
+
ValidationError(
|
|
66
|
+
file=str(dataset_path),
|
|
67
|
+
line=None,
|
|
68
|
+
message="Missing required directory: images/",
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Test directory structure
|
|
73
|
+
test_dir = dataset_path / "test"
|
|
74
|
+
if not test_dir.is_dir():
|
|
75
|
+
errors.append(
|
|
76
|
+
ValidationError(
|
|
77
|
+
file=str(dataset_path),
|
|
78
|
+
line=None,
|
|
79
|
+
message="Missing required directory: test/",
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
# Required files in test/
|
|
84
|
+
if not (test_dir / "test.json").exists():
|
|
85
|
+
errors.append(
|
|
86
|
+
ValidationError(
|
|
87
|
+
file=str(test_dir),
|
|
88
|
+
line=None,
|
|
89
|
+
message="Missing required file: test.json",
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Required directories in test/
|
|
94
|
+
if not (test_dir / "images").is_dir():
|
|
95
|
+
errors.append(
|
|
96
|
+
ValidationError(
|
|
97
|
+
file=str(test_dir),
|
|
98
|
+
line=None,
|
|
99
|
+
message="Missing required directory: images/",
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return errors
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _validate_train_record(
|
|
107
|
+
record: dict[str, Any], line_num: int, file_path: str
|
|
108
|
+
) -> list[ValidationError]:
|
|
109
|
+
"""Validate a single training record."""
|
|
110
|
+
errors: list[ValidationError] = []
|
|
111
|
+
|
|
112
|
+
# Required fields
|
|
113
|
+
required = ["id", "image", "conversations", "metadata"]
|
|
114
|
+
for field in required:
|
|
115
|
+
if field not in record:
|
|
116
|
+
errors.append(
|
|
117
|
+
ValidationError(
|
|
118
|
+
file=file_path,
|
|
119
|
+
line=line_num,
|
|
120
|
+
message=f"Missing required field: {field}",
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Validate image path format
|
|
125
|
+
if "image" in record:
|
|
126
|
+
image = record["image"]
|
|
127
|
+
if not isinstance(image, str) or not image.startswith("images/"):
|
|
128
|
+
errors.append(
|
|
129
|
+
ValidationError(
|
|
130
|
+
file=file_path,
|
|
131
|
+
line=line_num,
|
|
132
|
+
message=f"Invalid image path: {image} (must start with 'images/')",
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Validate conversations structure
|
|
137
|
+
if "conversations" in record:
|
|
138
|
+
convs = record["conversations"]
|
|
139
|
+
if not isinstance(convs, list) or len(convs) != 2:
|
|
140
|
+
errors.append(
|
|
141
|
+
ValidationError(
|
|
142
|
+
file=file_path,
|
|
143
|
+
line=line_num,
|
|
144
|
+
message="conversations must be array of 2 items",
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
# Validate human turn
|
|
149
|
+
if convs[0].get("from") != "human":
|
|
150
|
+
errors.append(
|
|
151
|
+
ValidationError(
|
|
152
|
+
file=file_path,
|
|
153
|
+
line=line_num,
|
|
154
|
+
message="First conversation turn must be from 'human'",
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
human_value = convs[0].get("value", "")
|
|
158
|
+
if not human_value.startswith("<image>\n"):
|
|
159
|
+
errors.append(
|
|
160
|
+
ValidationError(
|
|
161
|
+
file=file_path,
|
|
162
|
+
line=line_num,
|
|
163
|
+
message="Human value must start with '<image>\\n'",
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Validate gpt turn
|
|
168
|
+
if convs[1].get("from") != "gpt":
|
|
169
|
+
errors.append(
|
|
170
|
+
ValidationError(
|
|
171
|
+
file=file_path,
|
|
172
|
+
line=line_num,
|
|
173
|
+
message="Second conversation turn must be from 'gpt'",
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
gpt_value = convs[1].get("value", "")
|
|
177
|
+
if not gpt_value.startswith("<tool_call>"):
|
|
178
|
+
errors.append(
|
|
179
|
+
ValidationError(
|
|
180
|
+
file=file_path,
|
|
181
|
+
line=line_num,
|
|
182
|
+
message="GPT value must start with '<tool_call>'",
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Validate metadata
|
|
187
|
+
if "metadata" in record:
|
|
188
|
+
metadata = record["metadata"]
|
|
189
|
+
if not isinstance(metadata, dict):
|
|
190
|
+
errors.append(
|
|
191
|
+
ValidationError(
|
|
192
|
+
file=file_path,
|
|
193
|
+
line=line_num,
|
|
194
|
+
message="metadata must be an object",
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
if "task_type" not in metadata:
|
|
199
|
+
errors.append(
|
|
200
|
+
ValidationError(
|
|
201
|
+
file=file_path,
|
|
202
|
+
line=line_num,
|
|
203
|
+
message="metadata missing required field: task_type",
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
if "real_coords" not in metadata:
|
|
207
|
+
errors.append(
|
|
208
|
+
ValidationError(
|
|
209
|
+
file=file_path,
|
|
210
|
+
line=line_num,
|
|
211
|
+
message="metadata missing required field: real_coords",
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
elif not isinstance(metadata["real_coords"], list) or len(metadata["real_coords"]) != 2:
|
|
215
|
+
errors.append(
|
|
216
|
+
ValidationError(
|
|
217
|
+
file=file_path,
|
|
218
|
+
line=line_num,
|
|
219
|
+
message="metadata.real_coords must be [x, y] array",
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return errors
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def validate_training_records(jsonl_path: Path) -> list[ValidationError]:
|
|
227
|
+
"""Validate training records in a JSONL file.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
jsonl_path: Path to train.jsonl, val.jsonl, or data.jsonl.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of validation errors (empty if valid).
|
|
234
|
+
"""
|
|
235
|
+
errors: list[ValidationError] = []
|
|
236
|
+
file_str = str(jsonl_path)
|
|
237
|
+
|
|
238
|
+
if not jsonl_path.exists():
|
|
239
|
+
errors.append(
|
|
240
|
+
ValidationError(file=file_str, line=None, message="File not found")
|
|
241
|
+
)
|
|
242
|
+
return errors
|
|
243
|
+
|
|
244
|
+
with open(jsonl_path, encoding="utf-8") as f:
|
|
245
|
+
for line_num, line in enumerate(f, start=1):
|
|
246
|
+
line = line.strip()
|
|
247
|
+
if not line:
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
record = json.loads(line)
|
|
252
|
+
except json.JSONDecodeError as e:
|
|
253
|
+
errors.append(
|
|
254
|
+
ValidationError(
|
|
255
|
+
file=file_str,
|
|
256
|
+
line=line_num,
|
|
257
|
+
message=f"Invalid JSON: {e}",
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
errors.extend(_validate_train_record(record, line_num, file_str))
|
|
263
|
+
|
|
264
|
+
return errors
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _validate_test_record(
|
|
268
|
+
record: dict[str, Any], index: int, file_path: str
|
|
269
|
+
) -> list[ValidationError]:
|
|
270
|
+
"""Validate a single test record."""
|
|
271
|
+
errors: list[ValidationError] = []
|
|
272
|
+
|
|
273
|
+
# Required fields
|
|
274
|
+
required = ["test_id", "screenshot", "prompt", "expected_action", "tolerance", "metadata"]
|
|
275
|
+
for field in required:
|
|
276
|
+
if field not in record:
|
|
277
|
+
errors.append(
|
|
278
|
+
ValidationError(
|
|
279
|
+
file=file_path,
|
|
280
|
+
line=index,
|
|
281
|
+
message=f"Missing required field: {field}",
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Validate screenshot path format
|
|
286
|
+
if "screenshot" in record:
|
|
287
|
+
screenshot = record["screenshot"]
|
|
288
|
+
if not isinstance(screenshot, str) or not screenshot.startswith("images/"):
|
|
289
|
+
errors.append(
|
|
290
|
+
ValidationError(
|
|
291
|
+
file=file_path,
|
|
292
|
+
line=index,
|
|
293
|
+
message=f"Invalid screenshot path: {screenshot} (must start with 'images/')",
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Validate expected_action
|
|
298
|
+
if "expected_action" in record:
|
|
299
|
+
action = record["expected_action"]
|
|
300
|
+
if not isinstance(action, dict):
|
|
301
|
+
errors.append(
|
|
302
|
+
ValidationError(
|
|
303
|
+
file=file_path,
|
|
304
|
+
line=index,
|
|
305
|
+
message="expected_action must be an object",
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
else:
|
|
309
|
+
if action.get("name") != "computer_use":
|
|
310
|
+
errors.append(
|
|
311
|
+
ValidationError(
|
|
312
|
+
file=file_path,
|
|
313
|
+
line=index,
|
|
314
|
+
message="expected_action.name must be 'computer_use'",
|
|
315
|
+
)
|
|
316
|
+
)
|
|
317
|
+
if "arguments" not in action:
|
|
318
|
+
errors.append(
|
|
319
|
+
ValidationError(
|
|
320
|
+
file=file_path,
|
|
321
|
+
line=index,
|
|
322
|
+
message="expected_action missing 'arguments'",
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
elif "action" not in action["arguments"]:
|
|
326
|
+
errors.append(
|
|
327
|
+
ValidationError(
|
|
328
|
+
file=file_path,
|
|
329
|
+
line=index,
|
|
330
|
+
message="expected_action.arguments missing 'action'",
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Validate tolerance
|
|
335
|
+
if "tolerance" in record:
|
|
336
|
+
tolerance = record["tolerance"]
|
|
337
|
+
if not isinstance(tolerance, list) or len(tolerance) != 2:
|
|
338
|
+
errors.append(
|
|
339
|
+
ValidationError(
|
|
340
|
+
file=file_path,
|
|
341
|
+
line=index,
|
|
342
|
+
message="tolerance must be [tol_x, tol_y] array",
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Validate metadata
|
|
347
|
+
if "metadata" in record:
|
|
348
|
+
metadata = record["metadata"]
|
|
349
|
+
if not isinstance(metadata, dict):
|
|
350
|
+
errors.append(
|
|
351
|
+
ValidationError(
|
|
352
|
+
file=file_path,
|
|
353
|
+
line=index,
|
|
354
|
+
message="metadata must be an object",
|
|
355
|
+
)
|
|
356
|
+
)
|
|
357
|
+
elif "task_type" not in metadata:
|
|
358
|
+
errors.append(
|
|
359
|
+
ValidationError(
|
|
360
|
+
file=file_path,
|
|
361
|
+
line=index,
|
|
362
|
+
message="metadata missing required field: task_type",
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
return errors
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def validate_test_records(json_path: Path) -> list[ValidationError]:
|
|
370
|
+
"""Validate test records in test.json.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
json_path: Path to test/test.json.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
List of validation errors (empty if valid).
|
|
377
|
+
"""
|
|
378
|
+
errors: list[ValidationError] = []
|
|
379
|
+
file_str = str(json_path)
|
|
380
|
+
|
|
381
|
+
if not json_path.exists():
|
|
382
|
+
errors.append(
|
|
383
|
+
ValidationError(file=file_str, line=None, message="File not found")
|
|
384
|
+
)
|
|
385
|
+
return errors
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
with open(json_path, encoding="utf-8") as f:
|
|
389
|
+
records = json.load(f)
|
|
390
|
+
except json.JSONDecodeError as e:
|
|
391
|
+
errors.append(
|
|
392
|
+
ValidationError(file=file_str, line=None, message=f"Invalid JSON: {e}")
|
|
393
|
+
)
|
|
394
|
+
return errors
|
|
395
|
+
|
|
396
|
+
if not isinstance(records, list):
|
|
397
|
+
errors.append(
|
|
398
|
+
ValidationError(
|
|
399
|
+
file=file_str,
|
|
400
|
+
line=None,
|
|
401
|
+
message="test.json must be a JSON array",
|
|
402
|
+
)
|
|
403
|
+
)
|
|
404
|
+
return errors
|
|
405
|
+
|
|
406
|
+
for index, record in enumerate(records):
|
|
407
|
+
errors.extend(_validate_test_record(record, index, file_str))
|
|
408
|
+
|
|
409
|
+
return errors
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def validate_image_paths(dataset_path: Path) -> list[ValidationError]:
|
|
413
|
+
"""Validate that all image paths in JSONL/JSON files exist.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
dataset_path: Path to dataset root directory.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
List of validation errors (empty if valid).
|
|
420
|
+
"""
|
|
421
|
+
errors: list[ValidationError] = []
|
|
422
|
+
|
|
423
|
+
# Check training images
|
|
424
|
+
for jsonl_name in ["train.jsonl", "val.jsonl"]:
|
|
425
|
+
jsonl_path = dataset_path / jsonl_name
|
|
426
|
+
if not jsonl_path.exists():
|
|
427
|
+
continue
|
|
428
|
+
|
|
429
|
+
with open(jsonl_path, encoding="utf-8") as f:
|
|
430
|
+
for line_num, line in enumerate(f, start=1):
|
|
431
|
+
line = line.strip()
|
|
432
|
+
if not line:
|
|
433
|
+
continue
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
record = json.loads(line)
|
|
437
|
+
except json.JSONDecodeError:
|
|
438
|
+
continue # Already caught by validate_training_records
|
|
439
|
+
|
|
440
|
+
image_path = record.get("image", "")
|
|
441
|
+
full_path = dataset_path / image_path
|
|
442
|
+
if not full_path.exists():
|
|
443
|
+
errors.append(
|
|
444
|
+
ValidationError(
|
|
445
|
+
file=str(jsonl_path),
|
|
446
|
+
line=line_num,
|
|
447
|
+
message=f"Image not found: {image_path}",
|
|
448
|
+
)
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Check test images
|
|
452
|
+
test_json_path = dataset_path / "test" / "test.json"
|
|
453
|
+
if test_json_path.exists():
|
|
454
|
+
try:
|
|
455
|
+
with open(test_json_path, encoding="utf-8") as f:
|
|
456
|
+
records = json.load(f)
|
|
457
|
+
|
|
458
|
+
if isinstance(records, list):
|
|
459
|
+
test_dir = dataset_path / "test"
|
|
460
|
+
for index, record in enumerate(records):
|
|
461
|
+
screenshot = record.get("screenshot", "")
|
|
462
|
+
full_path = test_dir / screenshot
|
|
463
|
+
if not full_path.exists():
|
|
464
|
+
errors.append(
|
|
465
|
+
ValidationError(
|
|
466
|
+
file=str(test_json_path),
|
|
467
|
+
line=index,
|
|
468
|
+
message=f"Screenshot not found: {screenshot}",
|
|
469
|
+
)
|
|
470
|
+
)
|
|
471
|
+
except json.JSONDecodeError:
|
|
472
|
+
pass # Already caught by validate_test_records
|
|
473
|
+
|
|
474
|
+
return errors
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def validate_dataset(dataset_path: Path) -> list[ValidationError]:
|
|
478
|
+
"""Run all validations on a dataset.
|
|
479
|
+
|
|
480
|
+
This is the main entry point for dataset validation.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
dataset_path: Path to dataset root directory.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
List of all validation errors (empty if valid).
|
|
487
|
+
"""
|
|
488
|
+
errors: list[ValidationError] = []
|
|
489
|
+
|
|
490
|
+
# 1. Validate filesystem structure
|
|
491
|
+
errors.extend(validate_filesystem(dataset_path))
|
|
492
|
+
|
|
493
|
+
# 2. Validate training records (only if files exist)
|
|
494
|
+
for jsonl_name in ["train.jsonl", "val.jsonl"]:
|
|
495
|
+
jsonl_path = dataset_path / jsonl_name
|
|
496
|
+
if jsonl_path.exists():
|
|
497
|
+
errors.extend(validate_training_records(jsonl_path))
|
|
498
|
+
|
|
499
|
+
# 3. Validate test records
|
|
500
|
+
test_json_path = dataset_path / "test" / "test.json"
|
|
501
|
+
if test_json_path.exists():
|
|
502
|
+
errors.extend(validate_test_records(test_json_path))
|
|
503
|
+
|
|
504
|
+
# 4. Validate image paths (only if basic structure is valid)
|
|
505
|
+
if not any(e.message.startswith("Missing required") for e in errors):
|
|
506
|
+
errors.extend(validate_image_paths(dataset_path))
|
|
507
|
+
|
|
508
|
+
return errors
|