cudag 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudag/__init__.py +334 -0
- cudag/annotation/__init__.py +77 -0
- cudag/annotation/codegen.py +648 -0
- cudag/annotation/config.py +545 -0
- cudag/annotation/loader.py +342 -0
- cudag/annotation/scaffold.py +121 -0
- cudag/annotation/transcription.py +296 -0
- cudag/cli/__init__.py +5 -0
- cudag/cli/main.py +315 -0
- cudag/cli/new.py +873 -0
- cudag/core/__init__.py +364 -0
- cudag/core/button.py +137 -0
- cudag/core/canvas.py +222 -0
- cudag/core/config.py +70 -0
- cudag/core/coords.py +233 -0
- cudag/core/data_grid.py +804 -0
- cudag/core/dataset.py +678 -0
- cudag/core/distribution.py +136 -0
- cudag/core/drawing.py +75 -0
- cudag/core/fonts.py +156 -0
- cudag/core/generator.py +163 -0
- cudag/core/grid.py +367 -0
- cudag/core/grounding_task.py +247 -0
- cudag/core/icon.py +207 -0
- cudag/core/iconlist_task.py +301 -0
- cudag/core/models.py +1251 -0
- cudag/core/random.py +130 -0
- cudag/core/renderer.py +190 -0
- cudag/core/screen.py +402 -0
- cudag/core/scroll_task.py +254 -0
- cudag/core/scrollable_grid.py +447 -0
- cudag/core/state.py +110 -0
- cudag/core/task.py +293 -0
- cudag/core/taskbar.py +350 -0
- cudag/core/text.py +212 -0
- cudag/core/utils.py +82 -0
- cudag/data/surnames.txt +5000 -0
- cudag/modal_apps/__init__.py +4 -0
- cudag/modal_apps/archive.py +103 -0
- cudag/modal_apps/extract.py +138 -0
- cudag/modal_apps/preprocess.py +529 -0
- cudag/modal_apps/upload.py +317 -0
- cudag/prompts/SYSTEM_PROMPT.txt +104 -0
- cudag/prompts/__init__.py +33 -0
- cudag/prompts/system.py +43 -0
- cudag/prompts/tools.py +382 -0
- cudag/py.typed +0 -0
- cudag/schemas/filesystem.json +90 -0
- cudag/schemas/test_record.schema.json +113 -0
- cudag/schemas/train_record.schema.json +90 -0
- cudag/server/__init__.py +21 -0
- cudag/server/app.py +232 -0
- cudag/server/services/__init__.py +9 -0
- cudag/server/services/generator.py +128 -0
- cudag/templates/scripts/archive.sh +35 -0
- cudag/templates/scripts/build.sh +13 -0
- cudag/templates/scripts/extract.sh +54 -0
- cudag/templates/scripts/generate.sh +116 -0
- cudag/templates/scripts/pre-commit.sh +44 -0
- cudag/templates/scripts/preprocess.sh +46 -0
- cudag/templates/scripts/upload.sh +63 -0
- cudag/templates/scripts/verify.py +428 -0
- cudag/validation/__init__.py +35 -0
- cudag/validation/validate.py +508 -0
- cudag-0.3.10.dist-info/METADATA +570 -0
- cudag-0.3.10.dist-info/RECORD +69 -0
- cudag-0.3.10.dist-info/WHEEL +4 -0
- cudag-0.3.10.dist-info/entry_points.txt +2 -0
- cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
3
|
+
# Derivative works may be released by researchers,
|
|
4
|
+
# but original files may not be redistributed or used beyond research purposes.
|
|
5
|
+
#
|
|
6
|
+
# DO NOT EDIT THIS FILE - Generated by cudag framework
|
|
7
|
+
|
|
8
|
+
# Pipeline: generate.sh -> upload.sh -> extract.sh -> preprocess.sh
|
|
9
|
+
#
|
|
10
|
+
# Usage:
|
|
11
|
+
# ./scripts/preprocess.sh --dataset-name <NAME>
|
|
12
|
+
|
|
13
|
+
set -euo pipefail
|
|
14
|
+
|
|
15
|
+
DATASET_NAME=""
|
|
16
|
+
|
|
17
|
+
while [[ $# -gt 0 ]]; do
|
|
18
|
+
case "$1" in
|
|
19
|
+
--dataset-name)
|
|
20
|
+
DATASET_NAME="${2:-}"
|
|
21
|
+
shift 2
|
|
22
|
+
;;
|
|
23
|
+
*)
|
|
24
|
+
shift
|
|
25
|
+
;;
|
|
26
|
+
esac
|
|
27
|
+
done
|
|
28
|
+
|
|
29
|
+
if [[ -z "$DATASET_NAME" ]]; then
|
|
30
|
+
echo "Error: --dataset-name <NAME> is required"
|
|
31
|
+
exit 1
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
echo "========================================"
|
|
35
|
+
echo "STAGE 4: Preprocess Dataset"
|
|
36
|
+
echo "========================================"
|
|
37
|
+
echo ""
|
|
38
|
+
echo "Dataset: $DATASET_NAME"
|
|
39
|
+
echo ""
|
|
40
|
+
|
|
41
|
+
# Find cudag's preprocess.py location and run via Modal
|
|
42
|
+
CUDAG_PATH=$(uvx --with cudag python -c "import cudag.modal_apps.preprocess as p; print(p.__file__)")
|
|
43
|
+
uvx modal run --detach "$CUDAG_PATH" --dataset-name "$DATASET_NAME"
|
|
44
|
+
|
|
45
|
+
echo ""
|
|
46
|
+
echo "Preprocessing job started for: $DATASET_NAME"
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
3
|
+
# Derivative works may be released by researchers,
|
|
4
|
+
# but original files may not be redistributed or used beyond research purposes.
|
|
5
|
+
#
|
|
6
|
+
# DO NOT EDIT THIS FILE - Generated by cudag framework
|
|
7
|
+
|
|
8
|
+
# Usage:
|
|
9
|
+
# ./scripts/upload.sh [dataset_dir] # Upload to Modal volume
|
|
10
|
+
# ./scripts/upload.sh --dry [dataset_dir] # Dry run, show what would be uploaded
|
|
11
|
+
|
|
12
|
+
set -euo pipefail
|
|
13
|
+
|
|
14
|
+
DRY_RUN=false
|
|
15
|
+
DATASET_DIR=""
|
|
16
|
+
|
|
17
|
+
# Parse args
|
|
18
|
+
for arg in "$@"; do
|
|
19
|
+
if [[ "$arg" == "--dry" ]]; then
|
|
20
|
+
DRY_RUN=true
|
|
21
|
+
elif [[ -z "$DATASET_DIR" && ! "$arg" =~ ^-- ]]; then
|
|
22
|
+
DATASET_DIR="$arg"
|
|
23
|
+
fi
|
|
24
|
+
done
|
|
25
|
+
|
|
26
|
+
echo "========================================"
|
|
27
|
+
echo "STAGE 2: Upload Dataset"
|
|
28
|
+
echo "========================================"
|
|
29
|
+
echo ""
|
|
30
|
+
|
|
31
|
+
if [[ -z "$DATASET_DIR" ]]; then
|
|
32
|
+
# Find most recent dataset
|
|
33
|
+
DATASET_DIR=$(ls -td datasets/*/ 2>/dev/null | head -1)
|
|
34
|
+
if [[ -z "$DATASET_DIR" ]]; then
|
|
35
|
+
echo "No dataset directory found. Specify path or run generate.sh first."
|
|
36
|
+
exit 1
|
|
37
|
+
fi
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
DATASET_NAME=$(basename "$DATASET_DIR")
|
|
41
|
+
echo "Dataset: $DATASET_NAME"
|
|
42
|
+
echo "Path: $DATASET_DIR"
|
|
43
|
+
echo ""
|
|
44
|
+
|
|
45
|
+
if [[ "$DRY_RUN" == "true" ]]; then
|
|
46
|
+
echo "[DRY RUN] Would upload: $DATASET_DIR"
|
|
47
|
+
exit 0
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
# Upload via Modal
|
|
51
|
+
echo "Uploading to Modal volume..."
|
|
52
|
+
uvx --with cudag python -m cudag.modal_apps.upload "$DATASET_DIR"
|
|
53
|
+
|
|
54
|
+
echo ""
|
|
55
|
+
echo "Upload complete: $DATASET_NAME"
|
|
56
|
+
|
|
57
|
+
echo ""
|
|
58
|
+
echo "========================================"
|
|
59
|
+
echo "Auto-starting extraction..."
|
|
60
|
+
echo "========================================"
|
|
61
|
+
echo ""
|
|
62
|
+
|
|
63
|
+
exec ./scripts/extract.sh --dataset-name "$DATASET_NAME"
|
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
3
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
4
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
5
|
+
|
|
6
|
+
"""Interactive dataset verification and configuration tool.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/verify.py --config config/dataset.yaml
|
|
10
|
+
python scripts/verify.py --dataset datasets/my-dataset
|
|
11
|
+
|
|
12
|
+
Allows interactive modification of:
|
|
13
|
+
- Task counts (training samples per task type)
|
|
14
|
+
- Test distribution (tests per task type)
|
|
15
|
+
- Train/val split ratio
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import shutil
|
|
23
|
+
import subprocess
|
|
24
|
+
import sys
|
|
25
|
+
from collections import Counter
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
import yaml
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_config(config_path: Path) -> dict:
|
|
32
|
+
"""Load dataset configuration from YAML."""
|
|
33
|
+
with open(config_path) as f:
|
|
34
|
+
return yaml.safe_load(f)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def save_config(config_path: Path, config: dict) -> None:
|
|
38
|
+
"""Save dataset configuration to YAML."""
|
|
39
|
+
with open(config_path, "w") as f:
|
|
40
|
+
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def analyze_dataset(dataset_dir: Path) -> dict:
|
|
44
|
+
"""Analyze a generated dataset and return stats."""
|
|
45
|
+
stats = {
|
|
46
|
+
"dataset_dir": str(dataset_dir),
|
|
47
|
+
"training": {},
|
|
48
|
+
"tests": {},
|
|
49
|
+
"images": 0,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Count training samples by task type
|
|
53
|
+
data_path = dataset_dir / "data.jsonl"
|
|
54
|
+
if data_path.exists():
|
|
55
|
+
task_counts: Counter[str] = Counter()
|
|
56
|
+
with open(data_path) as f:
|
|
57
|
+
for line in f:
|
|
58
|
+
record = json.loads(line)
|
|
59
|
+
task_type = record.get("metadata", {}).get("task_type", "unknown")
|
|
60
|
+
task_counts[task_type] += 1
|
|
61
|
+
stats["training"] = dict(task_counts)
|
|
62
|
+
|
|
63
|
+
# Count training images
|
|
64
|
+
images_dir = dataset_dir / "images"
|
|
65
|
+
if images_dir.exists():
|
|
66
|
+
stats["images"] = len(list(images_dir.glob("*")))
|
|
67
|
+
|
|
68
|
+
# Count tests by task type
|
|
69
|
+
test_json = dataset_dir / "test" / "test.json"
|
|
70
|
+
if test_json.exists():
|
|
71
|
+
with open(test_json) as f:
|
|
72
|
+
tests = json.load(f)
|
|
73
|
+
test_counts: Counter[str] = Counter()
|
|
74
|
+
for test in tests:
|
|
75
|
+
task_type = test.get("metadata", {}).get("task_type", "unknown")
|
|
76
|
+
test_counts[task_type] += 1
|
|
77
|
+
stats["tests"] = dict(test_counts)
|
|
78
|
+
|
|
79
|
+
return stats
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def print_stats(stats: dict, config: dict | None = None) -> None:
|
|
83
|
+
"""Print dataset statistics in a readable format."""
|
|
84
|
+
print("\n" + "=" * 60)
|
|
85
|
+
print("DATASET STATISTICS")
|
|
86
|
+
print("=" * 60)
|
|
87
|
+
|
|
88
|
+
if config:
|
|
89
|
+
print(f"\nConfig: {config.get('name_prefix', 'unknown')}")
|
|
90
|
+
print(f"Seed: {config.get('seed', 'N/A')}")
|
|
91
|
+
|
|
92
|
+
print(f"\nDataset: {stats['dataset_dir']}")
|
|
93
|
+
print(f"Total images: {stats['images']}")
|
|
94
|
+
|
|
95
|
+
# Training samples
|
|
96
|
+
print("\n--- TRAINING SAMPLES ---")
|
|
97
|
+
training = stats.get("training", {})
|
|
98
|
+
total_training = sum(training.values())
|
|
99
|
+
print(f"Total: {total_training}")
|
|
100
|
+
|
|
101
|
+
if training:
|
|
102
|
+
max_name_len = max(len(name) for name in training.keys())
|
|
103
|
+
for task_type, count in sorted(training.items()):
|
|
104
|
+
pct = (count / total_training * 100) if total_training > 0 else 0
|
|
105
|
+
bar = "#" * int(pct / 2)
|
|
106
|
+
print(f" {task_type:<{max_name_len}} : {count:>5} ({pct:5.1f}%) {bar}")
|
|
107
|
+
|
|
108
|
+
# Test samples
|
|
109
|
+
print("\n--- TEST SAMPLES ---")
|
|
110
|
+
tests = stats.get("tests", {})
|
|
111
|
+
total_tests = sum(tests.values())
|
|
112
|
+
print(f"Total: {total_tests}")
|
|
113
|
+
|
|
114
|
+
if tests:
|
|
115
|
+
max_name_len = max(len(name) for name in tests.keys())
|
|
116
|
+
for task_type, count in sorted(tests.items()):
|
|
117
|
+
pct = (count / total_tests * 100) if total_tests > 0 else 0
|
|
118
|
+
print(f" {task_type:<{max_name_len}} : {count:>3} ({pct:5.1f}%)")
|
|
119
|
+
|
|
120
|
+
print("=" * 60)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def prompt_yes_no(question: str, default: bool = True) -> bool:
|
|
124
|
+
"""Prompt user for yes/no answer."""
|
|
125
|
+
suffix = " [Y/n]: " if default else " [y/N]: "
|
|
126
|
+
while True:
|
|
127
|
+
response = input(question + suffix).strip().lower()
|
|
128
|
+
if not response:
|
|
129
|
+
return default
|
|
130
|
+
if response in ("y", "yes"):
|
|
131
|
+
return True
|
|
132
|
+
if response in ("n", "no"):
|
|
133
|
+
return False
|
|
134
|
+
print("Please answer 'y' or 'n'")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def prompt_int(question: str, default: int | None = None) -> int:
|
|
138
|
+
"""Prompt user for integer input."""
|
|
139
|
+
suffix = f" [{default}]: " if default is not None else ": "
|
|
140
|
+
while True:
|
|
141
|
+
response = input(question + suffix).strip()
|
|
142
|
+
if not response and default is not None:
|
|
143
|
+
return default
|
|
144
|
+
try:
|
|
145
|
+
return int(response)
|
|
146
|
+
except ValueError:
|
|
147
|
+
print("Please enter a valid integer")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def edit_task_counts(config: dict) -> tuple[dict, bool]:
|
|
151
|
+
"""Interactive editor for task counts. Returns (config, changed)."""
|
|
152
|
+
tasks = config.get("tasks", {})
|
|
153
|
+
|
|
154
|
+
print("\n--- EDIT TASK COUNTS ---")
|
|
155
|
+
print("Enter new count for each task type (press Enter to keep current):\n")
|
|
156
|
+
|
|
157
|
+
new_tasks = {}
|
|
158
|
+
for task_type, current_count in tasks.items():
|
|
159
|
+
new_count = prompt_int(f" {task_type}", default=current_count)
|
|
160
|
+
new_tasks[task_type] = new_count
|
|
161
|
+
|
|
162
|
+
# Check for new tasks
|
|
163
|
+
if prompt_yes_no("\nAdd a new task type?", default=False):
|
|
164
|
+
task_type = input(" Task type name: ").strip()
|
|
165
|
+
count = prompt_int(f" {task_type} count", default=100)
|
|
166
|
+
new_tasks[task_type] = count
|
|
167
|
+
|
|
168
|
+
changed = new_tasks != tasks
|
|
169
|
+
config["tasks"] = new_tasks
|
|
170
|
+
return config, changed
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def calc_auto_test_distribution(task_types: list[str], total: int) -> dict[str, int]:
|
|
174
|
+
"""Calculate auto-distribution: 3 each for scroll/click, rest for select."""
|
|
175
|
+
dist: dict[str, int] = {}
|
|
176
|
+
simple_tasks = [t for t in task_types if t.startswith("scroll-") or t.startswith("click-")]
|
|
177
|
+
select_tasks = [t for t in task_types if t.startswith("select-")]
|
|
178
|
+
|
|
179
|
+
simple_per_task = 3
|
|
180
|
+
simple_total = len(simple_tasks) * simple_per_task
|
|
181
|
+
remaining = max(0, total - simple_total)
|
|
182
|
+
|
|
183
|
+
for task_type in simple_tasks:
|
|
184
|
+
dist[task_type] = simple_per_task
|
|
185
|
+
|
|
186
|
+
if select_tasks and remaining > 0:
|
|
187
|
+
per_select = remaining // len(select_tasks)
|
|
188
|
+
remainder = remaining % len(select_tasks)
|
|
189
|
+
for i, task_type in enumerate(select_tasks):
|
|
190
|
+
dist[task_type] = per_select + (1 if i < remainder else 0)
|
|
191
|
+
else:
|
|
192
|
+
for task_type in select_tasks:
|
|
193
|
+
dist[task_type] = 0
|
|
194
|
+
|
|
195
|
+
return dist
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def edit_test_distribution(config: dict) -> tuple[dict, bool]:
|
|
199
|
+
"""Interactive editor for test distribution. Returns (config, changed)."""
|
|
200
|
+
test_config = config.get("test", {})
|
|
201
|
+
total_tests = test_config.get("count", 100)
|
|
202
|
+
current_dist = test_config.get("distribution", {})
|
|
203
|
+
task_types = list(config.get("tasks", {}).keys())
|
|
204
|
+
|
|
205
|
+
print("\n--- EDIT TEST DISTRIBUTION ---")
|
|
206
|
+
new_total = prompt_int("Total test count", default=total_tests)
|
|
207
|
+
|
|
208
|
+
# Calculate auto values for any tasks not explicitly set
|
|
209
|
+
auto_dist = calc_auto_test_distribution(task_types, new_total)
|
|
210
|
+
|
|
211
|
+
print("\nPer-task test counts (press Enter to keep current):")
|
|
212
|
+
new_dist: dict[str, int] = {}
|
|
213
|
+
for task_type in task_types:
|
|
214
|
+
# Use explicit value if set, otherwise show auto-calculated
|
|
215
|
+
current = current_dist.get(task_type, auto_dist.get(task_type, 0))
|
|
216
|
+
new_count = prompt_int(f" {task_type}", default=current)
|
|
217
|
+
new_dist[task_type] = new_count
|
|
218
|
+
|
|
219
|
+
# Check if anything changed
|
|
220
|
+
changed = (new_total != total_tests) or (new_dist != current_dist)
|
|
221
|
+
|
|
222
|
+
test_config["count"] = new_total
|
|
223
|
+
test_config["distribution"] = new_dist
|
|
224
|
+
|
|
225
|
+
config["test"] = test_config
|
|
226
|
+
return config, changed
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def run_generator(config_path: Path, verbose: bool = False) -> Path | None:
|
|
230
|
+
"""Run the generator and return the generated dataset path."""
|
|
231
|
+
import os
|
|
232
|
+
import threading
|
|
233
|
+
import time
|
|
234
|
+
|
|
235
|
+
print("\n" + "-" * 40)
|
|
236
|
+
print("Running generator (this may take a few minutes)...")
|
|
237
|
+
print("-" * 40 + "\n", flush=True)
|
|
238
|
+
|
|
239
|
+
env = os.environ.copy()
|
|
240
|
+
env["CUDAG_FROM_SCRIPT"] = "1"
|
|
241
|
+
cmd = ["uv", "run", "python", "generator.py", "--config", str(config_path)]
|
|
242
|
+
|
|
243
|
+
if verbose:
|
|
244
|
+
# Stream output directly
|
|
245
|
+
result = subprocess.run(cmd, env=env)
|
|
246
|
+
else:
|
|
247
|
+
# Spinner for progress indication
|
|
248
|
+
stop_spinner = threading.Event()
|
|
249
|
+
|
|
250
|
+
def spinner() -> None:
|
|
251
|
+
chars = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
|
|
252
|
+
i = 0
|
|
253
|
+
while not stop_spinner.is_set():
|
|
254
|
+
print(f"\r{chars[i % len(chars)]} Generating...", end="", flush=True)
|
|
255
|
+
time.sleep(0.1)
|
|
256
|
+
i += 1
|
|
257
|
+
print("\r" + " " * 20 + "\r", end="", flush=True)
|
|
258
|
+
|
|
259
|
+
spinner_thread = threading.Thread(target=spinner)
|
|
260
|
+
spinner_thread.start()
|
|
261
|
+
|
|
262
|
+
result = subprocess.run(cmd, capture_output=True, text=True, env=env)
|
|
263
|
+
|
|
264
|
+
stop_spinner.set()
|
|
265
|
+
spinner_thread.join()
|
|
266
|
+
|
|
267
|
+
if result.returncode == 0:
|
|
268
|
+
print(result.stdout)
|
|
269
|
+
else:
|
|
270
|
+
print(result.stderr)
|
|
271
|
+
|
|
272
|
+
if result.returncode != 0:
|
|
273
|
+
print("ERROR: Generator failed!")
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
# Find the generated dataset
|
|
277
|
+
datasets_dir = Path("datasets")
|
|
278
|
+
if datasets_dir.exists():
|
|
279
|
+
datasets = sorted(datasets_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
280
|
+
if datasets:
|
|
281
|
+
return datasets[0]
|
|
282
|
+
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def find_latest_dataset() -> Path | None:
|
|
287
|
+
"""Find the most recently generated dataset."""
|
|
288
|
+
datasets_dir = Path("datasets")
|
|
289
|
+
if not datasets_dir.exists():
|
|
290
|
+
return None
|
|
291
|
+
datasets = sorted(datasets_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
292
|
+
return datasets[0] if datasets else None
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def interactive_loop(
|
|
296
|
+
config_path: Path, existing_dataset: Path | None = None, verbose: bool = False
|
|
297
|
+
) -> None:
|
|
298
|
+
"""Main interactive loop for dataset verification."""
|
|
299
|
+
config = load_config(config_path)
|
|
300
|
+
|
|
301
|
+
# Use provided dataset or find latest
|
|
302
|
+
if existing_dataset and existing_dataset.exists():
|
|
303
|
+
dataset_dir = existing_dataset
|
|
304
|
+
needs_generation = False
|
|
305
|
+
print(f"Using existing dataset: {dataset_dir}")
|
|
306
|
+
else:
|
|
307
|
+
dataset_dir = find_latest_dataset()
|
|
308
|
+
needs_generation = dataset_dir is None
|
|
309
|
+
|
|
310
|
+
while True:
|
|
311
|
+
if needs_generation:
|
|
312
|
+
# Run generator
|
|
313
|
+
dataset_dir = run_generator(config_path, verbose=verbose)
|
|
314
|
+
if dataset_dir is None:
|
|
315
|
+
print("Failed to generate dataset. Please fix errors and try again.")
|
|
316
|
+
if not prompt_yes_no("Retry?"):
|
|
317
|
+
break
|
|
318
|
+
continue
|
|
319
|
+
needs_generation = False
|
|
320
|
+
|
|
321
|
+
# Analyze and show stats
|
|
322
|
+
stats = analyze_dataset(dataset_dir)
|
|
323
|
+
print_stats(stats, config)
|
|
324
|
+
|
|
325
|
+
# Ask for approval
|
|
326
|
+
print("\nOptions:")
|
|
327
|
+
print(" [a] Approve - dataset looks good")
|
|
328
|
+
print(" [t] Modify task counts")
|
|
329
|
+
print(" [d] Modify test distribution")
|
|
330
|
+
print(" [r] Regenerate with same config")
|
|
331
|
+
print(" [q] Quit without approving")
|
|
332
|
+
|
|
333
|
+
choice = input("\nChoice [a/t/d/r/q]: ").strip().lower()
|
|
334
|
+
|
|
335
|
+
if choice == "a":
|
|
336
|
+
print(f"\nDataset approved: {dataset_dir}")
|
|
337
|
+
print("Ready for upload with: ./scripts/upload.sh " + str(dataset_dir))
|
|
338
|
+
break
|
|
339
|
+
|
|
340
|
+
elif choice == "t":
|
|
341
|
+
config, changed = edit_task_counts(config)
|
|
342
|
+
if changed:
|
|
343
|
+
save_config(config_path, config)
|
|
344
|
+
print(f"\nUpdated config saved to {config_path}")
|
|
345
|
+
# Delete old dataset before regenerating
|
|
346
|
+
if dataset_dir.exists():
|
|
347
|
+
shutil.rmtree(dataset_dir)
|
|
348
|
+
needs_generation = True
|
|
349
|
+
else:
|
|
350
|
+
print("\nNo changes made.")
|
|
351
|
+
|
|
352
|
+
elif choice == "d":
|
|
353
|
+
config, changed = edit_test_distribution(config)
|
|
354
|
+
if changed:
|
|
355
|
+
save_config(config_path, config)
|
|
356
|
+
print(f"\nUpdated config saved to {config_path}")
|
|
357
|
+
if dataset_dir.exists():
|
|
358
|
+
shutil.rmtree(dataset_dir)
|
|
359
|
+
needs_generation = True
|
|
360
|
+
else:
|
|
361
|
+
print("\nNo changes made.")
|
|
362
|
+
|
|
363
|
+
elif choice == "r":
|
|
364
|
+
if dataset_dir.exists():
|
|
365
|
+
shutil.rmtree(dataset_dir)
|
|
366
|
+
needs_generation = True
|
|
367
|
+
|
|
368
|
+
elif choice == "q":
|
|
369
|
+
print("\nExiting without approval.")
|
|
370
|
+
# Clean up generated dataset
|
|
371
|
+
if dataset_dir.exists() and prompt_yes_no("Delete generated dataset?", default=False):
|
|
372
|
+
shutil.rmtree(dataset_dir)
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
else:
|
|
376
|
+
print("Invalid choice, please try again.")
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def main() -> None:
|
|
380
|
+
"""Main entry point."""
|
|
381
|
+
parser = argparse.ArgumentParser(
|
|
382
|
+
description="Interactive dataset verification and configuration tool"
|
|
383
|
+
)
|
|
384
|
+
parser.add_argument(
|
|
385
|
+
"--config",
|
|
386
|
+
"-c",
|
|
387
|
+
type=Path,
|
|
388
|
+
help="Path to dataset config YAML",
|
|
389
|
+
)
|
|
390
|
+
parser.add_argument(
|
|
391
|
+
"--existing",
|
|
392
|
+
"-e",
|
|
393
|
+
type=Path,
|
|
394
|
+
help="Path to existing dataset to verify (skips generation)",
|
|
395
|
+
)
|
|
396
|
+
parser.add_argument(
|
|
397
|
+
"--verbose",
|
|
398
|
+
"-v",
|
|
399
|
+
action="store_true",
|
|
400
|
+
help="Stream generator output instead of showing spinner",
|
|
401
|
+
)
|
|
402
|
+
args = parser.parse_args()
|
|
403
|
+
|
|
404
|
+
# Determine config path
|
|
405
|
+
config_path = args.config
|
|
406
|
+
if not config_path:
|
|
407
|
+
config_path = Path("config/dataset.prod.yaml")
|
|
408
|
+
if not config_path.exists():
|
|
409
|
+
config_path = Path("config/dataset.yaml")
|
|
410
|
+
|
|
411
|
+
if not config_path.exists():
|
|
412
|
+
print(f"ERROR: Config not found: {config_path}")
|
|
413
|
+
parser.print_help()
|
|
414
|
+
sys.exit(1)
|
|
415
|
+
|
|
416
|
+
if args.existing:
|
|
417
|
+
# Verify existing dataset
|
|
418
|
+
if not args.existing.exists():
|
|
419
|
+
print(f"ERROR: Dataset not found: {args.existing}")
|
|
420
|
+
sys.exit(1)
|
|
421
|
+
interactive_loop(config_path, existing_dataset=args.existing, verbose=args.verbose)
|
|
422
|
+
else:
|
|
423
|
+
# Default: generate and verify
|
|
424
|
+
interactive_loop(config_path, verbose=args.verbose)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
if __name__ == "__main__":
|
|
428
|
+
main()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
2
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
3
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
4
|
+
|
|
5
|
+
"""Dataset validation module for CUDAG.
|
|
6
|
+
|
|
7
|
+
This module provides validation for CUDAG datasets to ensure they conform
|
|
8
|
+
to the expected filesystem structure and data schemas.
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
from cudag.validation import validate_dataset
|
|
12
|
+
|
|
13
|
+
errors = validate_dataset(Path("datasets/my-dataset"))
|
|
14
|
+
if errors:
|
|
15
|
+
for error in errors:
|
|
16
|
+
print(f"ERROR: {error}")
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from cudag.validation.validate import (
|
|
20
|
+
ValidationError,
|
|
21
|
+
validate_dataset,
|
|
22
|
+
validate_filesystem,
|
|
23
|
+
validate_image_paths,
|
|
24
|
+
validate_test_records,
|
|
25
|
+
validate_training_records,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"ValidationError",
|
|
30
|
+
"validate_dataset",
|
|
31
|
+
"validate_filesystem",
|
|
32
|
+
"validate_image_paths",
|
|
33
|
+
"validate_test_records",
|
|
34
|
+
"validate_training_records",
|
|
35
|
+
]
|