cudag 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudag/__init__.py +334 -0
- cudag/annotation/__init__.py +77 -0
- cudag/annotation/codegen.py +648 -0
- cudag/annotation/config.py +545 -0
- cudag/annotation/loader.py +342 -0
- cudag/annotation/scaffold.py +121 -0
- cudag/annotation/transcription.py +296 -0
- cudag/cli/__init__.py +5 -0
- cudag/cli/main.py +315 -0
- cudag/cli/new.py +873 -0
- cudag/core/__init__.py +364 -0
- cudag/core/button.py +137 -0
- cudag/core/canvas.py +222 -0
- cudag/core/config.py +70 -0
- cudag/core/coords.py +233 -0
- cudag/core/data_grid.py +804 -0
- cudag/core/dataset.py +678 -0
- cudag/core/distribution.py +136 -0
- cudag/core/drawing.py +75 -0
- cudag/core/fonts.py +156 -0
- cudag/core/generator.py +163 -0
- cudag/core/grid.py +367 -0
- cudag/core/grounding_task.py +247 -0
- cudag/core/icon.py +207 -0
- cudag/core/iconlist_task.py +301 -0
- cudag/core/models.py +1251 -0
- cudag/core/random.py +130 -0
- cudag/core/renderer.py +190 -0
- cudag/core/screen.py +402 -0
- cudag/core/scroll_task.py +254 -0
- cudag/core/scrollable_grid.py +447 -0
- cudag/core/state.py +110 -0
- cudag/core/task.py +293 -0
- cudag/core/taskbar.py +350 -0
- cudag/core/text.py +212 -0
- cudag/core/utils.py +82 -0
- cudag/data/surnames.txt +5000 -0
- cudag/modal_apps/__init__.py +4 -0
- cudag/modal_apps/archive.py +103 -0
- cudag/modal_apps/extract.py +138 -0
- cudag/modal_apps/preprocess.py +529 -0
- cudag/modal_apps/upload.py +317 -0
- cudag/prompts/SYSTEM_PROMPT.txt +104 -0
- cudag/prompts/__init__.py +33 -0
- cudag/prompts/system.py +43 -0
- cudag/prompts/tools.py +382 -0
- cudag/py.typed +0 -0
- cudag/schemas/filesystem.json +90 -0
- cudag/schemas/test_record.schema.json +113 -0
- cudag/schemas/train_record.schema.json +90 -0
- cudag/server/__init__.py +21 -0
- cudag/server/app.py +232 -0
- cudag/server/services/__init__.py +9 -0
- cudag/server/services/generator.py +128 -0
- cudag/templates/scripts/archive.sh +35 -0
- cudag/templates/scripts/build.sh +13 -0
- cudag/templates/scripts/extract.sh +54 -0
- cudag/templates/scripts/generate.sh +116 -0
- cudag/templates/scripts/pre-commit.sh +44 -0
- cudag/templates/scripts/preprocess.sh +46 -0
- cudag/templates/scripts/upload.sh +63 -0
- cudag/templates/scripts/verify.py +428 -0
- cudag/validation/__init__.py +35 -0
- cudag/validation/validate.py +508 -0
- cudag-0.3.10.dist-info/METADATA +570 -0
- cudag-0.3.10.dist-info/RECORD +69 -0
- cudag-0.3.10.dist-info/WHEEL +4 -0
- cudag-0.3.10.dist-info/entry_points.txt +2 -0
- cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
3
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
4
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
5
|
+
|
|
6
|
+
"""Archive datasets from Modal volume to archive volume.
|
|
7
|
+
|
|
8
|
+
Compresses the raw dataset into a .tgz archive.
|
|
9
|
+
|
|
10
|
+
Volume structure:
|
|
11
|
+
claimhawk-archives/
|
|
12
|
+
datasets/[ds-name].tgz
|
|
13
|
+
loras/[ds-name]/[run-name].tgz
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
import modal
|
|
20
|
+
|
|
21
|
+
# =============================================================================
|
|
22
|
+
# CENTRALIZED CONFIGURATION
|
|
23
|
+
# =============================================================================
|
|
24
|
+
# Volume names are loaded from config/adapters.yaml via the SDK.
|
|
25
|
+
# Users can customize these by editing the YAML file.
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from sdk.modal_compat import get_volume_name
|
|
29
|
+
TRAINING_VOLUME = get_volume_name("lora_training")
|
|
30
|
+
ARCHIVE_VOLUME = get_volume_name("archives")
|
|
31
|
+
except ImportError:
|
|
32
|
+
# Fallback when SDK not available
|
|
33
|
+
TRAINING_VOLUME = "claimhawk-lora-training"
|
|
34
|
+
ARCHIVE_VOLUME = "claimhawk-archives"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_generator_name() -> str:
|
|
38
|
+
"""Extract generator name from --ds-name arg for dynamic app naming."""
|
|
39
|
+
for i, arg in enumerate(sys.argv):
|
|
40
|
+
if arg == "--ds-name" and i + 1 < len(sys.argv):
|
|
41
|
+
ds_name = sys.argv[i + 1]
|
|
42
|
+
return ds_name.split("-")[0] if ds_name else "cudag"
|
|
43
|
+
return "cudag"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
app = modal.App(f"{_get_generator_name()}-archive")
|
|
47
|
+
training_vol = modal.Volume.from_name(TRAINING_VOLUME, create_if_missing=True)
|
|
48
|
+
archive_vol = modal.Volume.from_name(ARCHIVE_VOLUME, create_if_missing=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@app.function(
|
|
52
|
+
volumes={
|
|
53
|
+
"/training": training_vol,
|
|
54
|
+
"/archive": archive_vol,
|
|
55
|
+
},
|
|
56
|
+
timeout=1800, # 30 min for large datasets
|
|
57
|
+
)
|
|
58
|
+
def archive_dataset(ds_name: str) -> str:
|
|
59
|
+
"""Archive a dataset to the archive volume.
|
|
60
|
+
|
|
61
|
+
Reads from:
|
|
62
|
+
/training/datasets/[ds_name]/
|
|
63
|
+
|
|
64
|
+
Writes to:
|
|
65
|
+
/archive/datasets/[ds_name].tgz
|
|
66
|
+
"""
|
|
67
|
+
import tarfile
|
|
68
|
+
from pathlib import Path
|
|
69
|
+
|
|
70
|
+
dataset_path = Path(f"/training/datasets/{ds_name}")
|
|
71
|
+
archive_dir = Path("/archive/datasets")
|
|
72
|
+
archive_path = archive_dir / f"{ds_name}.tgz"
|
|
73
|
+
|
|
74
|
+
# Verify source path exists
|
|
75
|
+
if not dataset_path.exists():
|
|
76
|
+
raise FileNotFoundError(f"Dataset not found: {dataset_path}")
|
|
77
|
+
|
|
78
|
+
# Create archive directory
|
|
79
|
+
archive_dir.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
|
|
81
|
+
print(f"Creating archive: {ds_name}.tgz")
|
|
82
|
+
print(f" Dataset: {dataset_path}")
|
|
83
|
+
|
|
84
|
+
# Create tar.gz archive
|
|
85
|
+
with tarfile.open(archive_path, "w:gz") as tar:
|
|
86
|
+
tar.add(dataset_path, arcname=ds_name)
|
|
87
|
+
|
|
88
|
+
# Get archive size
|
|
89
|
+
size_mb = archive_path.stat().st_size / (1024 * 1024)
|
|
90
|
+
print(f"Archive size: {size_mb:.1f} MB")
|
|
91
|
+
|
|
92
|
+
# Commit to volume
|
|
93
|
+
archive_vol.commit()
|
|
94
|
+
|
|
95
|
+
print(f"Archived to: /archive/datasets/{ds_name}.tgz")
|
|
96
|
+
return str(archive_path)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@app.local_entrypoint()
|
|
100
|
+
def main(ds_name: str) -> None:
|
|
101
|
+
"""Archive a dataset."""
|
|
102
|
+
result = archive_dataset.remote(ds_name)
|
|
103
|
+
print(f"Archive complete: {result}")
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
3
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
4
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
5
|
+
|
|
6
|
+
"""Modal function to extract uploaded dataset archives (single or chunked) on a volume.
|
|
7
|
+
|
|
8
|
+
Pipeline: upload_dataset -> modal_extract -> preprocess
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
import modal
|
|
15
|
+
|
|
16
|
+
# =============================================================================
|
|
17
|
+
# CENTRALIZED CONFIGURATION
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Volume names are loaded from config/adapters.yaml via the SDK.
|
|
20
|
+
# Users can customize these by editing the YAML file.
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from sdk.modal_compat import get_volume_name
|
|
24
|
+
DEFAULT_VOLUME = get_volume_name("lora_training")
|
|
25
|
+
except ImportError:
|
|
26
|
+
# Fallback when SDK not available
|
|
27
|
+
DEFAULT_VOLUME = "claimhawk-lora-training"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_generator_name() -> str:
|
|
31
|
+
"""Extract generator name from --dataset-name arg for dynamic app naming."""
|
|
32
|
+
for i, arg in enumerate(sys.argv):
|
|
33
|
+
if arg == "--dataset-name" and i + 1 < len(sys.argv):
|
|
34
|
+
ds_name = sys.argv[i + 1]
|
|
35
|
+
return ds_name.split("-")[0] if ds_name else "cudag"
|
|
36
|
+
return "cudag"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
app = modal.App(f"{_get_generator_name()}-extract")
|
|
40
|
+
VOLUME = modal.Volume.from_name(DEFAULT_VOLUME, create_if_missing=True)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@app.function(volumes={"/data": VOLUME}, timeout=600)
|
|
44
|
+
def extract(ds_name: str) -> str:
|
|
45
|
+
"""Extract a tarball (single or chunked) on the Modal volume."""
|
|
46
|
+
import json
|
|
47
|
+
import shutil
|
|
48
|
+
import tarfile
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
|
|
51
|
+
datasets_dir = Path("/data/datasets")
|
|
52
|
+
datasets_dir.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
chunks_dir = datasets_dir / f"{ds_name}_chunks"
|
|
55
|
+
legacy_archive = datasets_dir / f"{ds_name}.tar.gz"
|
|
56
|
+
extract_dir = datasets_dir
|
|
57
|
+
|
|
58
|
+
# Check for chunked upload first
|
|
59
|
+
if chunks_dir.exists():
|
|
60
|
+
manifest_path = chunks_dir / f"{ds_name}.manifest.json"
|
|
61
|
+
|
|
62
|
+
if not manifest_path.exists():
|
|
63
|
+
raise FileNotFoundError(f"Manifest not found: {manifest_path}")
|
|
64
|
+
|
|
65
|
+
with open(manifest_path) as f:
|
|
66
|
+
manifest = json.load(f)
|
|
67
|
+
|
|
68
|
+
num_chunks = manifest["num_chunks"]
|
|
69
|
+
print(f"Reassembling {num_chunks} chunks...")
|
|
70
|
+
|
|
71
|
+
# Reassemble the archive from chunks
|
|
72
|
+
reassembled_path = datasets_dir / f"{ds_name}.tar.gz"
|
|
73
|
+
|
|
74
|
+
# Handle single-chunk case (archive wasn't split)
|
|
75
|
+
if num_chunks == 1:
|
|
76
|
+
# Find the single chunk (could be .tar.gz or other naming)
|
|
77
|
+
chunk_names = list(manifest["chunks"].keys())
|
|
78
|
+
if chunk_names:
|
|
79
|
+
chunk_path = chunks_dir / chunk_names[0]
|
|
80
|
+
if chunk_path.exists():
|
|
81
|
+
print(f" Moving single chunk {chunk_path.name}")
|
|
82
|
+
shutil.copy2(chunk_path, reassembled_path)
|
|
83
|
+
else:
|
|
84
|
+
raise FileNotFoundError(f"Chunk not found: {chunk_path}")
|
|
85
|
+
else:
|
|
86
|
+
raise FileNotFoundError("No chunks found in manifest")
|
|
87
|
+
else:
|
|
88
|
+
with open(reassembled_path, "wb") as outfile:
|
|
89
|
+
for i in range(num_chunks):
|
|
90
|
+
# Try to find the chunk with different naming patterns
|
|
91
|
+
chunk_path = None
|
|
92
|
+
for name in manifest["chunks"]:
|
|
93
|
+
if f"part{i:03d}" in name:
|
|
94
|
+
chunk_path = chunks_dir / name
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
if chunk_path is None or not chunk_path.exists():
|
|
98
|
+
raise FileNotFoundError(f"Chunk {i} not found")
|
|
99
|
+
|
|
100
|
+
print(f" Adding {chunk_path.name}")
|
|
101
|
+
with open(chunk_path, "rb") as chunk:
|
|
102
|
+
outfile.write(chunk.read())
|
|
103
|
+
|
|
104
|
+
# Extract the reassembled archive
|
|
105
|
+
print("Extracting archive...")
|
|
106
|
+
with tarfile.open(reassembled_path, "r:gz") as tar:
|
|
107
|
+
tar.extractall(path=extract_dir, filter="data")
|
|
108
|
+
|
|
109
|
+
# Cleanup: remove chunks directory and reassembled archive
|
|
110
|
+
shutil.rmtree(chunks_dir)
|
|
111
|
+
reassembled_path.unlink()
|
|
112
|
+
VOLUME.commit()
|
|
113
|
+
print(f"Extracted {ds_name} to /data/datasets/{ds_name}")
|
|
114
|
+
|
|
115
|
+
# Fall back to legacy single-file archive
|
|
116
|
+
elif legacy_archive.exists():
|
|
117
|
+
print("Extracting single archive...")
|
|
118
|
+
with tarfile.open(legacy_archive, "r:gz") as tar:
|
|
119
|
+
tar.extractall(path=extract_dir, filter="data")
|
|
120
|
+
|
|
121
|
+
legacy_archive.unlink()
|
|
122
|
+
VOLUME.commit()
|
|
123
|
+
print(f"Extracted {ds_name} to /data/datasets/{ds_name}")
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
raise FileNotFoundError(
|
|
127
|
+
f"No archive found for {ds_name}. "
|
|
128
|
+
f"Checked: {chunks_dir} and {legacy_archive}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return ds_name
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@app.local_entrypoint()
|
|
135
|
+
def main(dataset_name: str) -> None:
|
|
136
|
+
"""Entry point for modal run command."""
|
|
137
|
+
result = extract.remote(dataset_name)
|
|
138
|
+
print(f"Extraction complete: {result}")
|