cudag 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudag/__init__.py +334 -0
- cudag/annotation/__init__.py +77 -0
- cudag/annotation/codegen.py +648 -0
- cudag/annotation/config.py +545 -0
- cudag/annotation/loader.py +342 -0
- cudag/annotation/scaffold.py +121 -0
- cudag/annotation/transcription.py +296 -0
- cudag/cli/__init__.py +5 -0
- cudag/cli/main.py +315 -0
- cudag/cli/new.py +873 -0
- cudag/core/__init__.py +364 -0
- cudag/core/button.py +137 -0
- cudag/core/canvas.py +222 -0
- cudag/core/config.py +70 -0
- cudag/core/coords.py +233 -0
- cudag/core/data_grid.py +804 -0
- cudag/core/dataset.py +678 -0
- cudag/core/distribution.py +136 -0
- cudag/core/drawing.py +75 -0
- cudag/core/fonts.py +156 -0
- cudag/core/generator.py +163 -0
- cudag/core/grid.py +367 -0
- cudag/core/grounding_task.py +247 -0
- cudag/core/icon.py +207 -0
- cudag/core/iconlist_task.py +301 -0
- cudag/core/models.py +1251 -0
- cudag/core/random.py +130 -0
- cudag/core/renderer.py +190 -0
- cudag/core/screen.py +402 -0
- cudag/core/scroll_task.py +254 -0
- cudag/core/scrollable_grid.py +447 -0
- cudag/core/state.py +110 -0
- cudag/core/task.py +293 -0
- cudag/core/taskbar.py +350 -0
- cudag/core/text.py +212 -0
- cudag/core/utils.py +82 -0
- cudag/data/surnames.txt +5000 -0
- cudag/modal_apps/__init__.py +4 -0
- cudag/modal_apps/archive.py +103 -0
- cudag/modal_apps/extract.py +138 -0
- cudag/modal_apps/preprocess.py +529 -0
- cudag/modal_apps/upload.py +317 -0
- cudag/prompts/SYSTEM_PROMPT.txt +104 -0
- cudag/prompts/__init__.py +33 -0
- cudag/prompts/system.py +43 -0
- cudag/prompts/tools.py +382 -0
- cudag/py.typed +0 -0
- cudag/schemas/filesystem.json +90 -0
- cudag/schemas/test_record.schema.json +113 -0
- cudag/schemas/train_record.schema.json +90 -0
- cudag/server/__init__.py +21 -0
- cudag/server/app.py +232 -0
- cudag/server/services/__init__.py +9 -0
- cudag/server/services/generator.py +128 -0
- cudag/templates/scripts/archive.sh +35 -0
- cudag/templates/scripts/build.sh +13 -0
- cudag/templates/scripts/extract.sh +54 -0
- cudag/templates/scripts/generate.sh +116 -0
- cudag/templates/scripts/pre-commit.sh +44 -0
- cudag/templates/scripts/preprocess.sh +46 -0
- cudag/templates/scripts/upload.sh +63 -0
- cudag/templates/scripts/verify.py +428 -0
- cudag/validation/__init__.py +35 -0
- cudag/validation/validate.py +508 -0
- cudag-0.3.10.dist-info/METADATA +570 -0
- cudag-0.3.10.dist-info/RECORD +69 -0
- cudag-0.3.10.dist-info/WHEEL +4 -0
- cudag-0.3.10.dist-info/entry_points.txt +2 -0
- cudag-0.3.10.dist-info/licenses/LICENSE +66 -0
cudag/prompts/tools.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
# Copyright (c) 2025 Tylt LLC. All rights reserved.
|
|
2
|
+
# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
|
|
3
|
+
# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
|
|
4
|
+
|
|
5
|
+
"""Computer use tool definition and tool_call formatting.
|
|
6
|
+
|
|
7
|
+
This module codifies the canonical tool_call format used in VLM training datasets.
|
|
8
|
+
All tool calls must use this format for consistency across generators.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
|
|
18
|
+
# Valid actions for computer_use tool
|
|
19
|
+
TOOL_ACTIONS = Literal[
|
|
20
|
+
"key",
|
|
21
|
+
"type",
|
|
22
|
+
"mouse_move",
|
|
23
|
+
"left_click",
|
|
24
|
+
"left_click_drag",
|
|
25
|
+
"right_click",
|
|
26
|
+
"middle_click",
|
|
27
|
+
"double_click",
|
|
28
|
+
"triple_click",
|
|
29
|
+
"scroll",
|
|
30
|
+
"hscroll",
|
|
31
|
+
"wait",
|
|
32
|
+
"terminate",
|
|
33
|
+
"answer",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# Action descriptions for system prompt
|
|
37
|
+
ACTION_DESCRIPTIONS = {
|
|
38
|
+
"key": "Press keys in order, release in reverse.",
|
|
39
|
+
"type": "Type a string of text.",
|
|
40
|
+
"mouse_move": "Move the cursor to (x, y).",
|
|
41
|
+
"left_click": "Left click at (x, y).",
|
|
42
|
+
"left_click_drag": "Click and drag from current to (x, y).",
|
|
43
|
+
"right_click": "Right click at (x, y).",
|
|
44
|
+
"middle_click": "Middle click at (x, y).",
|
|
45
|
+
"double_click": "Double-click at (x, y).",
|
|
46
|
+
"triple_click": "Triple-click at (x, y) (simulated as double-click).",
|
|
47
|
+
"scroll": "Scroll the mouse wheel.",
|
|
48
|
+
"hscroll": "Horizontal scroll.",
|
|
49
|
+
"wait": "Wait N seconds.",
|
|
50
|
+
"terminate": "End the task with a status.",
|
|
51
|
+
"answer": "Answer a question.",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Actions that require coordinate parameter
|
|
55
|
+
COORDINATE_ACTIONS = {
|
|
56
|
+
"mouse_move",
|
|
57
|
+
"left_click",
|
|
58
|
+
"left_click_drag",
|
|
59
|
+
"right_click",
|
|
60
|
+
"middle_click",
|
|
61
|
+
"double_click",
|
|
62
|
+
"triple_click",
|
|
63
|
+
"scroll",
|
|
64
|
+
"hscroll",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Actions that require specific parameters
|
|
68
|
+
ACTION_REQUIRED_PARAMS: dict[str, list[str]] = {
|
|
69
|
+
"key": ["keys"],
|
|
70
|
+
"type": ["text"],
|
|
71
|
+
"scroll": ["coordinate", "pixels"],
|
|
72
|
+
"hscroll": ["coordinate", "pixels"],
|
|
73
|
+
"wait": ["time"],
|
|
74
|
+
"terminate": ["status"],
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Canonical computer_use tool definition (JSON schema)
|
|
78
|
+
COMPUTER_USE_TOOL: dict[str, Any] = {
|
|
79
|
+
"type": "function",
|
|
80
|
+
"function": {
|
|
81
|
+
"name_for_human": "computer_use",
|
|
82
|
+
"name": "computer_use",
|
|
83
|
+
"description": "Perform computer actions",
|
|
84
|
+
"parameters": {
|
|
85
|
+
"properties": {
|
|
86
|
+
"action": {
|
|
87
|
+
"description": "\n".join(
|
|
88
|
+
f"* `{action}`: {desc}" for action, desc in ACTION_DESCRIPTIONS.items()
|
|
89
|
+
),
|
|
90
|
+
"enum": list(ACTION_DESCRIPTIONS.keys()),
|
|
91
|
+
"type": "string",
|
|
92
|
+
},
|
|
93
|
+
"keys": {
|
|
94
|
+
"description": "Required only by `action=key`.",
|
|
95
|
+
"type": "array",
|
|
96
|
+
},
|
|
97
|
+
"text": {
|
|
98
|
+
"description": "Required only by `action=type`.",
|
|
99
|
+
"type": "string",
|
|
100
|
+
},
|
|
101
|
+
"coordinate": {
|
|
102
|
+
"description": "Mouse coordinates (1000x1000 normalized).",
|
|
103
|
+
"type": "array",
|
|
104
|
+
},
|
|
105
|
+
"pixels": {
|
|
106
|
+
"description": "The amount of scrolling.",
|
|
107
|
+
"type": "number",
|
|
108
|
+
},
|
|
109
|
+
"time": {
|
|
110
|
+
"description": "The seconds to wait.",
|
|
111
|
+
"type": "number",
|
|
112
|
+
},
|
|
113
|
+
"status": {
|
|
114
|
+
"description": "The status of the task.",
|
|
115
|
+
"type": "string",
|
|
116
|
+
"enum": ["success", "failure"],
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
"required": ["action"],
|
|
120
|
+
"type": "object",
|
|
121
|
+
},
|
|
122
|
+
"args_format": "Format the arguments as a JSON object.",
|
|
123
|
+
},
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class BboxCall:
|
|
129
|
+
"""Represents a get_bbox tool call for element grounding.
|
|
130
|
+
|
|
131
|
+
This is the canonical format for bounding box detection in VLMGen datasets.
|
|
132
|
+
Used for "grounding" task types that identify element locations.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
bbox_2d: tuple[int, int, int, int]
|
|
136
|
+
"""Bounding box coordinates [x1, y1, x2, y2] in RU (0-1000)."""
|
|
137
|
+
|
|
138
|
+
label: str | None = None
|
|
139
|
+
"""Optional human-readable label of the element being located."""
|
|
140
|
+
|
|
141
|
+
def to_dict(self) -> dict[str, Any]:
|
|
142
|
+
"""Convert to dictionary for JSON serialization."""
|
|
143
|
+
args: dict[str, Any] = {"bbox_2d": list(self.bbox_2d)}
|
|
144
|
+
if self.label:
|
|
145
|
+
args["label"] = self.label
|
|
146
|
+
return {
|
|
147
|
+
"name": "get_bbox",
|
|
148
|
+
"arguments": args,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def from_dict(cls, data: dict[str, Any]) -> BboxCall:
|
|
153
|
+
"""Create from dictionary."""
|
|
154
|
+
if data.get("name") != "get_bbox":
|
|
155
|
+
raise ValueError(f"Expected get_bbox tool, got: {data.get('name')}")
|
|
156
|
+
|
|
157
|
+
args = data.get("arguments", {})
|
|
158
|
+
bbox = args.get("bbox_2d", [0, 0, 0, 0])
|
|
159
|
+
|
|
160
|
+
return cls(
|
|
161
|
+
bbox_2d=tuple(bbox), # type: ignore[arg-type]
|
|
162
|
+
label=args.get("label"),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def create(
|
|
167
|
+
cls, bbox_2d: tuple[int, int, int, int], label: str | None = None
|
|
168
|
+
) -> BboxCall:
|
|
169
|
+
"""Create a get_bbox tool call.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
bbox_2d: Bounding box [x1, y1, x2, y2] in RU units (0-1000)
|
|
173
|
+
label: Optional human-readable label of the element (e.g., "Appts")
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
BboxCall instance
|
|
177
|
+
"""
|
|
178
|
+
return cls(bbox_2d=bbox_2d, label=label)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass
|
|
182
|
+
class ToolCall:
|
|
183
|
+
"""Represents a computer_use tool call.
|
|
184
|
+
|
|
185
|
+
This is the canonical format for all tool calls in VLMGen datasets.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
action: str
|
|
189
|
+
coordinate: tuple[int, int] | None = None
|
|
190
|
+
pixels: int | None = None
|
|
191
|
+
keys: list[str] | None = None
|
|
192
|
+
text: str | None = None
|
|
193
|
+
time: float | None = None
|
|
194
|
+
status: str | None = None
|
|
195
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
196
|
+
|
|
197
|
+
def to_dict(self) -> dict[str, Any]:
|
|
198
|
+
"""Convert to dictionary for JSON serialization."""
|
|
199
|
+
args: dict[str, Any] = {"action": self.action}
|
|
200
|
+
|
|
201
|
+
if self.coordinate is not None:
|
|
202
|
+
args["coordinate"] = list(self.coordinate)
|
|
203
|
+
if self.pixels is not None:
|
|
204
|
+
args["pixels"] = self.pixels
|
|
205
|
+
if self.keys is not None:
|
|
206
|
+
args["keys"] = self.keys
|
|
207
|
+
if self.text is not None:
|
|
208
|
+
args["text"] = self.text
|
|
209
|
+
if self.time is not None:
|
|
210
|
+
args["time"] = self.time
|
|
211
|
+
if self.status is not None:
|
|
212
|
+
args["status"] = self.status
|
|
213
|
+
|
|
214
|
+
# Include any extra fields
|
|
215
|
+
args.update(self.extra)
|
|
216
|
+
|
|
217
|
+
return {"name": "computer_use", "arguments": args}
|
|
218
|
+
|
|
219
|
+
@classmethod
|
|
220
|
+
def from_dict(cls, data: dict[str, Any]) -> ToolCall:
|
|
221
|
+
"""Create from dictionary."""
|
|
222
|
+
if data.get("name") != "computer_use":
|
|
223
|
+
raise ValueError(f"Expected computer_use tool, got: {data.get('name')}")
|
|
224
|
+
|
|
225
|
+
args = data.get("arguments", {})
|
|
226
|
+
coord = args.get("coordinate")
|
|
227
|
+
|
|
228
|
+
# Extract known fields
|
|
229
|
+
known_fields = {"action", "coordinate", "pixels", "keys", "text", "time", "status"}
|
|
230
|
+
extra = {k: v for k, v in args.items() if k not in known_fields}
|
|
231
|
+
|
|
232
|
+
return cls(
|
|
233
|
+
action=args["action"],
|
|
234
|
+
coordinate=tuple(coord) if coord else None,
|
|
235
|
+
pixels=args.get("pixels"),
|
|
236
|
+
keys=args.get("keys"),
|
|
237
|
+
text=args.get("text"),
|
|
238
|
+
time=args.get("time"),
|
|
239
|
+
status=args.get("status"),
|
|
240
|
+
extra=extra,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def left_click(cls, coordinate: tuple[int, int]) -> ToolCall:
|
|
245
|
+
"""Create a left_click tool call."""
|
|
246
|
+
return cls(action="left_click", coordinate=coordinate)
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def double_click(cls, coordinate: tuple[int, int]) -> ToolCall:
|
|
250
|
+
"""Create a double_click tool call."""
|
|
251
|
+
return cls(action="double_click", coordinate=coordinate)
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def right_click(cls, coordinate: tuple[int, int]) -> ToolCall:
|
|
255
|
+
"""Create a right_click tool call."""
|
|
256
|
+
return cls(action="right_click", coordinate=coordinate)
|
|
257
|
+
|
|
258
|
+
@classmethod
|
|
259
|
+
def scroll(cls, coordinate: tuple[int, int], pixels: int) -> ToolCall:
|
|
260
|
+
"""Create a scroll tool call. Negative pixels = scroll up."""
|
|
261
|
+
return cls(action="scroll", coordinate=coordinate, pixels=pixels)
|
|
262
|
+
|
|
263
|
+
@classmethod
|
|
264
|
+
def key_press(cls, keys: list[str]) -> ToolCall:
|
|
265
|
+
"""Create a key press tool call."""
|
|
266
|
+
return cls(action="key", keys=keys)
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def type_text(cls, text: str) -> ToolCall:
|
|
270
|
+
"""Create a type tool call."""
|
|
271
|
+
return cls(action="type", text=text)
|
|
272
|
+
|
|
273
|
+
@classmethod
|
|
274
|
+
def wait(cls, seconds: float) -> ToolCall:
|
|
275
|
+
"""Create a wait tool call."""
|
|
276
|
+
return cls(action="wait", time=seconds)
|
|
277
|
+
|
|
278
|
+
@classmethod
|
|
279
|
+
def terminate(cls, status: str = "success") -> ToolCall:
|
|
280
|
+
"""Create a terminate tool call."""
|
|
281
|
+
return cls(action="terminate", status=status)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def format_tool_call(tool_call: ToolCall | BboxCall | dict[str, Any]) -> str:
|
|
285
|
+
"""Format a tool call as XML-wrapped JSON string.
|
|
286
|
+
|
|
287
|
+
This is the canonical output format for GPT responses in training data.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
tool_call: ToolCall, BboxCall instance, or dict with {name, arguments}
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Formatted string like:
|
|
294
|
+
<tool_call>
|
|
295
|
+
{"name": "computer_use", "arguments": {...}}
|
|
296
|
+
</tool_call>
|
|
297
|
+
|
|
298
|
+
or for bounding box:
|
|
299
|
+
<tool_call>
|
|
300
|
+
{"name": "get_bbox", "arguments": {"label": "...", "bbox_2d": [...]}}
|
|
301
|
+
</tool_call>
|
|
302
|
+
"""
|
|
303
|
+
if isinstance(tool_call, (ToolCall, BboxCall)):
|
|
304
|
+
data = tool_call.to_dict()
|
|
305
|
+
else:
|
|
306
|
+
data = tool_call
|
|
307
|
+
|
|
308
|
+
json_str = json.dumps(data)
|
|
309
|
+
return f"<tool_call>\n{json_str}\n</tool_call>"
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# Regex pattern for parsing tool calls
|
|
313
|
+
TOOL_CALL_PATTERN = re.compile(
|
|
314
|
+
r"<tool_call>\s*(?P<json>\{.*?\})\s*</tool_call>",
|
|
315
|
+
re.DOTALL | re.IGNORECASE,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def parse_tool_call(text: str) -> ToolCall | None:
|
|
320
|
+
"""Parse a tool call from model output text.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
text: Model output containing <tool_call>...</tool_call>
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Parsed ToolCall or None if not found
|
|
327
|
+
"""
|
|
328
|
+
match = TOOL_CALL_PATTERN.search(text)
|
|
329
|
+
if not match:
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
data = json.loads(match.group("json"))
|
|
334
|
+
return ToolCall.from_dict(data)
|
|
335
|
+
except (json.JSONDecodeError, ValueError, KeyError):
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def validate_tool_call(tool_call: ToolCall) -> list[str]:
|
|
340
|
+
"""Validate a tool call and return list of errors.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
tool_call: ToolCall to validate
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
List of error messages (empty if valid)
|
|
347
|
+
"""
|
|
348
|
+
errors: list[str] = []
|
|
349
|
+
|
|
350
|
+
# Check action is valid
|
|
351
|
+
if tool_call.action not in ACTION_DESCRIPTIONS:
|
|
352
|
+
errors.append(f"Invalid action: {tool_call.action}")
|
|
353
|
+
return errors # Can't validate further without valid action
|
|
354
|
+
|
|
355
|
+
# Check coordinate is provided for coordinate-requiring actions
|
|
356
|
+
if tool_call.action in COORDINATE_ACTIONS and tool_call.coordinate is None:
|
|
357
|
+
errors.append(f"Action '{tool_call.action}' requires coordinate")
|
|
358
|
+
|
|
359
|
+
# Check coordinate values are in valid range
|
|
360
|
+
if tool_call.coordinate is not None:
|
|
361
|
+
x, y = tool_call.coordinate
|
|
362
|
+
if not (0 <= x <= 1000 and 0 <= y <= 1000):
|
|
363
|
+
errors.append(f"Coordinate out of range [0, 1000]: ({x}, {y})")
|
|
364
|
+
|
|
365
|
+
# Check required parameters
|
|
366
|
+
required = ACTION_REQUIRED_PARAMS.get(tool_call.action, [])
|
|
367
|
+
for param in required:
|
|
368
|
+
value = getattr(tool_call, param, None)
|
|
369
|
+
if value is None:
|
|
370
|
+
errors.append(f"Action '{tool_call.action}' requires '{param}'")
|
|
371
|
+
|
|
372
|
+
# Validate scroll pixels
|
|
373
|
+
if tool_call.action in ("scroll", "hscroll") and tool_call.pixels is not None:
|
|
374
|
+
if not isinstance(tool_call.pixels, (int, float)):
|
|
375
|
+
errors.append(f"Invalid pixels value: {tool_call.pixels}")
|
|
376
|
+
|
|
377
|
+
# Validate terminate status
|
|
378
|
+
if tool_call.action == "terminate" and tool_call.status is not None:
|
|
379
|
+
if tool_call.status not in ("success", "failure"):
|
|
380
|
+
errors.append(f"Invalid terminate status: {tool_call.status}")
|
|
381
|
+
|
|
382
|
+
return errors
|
cudag/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://claimhawk.com/schemas/cudag/filesystem.json",
|
|
4
|
+
"title": "CUDAG Dataset Filesystem Structure",
|
|
5
|
+
"description": "Defines the required filesystem structure for CUDAG datasets",
|
|
6
|
+
"filesystem": {
|
|
7
|
+
"root": {
|
|
8
|
+
"description": "Dataset root directory (e.g., datasets/calendar--mike--20251203_123456)",
|
|
9
|
+
"required_files": [
|
|
10
|
+
{
|
|
11
|
+
"name": "config.json",
|
|
12
|
+
"description": "Generation configuration and metadata"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"name": "data.jsonl",
|
|
16
|
+
"description": "All training samples (JSONL format)"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"name": "train.jsonl",
|
|
20
|
+
"description": "Training split samples (JSONL format)"
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"name": "val.jsonl",
|
|
24
|
+
"description": "Validation split samples (JSONL format)"
|
|
25
|
+
}
|
|
26
|
+
],
|
|
27
|
+
"optional_files": [
|
|
28
|
+
{
|
|
29
|
+
"name": "held_out.jsonl",
|
|
30
|
+
"description": "Held-out samples for evaluation (JSONL format)"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"required_directories": [
|
|
34
|
+
{
|
|
35
|
+
"name": "images",
|
|
36
|
+
"description": "Training images directory",
|
|
37
|
+
"contents": "*.jpg or *.png files referenced by train.jsonl/val.jsonl"
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"name": "test",
|
|
41
|
+
"description": "Test cases directory"
|
|
42
|
+
}
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
"test": {
|
|
46
|
+
"description": "Test directory structure (dataset_root/test/)",
|
|
47
|
+
"required_files": [
|
|
48
|
+
{
|
|
49
|
+
"name": "test.json",
|
|
50
|
+
"description": "Test case definitions (JSON array)"
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
"required_directories": [
|
|
54
|
+
{
|
|
55
|
+
"name": "images",
|
|
56
|
+
"description": "Test screenshot images",
|
|
57
|
+
"contents": "*.png files referenced by test.json"
|
|
58
|
+
}
|
|
59
|
+
],
|
|
60
|
+
"optional_directories": [
|
|
61
|
+
{
|
|
62
|
+
"name": "annotated",
|
|
63
|
+
"description": "Annotated test images with crosshairs and tool call output",
|
|
64
|
+
"contents": "*_annotated.png files"
|
|
65
|
+
}
|
|
66
|
+
]
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"naming_conventions": {
|
|
70
|
+
"dataset_directory": {
|
|
71
|
+
"pattern": "{prefix}--{user}--{timestamp}",
|
|
72
|
+
"example": "calendar--mike--20251203_123456"
|
|
73
|
+
},
|
|
74
|
+
"training_images": {
|
|
75
|
+
"pattern": "{dataset_name}_{index:05d}.{ext}",
|
|
76
|
+
"example": "calendar_00001.jpg"
|
|
77
|
+
},
|
|
78
|
+
"test_images": {
|
|
79
|
+
"pattern": "{prefix}_{index:05d}.png",
|
|
80
|
+
"example": "test_00001.png"
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
"constraints": [
|
|
84
|
+
"All image paths in JSONL/JSON files must be relative to their parent directory",
|
|
85
|
+
"Training images are relative to dataset root (e.g., 'images/sample.jpg')",
|
|
86
|
+
"Test screenshots are relative to test directory (e.g., 'images/test.png')",
|
|
87
|
+
"Coordinates in training data use RU (Resolution Units) normalized to [0, 1000]",
|
|
88
|
+
"Test tolerance values are in RU units"
|
|
89
|
+
]
|
|
90
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://claimhawk.com/schemas/cudag/test_record.schema.json",
|
|
4
|
+
"title": "CUDAG Test Record",
|
|
5
|
+
"description": "Schema for test case records in test/test.json",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["test_id", "screenshot", "prompt", "expected_action", "tolerance", "metadata"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"test_id": {
|
|
10
|
+
"type": "string",
|
|
11
|
+
"description": "Unique identifier for this test case",
|
|
12
|
+
"pattern": "^[a-zA-Z0-9_-]+$"
|
|
13
|
+
},
|
|
14
|
+
"screenshot": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"description": "Relative path to screenshot from test directory (e.g., 'images/test_00001.png')",
|
|
17
|
+
"pattern": "^images/[^/]+\\.(jpg|jpeg|png)$"
|
|
18
|
+
},
|
|
19
|
+
"prompt": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "Human instruction prompt",
|
|
22
|
+
"minLength": 1
|
|
23
|
+
},
|
|
24
|
+
"expected_action": {
|
|
25
|
+
"type": "object",
|
|
26
|
+
"required": ["name", "arguments"],
|
|
27
|
+
"properties": {
|
|
28
|
+
"name": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"description": "Tool name",
|
|
31
|
+
"const": "computer_use"
|
|
32
|
+
},
|
|
33
|
+
"arguments": {
|
|
34
|
+
"type": "object",
|
|
35
|
+
"required": ["action"],
|
|
36
|
+
"properties": {
|
|
37
|
+
"action": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"description": "Action type",
|
|
40
|
+
"enum": ["left_click", "right_click", "double_click", "scroll", "type", "key"]
|
|
41
|
+
},
|
|
42
|
+
"coordinate": {
|
|
43
|
+
"type": "array",
|
|
44
|
+
"description": "Click coordinates [x, y] in RU units (0-1000)",
|
|
45
|
+
"items": {
|
|
46
|
+
"type": "integer",
|
|
47
|
+
"minimum": 0,
|
|
48
|
+
"maximum": 1000
|
|
49
|
+
},
|
|
50
|
+
"minItems": 2,
|
|
51
|
+
"maxItems": 2
|
|
52
|
+
},
|
|
53
|
+
"text": {
|
|
54
|
+
"type": "string",
|
|
55
|
+
"description": "Text to type (for type action)"
|
|
56
|
+
},
|
|
57
|
+
"direction": {
|
|
58
|
+
"type": "string",
|
|
59
|
+
"description": "Scroll direction",
|
|
60
|
+
"enum": ["up", "down", "left", "right"]
|
|
61
|
+
},
|
|
62
|
+
"amount": {
|
|
63
|
+
"type": "integer",
|
|
64
|
+
"description": "Scroll amount in pixels"
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"additionalProperties": false
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"additionalProperties": false
|
|
71
|
+
},
|
|
72
|
+
"tolerance": {
|
|
73
|
+
"type": "array",
|
|
74
|
+
"description": "Allowed coordinate tolerance [tol_x, tol_y] in RU units",
|
|
75
|
+
"items": {
|
|
76
|
+
"type": "integer",
|
|
77
|
+
"minimum": 0
|
|
78
|
+
},
|
|
79
|
+
"minItems": 2,
|
|
80
|
+
"maxItems": 2
|
|
81
|
+
},
|
|
82
|
+
"metadata": {
|
|
83
|
+
"type": "object",
|
|
84
|
+
"required": ["task_type"],
|
|
85
|
+
"properties": {
|
|
86
|
+
"task_type": {
|
|
87
|
+
"type": "string",
|
|
88
|
+
"description": "Task type identifier"
|
|
89
|
+
},
|
|
90
|
+
"real_coords": {
|
|
91
|
+
"type": ["array", "null"],
|
|
92
|
+
"description": "Original pixel coordinates [x, y]",
|
|
93
|
+
"items": {
|
|
94
|
+
"type": "integer"
|
|
95
|
+
},
|
|
96
|
+
"minItems": 2,
|
|
97
|
+
"maxItems": 2
|
|
98
|
+
},
|
|
99
|
+
"image_size": {
|
|
100
|
+
"type": "array",
|
|
101
|
+
"description": "Image dimensions [width, height]",
|
|
102
|
+
"items": {
|
|
103
|
+
"type": "integer"
|
|
104
|
+
},
|
|
105
|
+
"minItems": 2,
|
|
106
|
+
"maxItems": 2
|
|
107
|
+
}
|
|
108
|
+
},
|
|
109
|
+
"additionalProperties": true
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
"additionalProperties": false
|
|
113
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://claimhawk.com/schemas/cudag/train_record.schema.json",
|
|
4
|
+
"title": "CUDAG Training Record",
|
|
5
|
+
"description": "Schema for training/validation records in data.jsonl, train.jsonl, val.jsonl",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["id", "image", "conversations", "metadata"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"id": {
|
|
10
|
+
"type": "string",
|
|
11
|
+
"description": "Unique identifier for this sample",
|
|
12
|
+
"pattern": "^[a-zA-Z0-9_-]+$"
|
|
13
|
+
},
|
|
14
|
+
"image": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"description": "Relative path to image file from dataset root (e.g., 'images/sample_00001.jpg')",
|
|
17
|
+
"pattern": "^images/[^/]+\\.(jpg|jpeg|png)$"
|
|
18
|
+
},
|
|
19
|
+
"conversations": {
|
|
20
|
+
"type": "array",
|
|
21
|
+
"description": "Human-GPT conversation turns",
|
|
22
|
+
"minItems": 2,
|
|
23
|
+
"maxItems": 2,
|
|
24
|
+
"items": [
|
|
25
|
+
{
|
|
26
|
+
"type": "object",
|
|
27
|
+
"required": ["from", "value"],
|
|
28
|
+
"properties": {
|
|
29
|
+
"from": {
|
|
30
|
+
"type": "string",
|
|
31
|
+
"const": "human"
|
|
32
|
+
},
|
|
33
|
+
"value": {
|
|
34
|
+
"type": "string",
|
|
35
|
+
"description": "Human prompt, must start with <image>\\n",
|
|
36
|
+
"pattern": "^<image>\\n.+"
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
"additionalProperties": false
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"type": "object",
|
|
43
|
+
"required": ["from", "value"],
|
|
44
|
+
"properties": {
|
|
45
|
+
"from": {
|
|
46
|
+
"type": "string",
|
|
47
|
+
"const": "gpt"
|
|
48
|
+
},
|
|
49
|
+
"value": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"description": "GPT response in <tool_call> format",
|
|
52
|
+
"pattern": "^<tool_call>"
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"additionalProperties": false
|
|
56
|
+
}
|
|
57
|
+
]
|
|
58
|
+
},
|
|
59
|
+
"metadata": {
|
|
60
|
+
"type": "object",
|
|
61
|
+
"required": ["task_type", "real_coords"],
|
|
62
|
+
"properties": {
|
|
63
|
+
"task_type": {
|
|
64
|
+
"type": "string",
|
|
65
|
+
"description": "Task type identifier (e.g., 'click-day', 'click-appointment')"
|
|
66
|
+
},
|
|
67
|
+
"real_coords": {
|
|
68
|
+
"type": "array",
|
|
69
|
+
"description": "Original pixel coordinates [x, y]",
|
|
70
|
+
"items": {
|
|
71
|
+
"type": "integer"
|
|
72
|
+
},
|
|
73
|
+
"minItems": 2,
|
|
74
|
+
"maxItems": 2
|
|
75
|
+
},
|
|
76
|
+
"tolerance": {
|
|
77
|
+
"type": "array",
|
|
78
|
+
"description": "Coordinate tolerance [tol_x, tol_y] in RU units",
|
|
79
|
+
"items": {
|
|
80
|
+
"type": "integer"
|
|
81
|
+
},
|
|
82
|
+
"minItems": 2,
|
|
83
|
+
"maxItems": 2
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
"additionalProperties": true
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
"additionalProperties": false
|
|
90
|
+
}
|