opencode-skills-antigravity 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +10 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/security-findings-triage-2026-03-29-refresh.csv +34 -0
  6. package/bundled-skills/docs/maintainers/security-findings-triage-2026-03-29-refresh.md +2 -0
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +2 -2
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/hugging-face-cli/SKILL.md +192 -195
  17. package/bundled-skills/hugging-face-community-evals/SKILL.md +213 -0
  18. package/bundled-skills/hugging-face-community-evals/examples/.env.example +3 -0
  19. package/bundled-skills/hugging-face-community-evals/examples/USAGE_EXAMPLES.md +101 -0
  20. package/bundled-skills/hugging-face-community-evals/scripts/inspect_eval_uv.py +104 -0
  21. package/bundled-skills/hugging-face-community-evals/scripts/inspect_vllm_uv.py +306 -0
  22. package/bundled-skills/hugging-face-community-evals/scripts/lighteval_vllm_uv.py +297 -0
  23. package/bundled-skills/hugging-face-dataset-viewer/SKILL.md +120 -120
  24. package/bundled-skills/hugging-face-gradio/SKILL.md +304 -0
  25. package/bundled-skills/hugging-face-gradio/examples.md +613 -0
  26. package/bundled-skills/hugging-face-jobs/SKILL.md +25 -18
  27. package/bundled-skills/hugging-face-jobs/index.html +216 -0
  28. package/bundled-skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  29. package/bundled-skills/hugging-face-jobs/references/hub_saving.md +352 -0
  30. package/bundled-skills/hugging-face-jobs/references/token_usage.md +570 -0
  31. package/bundled-skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  32. package/bundled-skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  33. package/bundled-skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  34. package/bundled-skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  35. package/bundled-skills/hugging-face-model-trainer/SKILL.md +11 -12
  36. package/bundled-skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  37. package/bundled-skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  38. package/bundled-skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  39. package/bundled-skills/hugging-face-model-trainer/references/local_training_macos.md +231 -0
  40. package/bundled-skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  41. package/bundled-skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  42. package/bundled-skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  43. package/bundled-skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  44. package/bundled-skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  45. package/bundled-skills/hugging-face-model-trainer/references/unsloth.md +313 -0
  46. package/bundled-skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  47. package/bundled-skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  48. package/bundled-skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  49. package/bundled-skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  50. package/bundled-skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  51. package/bundled-skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  52. package/bundled-skills/hugging-face-model-trainer/scripts/unsloth_sft_example.py +512 -0
  53. package/bundled-skills/hugging-face-paper-publisher/SKILL.md +11 -4
  54. package/bundled-skills/hugging-face-paper-publisher/examples/example_usage.md +326 -0
  55. package/bundled-skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  56. package/bundled-skills/hugging-face-paper-publisher/scripts/paper_manager.py +606 -0
  57. package/bundled-skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  58. package/bundled-skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  59. package/bundled-skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  60. package/bundled-skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  61. package/bundled-skills/hugging-face-papers/SKILL.md +241 -0
  62. package/bundled-skills/hugging-face-trackio/.claude-plugin/plugin.json +19 -0
  63. package/bundled-skills/hugging-face-trackio/SKILL.md +117 -0
  64. package/bundled-skills/hugging-face-trackio/references/alerts.md +196 -0
  65. package/bundled-skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  66. package/bundled-skills/hugging-face-trackio/references/retrieving_metrics.md +251 -0
  67. package/bundled-skills/hugging-face-vision-trainer/SKILL.md +595 -0
  68. package/bundled-skills/hugging-face-vision-trainer/references/finetune_sam2_trainer.md +254 -0
  69. package/bundled-skills/hugging-face-vision-trainer/references/hub_saving.md +618 -0
  70. package/bundled-skills/hugging-face-vision-trainer/references/image_classification_training_notebook.md +279 -0
  71. package/bundled-skills/hugging-face-vision-trainer/references/object_detection_training_notebook.md +700 -0
  72. package/bundled-skills/hugging-face-vision-trainer/references/reliability_principles.md +310 -0
  73. package/bundled-skills/hugging-face-vision-trainer/references/timm_trainer.md +91 -0
  74. package/bundled-skills/hugging-face-vision-trainer/scripts/dataset_inspector.py +814 -0
  75. package/bundled-skills/hugging-face-vision-trainer/scripts/estimate_cost.py +217 -0
  76. package/bundled-skills/hugging-face-vision-trainer/scripts/image_classification_training.py +383 -0
  77. package/bundled-skills/hugging-face-vision-trainer/scripts/object_detection_training.py +710 -0
  78. package/bundled-skills/hugging-face-vision-trainer/scripts/sam_segmentation_training.py +382 -0
  79. package/bundled-skills/jq/SKILL.md +273 -0
  80. package/bundled-skills/odoo-edi-connector/SKILL.md +32 -10
  81. package/bundled-skills/odoo-woocommerce-bridge/SKILL.md +9 -5
  82. package/bundled-skills/tmux/SKILL.md +370 -0
  83. package/bundled-skills/transformers-js/SKILL.md +639 -0
  84. package/bundled-skills/transformers-js/references/CACHE.md +339 -0
  85. package/bundled-skills/transformers-js/references/CONFIGURATION.md +390 -0
  86. package/bundled-skills/transformers-js/references/EXAMPLES.md +605 -0
  87. package/bundled-skills/transformers-js/references/MODEL_ARCHITECTURES.md +167 -0
  88. package/bundled-skills/transformers-js/references/PIPELINE_OPTIONS.md +545 -0
  89. package/bundled-skills/transformers-js/references/TEXT_GENERATION.md +315 -0
  90. package/bundled-skills/viboscope/SKILL.md +64 -0
  91. package/package.json +1 -1
@@ -0,0 +1,814 @@
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = []
5
+ # ///
6
+ """
7
+ Dataset Format Inspector for Vision Model Training
8
+
9
+ Inspects Hugging Face datasets to determine compatibility with object detection
10
+ and image classification training.
11
+ Uses Datasets Server API for instant results - no dataset download needed!
12
+
13
+ ULTRA-EFFICIENT: Uses HF Datasets Server API - completes in <2 seconds.
14
+
15
+ Usage with HF Jobs:
16
+ hf_jobs("uv", {
17
+ "script": "path/to/dataset_inspector.py",
18
+ "script_args": ["--dataset", "your/dataset", "--split", "train"]
19
+ })
20
+ """
21
+
22
+ import argparse
23
+ import math
24
+ import sys
25
+ import json
26
+ import urllib.request
27
+ import urllib.parse
28
+ from typing import List, Dict, Any, Tuple
29
+
30
+
31
+ def parse_args():
32
+ parser = argparse.ArgumentParser(description="Inspect dataset format for vision model training")
33
+ parser.add_argument("--dataset", type=str, required=True, help="Dataset name")
34
+ parser.add_argument("--split", type=str, default="train", help="Dataset split (default: train)")
35
+ parser.add_argument("--config", type=str, default="default", help="Dataset config name (default: default)")
36
+ parser.add_argument("--preview", type=int, default=150, help="Max chars per field preview")
37
+ parser.add_argument("--samples", type=int, default=5, help="Number of samples to fetch (default: 5)")
38
+ parser.add_argument("--json-output", action="store_true", help="Output as JSON")
39
+ return parser.parse_args()
40
+
41
+
42
+ def api_request(url: str) -> Dict:
43
+ """Make API request to Datasets Server"""
44
+ try:
45
+ with urllib.request.urlopen(url, timeout=10) as response:
46
+ return json.loads(response.read().decode())
47
+ except urllib.error.HTTPError as e:
48
+ if e.code == 404:
49
+ return None
50
+ raise Exception(f"API request failed: {e.code} {e.reason}")
51
+ except Exception as e:
52
+ raise Exception(f"API request failed: {str(e)}")
53
+
54
+
55
+ def get_splits(dataset: str) -> Dict:
56
+ """Get available splits for dataset"""
57
+ url = f"https://datasets-server.huggingface.co/splits?dataset={urllib.parse.quote(dataset)}"
58
+ return api_request(url)
59
+
60
+
61
+ def get_rows(dataset: str, config: str, split: str, offset: int = 0, length: int = 5) -> Dict:
62
+ """Get rows from dataset"""
63
+ url = f"https://datasets-server.huggingface.co/rows?dataset={urllib.parse.quote(dataset)}&config={config}&split={split}&offset={offset}&length={length}"
64
+ return api_request(url)
65
+
66
+
67
+ def find_columns(columns: List[str], patterns: List[str]) -> List[str]:
68
+ """Find columns matching patterns"""
69
+ return [c for c in columns if any(p in c.lower() for p in patterns)]
70
+
71
+
72
+ def detect_bbox_format(bbox: List[float], image_size: Tuple[int, int] = None) -> str:
73
+ """
74
+ Detect bounding box format based on values and optionally image dimensions.
75
+ Common formats:
76
+ - [x_min, y_min, x_max, y_max] - XYXY (Pascal VOC)
77
+ - [x_min, y_min, width, height] - XYWH (COCO)
78
+ - [x_center, y_center, width, height] - CXCYWH (YOLO normalized)
79
+ """
80
+ if len(bbox) != 4:
81
+ return "unknown (not 4 values)"
82
+
83
+ a, b, c, d = bbox
84
+
85
+ is_normalized = all(0 <= v <= 1 for v in bbox)
86
+
87
+ if c < a or d < b:
88
+ if is_normalized:
89
+ return "xywh_normalized"
90
+ return "xywh (COCO style)"
91
+
92
+ # c > a and d > b — ambiguous between xyxy and xywh.
93
+ # Use image dimensions to disambiguate when available.
94
+ if image_size is not None:
95
+ img_w, img_h = image_size
96
+ # If interpreting as xywh, right edge = a + c; if that overshoots the
97
+ # image while c alone fits, the format is more likely xyxy.
98
+ xywh_exceeds = (a + c > img_w * 1.05) or (b + d > img_h * 1.05)
99
+ xyxy_exceeds = (c > img_w * 1.05) or (d > img_h * 1.05)
100
+ if xywh_exceeds and not xyxy_exceeds:
101
+ return "xyxy (Pascal VOC style)"
102
+ if xyxy_exceeds and not xywh_exceeds:
103
+ return "xywh (COCO style)"
104
+
105
+ if is_normalized:
106
+ return "xyxy_normalized"
107
+ return "xyxy (Pascal VOC style)"
108
+
109
+
110
+ def _extract_image_size(row: Dict) -> Tuple[int, int] | None:
111
+ """Try to extract (width, height) from the image column returned by Datasets Server."""
112
+ for col in ("image", "img", "picture", "photo"):
113
+ img = row.get(col)
114
+ if isinstance(img, dict):
115
+ w = img.get("width")
116
+ h = img.get("height")
117
+ if isinstance(w, (int, float)) and isinstance(h, (int, float)):
118
+ return (int(w), int(h))
119
+ return None
120
+
121
+
122
+ def analyze_annotations(sample_rows: List[Dict], annotation_cols: List[str]) -> Dict[str, Any]:
123
+ """Analyze annotation structure from sample rows"""
124
+ if not annotation_cols:
125
+ return {"found": False}
126
+
127
+ annotation_col = annotation_cols[0]
128
+ annotations_info = {
129
+ "found": True,
130
+ "column": annotation_col,
131
+ "sample_structures": [],
132
+ "bbox_formats": [],
133
+ "categories_found": [],
134
+ "avg_objects_per_image": 0,
135
+ "max_objects": 0,
136
+ "min_objects": float('inf'),
137
+ }
138
+
139
+ total_objects = 0
140
+ valid_samples = 0
141
+
142
+ for row in sample_rows:
143
+ ann = row["row"].get(annotation_col)
144
+ if not ann:
145
+ continue
146
+
147
+ valid_samples += 1
148
+ image_size = _extract_image_size(row["row"])
149
+
150
+ # Check if it's a list of annotations or a dict
151
+ if isinstance(ann, dict):
152
+ # COCO-style or structured annotation
153
+ sample_structure = {
154
+ "type": "dict",
155
+ "keys": list(ann.keys())
156
+ }
157
+
158
+ # Check for bounding boxes
159
+ if "bbox" in ann or "bboxes" in ann:
160
+ bbox_key = "bbox" if "bbox" in ann else "bboxes"
161
+ bboxes = ann[bbox_key]
162
+ if isinstance(bboxes, list) and len(bboxes) > 0:
163
+ if isinstance(bboxes[0], list):
164
+ # Multiple bboxes
165
+ num_objects = len(bboxes)
166
+ total_objects += num_objects
167
+ annotations_info["max_objects"] = max(annotations_info["max_objects"], num_objects)
168
+ annotations_info["min_objects"] = min(annotations_info["min_objects"], num_objects)
169
+
170
+ # Analyze first bbox format
171
+ bbox_format = detect_bbox_format(bboxes[0], image_size)
172
+ annotations_info["bbox_formats"].append(bbox_format)
173
+ else:
174
+ # Single bbox
175
+ total_objects += 1
176
+ annotations_info["max_objects"] = max(annotations_info["max_objects"], 1)
177
+ annotations_info["min_objects"] = min(annotations_info["min_objects"], 1)
178
+ bbox_format = detect_bbox_format(bboxes, image_size)
179
+ annotations_info["bbox_formats"].append(bbox_format)
180
+
181
+ # Check for categories/classes
182
+ for key in ["category", "categories", "label", "labels", "class", "classes", "category_id"]:
183
+ if key in ann:
184
+ cats = ann[key]
185
+ if isinstance(cats, list):
186
+ annotations_info["categories_found"].extend([str(c) for c in cats])
187
+ else:
188
+ annotations_info["categories_found"].append(str(cats))
189
+
190
+ annotations_info["sample_structures"].append(sample_structure)
191
+
192
+ elif isinstance(ann, list):
193
+ # List of annotation dicts
194
+ sample_structure = {
195
+ "type": "list",
196
+ "length": len(ann),
197
+ "item_type": type(ann[0]).__name__ if ann else None
198
+ }
199
+
200
+ if ann and isinstance(ann[0], dict):
201
+ sample_structure["item_keys"] = list(ann[0].keys())
202
+
203
+ # Count objects
204
+ num_objects = len(ann)
205
+ total_objects += num_objects
206
+ annotations_info["max_objects"] = max(annotations_info["max_objects"], num_objects)
207
+ annotations_info["min_objects"] = min(annotations_info["min_objects"], num_objects)
208
+
209
+ # Check first annotation
210
+ first_ann = ann[0]
211
+ if "bbox" in first_ann:
212
+ bbox_format = detect_bbox_format(first_ann["bbox"], image_size)
213
+ annotations_info["bbox_formats"].append(bbox_format)
214
+
215
+ # Check for categories
216
+ for key in ["category", "label", "class", "category_id"]:
217
+ if key in first_ann:
218
+ for item in ann:
219
+ if key in item:
220
+ annotations_info["categories_found"].append(str(item[key]))
221
+
222
+ annotations_info["sample_structures"].append(sample_structure)
223
+
224
+ if valid_samples > 0:
225
+ annotations_info["avg_objects_per_image"] = round(total_objects / valid_samples, 2)
226
+
227
+ if annotations_info["min_objects"] == float('inf'):
228
+ annotations_info["min_objects"] = 0
229
+
230
+ # Get unique categories
231
+ annotations_info["categories_found"] = list(set(annotations_info["categories_found"]))
232
+ annotations_info["num_classes"] = len(annotations_info["categories_found"])
233
+
234
+ # Get most common bbox format
235
+ if annotations_info["bbox_formats"]:
236
+ from collections import Counter
237
+ format_counts = Counter(annotations_info["bbox_formats"])
238
+ annotations_info["primary_bbox_format"] = format_counts.most_common(1)[0][0]
239
+
240
+ return annotations_info
241
+
242
+
243
+ def check_image_classification_compatibility(columns: List[str], sample_rows: List[Dict], features: List[Dict]) -> Dict[str, Any]:
244
+ """Check image classification dataset compatibility"""
245
+
246
+ image_cols = find_columns(columns, ["image", "img", "picture", "photo"])
247
+ has_image = len(image_cols) > 0
248
+
249
+ label_cols = find_columns(columns, ["label", "labels", "class", "fine_label", "coarse_label"])
250
+ has_label = len(label_cols) > 0
251
+
252
+ label_info: Dict[str, Any] = {"found": has_label}
253
+
254
+ if has_label:
255
+ label_col = label_cols[0]
256
+ label_info["column"] = label_col
257
+
258
+ # Detect whether label is ClassLabel (int with names) or plain int/string
259
+ for f in features:
260
+ if f.get("name") == label_col:
261
+ ftype = f.get("type", "")
262
+ if isinstance(ftype, dict) and ftype.get("_type") == "ClassLabel":
263
+ label_info["type"] = "ClassLabel"
264
+ names = ftype.get("names", [])
265
+ label_info["num_classes"] = len(names)
266
+ label_info["class_names"] = names[:20]
267
+ if len(names) > 20:
268
+ label_info["class_names_truncated"] = True
269
+ elif isinstance(ftype, dict) and ftype.get("dtype") in ("int64", "int32", "int8"):
270
+ label_info["type"] = "int"
271
+ elif isinstance(ftype, dict) and ftype.get("dtype") == "string":
272
+ label_info["type"] = "string"
273
+ break
274
+
275
+ # Discover unique labels from samples if ClassLabel info wasn't in features
276
+ if "num_classes" not in label_info:
277
+ unique = set()
278
+ for row in sample_rows:
279
+ val = row["row"].get(label_col)
280
+ if val is not None:
281
+ unique.add(val)
282
+ label_info["sample_unique_labels"] = sorted(unique, key=str)[:20]
283
+ label_info["sample_unique_count"] = len(unique)
284
+
285
+ ready = has_image and has_label
286
+ return {
287
+ "ready": ready,
288
+ "has_image": has_image,
289
+ "image_columns": image_cols,
290
+ "has_label": has_label,
291
+ "label_columns": label_cols,
292
+ "label_info": label_info,
293
+ }
294
+
295
+
296
+ def check_object_detection_compatibility(columns: List[str], sample_rows: List[Dict]) -> Dict[str, Any]:
297
+ """Check object detection dataset compatibility"""
298
+
299
+ # Find image column
300
+ image_cols = find_columns(columns, ["image", "img", "picture", "photo"])
301
+ has_image = len(image_cols) > 0
302
+
303
+ # Find annotation columns
304
+ annotation_cols = find_columns(columns, ["objects", "annotations", "ann", "bbox", "bboxes", "detection"])
305
+ has_annotations = len(annotation_cols) > 0
306
+
307
+ # Analyze annotations
308
+ annotations_info = analyze_annotations(sample_rows, annotation_cols) if has_annotations else {"found": False}
309
+
310
+ # Check for separate bbox and category columns
311
+ bbox_cols = find_columns(columns, ["bbox", "bboxes", "boxes"])
312
+ category_cols = find_columns(columns, ["category", "label", "class", "categories", "labels", "classes"])
313
+
314
+ # Determine readiness
315
+ ready = has_image and (has_annotations or (len(bbox_cols) > 0 and len(category_cols) > 0))
316
+
317
+ return {
318
+ "ready": ready,
319
+ "has_image": has_image,
320
+ "image_columns": image_cols,
321
+ "has_annotations": has_annotations,
322
+ "annotation_columns": annotation_cols,
323
+ "separate_bbox_columns": bbox_cols,
324
+ "separate_category_columns": category_cols,
325
+ "annotations_info": annotations_info,
326
+ }
327
+
328
+
329
+ def check_sam_segmentation_compatibility(columns: List[str], sample_rows: List[Dict], features: List[Dict]) -> Dict[str, Any]:
330
+ """Check SAM/SAM2 segmentation dataset compatibility.
331
+
332
+ A valid SAM segmentation dataset needs:
333
+ - An image column
334
+ - A mask column (binary ground-truth segmentation mask)
335
+ - A prompt: either a bbox prompt or point prompt (in a JSON prompt column, or dedicated columns)
336
+ """
337
+
338
+ image_cols = find_columns(columns, ["image", "img", "picture", "photo"])
339
+ has_image = len(image_cols) > 0
340
+
341
+ mask_cols = find_columns(columns, ["mask", "segmentation", "alpha", "matte"])
342
+ has_mask = len(mask_cols) > 0
343
+
344
+ prompt_cols = find_columns(columns, ["prompt"])
345
+ bbox_cols = [c for c in columns if c in ("bbox", "bboxes", "box", "boxes")]
346
+ point_cols = [c for c in columns if c in ("point", "points", "input_point", "input_points")]
347
+
348
+ prompt_info: Dict[str, Any] = {
349
+ "has_prompt": False,
350
+ "prompt_type": None,
351
+ "source": None,
352
+ "bbox_valid": None,
353
+ }
354
+
355
+ # Try JSON prompt column first
356
+ if prompt_cols:
357
+ for row in sample_rows:
358
+ raw = row["row"].get(prompt_cols[0])
359
+ if raw is None:
360
+ continue
361
+ parsed = raw if isinstance(raw, dict) else _try_json(raw)
362
+ if parsed is None:
363
+ continue
364
+
365
+ if isinstance(parsed, dict):
366
+ if "bbox" in parsed or "box" in parsed:
367
+ prompt_info["has_prompt"] = True
368
+ prompt_info["prompt_type"] = "bbox"
369
+ prompt_info["source"] = f"JSON column '{prompt_cols[0]}'"
370
+ bbox = parsed.get("bbox") or parsed.get("box")
371
+ prompt_info["bbox_valid"] = _validate_bbox(bbox, _extract_image_size(row["row"]))
372
+ break
373
+ elif "point" in parsed or "points" in parsed:
374
+ prompt_info["has_prompt"] = True
375
+ prompt_info["prompt_type"] = "point"
376
+ prompt_info["source"] = f"JSON column '{prompt_cols[0]}'"
377
+ break
378
+
379
+ if not prompt_info["has_prompt"] and bbox_cols:
380
+ prompt_info["has_prompt"] = True
381
+ prompt_info["prompt_type"] = "bbox"
382
+ prompt_info["source"] = f"column '{bbox_cols[0]}'"
383
+ for row in sample_rows:
384
+ bbox = row["row"].get(bbox_cols[0])
385
+ if bbox is not None:
386
+ prompt_info["bbox_valid"] = _validate_bbox(bbox, _extract_image_size(row["row"]))
387
+ break
388
+
389
+ if not prompt_info["has_prompt"] and point_cols:
390
+ prompt_info["has_prompt"] = True
391
+ prompt_info["prompt_type"] = "point"
392
+ prompt_info["source"] = f"column '{point_cols[0]}'"
393
+
394
+ ready = has_image and has_mask and prompt_info["has_prompt"]
395
+
396
+ return {
397
+ "ready": ready,
398
+ "has_image": has_image,
399
+ "image_columns": image_cols,
400
+ "has_mask": has_mask,
401
+ "mask_columns": mask_cols,
402
+ "prompt_columns": prompt_cols,
403
+ "bbox_columns": bbox_cols,
404
+ "point_columns": point_cols,
405
+ "prompt_info": prompt_info,
406
+ }
407
+
408
+
409
+ def _try_json(value) -> Any:
410
+ if not isinstance(value, str):
411
+ return None
412
+ try:
413
+ return json.loads(value)
414
+ except (json.JSONDecodeError, TypeError):
415
+ return None
416
+
417
+
418
+ def _validate_bbox(bbox, image_size=None) -> Dict[str, Any]:
419
+ """Validate a single bounding box and return diagnostics."""
420
+ result: Dict[str, Any] = {"valid": False}
421
+ if not isinstance(bbox, (list, tuple)):
422
+ result["error"] = "bbox is not a list"
423
+ return result
424
+ if len(bbox) != 4:
425
+ result["error"] = f"expected 4 values, got {len(bbox)}"
426
+ return result
427
+ try:
428
+ vals = [float(v) for v in bbox]
429
+ except (TypeError, ValueError):
430
+ result["error"] = "non-numeric values"
431
+ return result
432
+
433
+ if not all(math.isfinite(v) for v in vals):
434
+ result["error"] = "contains non-finite values"
435
+ return result
436
+
437
+ x0, y0, x1, y1 = vals
438
+ if x1 <= x0 or y1 <= y0:
439
+ if vals[2] > 0 and vals[3] > 0:
440
+ result["format_hint"] = "likely xywh"
441
+ else:
442
+ result["error"] = "degenerate bbox (zero or negative area)"
443
+ return result
444
+ else:
445
+ result["format_hint"] = "likely xyxy"
446
+
447
+ if image_size is not None:
448
+ img_w, img_h = image_size
449
+ if any(v > max(img_w, img_h) * 1.5 for v in vals):
450
+ result["warning"] = "coordinates exceed image bounds"
451
+
452
+ result["valid"] = True
453
+ result["values"] = vals
454
+ return result
455
+
456
+
457
+ def generate_mapping_code(info: Dict[str, Any]) -> str:
458
+ """Generate mapping code if needed"""
459
+ if info["ready"]:
460
+ ann_info = info["annotations_info"]
461
+ if not ann_info.get("found"):
462
+ return None
463
+
464
+ # Check if format conversion is needed
465
+ ann_col = ann_info.get("column")
466
+ bbox_format = ann_info.get("primary_bbox_format", "unknown")
467
+
468
+ if "coco" in bbox_format.lower() or "xywh" in bbox_format.lower():
469
+ # Already COCO format
470
+ return f"""# Dataset appears to be in COCO format (xywh)
471
+ # Image column: {info['image_columns'][0] if info['image_columns'] else 'image'}
472
+ # Annotation column: {ann_col}
473
+ # Use directly with transformers object detection models"""
474
+ elif "xyxy" in bbox_format.lower():
475
+ # Need to convert from XYXY to XYWH
476
+ return f"""# Convert from XYXY (Pascal VOC) to XYWH (COCO) format
477
+ def convert_to_coco_format(example):
478
+ annotations = example['{ann_col}']
479
+ if isinstance(annotations, list):
480
+ for ann in annotations:
481
+ if 'bbox' in ann:
482
+ x_min, y_min, x_max, y_max = ann['bbox']
483
+ ann['bbox'] = [x_min, y_min, x_max - x_min, y_max - y_min]
484
+ elif isinstance(annotations, dict) and 'bbox' in annotations:
485
+ bbox = annotations['bbox']
486
+ if isinstance(bbox, list) and len(bbox) > 0 and isinstance(bbox[0], list):
487
+ for i, box in enumerate(bbox):
488
+ x_min, y_min, x_max, y_max = box
489
+ bbox[i] = [x_min, y_min, x_max - x_min, y_max - y_min]
490
+ return example
491
+
492
+ dataset = dataset.map(convert_to_coco_format)"""
493
+
494
+ elif not info["ready"]:
495
+ # Need to create annotations structure
496
+ if info["separate_bbox_columns"] and info["separate_category_columns"]:
497
+ bbox_col = info["separate_bbox_columns"][0]
498
+ cat_col = info["separate_category_columns"][0]
499
+
500
+ return f"""# Combine separate bbox and category columns
501
+ def create_annotations(example):
502
+ bboxes = example['{bbox_col}']
503
+ categories = example['{cat_col}']
504
+
505
+ if not isinstance(bboxes, list):
506
+ bboxes = [bboxes]
507
+ if not isinstance(categories, list):
508
+ categories = [categories]
509
+
510
+ annotations = []
511
+ for bbox, cat in zip(bboxes, categories):
512
+ annotations.append({{'bbox': bbox, 'category': cat}})
513
+
514
+ example['objects'] = annotations
515
+ return example
516
+
517
+ dataset = dataset.map(create_annotations)"""
518
+
519
+ return None
520
+
521
+
522
+ def format_value_preview(value: Any, max_chars: int) -> str:
523
+ """Format value for preview"""
524
+ if value is None:
525
+ return "None"
526
+ elif isinstance(value, str):
527
+ return value[:max_chars] + ("..." if len(value) > max_chars else "")
528
+ elif isinstance(value, dict):
529
+ keys = list(value.keys())
530
+ return f"{{dict with {len(keys)} keys: {', '.join(keys[:5])}}}"
531
+ elif isinstance(value, list):
532
+ if len(value) == 0:
533
+ return "[]"
534
+ elif isinstance(value[0], dict):
535
+ return f"[{len(value)} items] First item keys: {list(value[0].keys())}"
536
+ elif isinstance(value[0], list):
537
+ return f"[{len(value)} items] First item: {value[0]}"
538
+ else:
539
+ preview = str(value)
540
+ return preview[:max_chars] + ("..." if len(preview) > max_chars else "")
541
+ else:
542
+ preview = str(value)
543
+ return preview[:max_chars] + ("..." if len(preview) > max_chars else "")
544
+
545
+
546
+ def main():
547
+ args = parse_args()
548
+
549
+ print(f"Fetching dataset info via Datasets Server API...")
550
+
551
+ try:
552
+ # Get splits info
553
+ splits_data = get_splits(args.dataset)
554
+ if not splits_data or "splits" not in splits_data:
555
+ print(f"ERROR: Could not fetch splits for dataset '{args.dataset}'")
556
+ print(f" Dataset may not exist or is not accessible via Datasets Server API")
557
+ sys.exit(1)
558
+
559
+ # Find the right config
560
+ available_configs = set()
561
+ split_found = False
562
+ config_to_use = args.config
563
+
564
+ for split_info in splits_data["splits"]:
565
+ available_configs.add(split_info["config"])
566
+ if split_info["config"] == args.config and split_info["split"] == args.split:
567
+ split_found = True
568
+
569
+ # If default config not found, try first available
570
+ if not split_found and available_configs:
571
+ config_to_use = list(available_configs)[0]
572
+ print(f"Config '{args.config}' not found, trying '{config_to_use}'...")
573
+
574
+ # Get rows
575
+ rows_data = get_rows(args.dataset, config_to_use, args.split, offset=0, length=args.samples)
576
+
577
+ if not rows_data or "rows" not in rows_data:
578
+ print(f"ERROR: Could not fetch rows for dataset '{args.dataset}'")
579
+ print(f" Split '{args.split}' may not exist")
580
+ print(f" Available configs: {', '.join(sorted(available_configs))}")
581
+ sys.exit(1)
582
+
583
+ rows = rows_data["rows"]
584
+ if not rows:
585
+ print(f"ERROR: No rows found in split '{args.split}'")
586
+ sys.exit(1)
587
+
588
+ # Extract column info from first row
589
+ first_row = rows[0]["row"]
590
+ columns = list(first_row.keys())
591
+ features = rows_data.get("features", [])
592
+
593
+ # Get total count if available
594
+ total_examples = "Unknown"
595
+ for split_info in splits_data["splits"]:
596
+ if split_info["config"] == config_to_use and split_info["split"] == args.split:
597
+ total_examples = f"{split_info.get('num_examples', 'Unknown'):,}" if isinstance(split_info.get('num_examples'), int) else "Unknown"
598
+ break
599
+
600
+ except Exception as e:
601
+ print(f"ERROR: {str(e)}")
602
+ sys.exit(1)
603
+
604
+ # Run compatibility checks
605
+ od_info = check_object_detection_compatibility(columns, rows)
606
+ ic_info = check_image_classification_compatibility(columns, rows, features)
607
+ sam_info = check_sam_segmentation_compatibility(columns, rows, features)
608
+
609
+ # JSON output mode
610
+ if args.json_output:
611
+ result = {
612
+ "dataset": args.dataset,
613
+ "config": config_to_use,
614
+ "split": args.split,
615
+ "total_examples": total_examples,
616
+ "columns": columns,
617
+ "features": [{"name": f["name"], "type": f["type"]} for f in features] if features else [],
618
+ "object_detection_compatibility": od_info,
619
+ "image_classification_compatibility": ic_info,
620
+ "sam_segmentation_compatibility": sam_info,
621
+ }
622
+ print(json.dumps(result, indent=2))
623
+ sys.exit(0)
624
+
625
+ # Human-readable output optimized for LLM parsing
626
+ print("=" * 80)
627
+ print(f"VISION DATASET INSPECTION")
628
+ print("=" * 80)
629
+
630
+ print(f"\nDataset: {args.dataset}")
631
+ print(f"Config: {config_to_use}")
632
+ print(f"Split: {args.split}")
633
+ print(f"Total examples: {total_examples}")
634
+ print(f"Samples fetched: {len(rows)}")
635
+
636
+ print(f"\n{'COLUMNS':-<80}")
637
+ if features:
638
+ for feature in features:
639
+ print(f" {feature['name']}: {feature['type']}")
640
+ else:
641
+ for col in columns:
642
+ print(f" {col}: (type info not available)")
643
+
644
+ print(f"\n{'EXAMPLE DATA':-<80}")
645
+ example = first_row
646
+ for col in columns:
647
+ value = example.get(col)
648
+ display = format_value_preview(value, args.preview)
649
+ print(f"\n{col}:")
650
+ print(f" {display}")
651
+
652
+ # --- Image Classification ---
653
+ print(f"\n{'IMAGE CLASSIFICATION COMPATIBILITY':-<80}")
654
+ print(f"\n[STATUS] {'✓ READY' if ic_info['ready'] else '✗ NOT COMPATIBLE'}")
655
+
656
+ print(f"\nImage Column:")
657
+ if ic_info["has_image"]:
658
+ print(f" ✓ Found: {', '.join(ic_info['image_columns'])}")
659
+ else:
660
+ print(f" ✗ No image column detected")
661
+
662
+ print(f"\nLabel Column:")
663
+ if ic_info["has_label"]:
664
+ print(f" ✓ Found: {', '.join(ic_info['label_columns'])}")
665
+ li = ic_info["label_info"]
666
+ if li.get("type"):
667
+ print(f" • Type: {li['type']}")
668
+ if li.get("num_classes"):
669
+ print(f" • Number of Classes: {li['num_classes']}")
670
+ if li.get("class_names"):
671
+ names = li["class_names"]
672
+ display = ", ".join(str(n) for n in names[:10])
673
+ if len(names) > 10:
674
+ display += f" ... ({li['num_classes']} total)"
675
+ print(f" • Classes: {display}")
676
+ elif li.get("sample_unique_labels"):
677
+ labels = li["sample_unique_labels"]
678
+ display = ", ".join(str(l) for l in labels[:10])
679
+ if li.get("sample_unique_count", 0) > 10:
680
+ display += f" ... ({li['sample_unique_count']}+ from sample)"
681
+ print(f" • Sample labels: {display}")
682
+ else:
683
+ print(f" ✗ No label column detected")
684
+ print(f" Expected column names: 'label', 'labels', 'class', 'fine_label'")
685
+
686
+ if ic_info["ready"]:
687
+ lc = ic_info["label_info"].get("column", "label")
688
+ print(f"\n Use with: scripts/image_classification_training.py")
689
+ print(f" --image_column_name {ic_info['image_columns'][0]} --label_column_name {lc}")
690
+
691
+ # --- Object Detection ---
692
+ print(f"\n{'OBJECT DETECTION COMPATIBILITY':-<80}")
693
+ print(f"\n[STATUS] {'✓ READY' if od_info['ready'] else '✗ NOT COMPATIBLE'}")
694
+
695
+ print(f"\nImage Column:")
696
+ if od_info["has_image"]:
697
+ print(f" ✓ Found: {', '.join(od_info['image_columns'])}")
698
+ else:
699
+ print(f" ✗ No image column detected")
700
+ print(f" Expected column names: 'image', 'img', 'picture', 'photo'")
701
+
702
+ print(f"\nAnnotations:")
703
+ if od_info["has_annotations"]:
704
+ print(f" ✓ Found: {', '.join(od_info['annotation_columns'])}")
705
+ ann_info = od_info["annotations_info"]
706
+ if ann_info.get("found"):
707
+ print(f"\n Annotation Details:")
708
+ print(f" • Column: {ann_info['column']}")
709
+ if ann_info.get("primary_bbox_format"):
710
+ print(f" • BBox Format: {ann_info['primary_bbox_format']}")
711
+ if ann_info.get("num_classes", 0) > 0:
712
+ print(f" • Number of Classes: {ann_info['num_classes']}")
713
+ print(f" • Classes: {', '.join(ann_info['categories_found'][:10])}")
714
+ if len(ann_info['categories_found']) > 10:
715
+ print(f" (showing first 10 of {len(ann_info['categories_found'])})")
716
+ print(f" • Avg Objects/Image: {ann_info['avg_objects_per_image']}")
717
+ print(f" • Min Objects: {ann_info['min_objects']}")
718
+ print(f" • Max Objects: {ann_info['max_objects']}")
719
+ elif od_info["separate_bbox_columns"] and od_info["separate_category_columns"]:
720
+ print(f" ⚠ Separate bbox and category columns found:")
721
+ print(f" BBox columns: {', '.join(od_info['separate_bbox_columns'])}")
722
+ print(f" Category columns: {', '.join(od_info['separate_category_columns'])}")
723
+ print(f" Action: These need to be combined (see mapping code below)")
724
+ else:
725
+ print(f" ✗ No annotation columns detected")
726
+ print(f" Expected: 'objects', 'annotations', 'bbox'/'bboxes' + 'category'/'label'")
727
+
728
+ # --- SAM Segmentation ---
729
+ print(f"\n{'SAM SEGMENTATION COMPATIBILITY':-<80}")
730
+ print(f"\n[STATUS] {'✓ READY' if sam_info['ready'] else '✗ NOT COMPATIBLE'}")
731
+
732
+ print(f"\nImage Column:")
733
+ if sam_info["has_image"]:
734
+ print(f" ✓ Found: {', '.join(sam_info['image_columns'])}")
735
+ else:
736
+ print(f" ✗ No image column detected")
737
+
738
+ print(f"\nMask Column:")
739
+ if sam_info["has_mask"]:
740
+ print(f" ✓ Found: {', '.join(sam_info['mask_columns'])}")
741
+ else:
742
+ print(f" ✗ No mask column detected")
743
+ print(f" Expected column names: 'mask', 'segmentation', 'alpha', 'matte'")
744
+
745
+ print(f"\nPrompt:")
746
+ pi = sam_info["prompt_info"]
747
+ if pi["has_prompt"]:
748
+ print(f" ✓ Type: {pi['prompt_type']} (from {pi['source']})")
749
+ if pi.get("bbox_valid"):
750
+ bv = pi["bbox_valid"]
751
+ if bv["valid"]:
752
+ print(f" • BBox values: {bv.get('values')}")
753
+ if bv.get("format_hint"):
754
+ print(f" • Format: {bv['format_hint']}")
755
+ if bv.get("warning"):
756
+ print(f" ⚠ {bv['warning']}")
757
+ else:
758
+ print(f" ✗ Invalid bbox: {bv.get('error', 'unknown error')}")
759
+ else:
760
+ print(f" ✗ No prompt detected")
761
+ print(f" Expected: 'prompt' column (JSON with bbox/point), or 'bbox'/'point' column")
762
+
763
+ if sam_info["ready"]:
764
+ pc = sam_info["prompt_columns"][0] if sam_info["prompt_columns"] else None
765
+ args_hint = f"--prompt_type {pi['prompt_type']}"
766
+ if pc:
767
+ args_hint += f" --prompt_column_name {pc}"
768
+ print(f"\n Use with: scripts/sam_segmentation_training.py")
769
+ print(f" {args_hint}")
770
+
771
+ # Mapping code (OD only)
772
+ mapping_code = generate_mapping_code(od_info)
773
+
774
+ if mapping_code:
775
+ print(f"\n{'OD PREPROCESSING CODE':-<80}")
776
+ print(mapping_code)
777
+ elif od_info["ready"]:
778
+ print(f"\n ✓ No OD preprocessing needed.")
779
+
780
+ # --- Summary ---
781
+ print(f"\n{'SUMMARY':-<80}")
782
+ if ic_info["ready"]:
783
+ num_cls = ic_info["label_info"].get("num_classes") or ic_info["label_info"].get("sample_unique_count", "?")
784
+ print(f"✓ Image Classification: READY ({num_cls} classes)")
785
+ else:
786
+ print(f"✗ Image Classification: not compatible")
787
+
788
+ if od_info["ready"]:
789
+ ann_info = od_info["annotations_info"]
790
+ fmt = ann_info.get("primary_bbox_format", "")
791
+ cls = ann_info.get("num_classes", "?")
792
+ print(f"✓ Object Detection: READY ({cls} classes, {fmt})")
793
+ else:
794
+ print(f"✗ Object Detection: not compatible")
795
+
796
+ if sam_info["ready"]:
797
+ print(f"✓ SAM Segmentation: READY (prompt: {pi['prompt_type']})")
798
+ else:
799
+ print(f"✗ SAM Segmentation: not compatible")
800
+
801
+ print(f"\nNote: Used Datasets Server API (instant, no download required)")
802
+
803
+ print("\n" + "=" * 80)
804
+ sys.exit(0)
805
+
806
+
807
+ if __name__ == "__main__":
808
+ try:
809
+ main()
810
+ except KeyboardInterrupt:
811
+ sys.exit(0)
812
+ except Exception as e:
813
+ print(f"ERROR: {e}", file=sys.stderr)
814
+ sys.exit(1)