@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,547 @@
1
+ ---
2
+ name: hugging-face-datasets
3
+ description: Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. Designed to work alongside HF MCP server for comprehensive dataset workflows.
4
+ version: 1.0.0
5
+ author: Synthetic Sciences
6
+ license: MIT
7
+ tags: [Hugging Face, Datasets, Data Loading, Data Processing]
8
+ dependencies: [huggingface-hub, transformers]
9
+ ---
10
+
11
+ # Overview
12
+ This skill provides tools to manage datasets on the Hugging Face Hub with a focus on creation, configuration, content management, and SQL-based data manipulation. It is designed to complement the existing Hugging Face MCP server by providing dataset editing and querying capabilities.
13
+
14
+ ## Integration with HF MCP Server
15
+ - **Use HF MCP Server for**: Dataset discovery, search, and metadata retrieval
16
+ - **Use This Skill for**: Dataset creation, content editing, SQL queries, data transformation, and structured data formatting
17
+
18
+ # Version
19
+ 2.1.0
20
+
21
+ # Dependencies
22
+ # This skill uses PEP 723 scripts with inline dependency management
23
+ # Scripts auto-install requirements when run with: uv run scripts/script_name.py
24
+
25
+ - uv (Python package manager)
26
+ - Getting Started: See "Usage Instructions" below for PEP 723 usage
27
+
28
+ # Core Capabilities
29
+
30
+ ## 1. Dataset Lifecycle Management
31
+ - **Initialize**: Create new dataset repositories with proper structure
32
+ - **Configure**: Store detailed configuration including system prompts and metadata
33
+ - **Stream Updates**: Add rows efficiently without downloading entire datasets
34
+
35
+ ## 2. SQL-Based Dataset Querying (NEW)
36
+ Query any Hugging Face dataset using DuckDB SQL via `scripts/sql_manager.py`:
37
+ - **Direct Queries**: Run SQL on datasets using the `hf://` protocol
38
+ - **Schema Discovery**: Describe dataset structure and column types
39
+ - **Data Sampling**: Get random samples for exploration
40
+ - **Aggregations**: Count, histogram, unique values analysis
41
+ - **Transformations**: Filter, join, reshape data with SQL
42
+ - **Export & Push**: Save results locally or push to new Hub repos
43
+
44
+ ## 3. Multi-Format Dataset Support
45
+ Supports diverse dataset types through template system:
46
+ - **Chat/Conversational**: Chat templating, multi-turn dialogues, tool usage examples
47
+ - **Text Classification**: Sentiment analysis, intent detection, topic classification
48
+ - **Question-Answering**: Reading comprehension, factual QA, knowledge bases
49
+ - **Text Completion**: Language modeling, code completion, creative writing
50
+ - **Tabular Data**: Structured data for regression/classification tasks
51
+ - **Custom Formats**: Flexible schema definition for specialized needs
52
+
53
+ ## 4. Quality Assurance Features
54
+ - **JSON Validation**: Ensures data integrity during uploads
55
+ - **Batch Processing**: Efficient handling of large datasets
56
+ - **Error Recovery**: Graceful handling of upload failures and conflicts
57
+
58
+ # Usage Instructions
59
+
60
+ The skill includes two Python scripts that use PEP 723 inline dependency management:
61
+
62
+ > **All paths are relative to the directory containing this SKILL.md
63
+ file.**
64
+ > Scripts are run with: `uv run scripts/script_name.py [arguments]`
65
+
66
+ - `scripts/dataset_manager.py` - Dataset creation and management
67
+ - `scripts/sql_manager.py` - SQL-based dataset querying and transformation
68
+
69
+ ### Prerequisites
70
+ - `uv` package manager installed
71
+ - `HF_TOKEN` environment variable must be set with a Write-access token
72
+
73
+ ---
74
+
75
+ # SQL Dataset Querying (sql_manager.py)
76
+
77
+ Query, transform, and push Hugging Face datasets using DuckDB SQL. The `hf://` protocol provides direct access to any public dataset (or private with token).
78
+
79
+ ## Quick Start
80
+
81
+ ```bash
82
+ # Query a dataset
83
+ uv run scripts/sql_manager.py query \
84
+ --dataset "cais/mmlu" \
85
+ --sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
86
+
87
+ # Get dataset schema
88
+ uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
89
+
90
+ # Sample random rows
91
+ uv run scripts/sql_manager.py sample --dataset "cais/mmlu" --n 5
92
+
93
+ # Count rows with filter
94
+ uv run scripts/sql_manager.py count --dataset "cais/mmlu" --where "subject='nutrition'"
95
+ ```
96
+
97
+ ## SQL Query Syntax
98
+
99
+ Use `data` as the table name in your SQL - it gets replaced with the actual `hf://` path:
100
+
101
+ ```sql
102
+ -- Basic select
103
+ SELECT * FROM data LIMIT 10
104
+
105
+ -- Filtering
106
+ SELECT * FROM data WHERE subject='nutrition'
107
+
108
+ -- Aggregations
109
+ SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject ORDER BY cnt DESC
110
+
111
+ -- Column selection and transformation
112
+ SELECT question, choices[answer] AS correct_answer FROM data
113
+
114
+ -- Regex matching
115
+ SELECT * FROM data WHERE regexp_matches(question, 'nutrition|diet')
116
+
117
+ -- String functions
118
+ SELECT regexp_replace(question, '\n', '') AS cleaned FROM data
119
+ ```
120
+
121
+ ## Common Operations
122
+
123
+ ### 1. Explore Dataset Structure
124
+ ```bash
125
+ # Get schema
126
+ uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
127
+
128
+ # Get unique values in column
129
+ uv run scripts/sql_manager.py unique --dataset "cais/mmlu" --column "subject"
130
+
131
+ # Get value distribution
132
+ uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" --bins 20
133
+ ```
134
+
135
+ ### 2. Filter and Transform
136
+ ```bash
137
+ # Complex filtering with SQL
138
+ uv run scripts/sql_manager.py query \
139
+ --dataset "cais/mmlu" \
140
+ --sql "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject HAVING cnt > 100"
141
+
142
+ # Using transform command
143
+ uv run scripts/sql_manager.py transform \
144
+ --dataset "cais/mmlu" \
145
+ --select "subject, COUNT(*) as cnt" \
146
+ --group-by "subject" \
147
+ --order-by "cnt DESC" \
148
+ --limit 10
149
+ ```
150
+
151
+ ### 3. Create Subsets and Push to Hub
152
+ ```bash
153
+ # Query and push to new dataset
154
+ uv run scripts/sql_manager.py query \
155
+ --dataset "cais/mmlu" \
156
+ --sql "SELECT * FROM data WHERE subject='nutrition'" \
157
+ --push-to "username/mmlu-nutrition-subset" \
158
+ --private
159
+
160
+ # Transform and push
161
+ uv run scripts/sql_manager.py transform \
162
+ --dataset "ibm/duorc" \
163
+ --config "ParaphraseRC" \
164
+ --select "question, answers" \
165
+ --where "LENGTH(question) > 50" \
166
+ --push-to "username/duorc-long-questions"
167
+ ```
168
+
169
+ ### 4. Export to Local Files
170
+ ```bash
171
+ # Export to Parquet
172
+ uv run scripts/sql_manager.py export \
173
+ --dataset "cais/mmlu" \
174
+ --sql "SELECT * FROM data WHERE subject='nutrition'" \
175
+ --output "nutrition.parquet" \
176
+ --format parquet
177
+
178
+ # Export to JSONL
179
+ uv run scripts/sql_manager.py export \
180
+ --dataset "cais/mmlu" \
181
+ --sql "SELECT * FROM data LIMIT 100" \
182
+ --output "sample.jsonl" \
183
+ --format jsonl
184
+ ```
185
+
186
+ ### 5. Working with Dataset Configs/Splits
187
+ ```bash
188
+ # Specify config (subset)
189
+ uv run scripts/sql_manager.py query \
190
+ --dataset "ibm/duorc" \
191
+ --config "ParaphraseRC" \
192
+ --sql "SELECT * FROM data LIMIT 5"
193
+
194
+ # Specify split
195
+ uv run scripts/sql_manager.py query \
196
+ --dataset "cais/mmlu" \
197
+ --split "test" \
198
+ --sql "SELECT COUNT(*) FROM data"
199
+
200
+ # Query all splits
201
+ uv run scripts/sql_manager.py query \
202
+ --dataset "cais/mmlu" \
203
+ --split "*" \
204
+ --sql "SELECT * FROM data LIMIT 10"
205
+ ```
206
+
207
+ ### 6. Raw SQL with Full Paths
208
+ For complex queries or joining datasets:
209
+ ```bash
210
+ uv run scripts/sql_manager.py raw --sql "
211
+ SELECT a.*, b.*
212
+ FROM 'hf://datasets/dataset1@~parquet/default/train/*.parquet' a
213
+ JOIN 'hf://datasets/dataset2@~parquet/default/train/*.parquet' b
214
+ ON a.id = b.id
215
+ LIMIT 100
216
+ "
217
+ ```
218
+
219
+ ## Python API Usage
220
+
221
+ ```python
222
+ from sql_manager import HFDatasetSQL
223
+
224
+ sql = HFDatasetSQL()
225
+
226
+ # Query
227
+ results = sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
228
+
229
+ # Get schema
230
+ schema = sql.describe("cais/mmlu")
231
+
232
+ # Sample
233
+ samples = sql.sample("cais/mmlu", n=5, seed=42)
234
+
235
+ # Count
236
+ count = sql.count("cais/mmlu", where="subject='nutrition'")
237
+
238
+ # Histogram
239
+ dist = sql.histogram("cais/mmlu", "subject")
240
+
241
+ # Filter and transform
242
+ results = sql.filter_and_transform(
243
+ "cais/mmlu",
244
+ select="subject, COUNT(*) as cnt",
245
+ group_by="subject",
246
+ order_by="cnt DESC",
247
+ limit=10
248
+ )
249
+
250
+ # Push to Hub
251
+ url = sql.push_to_hub(
252
+ "cais/mmlu",
253
+ "username/nutrition-subset",
254
+ sql="SELECT * FROM data WHERE subject='nutrition'",
255
+ private=True
256
+ )
257
+
258
+ # Export locally
259
+ sql.export_to_parquet("cais/mmlu", "output.parquet", sql="SELECT * FROM data LIMIT 100")
260
+
261
+ sql.close()
262
+ ```
263
+
264
+ ## HF Path Format
265
+
266
+ DuckDB uses the `hf://` protocol to access datasets:
267
+ ```
268
+ hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet
269
+ ```
270
+
271
+ Examples:
272
+ - `hf://datasets/cais/mmlu@~parquet/default/train/*.parquet`
273
+ - `hf://datasets/ibm/duorc@~parquet/ParaphraseRC/test/*.parquet`
274
+
275
+ The `@~parquet` revision provides auto-converted Parquet files for any dataset format.
276
+
277
+ ## Useful DuckDB SQL Functions
278
+
279
+ ```sql
280
+ -- String functions
281
+ LENGTH(column) -- String length
282
+ regexp_replace(col, '\n', '') -- Regex replace
283
+ regexp_matches(col, 'pattern') -- Regex match
284
+ LOWER(col), UPPER(col) -- Case conversion
285
+
286
+ -- Array functions
287
+ choices[0] -- Array indexing (0-based)
288
+ array_length(choices) -- Array length
289
+ unnest(choices) -- Expand array to rows
290
+
291
+ -- Aggregations
292
+ COUNT(*), SUM(col), AVG(col)
293
+ GROUP BY col HAVING condition
294
+
295
+ -- Sampling
296
+ USING SAMPLE 10 -- Random sample
297
+ USING SAMPLE 10 (RESERVOIR, 42) -- Reproducible sample
298
+
299
+ -- Window functions
300
+ ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
301
+ ```
302
+
303
+ ---
304
+
305
+ # Dataset Creation (dataset_manager.py)
306
+
307
+ ### Recommended Workflow
308
+
309
+ **1. Discovery (Use HF MCP Server):**
310
+ ```python
311
+ # Use HF MCP tools to find existing datasets
312
+ search_datasets("conversational AI training")
313
+ get_dataset_details("username/dataset-name")
314
+ ```
315
+
316
+ **2. Creation (Use This Skill):**
317
+ ```bash
318
+ # Initialize new dataset
319
+ uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
320
+
321
+ # Configure with detailed system prompt
322
+ uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "$(cat system_prompt.txt)"
323
+ ```
324
+
325
+ **3. Content Management (Use This Skill):**
326
+ ```bash
327
+ # Quick setup with any template
328
+ uv run scripts/dataset_manager.py quick_setup \
329
+ --repo_id "your-username/dataset-name" \
330
+ --template classification
331
+
332
+ # Add data with template validation
333
+ uv run scripts/dataset_manager.py add_rows \
334
+ --repo_id "your-username/dataset-name" \
335
+ --template qa \
336
+ --rows_json "$(cat your_qa_data.json)"
337
+ ```
338
+
339
+ ### Template-Based Data Structures
340
+
341
+ **1. Chat Template (`--template chat`)**
342
+ ```json
343
+ {
344
+ "messages": [
345
+ {"role": "user", "content": "Natural user request"},
346
+ {"role": "assistant", "content": "Response with tool usage"},
347
+ {"role": "tool", "content": "Tool response", "tool_call_id": "call_123"}
348
+ ],
349
+ "scenario": "Description of use case",
350
+ "complexity": "simple|intermediate|advanced"
351
+ }
352
+ ```
353
+
354
+ **2. Classification Template (`--template classification`)**
355
+ ```json
356
+ {
357
+ "text": "Input text to be classified",
358
+ "label": "classification_label",
359
+ "confidence": 0.95,
360
+ "metadata": {"domain": "technology", "language": "en"}
361
+ }
362
+ ```
363
+
364
+ **3. QA Template (`--template qa`)**
365
+ ```json
366
+ {
367
+ "question": "What is the question being asked?",
368
+ "answer": "The complete answer",
369
+ "context": "Additional context if needed",
370
+ "answer_type": "factual|explanatory|opinion",
371
+ "difficulty": "easy|medium|hard"
372
+ }
373
+ ```
374
+
375
+ **4. Completion Template (`--template completion`)**
376
+ ```json
377
+ {
378
+ "prompt": "The beginning text or context",
379
+ "completion": "The expected continuation",
380
+ "domain": "code|creative|technical|conversational",
381
+ "style": "description of writing style"
382
+ }
383
+ ```
384
+
385
+ **5. Tabular Template (`--template tabular`)**
386
+ ```json
387
+ {
388
+ "columns": [
389
+ {"name": "feature1", "type": "numeric", "description": "First feature"},
390
+ {"name": "target", "type": "categorical", "description": "Target variable"}
391
+ ],
392
+ "data": [
393
+ {"feature1": 123, "target": "class_a"},
394
+ {"feature1": 456, "target": "class_b"}
395
+ ]
396
+ }
397
+ ```
398
+
399
+ ### Advanced System Prompt Template
400
+
401
+ For high-quality training data generation:
402
+ ```text
403
+ You are an AI assistant expert at using MCP tools effectively.
404
+
405
+ ## MCP SERVER DEFINITIONS
406
+ [Define available servers and tools]
407
+
408
+ ## TRAINING EXAMPLE STRUCTURE
409
+ [Specify exact JSON schema for chat templating]
410
+
411
+ ## QUALITY GUIDELINES
412
+ [Detail requirements for realistic scenarios, progressive complexity, proper tool usage]
413
+
414
+ ## EXAMPLE CATEGORIES
415
+ [List development workflows, debugging scenarios, data management tasks]
416
+ ```
417
+
418
+ ### Example Categories & Templates
419
+
420
+ The skill includes diverse training examples beyond just MCP usage:
421
+
422
+ **Available Example Sets:**
423
+ - `training_examples.json` - MCP tool usage examples (debugging, project setup, database analysis)
424
+ - `diverse_training_examples.json` - Broader scenarios including:
425
+ - **Educational Chat** - Explaining programming concepts, tutorials
426
+ - **Git Workflows** - Feature branches, version control guidance
427
+ - **Code Analysis** - Performance optimization, architecture review
428
+ - **Content Generation** - Professional writing, creative brainstorming
429
+ - **Codebase Navigation** - Legacy code exploration, systematic analysis
430
+ - **Conversational Support** - Problem-solving, technical discussions
431
+
432
+ **Using Different Example Sets:**
433
+ ```bash
434
+ # Add MCP-focused examples
435
+ uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
436
+ --rows_json "$(cat examples/training_examples.json)"
437
+
438
+ # Add diverse conversational examples
439
+ uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
440
+ --rows_json "$(cat examples/diverse_training_examples.json)"
441
+
442
+ # Mix both for comprehensive training data
443
+ uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
444
+ --rows_json "$(jq -s '.[0] + .[1]' examples/training_examples.json examples/diverse_training_examples.json)"
445
+ ```
446
+
447
+ ### Commands Reference
448
+
449
+ **List Available Templates:**
450
+ ```bash
451
+ uv run scripts/dataset_manager.py list_templates
452
+ ```
453
+
454
+ **Quick Setup (Recommended):**
455
+ ```bash
456
+ uv run scripts/dataset_manager.py quick_setup --repo_id "your-username/dataset-name" --template classification
457
+ ```
458
+
459
+ **Manual Setup:**
460
+ ```bash
461
+ # Initialize repository
462
+ uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
463
+
464
+ # Configure with system prompt
465
+ uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "Your prompt here"
466
+
467
+ # Add data with validation
468
+ uv run scripts/dataset_manager.py add_rows \
469
+ --repo_id "your-username/dataset-name" \
470
+ --template qa \
471
+ --rows_json '[{"question": "What is AI?", "answer": "Artificial Intelligence..."}]'
472
+ ```
473
+
474
+ **View Dataset Statistics:**
475
+ ```bash
476
+ uv run scripts/dataset_manager.py stats --repo_id "your-username/dataset-name"
477
+ ```
478
+
479
+ ### Error Handling
480
+ - **Repository exists**: Script will notify and continue with configuration
481
+ - **Invalid JSON**: Clear error message with parsing details
482
+ - **Network issues**: Automatic retry for transient failures
483
+ - **Token permissions**: Validation before operations begin
484
+
485
+ ---
486
+
487
+ # Combined Workflow Examples
488
+
489
+ ## Example 1: Create Training Subset from Existing Dataset
490
+ ```bash
491
+ # 1. Explore the source dataset
492
+ uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
493
+ uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
494
+
495
+ # 2. Query and create subset
496
+ uv run scripts/sql_manager.py query \
497
+ --dataset "cais/mmlu" \
498
+ --sql "SELECT * FROM data WHERE subject IN ('nutrition', 'anatomy', 'clinical_knowledge')" \
499
+ --push-to "username/mmlu-medical-subset" \
500
+ --private
501
+ ```
502
+
503
+ ## Example 2: Transform and Reshape Data
504
+ ```bash
505
+ # Transform MMLU to QA format with correct answers extracted
506
+ uv run scripts/sql_manager.py query \
507
+ --dataset "cais/mmlu" \
508
+ --sql "SELECT question, choices[answer] as correct_answer, subject FROM data" \
509
+ --push-to "username/mmlu-qa-format"
510
+ ```
511
+
512
+ ## Example 3: Merge Multiple Dataset Splits
513
+ ```bash
514
+ # Export multiple splits and combine
515
+ uv run scripts/sql_manager.py export \
516
+ --dataset "cais/mmlu" \
517
+ --split "*" \
518
+ --output "mmlu_all.parquet"
519
+ ```
520
+
521
+ ## Example 4: Quality Filtering
522
+ ```bash
523
+ # Filter for high-quality examples
524
+ uv run scripts/sql_manager.py query \
525
+ --dataset "squad" \
526
+ --sql "SELECT * FROM data WHERE LENGTH(context) > 500 AND LENGTH(question) > 20" \
527
+ --push-to "username/squad-filtered"
528
+ ```
529
+
530
+ ## Example 5: Create Custom Training Dataset
531
+ ```bash
532
+ # 1. Query source data
533
+ uv run scripts/sql_manager.py export \
534
+ --dataset "cais/mmlu" \
535
+ --sql "SELECT question, subject FROM data WHERE subject='nutrition'" \
536
+ --output "nutrition_source.jsonl" \
537
+ --format jsonl
538
+
539
+ # 2. Process with your pipeline (add answers, format, etc.)
540
+
541
+ # 3. Push processed data
542
+ uv run scripts/dataset_manager.py init --repo_id "username/nutrition-training"
543
+ uv run scripts/dataset_manager.py add_rows \
544
+ --repo_id "username/nutrition-training" \
545
+ --template qa \
546
+ --rows_json "$(cat processed_data.json)"
547
+ ```