@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,844 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "duckdb>=1.0.0",
6
+ # "huggingface_hub>=0.20.0",
7
+ # "datasets>=2.14.0",
8
+ # "pandas>=2.0.0",
9
+ # ]
10
+ # ///
11
+ """
12
+ Hugging Face Dataset SQL Manager
13
+
14
+ Query, transform, and push Hugging Face datasets using DuckDB's SQL interface.
15
+ Supports the hf:// protocol for direct dataset access, data wrangling, and
16
+ pushing results back to the Hub.
17
+
18
+ Version: 1.0.0
19
+
20
+ Usage:
21
+ # Query a dataset
22
+ uv run sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data LIMIT 10"
23
+
24
+ # Query and push to new dataset
25
+ uv run sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data WHERE subject='nutrition'" \
26
+ --push-to "username/nutrition-subset"
27
+
28
+ # Describe dataset schema
29
+ uv run sql_manager.py describe --dataset "cais/mmlu"
30
+
31
+ # List available splits/configs
32
+ uv run sql_manager.py info --dataset "cais/mmlu"
33
+
34
+ # Get random sample
35
+ uv run sql_manager.py sample --dataset "cais/mmlu" --n 5
36
+
37
+ # Export to parquet
38
+ uv run sql_manager.py export --dataset "cais/mmlu" --output "data.parquet"
39
+ """
40
+
41
+ import os
42
+ import json
43
+ import argparse
44
+ from typing import Optional, List, Dict, Any, Union
45
+
46
+ import duckdb
47
+ from huggingface_hub import HfApi
48
+
49
+
50
+ # Configuration
51
+ HF_TOKEN = os.environ.get("HF_TOKEN")
52
+
53
+
54
+ class HFDatasetSQL:
55
+ """
56
+ Query Hugging Face datasets using DuckDB SQL.
57
+
58
+ Examples:
59
+ >>> sql = HFDatasetSQL()
60
+ >>> results = sql.query("cais/mmlu", "SELECT * FROM data LIMIT 5")
61
+ >>> schema = sql.describe("cais/mmlu")
62
+ >>> sql.query_and_push("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition'", "user/nutrition-qa")
63
+ """
64
+
65
+ def __init__(self, token: Optional[str] = None):
66
+ """Initialize the SQL manager with optional HF token."""
67
+ self.token = token or HF_TOKEN
68
+ self.conn = duckdb.connect()
69
+ self._setup_connection()
70
+
71
+ def _setup_connection(self):
72
+ """Configure DuckDB connection for HF access."""
73
+ # Set HF token if available (for private datasets)
74
+ if self.token:
75
+ self.conn.execute(f"CREATE SECRET hf_token (TYPE HUGGINGFACE, TOKEN '{self.token}');")
76
+
77
+ def _build_hf_path(
78
+ self, dataset_id: str, split: str = "*", config: Optional[str] = None, revision: str = "~parquet"
79
+ ) -> str:
80
+ """
81
+ Build the hf:// path for a dataset.
82
+
83
+ Args:
84
+ dataset_id: Dataset ID (e.g., "cais/mmlu")
85
+ split: Split name or "*" for all splits
86
+ config: Optional config/subset name
87
+ revision: Revision, defaults to ~parquet for auto-converted parquet
88
+
89
+ Returns:
90
+ hf:// path string
91
+ """
92
+ if config:
93
+ return f"hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet"
94
+ else:
95
+ return f"hf://datasets/{dataset_id}@{revision}/default/{split}/*.parquet"
96
+
97
+ def _build_hf_path_flexible(
98
+ self,
99
+ dataset_id: str,
100
+ split: Optional[str] = None,
101
+ config: Optional[str] = None,
102
+ ) -> str:
103
+ """
104
+ Build flexible hf:// path with wildcards for discovery.
105
+
106
+ Args:
107
+ dataset_id: Dataset ID
108
+ split: Optional specific split
109
+ config: Optional config name
110
+
111
+ Returns:
112
+ hf:// path with appropriate wildcards
113
+ """
114
+ base = f"hf://datasets/{dataset_id}@~parquet"
115
+
116
+ if config and split:
117
+ return f"{base}/{config}/{split}/*.parquet"
118
+ elif config:
119
+ return f"{base}/{config}/*/*.parquet"
120
+ elif split:
121
+ return f"{base}/*/{split}/*.parquet"
122
+ else:
123
+ return f"{base}/*/*/*.parquet"
124
+
125
+ def query(
126
+ self,
127
+ dataset_id: str,
128
+ sql: str,
129
+ split: str = "train",
130
+ config: Optional[str] = None,
131
+ limit: Optional[int] = None,
132
+ output_format: str = "dict",
133
+ ) -> Union[List[Dict], Any]:
134
+ """
135
+ Execute SQL query on a Hugging Face dataset.
136
+
137
+ Args:
138
+ dataset_id: Dataset ID (e.g., "cais/mmlu", "ibm/duorc")
139
+ sql: SQL query. Use 'data' as table name (will be replaced with actual path)
140
+ split: Dataset split (train, test, validation, or * for all)
141
+ config: Optional dataset config/subset
142
+ limit: Optional limit override
143
+ output_format: Output format - "dict", "df" (pandas), "arrow", "raw"
144
+
145
+ Returns:
146
+ Query results in specified format
147
+
148
+ Examples:
149
+ >>> sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
150
+ >>> sql.query("cais/mmlu", "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject")
151
+ """
152
+ # Build the HF path
153
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
154
+
155
+ # Replace 'data' placeholder with actual path
156
+ # Handle various SQL patterns
157
+ processed_sql = sql.replace("FROM data", f"FROM '{hf_path}'")
158
+ processed_sql = processed_sql.replace("from data", f"FROM '{hf_path}'")
159
+ processed_sql = processed_sql.replace("JOIN data", f"JOIN '{hf_path}'")
160
+ processed_sql = processed_sql.replace("join data", f"JOIN '{hf_path}'")
161
+
162
+ # If user provides raw path, use as-is
163
+ if "hf://" in sql:
164
+ processed_sql = sql
165
+
166
+ # Apply limit if specified and not already in query
167
+ if limit and "LIMIT" not in processed_sql.upper():
168
+ processed_sql += f" LIMIT {limit}"
169
+
170
+ try:
171
+ result = self.conn.execute(processed_sql)
172
+
173
+ if output_format == "df":
174
+ return result.fetchdf()
175
+ elif output_format == "arrow":
176
+ return result.fetch_arrow_table()
177
+ elif output_format == "raw":
178
+ return result.fetchall()
179
+ else: # dict
180
+ columns = [desc[0] for desc in result.description]
181
+ rows = result.fetchall()
182
+ return [dict(zip(columns, row)) for row in rows]
183
+
184
+ except Exception as e:
185
+ print(f"❌ Query error: {e}")
186
+ print(f" SQL: {processed_sql[:200]}...")
187
+ raise
188
+
189
+ def query_raw(self, sql: str, output_format: str = "dict") -> Union[List[Dict], Any]:
190
+ """
191
+ Execute raw SQL query without path substitution.
192
+
193
+ Useful for queries that already contain full hf:// paths or for
194
+ multi-dataset queries.
195
+
196
+ Args:
197
+ sql: Complete SQL query
198
+ output_format: Output format
199
+
200
+ Returns:
201
+ Query results
202
+ """
203
+ result = self.conn.execute(sql)
204
+
205
+ if output_format == "df":
206
+ return result.fetchdf()
207
+ elif output_format == "arrow":
208
+ return result.fetch_arrow_table()
209
+ elif output_format == "raw":
210
+ return result.fetchall()
211
+ else:
212
+ columns = [desc[0] for desc in result.description]
213
+ rows = result.fetchall()
214
+ return [dict(zip(columns, row)) for row in rows]
215
+
216
+ def describe(self, dataset_id: str, split: str = "train", config: Optional[str] = None) -> List[Dict[str, str]]:
217
+ """
218
+ Get schema/structure of a dataset.
219
+
220
+ Args:
221
+ dataset_id: Dataset ID
222
+ split: Dataset split
223
+ config: Optional config
224
+
225
+ Returns:
226
+ List of column definitions with name, type, nullable info
227
+ """
228
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
229
+
230
+ sql = f"DESCRIBE SELECT * FROM '{hf_path}' LIMIT 1"
231
+ result = self.conn.execute(sql)
232
+
233
+ columns = [desc[0] for desc in result.description]
234
+ rows = result.fetchall()
235
+
236
+ return [dict(zip(columns, row)) for row in rows]
237
+
238
+ def sample(
239
+ self,
240
+ dataset_id: str,
241
+ n: int = 10,
242
+ split: str = "train",
243
+ config: Optional[str] = None,
244
+ seed: Optional[int] = None,
245
+ ) -> List[Dict]:
246
+ """
247
+ Get a random sample from a dataset.
248
+
249
+ Args:
250
+ dataset_id: Dataset ID
251
+ n: Number of samples
252
+ split: Dataset split
253
+ config: Optional config
254
+ seed: Random seed for reproducibility
255
+
256
+ Returns:
257
+ List of sampled rows
258
+ """
259
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
260
+
261
+ if seed is not None:
262
+ sql = f"SELECT * FROM '{hf_path}' USING SAMPLE {n} (RESERVOIR, {seed})"
263
+ else:
264
+ sql = f"SELECT * FROM '{hf_path}' USING SAMPLE {n}"
265
+
266
+ return self.query_raw(sql)
267
+
268
+ def count(
269
+ self, dataset_id: str, split: str = "train", config: Optional[str] = None, where: Optional[str] = None
270
+ ) -> int:
271
+ """
272
+ Count rows in a dataset, optionally with filter.
273
+
274
+ Args:
275
+ dataset_id: Dataset ID
276
+ split: Dataset split
277
+ config: Optional config
278
+ where: Optional WHERE clause (without WHERE keyword)
279
+
280
+ Returns:
281
+ Row count
282
+ """
283
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
284
+
285
+ sql = f"SELECT COUNT(*) FROM '{hf_path}'"
286
+ if where:
287
+ sql += f" WHERE {where}"
288
+
289
+ result = self.conn.execute(sql).fetchone()
290
+ return result[0] if result else 0
291
+
292
+ def unique_values(
293
+ self, dataset_id: str, column: str, split: str = "train", config: Optional[str] = None, limit: int = 100
294
+ ) -> List[Any]:
295
+ """
296
+ Get unique values in a column.
297
+
298
+ Args:
299
+ dataset_id: Dataset ID
300
+ column: Column name
301
+ split: Dataset split
302
+ config: Optional config
303
+ limit: Max unique values to return
304
+
305
+ Returns:
306
+ List of unique values
307
+ """
308
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
309
+
310
+ sql = f"SELECT DISTINCT {column} FROM '{hf_path}' LIMIT {limit}"
311
+ result = self.conn.execute(sql).fetchall()
312
+
313
+ return [row[0] for row in result]
314
+
315
+ def histogram(
316
+ self, dataset_id: str, column: str, split: str = "train", config: Optional[str] = None, bins: int = 10
317
+ ) -> List[Dict]:
318
+ """
319
+ Get value distribution/histogram for a column.
320
+
321
+ Args:
322
+ dataset_id: Dataset ID
323
+ column: Column name
324
+ split: Dataset split
325
+ config: Optional config
326
+ bins: Number of bins for numeric columns
327
+
328
+ Returns:
329
+ Distribution data
330
+ """
331
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
332
+
333
+ sql = f"""
334
+ SELECT
335
+ {column},
336
+ COUNT(*) as count
337
+ FROM '{hf_path}'
338
+ GROUP BY {column}
339
+ ORDER BY count DESC
340
+ LIMIT {bins}
341
+ """
342
+
343
+ return self.query_raw(sql)
344
+
345
+ def filter_and_transform(
346
+ self,
347
+ dataset_id: str,
348
+ select: str = "*",
349
+ where: Optional[str] = None,
350
+ group_by: Optional[str] = None,
351
+ order_by: Optional[str] = None,
352
+ split: str = "train",
353
+ config: Optional[str] = None,
354
+ limit: Optional[int] = None,
355
+ ) -> List[Dict]:
356
+ """
357
+ Filter and transform dataset with SQL clauses.
358
+
359
+ Args:
360
+ dataset_id: Dataset ID
361
+ select: SELECT clause (columns, expressions, aggregations)
362
+ where: WHERE clause (filter conditions)
363
+ group_by: GROUP BY clause
364
+ order_by: ORDER BY clause
365
+ split: Dataset split
366
+ config: Optional config
367
+ limit: Row limit
368
+
369
+ Returns:
370
+ Transformed data
371
+
372
+ Examples:
373
+ >>> sql.filter_and_transform(
374
+ ... "cais/mmlu",
375
+ ... select="subject, COUNT(*) as cnt",
376
+ ... group_by="subject",
377
+ ... order_by="cnt DESC",
378
+ ... limit=10
379
+ ... )
380
+ """
381
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
382
+
383
+ sql_parts = [f"SELECT {select}", f"FROM '{hf_path}'"]
384
+
385
+ if where:
386
+ sql_parts.append(f"WHERE {where}")
387
+ if group_by:
388
+ sql_parts.append(f"GROUP BY {group_by}")
389
+ if order_by:
390
+ sql_parts.append(f"ORDER BY {order_by}")
391
+ if limit:
392
+ sql_parts.append(f"LIMIT {limit}")
393
+
394
+ sql = " ".join(sql_parts)
395
+ return self.query_raw(sql)
396
+
397
+ def join_datasets(
398
+ self,
399
+ left_dataset: str,
400
+ right_dataset: str,
401
+ on: str,
402
+ select: str = "*",
403
+ join_type: str = "INNER",
404
+ left_split: str = "train",
405
+ right_split: str = "train",
406
+ left_config: Optional[str] = None,
407
+ right_config: Optional[str] = None,
408
+ limit: Optional[int] = None,
409
+ ) -> List[Dict]:
410
+ """
411
+ Join two datasets.
412
+
413
+ Args:
414
+ left_dataset: Left dataset ID
415
+ right_dataset: Right dataset ID
416
+ on: JOIN condition (e.g., "left.id = right.id")
417
+ select: SELECT clause
418
+ join_type: Type of join (INNER, LEFT, RIGHT, FULL)
419
+ left_split: Split for left dataset
420
+ right_split: Split for right dataset
421
+ left_config: Config for left dataset
422
+ right_config: Config for right dataset
423
+ limit: Row limit
424
+
425
+ Returns:
426
+ Joined data
427
+ """
428
+ left_path = self._build_hf_path(left_dataset, split=left_split, config=left_config)
429
+ right_path = self._build_hf_path(right_dataset, split=right_split, config=right_config)
430
+
431
+ sql = f"""
432
+ SELECT {select}
433
+ FROM '{left_path}' AS left_table
434
+ {join_type} JOIN '{right_path}' AS right_table
435
+ ON {on}
436
+ """
437
+
438
+ if limit:
439
+ sql += f" LIMIT {limit}"
440
+
441
+ return self.query_raw(sql)
442
+
443
+ def export_to_parquet(
444
+ self,
445
+ dataset_id: str,
446
+ output_path: str,
447
+ sql: Optional[str] = None,
448
+ split: str = "train",
449
+ config: Optional[str] = None,
450
+ ) -> str:
451
+ """
452
+ Export query results to a local Parquet file.
453
+
454
+ Args:
455
+ dataset_id: Source dataset ID
456
+ output_path: Local path for output Parquet file
457
+ sql: Optional SQL query (uses SELECT * if not provided)
458
+ split: Dataset split
459
+ config: Optional config
460
+
461
+ Returns:
462
+ Path to created file
463
+ """
464
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
465
+
466
+ if sql:
467
+ # Process the query
468
+ processed_sql = sql.replace("FROM data", f"FROM '{hf_path}'")
469
+ processed_sql = processed_sql.replace("from data", f"FROM '{hf_path}'")
470
+ else:
471
+ processed_sql = f"SELECT * FROM '{hf_path}'"
472
+
473
+ export_sql = f"COPY ({processed_sql}) TO '{output_path}' (FORMAT PARQUET)"
474
+ self.conn.execute(export_sql)
475
+
476
+ print(f"✅ Exported to {output_path}")
477
+ return output_path
478
+
479
+ def export_to_jsonl(
480
+ self,
481
+ dataset_id: str,
482
+ output_path: str,
483
+ sql: Optional[str] = None,
484
+ split: str = "train",
485
+ config: Optional[str] = None,
486
+ ) -> str:
487
+ """
488
+ Export query results to JSONL format.
489
+
490
+ Args:
491
+ dataset_id: Source dataset ID
492
+ output_path: Local path for output JSONL file
493
+ sql: Optional SQL query
494
+ split: Dataset split
495
+ config: Optional config
496
+
497
+ Returns:
498
+ Path to created file
499
+ """
500
+ results = self.query(dataset_id, sql or "SELECT * FROM data", split=split, config=config)
501
+
502
+ with open(output_path, "w") as f:
503
+ for row in results:
504
+ f.write(json.dumps(row) + "\n")
505
+
506
+ print(f"✅ Exported {len(results)} rows to {output_path}")
507
+ return output_path
508
+
509
+ def push_to_hub(
510
+ self,
511
+ dataset_id: str,
512
+ target_repo: str,
513
+ sql: Optional[str] = None,
514
+ split: str = "train",
515
+ config: Optional[str] = None,
516
+ target_split: str = "train",
517
+ private: bool = True,
518
+ commit_message: Optional[str] = None,
519
+ ) -> str:
520
+ """
521
+ Query a dataset and push results to a new Hub repository.
522
+
523
+ Args:
524
+ dataset_id: Source dataset ID
525
+ target_repo: Target repository ID (e.g., "username/new-dataset")
526
+ sql: SQL query to transform data (optional, defaults to SELECT *)
527
+ split: Source split
528
+ config: Source config
529
+ target_split: Target split name
530
+ private: Whether to create private repo
531
+ commit_message: Commit message
532
+
533
+ Returns:
534
+ URL of created dataset
535
+ """
536
+ try:
537
+ from datasets import Dataset
538
+ except ImportError:
539
+ raise ImportError("datasets library required for push_to_hub. Install with: pip install datasets")
540
+
541
+ # Execute query
542
+ results = self.query(dataset_id, sql or "SELECT * FROM data", split=split, config=config)
543
+
544
+ if not results:
545
+ print("❌ No results to push")
546
+ return ""
547
+
548
+ # Convert to HF Dataset
549
+ ds = Dataset.from_list(results)
550
+
551
+ # Push to Hub
552
+ ds.push_to_hub(
553
+ target_repo,
554
+ split=target_split,
555
+ private=private,
556
+ commit_message=commit_message or f"Created from {dataset_id} via SQL query",
557
+ token=self.token,
558
+ )
559
+
560
+ url = f"https://huggingface.co/datasets/{target_repo}"
561
+ print(f"✅ Pushed {len(results)} rows to {url}")
562
+ return url
563
+
564
+ def create_view(self, name: str, dataset_id: str, split: str = "train", config: Optional[str] = None):
565
+ """
566
+ Create a DuckDB view for easier querying.
567
+
568
+ Args:
569
+ name: View name
570
+ dataset_id: Dataset ID
571
+ split: Dataset split
572
+ config: Optional config
573
+ """
574
+ hf_path = self._build_hf_path(dataset_id, split=split, config=config)
575
+ self.conn.execute(f"CREATE OR REPLACE VIEW {name} AS SELECT * FROM '{hf_path}'")
576
+ print(f"✅ Created view '{name}' for {dataset_id}")
577
+
578
+ def info(self, dataset_id: str) -> Dict[str, Any]:
579
+ """
580
+ Get information about a dataset including available configs and splits.
581
+
582
+ Args:
583
+ dataset_id: Dataset ID
584
+
585
+ Returns:
586
+ Dataset information
587
+ """
588
+ api = HfApi(token=self.token)
589
+
590
+ try:
591
+ info = api.dataset_info(dataset_id)
592
+
593
+ result = {
594
+ "id": info.id,
595
+ "author": info.author,
596
+ "private": info.private,
597
+ "downloads": info.downloads,
598
+ "likes": info.likes,
599
+ "tags": info.tags,
600
+ "created_at": str(info.created_at) if info.created_at else None,
601
+ "last_modified": str(info.last_modified) if info.last_modified else None,
602
+ }
603
+
604
+ # Try to get config/split info from card data
605
+ if info.card_data:
606
+ result["configs"] = getattr(info.card_data, "configs", None)
607
+
608
+ return result
609
+
610
+ except Exception as e:
611
+ print(f"❌ Failed to get info: {e}")
612
+ return {}
613
+
614
+ def close(self):
615
+ """Close the database connection."""
616
+ self.conn.close()
617
+
618
+
619
+ def main():
620
+ """CLI entry point."""
621
+ parser = argparse.ArgumentParser(
622
+ description="Query Hugging Face datasets with SQL",
623
+ formatter_class=argparse.RawDescriptionHelpFormatter,
624
+ epilog="""
625
+ Examples:
626
+ # Query dataset with SQL
627
+ python sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
628
+
629
+ # Get random sample
630
+ python sql_manager.py sample --dataset "cais/mmlu" --n 5
631
+
632
+ # Describe schema
633
+ python sql_manager.py describe --dataset "cais/mmlu"
634
+
635
+ # Get value counts
636
+ python sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
637
+
638
+ # Filter and transform
639
+ python sql_manager.py transform --dataset "cais/mmlu" \\
640
+ --select "subject, COUNT(*) as cnt" \\
641
+ --group-by "subject" \\
642
+ --order-by "cnt DESC"
643
+
644
+ # Query and push to Hub
645
+ python sql_manager.py query --dataset "cais/mmlu" \\
646
+ --sql "SELECT * FROM data WHERE subject='nutrition'" \\
647
+ --push-to "username/nutrition-subset"
648
+
649
+ # Export to Parquet
650
+ python sql_manager.py export --dataset "cais/mmlu" \\
651
+ --sql "SELECT * FROM data WHERE subject='nutrition'" \\
652
+ --output "nutrition.parquet"
653
+ """,
654
+ )
655
+
656
+ subparsers = parser.add_subparsers(dest="command", required=True)
657
+
658
+ # Common arguments
659
+ def add_common_args(p):
660
+ p.add_argument("--dataset", "-d", required=True, help="Dataset ID (e.g., cais/mmlu)")
661
+ p.add_argument("--split", "-s", default="train", help="Dataset split (default: train)")
662
+ p.add_argument("--config", "-c", help="Dataset config/subset")
663
+
664
+ # Query command
665
+ query_parser = subparsers.add_parser("query", help="Execute SQL query on dataset")
666
+ add_common_args(query_parser)
667
+ query_parser.add_argument("--sql", required=True, help="SQL query (use 'data' as table name)")
668
+ query_parser.add_argument("--limit", "-l", type=int, help="Limit results")
669
+ query_parser.add_argument("--format", choices=["json", "table", "csv"], default="json", help="Output format")
670
+ query_parser.add_argument("--push-to", help="Push results to this Hub repo")
671
+ query_parser.add_argument("--private", action="store_true", help="Make pushed repo private")
672
+
673
+ # Sample command
674
+ sample_parser = subparsers.add_parser("sample", help="Get random sample from dataset")
675
+ add_common_args(sample_parser)
676
+ sample_parser.add_argument("--n", type=int, default=10, help="Number of samples")
677
+ sample_parser.add_argument("--seed", type=int, help="Random seed")
678
+
679
+ # Describe command
680
+ describe_parser = subparsers.add_parser("describe", help="Get dataset schema")
681
+ add_common_args(describe_parser)
682
+
683
+ # Count command
684
+ count_parser = subparsers.add_parser("count", help="Count rows in dataset")
685
+ add_common_args(count_parser)
686
+ count_parser.add_argument("--where", "-w", help="WHERE clause for filtering")
687
+
688
+ # Histogram command
689
+ histogram_parser = subparsers.add_parser("histogram", help="Get value distribution")
690
+ add_common_args(histogram_parser)
691
+ histogram_parser.add_argument("--column", required=True, help="Column name")
692
+ histogram_parser.add_argument("--bins", type=int, default=20, help="Number of bins")
693
+
694
+ # Unique command
695
+ unique_parser = subparsers.add_parser("unique", help="Get unique values in column")
696
+ add_common_args(unique_parser)
697
+ unique_parser.add_argument("--column", required=True, help="Column name")
698
+ unique_parser.add_argument("--limit", "-l", type=int, default=100, help="Max values")
699
+
700
+ # Transform command
701
+ transform_parser = subparsers.add_parser("transform", help="Filter and transform dataset")
702
+ add_common_args(transform_parser)
703
+ transform_parser.add_argument("--select", default="*", help="SELECT clause")
704
+ transform_parser.add_argument("--where", "-w", help="WHERE clause")
705
+ transform_parser.add_argument("--group-by", help="GROUP BY clause")
706
+ transform_parser.add_argument("--order-by", help="ORDER BY clause")
707
+ transform_parser.add_argument("--limit", "-l", type=int, help="LIMIT")
708
+ transform_parser.add_argument("--push-to", help="Push results to Hub repo")
709
+
710
+ # Export command
711
+ export_parser = subparsers.add_parser("export", help="Export query results to file")
712
+ add_common_args(export_parser)
713
+ export_parser.add_argument("--sql", help="SQL query (defaults to SELECT *)")
714
+ export_parser.add_argument("--output", "-o", required=True, help="Output file path")
715
+ export_parser.add_argument("--format", choices=["parquet", "jsonl"], default="parquet", help="Output format")
716
+
717
+ # Info command
718
+ info_parser = subparsers.add_parser("info", help="Get dataset information")
719
+ info_parser.add_argument("--dataset", "-d", required=True, help="Dataset ID")
720
+
721
+ # Raw SQL command
722
+ raw_parser = subparsers.add_parser("raw", help="Execute raw SQL with full hf:// paths")
723
+ raw_parser.add_argument("--sql", required=True, help="Complete SQL query")
724
+ raw_parser.add_argument("--format", choices=["json", "table", "csv"], default="json", help="Output format")
725
+
726
+ args = parser.parse_args()
727
+
728
+ # Initialize SQL manager
729
+ sql = HFDatasetSQL()
730
+
731
+ try:
732
+ if args.command == "query":
733
+ results = sql.query(args.dataset, args.sql, split=args.split, config=args.config, limit=args.limit)
734
+
735
+ if getattr(args, "push_to", None):
736
+ sql.push_to_hub(
737
+ args.dataset, args.push_to, sql=args.sql, split=args.split, config=args.config, private=args.private
738
+ )
739
+ else:
740
+ _print_results(results, args.format)
741
+
742
+ elif args.command == "sample":
743
+ results = sql.sample(args.dataset, n=args.n, split=args.split, config=args.config, seed=args.seed)
744
+ _print_results(results, "json")
745
+
746
+ elif args.command == "describe":
747
+ schema = sql.describe(args.dataset, split=args.split, config=args.config)
748
+ _print_results(schema, "table")
749
+
750
+ elif args.command == "count":
751
+ count = sql.count(args.dataset, split=args.split, config=args.config, where=args.where)
752
+ print(f"Count: {count:,}")
753
+
754
+ elif args.command == "histogram":
755
+ results = sql.histogram(args.dataset, args.column, split=args.split, config=args.config, bins=args.bins)
756
+ _print_results(results, "table")
757
+
758
+ elif args.command == "unique":
759
+ values = sql.unique_values(
760
+ args.dataset, args.column, split=args.split, config=args.config, limit=args.limit
761
+ )
762
+ for v in values:
763
+ print(v)
764
+
765
+ elif args.command == "transform":
766
+ results = sql.filter_and_transform(
767
+ args.dataset,
768
+ select=args.select,
769
+ where=args.where,
770
+ group_by=args.group_by,
771
+ order_by=args.order_by,
772
+ split=args.split,
773
+ config=args.config,
774
+ limit=args.limit,
775
+ )
776
+
777
+ if getattr(args, "push_to", None):
778
+ # Build SQL for push
779
+ query_sql = f"SELECT {args.select} FROM data"
780
+ if args.where:
781
+ query_sql += f" WHERE {args.where}"
782
+ if args.group_by:
783
+ query_sql += f" GROUP BY {args.group_by}"
784
+ if args.order_by:
785
+ query_sql += f" ORDER BY {args.order_by}"
786
+ if args.limit:
787
+ query_sql += f" LIMIT {args.limit}"
788
+
789
+ sql.push_to_hub(args.dataset, args.push_to, sql=query_sql, split=args.split, config=args.config)
790
+ else:
791
+ _print_results(results, "json")
792
+
793
+ elif args.command == "export":
794
+ if args.format == "parquet":
795
+ sql.export_to_parquet(args.dataset, args.output, sql=args.sql, split=args.split, config=args.config)
796
+ else:
797
+ sql.export_to_jsonl(args.dataset, args.output, sql=args.sql, split=args.split, config=args.config)
798
+
799
+ elif args.command == "info":
800
+ info = sql.info(args.dataset)
801
+ _print_results([info], "json")
802
+
803
+ elif args.command == "raw":
804
+ results = sql.query_raw(args.sql)
805
+ _print_results(results, args.format)
806
+
807
+ finally:
808
+ sql.close()
809
+
810
+
811
+ def _print_results(results: List[Dict], format: str):
812
+ """Print results in specified format."""
813
+ if not results:
814
+ print("No results")
815
+ return
816
+
817
+ if format == "json":
818
+ print(json.dumps(results, indent=2, default=str))
819
+
820
+ elif format == "csv":
821
+ if results:
822
+ keys = results[0].keys()
823
+ print(",".join(str(k) for k in keys))
824
+ for row in results:
825
+ print(",".join(str(row.get(k, "")) for k in keys))
826
+
827
+ elif format == "table":
828
+ if results:
829
+ keys = list(results[0].keys())
830
+ # Calculate column widths
831
+ widths = {k: max(len(str(k)), max(len(str(r.get(k, ""))) for r in results)) for k in keys}
832
+
833
+ # Header
834
+ header = " | ".join(str(k).ljust(widths[k]) for k in keys)
835
+ print(header)
836
+ print("-" * len(header))
837
+
838
+ # Rows
839
+ for row in results:
840
+ print(" | ".join(str(row.get(k, "")).ljust(widths[k]) for k in keys))
841
+
842
+
843
+ if __name__ == "__main__":
844
+ main()