@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,844 @@
|
|
|
1
|
+
#!/usr/bin/env -S uv run
|
|
2
|
+
# /// script
|
|
3
|
+
# requires-python = ">=3.10"
|
|
4
|
+
# dependencies = [
|
|
5
|
+
# "duckdb>=1.0.0",
|
|
6
|
+
# "huggingface_hub>=0.20.0",
|
|
7
|
+
# "datasets>=2.14.0",
|
|
8
|
+
# "pandas>=2.0.0",
|
|
9
|
+
# ]
|
|
10
|
+
# ///
|
|
11
|
+
"""
|
|
12
|
+
Hugging Face Dataset SQL Manager
|
|
13
|
+
|
|
14
|
+
Query, transform, and push Hugging Face datasets using DuckDB's SQL interface.
|
|
15
|
+
Supports the hf:// protocol for direct dataset access, data wrangling, and
|
|
16
|
+
pushing results back to the Hub.
|
|
17
|
+
|
|
18
|
+
Version: 1.0.0
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
# Query a dataset
|
|
22
|
+
uv run sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data LIMIT 10"
|
|
23
|
+
|
|
24
|
+
# Query and push to new dataset
|
|
25
|
+
uv run sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data WHERE subject='nutrition'" \
|
|
26
|
+
--push-to "username/nutrition-subset"
|
|
27
|
+
|
|
28
|
+
# Describe dataset schema
|
|
29
|
+
uv run sql_manager.py describe --dataset "cais/mmlu"
|
|
30
|
+
|
|
31
|
+
# List available splits/configs
|
|
32
|
+
uv run sql_manager.py info --dataset "cais/mmlu"
|
|
33
|
+
|
|
34
|
+
# Get random sample
|
|
35
|
+
uv run sql_manager.py sample --dataset "cais/mmlu" --n 5
|
|
36
|
+
|
|
37
|
+
# Export to parquet
|
|
38
|
+
uv run sql_manager.py export --dataset "cais/mmlu" --output "data.parquet"
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import os
|
|
42
|
+
import json
|
|
43
|
+
import argparse
|
|
44
|
+
from typing import Optional, List, Dict, Any, Union
|
|
45
|
+
|
|
46
|
+
import duckdb
|
|
47
|
+
from huggingface_hub import HfApi
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Configuration
|
|
51
|
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class HFDatasetSQL:
|
|
55
|
+
"""
|
|
56
|
+
Query Hugging Face datasets using DuckDB SQL.
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
>>> sql = HFDatasetSQL()
|
|
60
|
+
>>> results = sql.query("cais/mmlu", "SELECT * FROM data LIMIT 5")
|
|
61
|
+
>>> schema = sql.describe("cais/mmlu")
|
|
62
|
+
>>> sql.query_and_push("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition'", "user/nutrition-qa")
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, token: Optional[str] = None):
|
|
66
|
+
"""Initialize the SQL manager with optional HF token."""
|
|
67
|
+
self.token = token or HF_TOKEN
|
|
68
|
+
self.conn = duckdb.connect()
|
|
69
|
+
self._setup_connection()
|
|
70
|
+
|
|
71
|
+
def _setup_connection(self):
|
|
72
|
+
"""Configure DuckDB connection for HF access."""
|
|
73
|
+
# Set HF token if available (for private datasets)
|
|
74
|
+
if self.token:
|
|
75
|
+
self.conn.execute(f"CREATE SECRET hf_token (TYPE HUGGINGFACE, TOKEN '{self.token}');")
|
|
76
|
+
|
|
77
|
+
def _build_hf_path(
|
|
78
|
+
self, dataset_id: str, split: str = "*", config: Optional[str] = None, revision: str = "~parquet"
|
|
79
|
+
) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Build the hf:// path for a dataset.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
dataset_id: Dataset ID (e.g., "cais/mmlu")
|
|
85
|
+
split: Split name or "*" for all splits
|
|
86
|
+
config: Optional config/subset name
|
|
87
|
+
revision: Revision, defaults to ~parquet for auto-converted parquet
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
hf:// path string
|
|
91
|
+
"""
|
|
92
|
+
if config:
|
|
93
|
+
return f"hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet"
|
|
94
|
+
else:
|
|
95
|
+
return f"hf://datasets/{dataset_id}@{revision}/default/{split}/*.parquet"
|
|
96
|
+
|
|
97
|
+
def _build_hf_path_flexible(
|
|
98
|
+
self,
|
|
99
|
+
dataset_id: str,
|
|
100
|
+
split: Optional[str] = None,
|
|
101
|
+
config: Optional[str] = None,
|
|
102
|
+
) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Build flexible hf:// path with wildcards for discovery.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
dataset_id: Dataset ID
|
|
108
|
+
split: Optional specific split
|
|
109
|
+
config: Optional config name
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
hf:// path with appropriate wildcards
|
|
113
|
+
"""
|
|
114
|
+
base = f"hf://datasets/{dataset_id}@~parquet"
|
|
115
|
+
|
|
116
|
+
if config and split:
|
|
117
|
+
return f"{base}/{config}/{split}/*.parquet"
|
|
118
|
+
elif config:
|
|
119
|
+
return f"{base}/{config}/*/*.parquet"
|
|
120
|
+
elif split:
|
|
121
|
+
return f"{base}/*/{split}/*.parquet"
|
|
122
|
+
else:
|
|
123
|
+
return f"{base}/*/*/*.parquet"
|
|
124
|
+
|
|
125
|
+
def query(
|
|
126
|
+
self,
|
|
127
|
+
dataset_id: str,
|
|
128
|
+
sql: str,
|
|
129
|
+
split: str = "train",
|
|
130
|
+
config: Optional[str] = None,
|
|
131
|
+
limit: Optional[int] = None,
|
|
132
|
+
output_format: str = "dict",
|
|
133
|
+
) -> Union[List[Dict], Any]:
|
|
134
|
+
"""
|
|
135
|
+
Execute SQL query on a Hugging Face dataset.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
dataset_id: Dataset ID (e.g., "cais/mmlu", "ibm/duorc")
|
|
139
|
+
sql: SQL query. Use 'data' as table name (will be replaced with actual path)
|
|
140
|
+
split: Dataset split (train, test, validation, or * for all)
|
|
141
|
+
config: Optional dataset config/subset
|
|
142
|
+
limit: Optional limit override
|
|
143
|
+
output_format: Output format - "dict", "df" (pandas), "arrow", "raw"
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Query results in specified format
|
|
147
|
+
|
|
148
|
+
Examples:
|
|
149
|
+
>>> sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
|
|
150
|
+
>>> sql.query("cais/mmlu", "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject")
|
|
151
|
+
"""
|
|
152
|
+
# Build the HF path
|
|
153
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
154
|
+
|
|
155
|
+
# Replace 'data' placeholder with actual path
|
|
156
|
+
# Handle various SQL patterns
|
|
157
|
+
processed_sql = sql.replace("FROM data", f"FROM '{hf_path}'")
|
|
158
|
+
processed_sql = processed_sql.replace("from data", f"FROM '{hf_path}'")
|
|
159
|
+
processed_sql = processed_sql.replace("JOIN data", f"JOIN '{hf_path}'")
|
|
160
|
+
processed_sql = processed_sql.replace("join data", f"JOIN '{hf_path}'")
|
|
161
|
+
|
|
162
|
+
# If user provides raw path, use as-is
|
|
163
|
+
if "hf://" in sql:
|
|
164
|
+
processed_sql = sql
|
|
165
|
+
|
|
166
|
+
# Apply limit if specified and not already in query
|
|
167
|
+
if limit and "LIMIT" not in processed_sql.upper():
|
|
168
|
+
processed_sql += f" LIMIT {limit}"
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
result = self.conn.execute(processed_sql)
|
|
172
|
+
|
|
173
|
+
if output_format == "df":
|
|
174
|
+
return result.fetchdf()
|
|
175
|
+
elif output_format == "arrow":
|
|
176
|
+
return result.fetch_arrow_table()
|
|
177
|
+
elif output_format == "raw":
|
|
178
|
+
return result.fetchall()
|
|
179
|
+
else: # dict
|
|
180
|
+
columns = [desc[0] for desc in result.description]
|
|
181
|
+
rows = result.fetchall()
|
|
182
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
print(f"❌ Query error: {e}")
|
|
186
|
+
print(f" SQL: {processed_sql[:200]}...")
|
|
187
|
+
raise
|
|
188
|
+
|
|
189
|
+
def query_raw(self, sql: str, output_format: str = "dict") -> Union[List[Dict], Any]:
|
|
190
|
+
"""
|
|
191
|
+
Execute raw SQL query without path substitution.
|
|
192
|
+
|
|
193
|
+
Useful for queries that already contain full hf:// paths or for
|
|
194
|
+
multi-dataset queries.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
sql: Complete SQL query
|
|
198
|
+
output_format: Output format
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Query results
|
|
202
|
+
"""
|
|
203
|
+
result = self.conn.execute(sql)
|
|
204
|
+
|
|
205
|
+
if output_format == "df":
|
|
206
|
+
return result.fetchdf()
|
|
207
|
+
elif output_format == "arrow":
|
|
208
|
+
return result.fetch_arrow_table()
|
|
209
|
+
elif output_format == "raw":
|
|
210
|
+
return result.fetchall()
|
|
211
|
+
else:
|
|
212
|
+
columns = [desc[0] for desc in result.description]
|
|
213
|
+
rows = result.fetchall()
|
|
214
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
215
|
+
|
|
216
|
+
def describe(self, dataset_id: str, split: str = "train", config: Optional[str] = None) -> List[Dict[str, str]]:
|
|
217
|
+
"""
|
|
218
|
+
Get schema/structure of a dataset.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
dataset_id: Dataset ID
|
|
222
|
+
split: Dataset split
|
|
223
|
+
config: Optional config
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
List of column definitions with name, type, nullable info
|
|
227
|
+
"""
|
|
228
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
229
|
+
|
|
230
|
+
sql = f"DESCRIBE SELECT * FROM '{hf_path}' LIMIT 1"
|
|
231
|
+
result = self.conn.execute(sql)
|
|
232
|
+
|
|
233
|
+
columns = [desc[0] for desc in result.description]
|
|
234
|
+
rows = result.fetchall()
|
|
235
|
+
|
|
236
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
237
|
+
|
|
238
|
+
def sample(
|
|
239
|
+
self,
|
|
240
|
+
dataset_id: str,
|
|
241
|
+
n: int = 10,
|
|
242
|
+
split: str = "train",
|
|
243
|
+
config: Optional[str] = None,
|
|
244
|
+
seed: Optional[int] = None,
|
|
245
|
+
) -> List[Dict]:
|
|
246
|
+
"""
|
|
247
|
+
Get a random sample from a dataset.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
dataset_id: Dataset ID
|
|
251
|
+
n: Number of samples
|
|
252
|
+
split: Dataset split
|
|
253
|
+
config: Optional config
|
|
254
|
+
seed: Random seed for reproducibility
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
List of sampled rows
|
|
258
|
+
"""
|
|
259
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
260
|
+
|
|
261
|
+
if seed is not None:
|
|
262
|
+
sql = f"SELECT * FROM '{hf_path}' USING SAMPLE {n} (RESERVOIR, {seed})"
|
|
263
|
+
else:
|
|
264
|
+
sql = f"SELECT * FROM '{hf_path}' USING SAMPLE {n}"
|
|
265
|
+
|
|
266
|
+
return self.query_raw(sql)
|
|
267
|
+
|
|
268
|
+
def count(
|
|
269
|
+
self, dataset_id: str, split: str = "train", config: Optional[str] = None, where: Optional[str] = None
|
|
270
|
+
) -> int:
|
|
271
|
+
"""
|
|
272
|
+
Count rows in a dataset, optionally with filter.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
dataset_id: Dataset ID
|
|
276
|
+
split: Dataset split
|
|
277
|
+
config: Optional config
|
|
278
|
+
where: Optional WHERE clause (without WHERE keyword)
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Row count
|
|
282
|
+
"""
|
|
283
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
284
|
+
|
|
285
|
+
sql = f"SELECT COUNT(*) FROM '{hf_path}'"
|
|
286
|
+
if where:
|
|
287
|
+
sql += f" WHERE {where}"
|
|
288
|
+
|
|
289
|
+
result = self.conn.execute(sql).fetchone()
|
|
290
|
+
return result[0] if result else 0
|
|
291
|
+
|
|
292
|
+
def unique_values(
|
|
293
|
+
self, dataset_id: str, column: str, split: str = "train", config: Optional[str] = None, limit: int = 100
|
|
294
|
+
) -> List[Any]:
|
|
295
|
+
"""
|
|
296
|
+
Get unique values in a column.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
dataset_id: Dataset ID
|
|
300
|
+
column: Column name
|
|
301
|
+
split: Dataset split
|
|
302
|
+
config: Optional config
|
|
303
|
+
limit: Max unique values to return
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
List of unique values
|
|
307
|
+
"""
|
|
308
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
309
|
+
|
|
310
|
+
sql = f"SELECT DISTINCT {column} FROM '{hf_path}' LIMIT {limit}"
|
|
311
|
+
result = self.conn.execute(sql).fetchall()
|
|
312
|
+
|
|
313
|
+
return [row[0] for row in result]
|
|
314
|
+
|
|
315
|
+
def histogram(
|
|
316
|
+
self, dataset_id: str, column: str, split: str = "train", config: Optional[str] = None, bins: int = 10
|
|
317
|
+
) -> List[Dict]:
|
|
318
|
+
"""
|
|
319
|
+
Get value distribution/histogram for a column.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
dataset_id: Dataset ID
|
|
323
|
+
column: Column name
|
|
324
|
+
split: Dataset split
|
|
325
|
+
config: Optional config
|
|
326
|
+
bins: Number of bins for numeric columns
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Distribution data
|
|
330
|
+
"""
|
|
331
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
332
|
+
|
|
333
|
+
sql = f"""
|
|
334
|
+
SELECT
|
|
335
|
+
{column},
|
|
336
|
+
COUNT(*) as count
|
|
337
|
+
FROM '{hf_path}'
|
|
338
|
+
GROUP BY {column}
|
|
339
|
+
ORDER BY count DESC
|
|
340
|
+
LIMIT {bins}
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
return self.query_raw(sql)
|
|
344
|
+
|
|
345
|
+
def filter_and_transform(
|
|
346
|
+
self,
|
|
347
|
+
dataset_id: str,
|
|
348
|
+
select: str = "*",
|
|
349
|
+
where: Optional[str] = None,
|
|
350
|
+
group_by: Optional[str] = None,
|
|
351
|
+
order_by: Optional[str] = None,
|
|
352
|
+
split: str = "train",
|
|
353
|
+
config: Optional[str] = None,
|
|
354
|
+
limit: Optional[int] = None,
|
|
355
|
+
) -> List[Dict]:
|
|
356
|
+
"""
|
|
357
|
+
Filter and transform dataset with SQL clauses.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
dataset_id: Dataset ID
|
|
361
|
+
select: SELECT clause (columns, expressions, aggregations)
|
|
362
|
+
where: WHERE clause (filter conditions)
|
|
363
|
+
group_by: GROUP BY clause
|
|
364
|
+
order_by: ORDER BY clause
|
|
365
|
+
split: Dataset split
|
|
366
|
+
config: Optional config
|
|
367
|
+
limit: Row limit
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Transformed data
|
|
371
|
+
|
|
372
|
+
Examples:
|
|
373
|
+
>>> sql.filter_and_transform(
|
|
374
|
+
... "cais/mmlu",
|
|
375
|
+
... select="subject, COUNT(*) as cnt",
|
|
376
|
+
... group_by="subject",
|
|
377
|
+
... order_by="cnt DESC",
|
|
378
|
+
... limit=10
|
|
379
|
+
... )
|
|
380
|
+
"""
|
|
381
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
382
|
+
|
|
383
|
+
sql_parts = [f"SELECT {select}", f"FROM '{hf_path}'"]
|
|
384
|
+
|
|
385
|
+
if where:
|
|
386
|
+
sql_parts.append(f"WHERE {where}")
|
|
387
|
+
if group_by:
|
|
388
|
+
sql_parts.append(f"GROUP BY {group_by}")
|
|
389
|
+
if order_by:
|
|
390
|
+
sql_parts.append(f"ORDER BY {order_by}")
|
|
391
|
+
if limit:
|
|
392
|
+
sql_parts.append(f"LIMIT {limit}")
|
|
393
|
+
|
|
394
|
+
sql = " ".join(sql_parts)
|
|
395
|
+
return self.query_raw(sql)
|
|
396
|
+
|
|
397
|
+
def join_datasets(
|
|
398
|
+
self,
|
|
399
|
+
left_dataset: str,
|
|
400
|
+
right_dataset: str,
|
|
401
|
+
on: str,
|
|
402
|
+
select: str = "*",
|
|
403
|
+
join_type: str = "INNER",
|
|
404
|
+
left_split: str = "train",
|
|
405
|
+
right_split: str = "train",
|
|
406
|
+
left_config: Optional[str] = None,
|
|
407
|
+
right_config: Optional[str] = None,
|
|
408
|
+
limit: Optional[int] = None,
|
|
409
|
+
) -> List[Dict]:
|
|
410
|
+
"""
|
|
411
|
+
Join two datasets.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
left_dataset: Left dataset ID
|
|
415
|
+
right_dataset: Right dataset ID
|
|
416
|
+
on: JOIN condition (e.g., "left.id = right.id")
|
|
417
|
+
select: SELECT clause
|
|
418
|
+
join_type: Type of join (INNER, LEFT, RIGHT, FULL)
|
|
419
|
+
left_split: Split for left dataset
|
|
420
|
+
right_split: Split for right dataset
|
|
421
|
+
left_config: Config for left dataset
|
|
422
|
+
right_config: Config for right dataset
|
|
423
|
+
limit: Row limit
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
Joined data
|
|
427
|
+
"""
|
|
428
|
+
left_path = self._build_hf_path(left_dataset, split=left_split, config=left_config)
|
|
429
|
+
right_path = self._build_hf_path(right_dataset, split=right_split, config=right_config)
|
|
430
|
+
|
|
431
|
+
sql = f"""
|
|
432
|
+
SELECT {select}
|
|
433
|
+
FROM '{left_path}' AS left_table
|
|
434
|
+
{join_type} JOIN '{right_path}' AS right_table
|
|
435
|
+
ON {on}
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
if limit:
|
|
439
|
+
sql += f" LIMIT {limit}"
|
|
440
|
+
|
|
441
|
+
return self.query_raw(sql)
|
|
442
|
+
|
|
443
|
+
def export_to_parquet(
|
|
444
|
+
self,
|
|
445
|
+
dataset_id: str,
|
|
446
|
+
output_path: str,
|
|
447
|
+
sql: Optional[str] = None,
|
|
448
|
+
split: str = "train",
|
|
449
|
+
config: Optional[str] = None,
|
|
450
|
+
) -> str:
|
|
451
|
+
"""
|
|
452
|
+
Export query results to a local Parquet file.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
dataset_id: Source dataset ID
|
|
456
|
+
output_path: Local path for output Parquet file
|
|
457
|
+
sql: Optional SQL query (uses SELECT * if not provided)
|
|
458
|
+
split: Dataset split
|
|
459
|
+
config: Optional config
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
Path to created file
|
|
463
|
+
"""
|
|
464
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
465
|
+
|
|
466
|
+
if sql:
|
|
467
|
+
# Process the query
|
|
468
|
+
processed_sql = sql.replace("FROM data", f"FROM '{hf_path}'")
|
|
469
|
+
processed_sql = processed_sql.replace("from data", f"FROM '{hf_path}'")
|
|
470
|
+
else:
|
|
471
|
+
processed_sql = f"SELECT * FROM '{hf_path}'"
|
|
472
|
+
|
|
473
|
+
export_sql = f"COPY ({processed_sql}) TO '{output_path}' (FORMAT PARQUET)"
|
|
474
|
+
self.conn.execute(export_sql)
|
|
475
|
+
|
|
476
|
+
print(f"✅ Exported to {output_path}")
|
|
477
|
+
return output_path
|
|
478
|
+
|
|
479
|
+
def export_to_jsonl(
|
|
480
|
+
self,
|
|
481
|
+
dataset_id: str,
|
|
482
|
+
output_path: str,
|
|
483
|
+
sql: Optional[str] = None,
|
|
484
|
+
split: str = "train",
|
|
485
|
+
config: Optional[str] = None,
|
|
486
|
+
) -> str:
|
|
487
|
+
"""
|
|
488
|
+
Export query results to JSONL format.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
dataset_id: Source dataset ID
|
|
492
|
+
output_path: Local path for output JSONL file
|
|
493
|
+
sql: Optional SQL query
|
|
494
|
+
split: Dataset split
|
|
495
|
+
config: Optional config
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
Path to created file
|
|
499
|
+
"""
|
|
500
|
+
results = self.query(dataset_id, sql or "SELECT * FROM data", split=split, config=config)
|
|
501
|
+
|
|
502
|
+
with open(output_path, "w") as f:
|
|
503
|
+
for row in results:
|
|
504
|
+
f.write(json.dumps(row) + "\n")
|
|
505
|
+
|
|
506
|
+
print(f"✅ Exported {len(results)} rows to {output_path}")
|
|
507
|
+
return output_path
|
|
508
|
+
|
|
509
|
+
def push_to_hub(
|
|
510
|
+
self,
|
|
511
|
+
dataset_id: str,
|
|
512
|
+
target_repo: str,
|
|
513
|
+
sql: Optional[str] = None,
|
|
514
|
+
split: str = "train",
|
|
515
|
+
config: Optional[str] = None,
|
|
516
|
+
target_split: str = "train",
|
|
517
|
+
private: bool = True,
|
|
518
|
+
commit_message: Optional[str] = None,
|
|
519
|
+
) -> str:
|
|
520
|
+
"""
|
|
521
|
+
Query a dataset and push results to a new Hub repository.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
dataset_id: Source dataset ID
|
|
525
|
+
target_repo: Target repository ID (e.g., "username/new-dataset")
|
|
526
|
+
sql: SQL query to transform data (optional, defaults to SELECT *)
|
|
527
|
+
split: Source split
|
|
528
|
+
config: Source config
|
|
529
|
+
target_split: Target split name
|
|
530
|
+
private: Whether to create private repo
|
|
531
|
+
commit_message: Commit message
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
URL of created dataset
|
|
535
|
+
"""
|
|
536
|
+
try:
|
|
537
|
+
from datasets import Dataset
|
|
538
|
+
except ImportError:
|
|
539
|
+
raise ImportError("datasets library required for push_to_hub. Install with: pip install datasets")
|
|
540
|
+
|
|
541
|
+
# Execute query
|
|
542
|
+
results = self.query(dataset_id, sql or "SELECT * FROM data", split=split, config=config)
|
|
543
|
+
|
|
544
|
+
if not results:
|
|
545
|
+
print("❌ No results to push")
|
|
546
|
+
return ""
|
|
547
|
+
|
|
548
|
+
# Convert to HF Dataset
|
|
549
|
+
ds = Dataset.from_list(results)
|
|
550
|
+
|
|
551
|
+
# Push to Hub
|
|
552
|
+
ds.push_to_hub(
|
|
553
|
+
target_repo,
|
|
554
|
+
split=target_split,
|
|
555
|
+
private=private,
|
|
556
|
+
commit_message=commit_message or f"Created from {dataset_id} via SQL query",
|
|
557
|
+
token=self.token,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
url = f"https://huggingface.co/datasets/{target_repo}"
|
|
561
|
+
print(f"✅ Pushed {len(results)} rows to {url}")
|
|
562
|
+
return url
|
|
563
|
+
|
|
564
|
+
def create_view(self, name: str, dataset_id: str, split: str = "train", config: Optional[str] = None):
|
|
565
|
+
"""
|
|
566
|
+
Create a DuckDB view for easier querying.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
name: View name
|
|
570
|
+
dataset_id: Dataset ID
|
|
571
|
+
split: Dataset split
|
|
572
|
+
config: Optional config
|
|
573
|
+
"""
|
|
574
|
+
hf_path = self._build_hf_path(dataset_id, split=split, config=config)
|
|
575
|
+
self.conn.execute(f"CREATE OR REPLACE VIEW {name} AS SELECT * FROM '{hf_path}'")
|
|
576
|
+
print(f"✅ Created view '{name}' for {dataset_id}")
|
|
577
|
+
|
|
578
|
+
def info(self, dataset_id: str) -> Dict[str, Any]:
|
|
579
|
+
"""
|
|
580
|
+
Get information about a dataset including available configs and splits.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
dataset_id: Dataset ID
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
Dataset information
|
|
587
|
+
"""
|
|
588
|
+
api = HfApi(token=self.token)
|
|
589
|
+
|
|
590
|
+
try:
|
|
591
|
+
info = api.dataset_info(dataset_id)
|
|
592
|
+
|
|
593
|
+
result = {
|
|
594
|
+
"id": info.id,
|
|
595
|
+
"author": info.author,
|
|
596
|
+
"private": info.private,
|
|
597
|
+
"downloads": info.downloads,
|
|
598
|
+
"likes": info.likes,
|
|
599
|
+
"tags": info.tags,
|
|
600
|
+
"created_at": str(info.created_at) if info.created_at else None,
|
|
601
|
+
"last_modified": str(info.last_modified) if info.last_modified else None,
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
# Try to get config/split info from card data
|
|
605
|
+
if info.card_data:
|
|
606
|
+
result["configs"] = getattr(info.card_data, "configs", None)
|
|
607
|
+
|
|
608
|
+
return result
|
|
609
|
+
|
|
610
|
+
except Exception as e:
|
|
611
|
+
print(f"❌ Failed to get info: {e}")
|
|
612
|
+
return {}
|
|
613
|
+
|
|
614
|
+
def close(self):
|
|
615
|
+
"""Close the database connection."""
|
|
616
|
+
self.conn.close()
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def main():
|
|
620
|
+
"""CLI entry point."""
|
|
621
|
+
parser = argparse.ArgumentParser(
|
|
622
|
+
description="Query Hugging Face datasets with SQL",
|
|
623
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
624
|
+
epilog="""
|
|
625
|
+
Examples:
|
|
626
|
+
# Query dataset with SQL
|
|
627
|
+
python sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
|
|
628
|
+
|
|
629
|
+
# Get random sample
|
|
630
|
+
python sql_manager.py sample --dataset "cais/mmlu" --n 5
|
|
631
|
+
|
|
632
|
+
# Describe schema
|
|
633
|
+
python sql_manager.py describe --dataset "cais/mmlu"
|
|
634
|
+
|
|
635
|
+
# Get value counts
|
|
636
|
+
python sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
|
|
637
|
+
|
|
638
|
+
# Filter and transform
|
|
639
|
+
python sql_manager.py transform --dataset "cais/mmlu" \\
|
|
640
|
+
--select "subject, COUNT(*) as cnt" \\
|
|
641
|
+
--group-by "subject" \\
|
|
642
|
+
--order-by "cnt DESC"
|
|
643
|
+
|
|
644
|
+
# Query and push to Hub
|
|
645
|
+
python sql_manager.py query --dataset "cais/mmlu" \\
|
|
646
|
+
--sql "SELECT * FROM data WHERE subject='nutrition'" \\
|
|
647
|
+
--push-to "username/nutrition-subset"
|
|
648
|
+
|
|
649
|
+
# Export to Parquet
|
|
650
|
+
python sql_manager.py export --dataset "cais/mmlu" \\
|
|
651
|
+
--sql "SELECT * FROM data WHERE subject='nutrition'" \\
|
|
652
|
+
--output "nutrition.parquet"
|
|
653
|
+
""",
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
657
|
+
|
|
658
|
+
# Common arguments
|
|
659
|
+
def add_common_args(p):
|
|
660
|
+
p.add_argument("--dataset", "-d", required=True, help="Dataset ID (e.g., cais/mmlu)")
|
|
661
|
+
p.add_argument("--split", "-s", default="train", help="Dataset split (default: train)")
|
|
662
|
+
p.add_argument("--config", "-c", help="Dataset config/subset")
|
|
663
|
+
|
|
664
|
+
# Query command
|
|
665
|
+
query_parser = subparsers.add_parser("query", help="Execute SQL query on dataset")
|
|
666
|
+
add_common_args(query_parser)
|
|
667
|
+
query_parser.add_argument("--sql", required=True, help="SQL query (use 'data' as table name)")
|
|
668
|
+
query_parser.add_argument("--limit", "-l", type=int, help="Limit results")
|
|
669
|
+
query_parser.add_argument("--format", choices=["json", "table", "csv"], default="json", help="Output format")
|
|
670
|
+
query_parser.add_argument("--push-to", help="Push results to this Hub repo")
|
|
671
|
+
query_parser.add_argument("--private", action="store_true", help="Make pushed repo private")
|
|
672
|
+
|
|
673
|
+
# Sample command
|
|
674
|
+
sample_parser = subparsers.add_parser("sample", help="Get random sample from dataset")
|
|
675
|
+
add_common_args(sample_parser)
|
|
676
|
+
sample_parser.add_argument("--n", type=int, default=10, help="Number of samples")
|
|
677
|
+
sample_parser.add_argument("--seed", type=int, help="Random seed")
|
|
678
|
+
|
|
679
|
+
# Describe command
|
|
680
|
+
describe_parser = subparsers.add_parser("describe", help="Get dataset schema")
|
|
681
|
+
add_common_args(describe_parser)
|
|
682
|
+
|
|
683
|
+
# Count command
|
|
684
|
+
count_parser = subparsers.add_parser("count", help="Count rows in dataset")
|
|
685
|
+
add_common_args(count_parser)
|
|
686
|
+
count_parser.add_argument("--where", "-w", help="WHERE clause for filtering")
|
|
687
|
+
|
|
688
|
+
# Histogram command
|
|
689
|
+
histogram_parser = subparsers.add_parser("histogram", help="Get value distribution")
|
|
690
|
+
add_common_args(histogram_parser)
|
|
691
|
+
histogram_parser.add_argument("--column", required=True, help="Column name")
|
|
692
|
+
histogram_parser.add_argument("--bins", type=int, default=20, help="Number of bins")
|
|
693
|
+
|
|
694
|
+
# Unique command
|
|
695
|
+
unique_parser = subparsers.add_parser("unique", help="Get unique values in column")
|
|
696
|
+
add_common_args(unique_parser)
|
|
697
|
+
unique_parser.add_argument("--column", required=True, help="Column name")
|
|
698
|
+
unique_parser.add_argument("--limit", "-l", type=int, default=100, help="Max values")
|
|
699
|
+
|
|
700
|
+
# Transform command
|
|
701
|
+
transform_parser = subparsers.add_parser("transform", help="Filter and transform dataset")
|
|
702
|
+
add_common_args(transform_parser)
|
|
703
|
+
transform_parser.add_argument("--select", default="*", help="SELECT clause")
|
|
704
|
+
transform_parser.add_argument("--where", "-w", help="WHERE clause")
|
|
705
|
+
transform_parser.add_argument("--group-by", help="GROUP BY clause")
|
|
706
|
+
transform_parser.add_argument("--order-by", help="ORDER BY clause")
|
|
707
|
+
transform_parser.add_argument("--limit", "-l", type=int, help="LIMIT")
|
|
708
|
+
transform_parser.add_argument("--push-to", help="Push results to Hub repo")
|
|
709
|
+
|
|
710
|
+
# Export command
|
|
711
|
+
export_parser = subparsers.add_parser("export", help="Export query results to file")
|
|
712
|
+
add_common_args(export_parser)
|
|
713
|
+
export_parser.add_argument("--sql", help="SQL query (defaults to SELECT *)")
|
|
714
|
+
export_parser.add_argument("--output", "-o", required=True, help="Output file path")
|
|
715
|
+
export_parser.add_argument("--format", choices=["parquet", "jsonl"], default="parquet", help="Output format")
|
|
716
|
+
|
|
717
|
+
# Info command
|
|
718
|
+
info_parser = subparsers.add_parser("info", help="Get dataset information")
|
|
719
|
+
info_parser.add_argument("--dataset", "-d", required=True, help="Dataset ID")
|
|
720
|
+
|
|
721
|
+
# Raw SQL command
|
|
722
|
+
raw_parser = subparsers.add_parser("raw", help="Execute raw SQL with full hf:// paths")
|
|
723
|
+
raw_parser.add_argument("--sql", required=True, help="Complete SQL query")
|
|
724
|
+
raw_parser.add_argument("--format", choices=["json", "table", "csv"], default="json", help="Output format")
|
|
725
|
+
|
|
726
|
+
args = parser.parse_args()
|
|
727
|
+
|
|
728
|
+
# Initialize SQL manager
|
|
729
|
+
sql = HFDatasetSQL()
|
|
730
|
+
|
|
731
|
+
try:
|
|
732
|
+
if args.command == "query":
|
|
733
|
+
results = sql.query(args.dataset, args.sql, split=args.split, config=args.config, limit=args.limit)
|
|
734
|
+
|
|
735
|
+
if getattr(args, "push_to", None):
|
|
736
|
+
sql.push_to_hub(
|
|
737
|
+
args.dataset, args.push_to, sql=args.sql, split=args.split, config=args.config, private=args.private
|
|
738
|
+
)
|
|
739
|
+
else:
|
|
740
|
+
_print_results(results, args.format)
|
|
741
|
+
|
|
742
|
+
elif args.command == "sample":
|
|
743
|
+
results = sql.sample(args.dataset, n=args.n, split=args.split, config=args.config, seed=args.seed)
|
|
744
|
+
_print_results(results, "json")
|
|
745
|
+
|
|
746
|
+
elif args.command == "describe":
|
|
747
|
+
schema = sql.describe(args.dataset, split=args.split, config=args.config)
|
|
748
|
+
_print_results(schema, "table")
|
|
749
|
+
|
|
750
|
+
elif args.command == "count":
|
|
751
|
+
count = sql.count(args.dataset, split=args.split, config=args.config, where=args.where)
|
|
752
|
+
print(f"Count: {count:,}")
|
|
753
|
+
|
|
754
|
+
elif args.command == "histogram":
|
|
755
|
+
results = sql.histogram(args.dataset, args.column, split=args.split, config=args.config, bins=args.bins)
|
|
756
|
+
_print_results(results, "table")
|
|
757
|
+
|
|
758
|
+
elif args.command == "unique":
|
|
759
|
+
values = sql.unique_values(
|
|
760
|
+
args.dataset, args.column, split=args.split, config=args.config, limit=args.limit
|
|
761
|
+
)
|
|
762
|
+
for v in values:
|
|
763
|
+
print(v)
|
|
764
|
+
|
|
765
|
+
elif args.command == "transform":
|
|
766
|
+
results = sql.filter_and_transform(
|
|
767
|
+
args.dataset,
|
|
768
|
+
select=args.select,
|
|
769
|
+
where=args.where,
|
|
770
|
+
group_by=args.group_by,
|
|
771
|
+
order_by=args.order_by,
|
|
772
|
+
split=args.split,
|
|
773
|
+
config=args.config,
|
|
774
|
+
limit=args.limit,
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
if getattr(args, "push_to", None):
|
|
778
|
+
# Build SQL for push
|
|
779
|
+
query_sql = f"SELECT {args.select} FROM data"
|
|
780
|
+
if args.where:
|
|
781
|
+
query_sql += f" WHERE {args.where}"
|
|
782
|
+
if args.group_by:
|
|
783
|
+
query_sql += f" GROUP BY {args.group_by}"
|
|
784
|
+
if args.order_by:
|
|
785
|
+
query_sql += f" ORDER BY {args.order_by}"
|
|
786
|
+
if args.limit:
|
|
787
|
+
query_sql += f" LIMIT {args.limit}"
|
|
788
|
+
|
|
789
|
+
sql.push_to_hub(args.dataset, args.push_to, sql=query_sql, split=args.split, config=args.config)
|
|
790
|
+
else:
|
|
791
|
+
_print_results(results, "json")
|
|
792
|
+
|
|
793
|
+
elif args.command == "export":
|
|
794
|
+
if args.format == "parquet":
|
|
795
|
+
sql.export_to_parquet(args.dataset, args.output, sql=args.sql, split=args.split, config=args.config)
|
|
796
|
+
else:
|
|
797
|
+
sql.export_to_jsonl(args.dataset, args.output, sql=args.sql, split=args.split, config=args.config)
|
|
798
|
+
|
|
799
|
+
elif args.command == "info":
|
|
800
|
+
info = sql.info(args.dataset)
|
|
801
|
+
_print_results([info], "json")
|
|
802
|
+
|
|
803
|
+
elif args.command == "raw":
|
|
804
|
+
results = sql.query_raw(args.sql)
|
|
805
|
+
_print_results(results, args.format)
|
|
806
|
+
|
|
807
|
+
finally:
|
|
808
|
+
sql.close()
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def _print_results(results: List[Dict], format: str):
|
|
812
|
+
"""Print results in specified format."""
|
|
813
|
+
if not results:
|
|
814
|
+
print("No results")
|
|
815
|
+
return
|
|
816
|
+
|
|
817
|
+
if format == "json":
|
|
818
|
+
print(json.dumps(results, indent=2, default=str))
|
|
819
|
+
|
|
820
|
+
elif format == "csv":
|
|
821
|
+
if results:
|
|
822
|
+
keys = results[0].keys()
|
|
823
|
+
print(",".join(str(k) for k in keys))
|
|
824
|
+
for row in results:
|
|
825
|
+
print(",".join(str(row.get(k, "")) for k in keys))
|
|
826
|
+
|
|
827
|
+
elif format == "table":
|
|
828
|
+
if results:
|
|
829
|
+
keys = list(results[0].keys())
|
|
830
|
+
# Calculate column widths
|
|
831
|
+
widths = {k: max(len(str(k)), max(len(str(r.get(k, ""))) for r in results)) for k in keys}
|
|
832
|
+
|
|
833
|
+
# Header
|
|
834
|
+
header = " | ".join(str(k).ljust(widths[k]) for k in keys)
|
|
835
|
+
print(header)
|
|
836
|
+
print("-" * len(header))
|
|
837
|
+
|
|
838
|
+
# Rows
|
|
839
|
+
for row in results:
|
|
840
|
+
print(" | ".join(str(row.get(k, "")).ljust(widths[k]) for k in keys))
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
if __name__ == "__main__":
|
|
844
|
+
main()
|