@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: hugging-face-datasets
|
|
3
|
+
description: Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. Designed to work alongside HF MCP server for comprehensive dataset workflows.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Hugging Face, Datasets, Data Loading, Data Processing]
|
|
8
|
+
dependencies: [huggingface-hub, transformers]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Overview
|
|
12
|
+
This skill provides tools to manage datasets on the Hugging Face Hub with a focus on creation, configuration, content management, and SQL-based data manipulation. It is designed to complement the existing Hugging Face MCP server by providing dataset editing and querying capabilities.
|
|
13
|
+
|
|
14
|
+
## Integration with HF MCP Server
|
|
15
|
+
- **Use HF MCP Server for**: Dataset discovery, search, and metadata retrieval
|
|
16
|
+
- **Use This Skill for**: Dataset creation, content editing, SQL queries, data transformation, and structured data formatting
|
|
17
|
+
|
|
18
|
+
# Version
|
|
19
|
+
2.1.0
|
|
20
|
+
|
|
21
|
+
# Dependencies
|
|
22
|
+
# This skill uses PEP 723 scripts with inline dependency management
|
|
23
|
+
# Scripts auto-install requirements when run with: uv run scripts/script_name.py
|
|
24
|
+
|
|
25
|
+
- uv (Python package manager)
|
|
26
|
+
- Getting Started: See "Usage Instructions" below for PEP 723 usage
|
|
27
|
+
|
|
28
|
+
# Core Capabilities
|
|
29
|
+
|
|
30
|
+
## 1. Dataset Lifecycle Management
|
|
31
|
+
- **Initialize**: Create new dataset repositories with proper structure
|
|
32
|
+
- **Configure**: Store detailed configuration including system prompts and metadata
|
|
33
|
+
- **Stream Updates**: Add rows efficiently without downloading entire datasets
|
|
34
|
+
|
|
35
|
+
## 2. SQL-Based Dataset Querying (NEW)
|
|
36
|
+
Query any Hugging Face dataset using DuckDB SQL via `scripts/sql_manager.py`:
|
|
37
|
+
- **Direct Queries**: Run SQL on datasets using the `hf://` protocol
|
|
38
|
+
- **Schema Discovery**: Describe dataset structure and column types
|
|
39
|
+
- **Data Sampling**: Get random samples for exploration
|
|
40
|
+
- **Aggregations**: Count, histogram, unique values analysis
|
|
41
|
+
- **Transformations**: Filter, join, reshape data with SQL
|
|
42
|
+
- **Export & Push**: Save results locally or push to new Hub repos
|
|
43
|
+
|
|
44
|
+
## 3. Multi-Format Dataset Support
|
|
45
|
+
Supports diverse dataset types through template system:
|
|
46
|
+
- **Chat/Conversational**: Chat templating, multi-turn dialogues, tool usage examples
|
|
47
|
+
- **Text Classification**: Sentiment analysis, intent detection, topic classification
|
|
48
|
+
- **Question-Answering**: Reading comprehension, factual QA, knowledge bases
|
|
49
|
+
- **Text Completion**: Language modeling, code completion, creative writing
|
|
50
|
+
- **Tabular Data**: Structured data for regression/classification tasks
|
|
51
|
+
- **Custom Formats**: Flexible schema definition for specialized needs
|
|
52
|
+
|
|
53
|
+
## 4. Quality Assurance Features
|
|
54
|
+
- **JSON Validation**: Ensures data integrity during uploads
|
|
55
|
+
- **Batch Processing**: Efficient handling of large datasets
|
|
56
|
+
- **Error Recovery**: Graceful handling of upload failures and conflicts
|
|
57
|
+
|
|
58
|
+
# Usage Instructions
|
|
59
|
+
|
|
60
|
+
The skill includes two Python scripts that use PEP 723 inline dependency management:
|
|
61
|
+
|
|
62
|
+
> **All paths are relative to the directory containing this SKILL.md
|
|
63
|
+
file.**
|
|
64
|
+
> Scripts are run with: `uv run scripts/script_name.py [arguments]`
|
|
65
|
+
|
|
66
|
+
- `scripts/dataset_manager.py` - Dataset creation and management
|
|
67
|
+
- `scripts/sql_manager.py` - SQL-based dataset querying and transformation
|
|
68
|
+
|
|
69
|
+
### Prerequisites
|
|
70
|
+
- `uv` package manager installed
|
|
71
|
+
- `HF_TOKEN` environment variable must be set with a Write-access token
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
# SQL Dataset Querying (sql_manager.py)
|
|
76
|
+
|
|
77
|
+
Query, transform, and push Hugging Face datasets using DuckDB SQL. The `hf://` protocol provides direct access to any public dataset (or private with token).
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Query a dataset
|
|
83
|
+
uv run scripts/sql_manager.py query \
|
|
84
|
+
--dataset "cais/mmlu" \
|
|
85
|
+
--sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
|
|
86
|
+
|
|
87
|
+
# Get dataset schema
|
|
88
|
+
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
|
|
89
|
+
|
|
90
|
+
# Sample random rows
|
|
91
|
+
uv run scripts/sql_manager.py sample --dataset "cais/mmlu" --n 5
|
|
92
|
+
|
|
93
|
+
# Count rows with filter
|
|
94
|
+
uv run scripts/sql_manager.py count --dataset "cais/mmlu" --where "subject='nutrition'"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## SQL Query Syntax
|
|
98
|
+
|
|
99
|
+
Use `data` as the table name in your SQL - it gets replaced with the actual `hf://` path:
|
|
100
|
+
|
|
101
|
+
```sql
|
|
102
|
+
-- Basic select
|
|
103
|
+
SELECT * FROM data LIMIT 10
|
|
104
|
+
|
|
105
|
+
-- Filtering
|
|
106
|
+
SELECT * FROM data WHERE subject='nutrition'
|
|
107
|
+
|
|
108
|
+
-- Aggregations
|
|
109
|
+
SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject ORDER BY cnt DESC
|
|
110
|
+
|
|
111
|
+
-- Column selection and transformation
|
|
112
|
+
SELECT question, choices[answer] AS correct_answer FROM data
|
|
113
|
+
|
|
114
|
+
-- Regex matching
|
|
115
|
+
SELECT * FROM data WHERE regexp_matches(question, 'nutrition|diet')
|
|
116
|
+
|
|
117
|
+
-- String functions
|
|
118
|
+
SELECT regexp_replace(question, '\n', '') AS cleaned FROM data
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Common Operations
|
|
122
|
+
|
|
123
|
+
### 1. Explore Dataset Structure
|
|
124
|
+
```bash
|
|
125
|
+
# Get schema
|
|
126
|
+
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
|
|
127
|
+
|
|
128
|
+
# Get unique values in column
|
|
129
|
+
uv run scripts/sql_manager.py unique --dataset "cais/mmlu" --column "subject"
|
|
130
|
+
|
|
131
|
+
# Get value distribution
|
|
132
|
+
uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" --bins 20
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### 2. Filter and Transform
|
|
136
|
+
```bash
|
|
137
|
+
# Complex filtering with SQL
|
|
138
|
+
uv run scripts/sql_manager.py query \
|
|
139
|
+
--dataset "cais/mmlu" \
|
|
140
|
+
--sql "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject HAVING cnt > 100"
|
|
141
|
+
|
|
142
|
+
# Using transform command
|
|
143
|
+
uv run scripts/sql_manager.py transform \
|
|
144
|
+
--dataset "cais/mmlu" \
|
|
145
|
+
--select "subject, COUNT(*) as cnt" \
|
|
146
|
+
--group-by "subject" \
|
|
147
|
+
--order-by "cnt DESC" \
|
|
148
|
+
--limit 10
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### 3. Create Subsets and Push to Hub
|
|
152
|
+
```bash
|
|
153
|
+
# Query and push to new dataset
|
|
154
|
+
uv run scripts/sql_manager.py query \
|
|
155
|
+
--dataset "cais/mmlu" \
|
|
156
|
+
--sql "SELECT * FROM data WHERE subject='nutrition'" \
|
|
157
|
+
--push-to "username/mmlu-nutrition-subset" \
|
|
158
|
+
--private
|
|
159
|
+
|
|
160
|
+
# Transform and push
|
|
161
|
+
uv run scripts/sql_manager.py transform \
|
|
162
|
+
--dataset "ibm/duorc" \
|
|
163
|
+
--config "ParaphraseRC" \
|
|
164
|
+
--select "question, answers" \
|
|
165
|
+
--where "LENGTH(question) > 50" \
|
|
166
|
+
--push-to "username/duorc-long-questions"
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### 4. Export to Local Files
|
|
170
|
+
```bash
|
|
171
|
+
# Export to Parquet
|
|
172
|
+
uv run scripts/sql_manager.py export \
|
|
173
|
+
--dataset "cais/mmlu" \
|
|
174
|
+
--sql "SELECT * FROM data WHERE subject='nutrition'" \
|
|
175
|
+
--output "nutrition.parquet" \
|
|
176
|
+
--format parquet
|
|
177
|
+
|
|
178
|
+
# Export to JSONL
|
|
179
|
+
uv run scripts/sql_manager.py export \
|
|
180
|
+
--dataset "cais/mmlu" \
|
|
181
|
+
--sql "SELECT * FROM data LIMIT 100" \
|
|
182
|
+
--output "sample.jsonl" \
|
|
183
|
+
--format jsonl
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### 5. Working with Dataset Configs/Splits
|
|
187
|
+
```bash
|
|
188
|
+
# Specify config (subset)
|
|
189
|
+
uv run scripts/sql_manager.py query \
|
|
190
|
+
--dataset "ibm/duorc" \
|
|
191
|
+
--config "ParaphraseRC" \
|
|
192
|
+
--sql "SELECT * FROM data LIMIT 5"
|
|
193
|
+
|
|
194
|
+
# Specify split
|
|
195
|
+
uv run scripts/sql_manager.py query \
|
|
196
|
+
--dataset "cais/mmlu" \
|
|
197
|
+
--split "test" \
|
|
198
|
+
--sql "SELECT COUNT(*) FROM data"
|
|
199
|
+
|
|
200
|
+
# Query all splits
|
|
201
|
+
uv run scripts/sql_manager.py query \
|
|
202
|
+
--dataset "cais/mmlu" \
|
|
203
|
+
--split "*" \
|
|
204
|
+
--sql "SELECT * FROM data LIMIT 10"
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### 6. Raw SQL with Full Paths
|
|
208
|
+
For complex queries or joining datasets:
|
|
209
|
+
```bash
|
|
210
|
+
uv run scripts/sql_manager.py raw --sql "
|
|
211
|
+
SELECT a.*, b.*
|
|
212
|
+
FROM 'hf://datasets/dataset1@~parquet/default/train/*.parquet' a
|
|
213
|
+
JOIN 'hf://datasets/dataset2@~parquet/default/train/*.parquet' b
|
|
214
|
+
ON a.id = b.id
|
|
215
|
+
LIMIT 100
|
|
216
|
+
"
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Python API Usage
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
from sql_manager import HFDatasetSQL
|
|
223
|
+
|
|
224
|
+
sql = HFDatasetSQL()
|
|
225
|
+
|
|
226
|
+
# Query
|
|
227
|
+
results = sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
|
|
228
|
+
|
|
229
|
+
# Get schema
|
|
230
|
+
schema = sql.describe("cais/mmlu")
|
|
231
|
+
|
|
232
|
+
# Sample
|
|
233
|
+
samples = sql.sample("cais/mmlu", n=5, seed=42)
|
|
234
|
+
|
|
235
|
+
# Count
|
|
236
|
+
count = sql.count("cais/mmlu", where="subject='nutrition'")
|
|
237
|
+
|
|
238
|
+
# Histogram
|
|
239
|
+
dist = sql.histogram("cais/mmlu", "subject")
|
|
240
|
+
|
|
241
|
+
# Filter and transform
|
|
242
|
+
results = sql.filter_and_transform(
|
|
243
|
+
"cais/mmlu",
|
|
244
|
+
select="subject, COUNT(*) as cnt",
|
|
245
|
+
group_by="subject",
|
|
246
|
+
order_by="cnt DESC",
|
|
247
|
+
limit=10
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Push to Hub
|
|
251
|
+
url = sql.push_to_hub(
|
|
252
|
+
"cais/mmlu",
|
|
253
|
+
"username/nutrition-subset",
|
|
254
|
+
sql="SELECT * FROM data WHERE subject='nutrition'",
|
|
255
|
+
private=True
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Export locally
|
|
259
|
+
sql.export_to_parquet("cais/mmlu", "output.parquet", sql="SELECT * FROM data LIMIT 100")
|
|
260
|
+
|
|
261
|
+
sql.close()
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
## HF Path Format
|
|
265
|
+
|
|
266
|
+
DuckDB uses the `hf://` protocol to access datasets:
|
|
267
|
+
```
|
|
268
|
+
hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
Examples:
|
|
272
|
+
- `hf://datasets/cais/mmlu@~parquet/default/train/*.parquet`
|
|
273
|
+
- `hf://datasets/ibm/duorc@~parquet/ParaphraseRC/test/*.parquet`
|
|
274
|
+
|
|
275
|
+
The `@~parquet` revision provides auto-converted Parquet files for any dataset format.
|
|
276
|
+
|
|
277
|
+
## Useful DuckDB SQL Functions
|
|
278
|
+
|
|
279
|
+
```sql
|
|
280
|
+
-- String functions
|
|
281
|
+
LENGTH(column) -- String length
|
|
282
|
+
regexp_replace(col, '\n', '') -- Regex replace
|
|
283
|
+
regexp_matches(col, 'pattern') -- Regex match
|
|
284
|
+
LOWER(col), UPPER(col) -- Case conversion
|
|
285
|
+
|
|
286
|
+
-- Array functions
|
|
287
|
+
choices[0] -- Array indexing (0-based)
|
|
288
|
+
array_length(choices) -- Array length
|
|
289
|
+
unnest(choices) -- Expand array to rows
|
|
290
|
+
|
|
291
|
+
-- Aggregations
|
|
292
|
+
COUNT(*), SUM(col), AVG(col)
|
|
293
|
+
GROUP BY col HAVING condition
|
|
294
|
+
|
|
295
|
+
-- Sampling
|
|
296
|
+
USING SAMPLE 10 -- Random sample
|
|
297
|
+
USING SAMPLE 10 (RESERVOIR, 42) -- Reproducible sample
|
|
298
|
+
|
|
299
|
+
-- Window functions
|
|
300
|
+
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
# Dataset Creation (dataset_manager.py)
|
|
306
|
+
|
|
307
|
+
### Recommended Workflow
|
|
308
|
+
|
|
309
|
+
**1. Discovery (Use HF MCP Server):**
|
|
310
|
+
```python
|
|
311
|
+
# Use HF MCP tools to find existing datasets
|
|
312
|
+
search_datasets("conversational AI training")
|
|
313
|
+
get_dataset_details("username/dataset-name")
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
**2. Creation (Use This Skill):**
|
|
317
|
+
```bash
|
|
318
|
+
# Initialize new dataset
|
|
319
|
+
uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
|
|
320
|
+
|
|
321
|
+
# Configure with detailed system prompt
|
|
322
|
+
uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "$(cat system_prompt.txt)"
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
**3. Content Management (Use This Skill):**
|
|
326
|
+
```bash
|
|
327
|
+
# Quick setup with any template
|
|
328
|
+
uv run scripts/dataset_manager.py quick_setup \
|
|
329
|
+
--repo_id "your-username/dataset-name" \
|
|
330
|
+
--template classification
|
|
331
|
+
|
|
332
|
+
# Add data with template validation
|
|
333
|
+
uv run scripts/dataset_manager.py add_rows \
|
|
334
|
+
--repo_id "your-username/dataset-name" \
|
|
335
|
+
--template qa \
|
|
336
|
+
--rows_json "$(cat your_qa_data.json)"
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Template-Based Data Structures
|
|
340
|
+
|
|
341
|
+
**1. Chat Template (`--template chat`)**
|
|
342
|
+
```json
|
|
343
|
+
{
|
|
344
|
+
"messages": [
|
|
345
|
+
{"role": "user", "content": "Natural user request"},
|
|
346
|
+
{"role": "assistant", "content": "Response with tool usage"},
|
|
347
|
+
{"role": "tool", "content": "Tool response", "tool_call_id": "call_123"}
|
|
348
|
+
],
|
|
349
|
+
"scenario": "Description of use case",
|
|
350
|
+
"complexity": "simple|intermediate|advanced"
|
|
351
|
+
}
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
**2. Classification Template (`--template classification`)**
|
|
355
|
+
```json
|
|
356
|
+
{
|
|
357
|
+
"text": "Input text to be classified",
|
|
358
|
+
"label": "classification_label",
|
|
359
|
+
"confidence": 0.95,
|
|
360
|
+
"metadata": {"domain": "technology", "language": "en"}
|
|
361
|
+
}
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
**3. QA Template (`--template qa`)**
|
|
365
|
+
```json
|
|
366
|
+
{
|
|
367
|
+
"question": "What is the question being asked?",
|
|
368
|
+
"answer": "The complete answer",
|
|
369
|
+
"context": "Additional context if needed",
|
|
370
|
+
"answer_type": "factual|explanatory|opinion",
|
|
371
|
+
"difficulty": "easy|medium|hard"
|
|
372
|
+
}
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
**4. Completion Template (`--template completion`)**
|
|
376
|
+
```json
|
|
377
|
+
{
|
|
378
|
+
"prompt": "The beginning text or context",
|
|
379
|
+
"completion": "The expected continuation",
|
|
380
|
+
"domain": "code|creative|technical|conversational",
|
|
381
|
+
"style": "description of writing style"
|
|
382
|
+
}
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
**5. Tabular Template (`--template tabular`)**
|
|
386
|
+
```json
|
|
387
|
+
{
|
|
388
|
+
"columns": [
|
|
389
|
+
{"name": "feature1", "type": "numeric", "description": "First feature"},
|
|
390
|
+
{"name": "target", "type": "categorical", "description": "Target variable"}
|
|
391
|
+
],
|
|
392
|
+
"data": [
|
|
393
|
+
{"feature1": 123, "target": "class_a"},
|
|
394
|
+
{"feature1": 456, "target": "class_b"}
|
|
395
|
+
]
|
|
396
|
+
}
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
### Advanced System Prompt Template
|
|
400
|
+
|
|
401
|
+
For high-quality training data generation:
|
|
402
|
+
```text
|
|
403
|
+
You are an AI assistant expert at using MCP tools effectively.
|
|
404
|
+
|
|
405
|
+
## MCP SERVER DEFINITIONS
|
|
406
|
+
[Define available servers and tools]
|
|
407
|
+
|
|
408
|
+
## TRAINING EXAMPLE STRUCTURE
|
|
409
|
+
[Specify exact JSON schema for chat templating]
|
|
410
|
+
|
|
411
|
+
## QUALITY GUIDELINES
|
|
412
|
+
[Detail requirements for realistic scenarios, progressive complexity, proper tool usage]
|
|
413
|
+
|
|
414
|
+
## EXAMPLE CATEGORIES
|
|
415
|
+
[List development workflows, debugging scenarios, data management tasks]
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
### Example Categories & Templates
|
|
419
|
+
|
|
420
|
+
The skill includes diverse training examples beyond just MCP usage:
|
|
421
|
+
|
|
422
|
+
**Available Example Sets:**
|
|
423
|
+
- `training_examples.json` - MCP tool usage examples (debugging, project setup, database analysis)
|
|
424
|
+
- `diverse_training_examples.json` - Broader scenarios including:
|
|
425
|
+
- **Educational Chat** - Explaining programming concepts, tutorials
|
|
426
|
+
- **Git Workflows** - Feature branches, version control guidance
|
|
427
|
+
- **Code Analysis** - Performance optimization, architecture review
|
|
428
|
+
- **Content Generation** - Professional writing, creative brainstorming
|
|
429
|
+
- **Codebase Navigation** - Legacy code exploration, systematic analysis
|
|
430
|
+
- **Conversational Support** - Problem-solving, technical discussions
|
|
431
|
+
|
|
432
|
+
**Using Different Example Sets:**
|
|
433
|
+
```bash
|
|
434
|
+
# Add MCP-focused examples
|
|
435
|
+
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
|
|
436
|
+
--rows_json "$(cat examples/training_examples.json)"
|
|
437
|
+
|
|
438
|
+
# Add diverse conversational examples
|
|
439
|
+
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
|
|
440
|
+
--rows_json "$(cat examples/diverse_training_examples.json)"
|
|
441
|
+
|
|
442
|
+
# Mix both for comprehensive training data
|
|
443
|
+
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
|
|
444
|
+
--rows_json "$(jq -s '.[0] + .[1]' examples/training_examples.json examples/diverse_training_examples.json)"
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
### Commands Reference
|
|
448
|
+
|
|
449
|
+
**List Available Templates:**
|
|
450
|
+
```bash
|
|
451
|
+
uv run scripts/dataset_manager.py list_templates
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
**Quick Setup (Recommended):**
|
|
455
|
+
```bash
|
|
456
|
+
uv run scripts/dataset_manager.py quick_setup --repo_id "your-username/dataset-name" --template classification
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
**Manual Setup:**
|
|
460
|
+
```bash
|
|
461
|
+
# Initialize repository
|
|
462
|
+
uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
|
|
463
|
+
|
|
464
|
+
# Configure with system prompt
|
|
465
|
+
uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "Your prompt here"
|
|
466
|
+
|
|
467
|
+
# Add data with validation
|
|
468
|
+
uv run scripts/dataset_manager.py add_rows \
|
|
469
|
+
--repo_id "your-username/dataset-name" \
|
|
470
|
+
--template qa \
|
|
471
|
+
--rows_json '[{"question": "What is AI?", "answer": "Artificial Intelligence..."}]'
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
**View Dataset Statistics:**
|
|
475
|
+
```bash
|
|
476
|
+
uv run scripts/dataset_manager.py stats --repo_id "your-username/dataset-name"
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
### Error Handling
|
|
480
|
+
- **Repository exists**: Script will notify and continue with configuration
|
|
481
|
+
- **Invalid JSON**: Clear error message with parsing details
|
|
482
|
+
- **Network issues**: Automatic retry for transient failures
|
|
483
|
+
- **Token permissions**: Validation before operations begin
|
|
484
|
+
|
|
485
|
+
---
|
|
486
|
+
|
|
487
|
+
# Combined Workflow Examples
|
|
488
|
+
|
|
489
|
+
## Example 1: Create Training Subset from Existing Dataset
|
|
490
|
+
```bash
|
|
491
|
+
# 1. Explore the source dataset
|
|
492
|
+
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
|
|
493
|
+
uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
|
|
494
|
+
|
|
495
|
+
# 2. Query and create subset
|
|
496
|
+
uv run scripts/sql_manager.py query \
|
|
497
|
+
--dataset "cais/mmlu" \
|
|
498
|
+
--sql "SELECT * FROM data WHERE subject IN ('nutrition', 'anatomy', 'clinical_knowledge')" \
|
|
499
|
+
--push-to "username/mmlu-medical-subset" \
|
|
500
|
+
--private
|
|
501
|
+
```
|
|
502
|
+
|
|
503
|
+
## Example 2: Transform and Reshape Data
|
|
504
|
+
```bash
|
|
505
|
+
# Transform MMLU to QA format with correct answers extracted
|
|
506
|
+
uv run scripts/sql_manager.py query \
|
|
507
|
+
--dataset "cais/mmlu" \
|
|
508
|
+
--sql "SELECT question, choices[answer] as correct_answer, subject FROM data" \
|
|
509
|
+
--push-to "username/mmlu-qa-format"
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
## Example 3: Merge Multiple Dataset Splits
|
|
513
|
+
```bash
|
|
514
|
+
# Export multiple splits and combine
|
|
515
|
+
uv run scripts/sql_manager.py export \
|
|
516
|
+
--dataset "cais/mmlu" \
|
|
517
|
+
--split "*" \
|
|
518
|
+
--output "mmlu_all.parquet"
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
## Example 4: Quality Filtering
|
|
522
|
+
```bash
|
|
523
|
+
# Filter for high-quality examples
|
|
524
|
+
uv run scripts/sql_manager.py query \
|
|
525
|
+
--dataset "squad" \
|
|
526
|
+
--sql "SELECT * FROM data WHERE LENGTH(context) > 500 AND LENGTH(question) > 20" \
|
|
527
|
+
--push-to "username/squad-filtered"
|
|
528
|
+
```
|
|
529
|
+
|
|
530
|
+
## Example 5: Create Custom Training Dataset
|
|
531
|
+
```bash
|
|
532
|
+
# 1. Query source data
|
|
533
|
+
uv run scripts/sql_manager.py export \
|
|
534
|
+
--dataset "cais/mmlu" \
|
|
535
|
+
--sql "SELECT question, subject FROM data WHERE subject='nutrition'" \
|
|
536
|
+
--output "nutrition_source.jsonl" \
|
|
537
|
+
--format jsonl
|
|
538
|
+
|
|
539
|
+
# 2. Process with your pipeline (add answers, format, etc.)
|
|
540
|
+
|
|
541
|
+
# 3. Push processed data
|
|
542
|
+
uv run scripts/dataset_manager.py init --repo_id "username/nutrition-training"
|
|
543
|
+
uv run scripts/dataset_manager.py add_rows \
|
|
544
|
+
--repo_id "username/nutrition-training" \
|
|
545
|
+
--template qa \
|
|
546
|
+
--rows_json "$(cat processed_data.json)"
|
|
547
|
+
```
|