@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ray-data
|
|
3
|
+
description: Scalable data processing for ML workloads. Streaming execution across CPU/GPU, supports Parquet/CSV/JSON/images. Integrates with Ray Train, PyTorch, TensorFlow. Scales from single machine to 100s of nodes. Use for batch inference, data preprocessing, multi-modal data loading, or distributed ETL pipelines.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Data Processing, Ray Data, Distributed Computing, ML Pipelines, Batch Inference, ETL, Scalable, Ray, PyTorch, TensorFlow]
|
|
8
|
+
dependencies: [ray[data], pyarrow, pandas]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Ray Data - Scalable ML Data Processing
|
|
12
|
+
|
|
13
|
+
Distributed data processing library for ML and AI workloads.
|
|
14
|
+
|
|
15
|
+
## When to use Ray Data
|
|
16
|
+
|
|
17
|
+
**Use Ray Data when:**
|
|
18
|
+
- Processing large datasets (>100GB) for ML training
|
|
19
|
+
- Need distributed data preprocessing across cluster
|
|
20
|
+
- Building batch inference pipelines
|
|
21
|
+
- Loading multi-modal data (images, audio, video)
|
|
22
|
+
- Scaling data processing from laptop to cluster
|
|
23
|
+
|
|
24
|
+
**Key features**:
|
|
25
|
+
- **Streaming execution**: Process data larger than memory
|
|
26
|
+
- **GPU support**: Accelerate transforms with GPUs
|
|
27
|
+
- **Framework integration**: PyTorch, TensorFlow, HuggingFace
|
|
28
|
+
- **Multi-modal**: Images, Parquet, CSV, JSON, audio, video
|
|
29
|
+
|
|
30
|
+
**Use alternatives instead**:
|
|
31
|
+
- **Pandas**: Small data (<1GB) on single machine
|
|
32
|
+
- **Dask**: Tabular data, SQL-like operations
|
|
33
|
+
- **Spark**: Enterprise ETL, SQL queries
|
|
34
|
+
|
|
35
|
+
## Quick start
|
|
36
|
+
|
|
37
|
+
### Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -U 'ray[data]'
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Load and transform data
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import ray
|
|
47
|
+
|
|
48
|
+
# Read Parquet files
|
|
49
|
+
ds = ray.data.read_parquet("s3://bucket/data/*.parquet")
|
|
50
|
+
|
|
51
|
+
# Transform data (lazy execution)
|
|
52
|
+
ds = ds.map_batches(lambda batch: {"processed": batch["text"].str.lower()})
|
|
53
|
+
|
|
54
|
+
# Consume data
|
|
55
|
+
for batch in ds.iter_batches(batch_size=100):
|
|
56
|
+
print(batch)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Integration with Ray Train
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import ray
|
|
63
|
+
from ray.train import ScalingConfig
|
|
64
|
+
from ray.train.torch import TorchTrainer
|
|
65
|
+
|
|
66
|
+
# Create dataset
|
|
67
|
+
train_ds = ray.data.read_parquet("s3://bucket/train/*.parquet")
|
|
68
|
+
|
|
69
|
+
def train_func(config):
|
|
70
|
+
# Access dataset in training
|
|
71
|
+
train_ds = ray.train.get_dataset_shard("train")
|
|
72
|
+
|
|
73
|
+
for epoch in range(10):
|
|
74
|
+
for batch in train_ds.iter_batches(batch_size=32):
|
|
75
|
+
# Train on batch
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
# Train with Ray
|
|
79
|
+
trainer = TorchTrainer(
|
|
80
|
+
train_func,
|
|
81
|
+
datasets={"train": train_ds},
|
|
82
|
+
scaling_config=ScalingConfig(num_workers=4, use_gpu=True)
|
|
83
|
+
)
|
|
84
|
+
trainer.fit()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Reading data
|
|
88
|
+
|
|
89
|
+
### From cloud storage
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
import ray
|
|
93
|
+
|
|
94
|
+
# Parquet (recommended for ML)
|
|
95
|
+
ds = ray.data.read_parquet("s3://bucket/data/*.parquet")
|
|
96
|
+
|
|
97
|
+
# CSV
|
|
98
|
+
ds = ray.data.read_csv("s3://bucket/data/*.csv")
|
|
99
|
+
|
|
100
|
+
# JSON
|
|
101
|
+
ds = ray.data.read_json("gs://bucket/data/*.json")
|
|
102
|
+
|
|
103
|
+
# Images
|
|
104
|
+
ds = ray.data.read_images("s3://bucket/images/")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### From Python objects
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
# From list
|
|
111
|
+
ds = ray.data.from_items([{"id": i, "value": i * 2} for i in range(1000)])
|
|
112
|
+
|
|
113
|
+
# From range
|
|
114
|
+
ds = ray.data.range(1000000) # Synthetic data
|
|
115
|
+
|
|
116
|
+
# From pandas
|
|
117
|
+
import pandas as pd
|
|
118
|
+
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
119
|
+
ds = ray.data.from_pandas(df)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Transformations
|
|
123
|
+
|
|
124
|
+
### Map batches (vectorized)
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# Batch transformation (fast)
|
|
128
|
+
def process_batch(batch):
|
|
129
|
+
batch["doubled"] = batch["value"] * 2
|
|
130
|
+
return batch
|
|
131
|
+
|
|
132
|
+
ds = ds.map_batches(process_batch, batch_size=1000)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Row transformations
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
# Row-by-row (slower)
|
|
139
|
+
def process_row(row):
|
|
140
|
+
row["squared"] = row["value"] ** 2
|
|
141
|
+
return row
|
|
142
|
+
|
|
143
|
+
ds = ds.map(process_row)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Filter
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
# Filter rows
|
|
150
|
+
ds = ds.filter(lambda row: row["value"] > 100)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Group by and aggregate
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
# Group by column
|
|
157
|
+
ds = ds.groupby("category").count()
|
|
158
|
+
|
|
159
|
+
# Custom aggregation
|
|
160
|
+
ds = ds.groupby("category").map_groups(lambda group: {"sum": group["value"].sum()})
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## GPU-accelerated transforms
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
# Use GPU for preprocessing
|
|
167
|
+
def preprocess_images_gpu(batch):
|
|
168
|
+
import torch
|
|
169
|
+
images = torch.tensor(batch["image"]).cuda()
|
|
170
|
+
# GPU preprocessing
|
|
171
|
+
processed = images * 255
|
|
172
|
+
return {"processed": processed.cpu().numpy()}
|
|
173
|
+
|
|
174
|
+
ds = ds.map_batches(
|
|
175
|
+
preprocess_images_gpu,
|
|
176
|
+
batch_size=64,
|
|
177
|
+
num_gpus=1 # Request GPU
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Writing data
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
# Write to Parquet
|
|
185
|
+
ds.write_parquet("s3://bucket/output/")
|
|
186
|
+
|
|
187
|
+
# Write to CSV
|
|
188
|
+
ds.write_csv("output/")
|
|
189
|
+
|
|
190
|
+
# Write to JSON
|
|
191
|
+
ds.write_json("output/")
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Performance optimization
|
|
195
|
+
|
|
196
|
+
### Repartition
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
# Control parallelism
|
|
200
|
+
ds = ds.repartition(100) # 100 blocks for 100-core cluster
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Batch size tuning
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
# Larger batches = faster vectorized ops
|
|
207
|
+
ds.map_batches(process_fn, batch_size=10000) # vs batch_size=100
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### Streaming execution
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
# Process data larger than memory
|
|
214
|
+
ds = ray.data.read_parquet("s3://huge-dataset/")
|
|
215
|
+
for batch in ds.iter_batches(batch_size=1000):
|
|
216
|
+
process(batch) # Streamed, not loaded to memory
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Common patterns
|
|
220
|
+
|
|
221
|
+
### Batch inference
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
import ray
|
|
225
|
+
|
|
226
|
+
# Load model
|
|
227
|
+
def load_model():
|
|
228
|
+
# Load once per worker
|
|
229
|
+
return MyModel()
|
|
230
|
+
|
|
231
|
+
# Inference function
|
|
232
|
+
class BatchInference:
|
|
233
|
+
def __init__(self):
|
|
234
|
+
self.model = load_model()
|
|
235
|
+
|
|
236
|
+
def __call__(self, batch):
|
|
237
|
+
predictions = self.model(batch["input"])
|
|
238
|
+
return {"prediction": predictions}
|
|
239
|
+
|
|
240
|
+
# Run distributed inference
|
|
241
|
+
ds = ray.data.read_parquet("s3://data/")
|
|
242
|
+
predictions = ds.map_batches(BatchInference, batch_size=32, num_gpus=1)
|
|
243
|
+
predictions.write_parquet("s3://output/")
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Data preprocessing pipeline
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
# Multi-step pipeline
|
|
250
|
+
ds = (
|
|
251
|
+
ray.data.read_parquet("s3://raw/")
|
|
252
|
+
.map_batches(clean_data)
|
|
253
|
+
.map_batches(tokenize)
|
|
254
|
+
.map_batches(augment)
|
|
255
|
+
.write_parquet("s3://processed/")
|
|
256
|
+
)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Integration with ML frameworks
|
|
260
|
+
|
|
261
|
+
### PyTorch
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# Convert to PyTorch
|
|
265
|
+
torch_ds = ds.to_torch(label_column="label", batch_size=32)
|
|
266
|
+
|
|
267
|
+
for batch in torch_ds:
|
|
268
|
+
# batch is dict with tensors
|
|
269
|
+
inputs, labels = batch["features"], batch["label"]
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### TensorFlow
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
# Convert to TensorFlow
|
|
276
|
+
tf_ds = ds.to_tf(feature_columns=["image"], label_column="label", batch_size=32)
|
|
277
|
+
|
|
278
|
+
for features, labels in tf_ds:
|
|
279
|
+
# Train model
|
|
280
|
+
pass
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Supported data formats
|
|
284
|
+
|
|
285
|
+
| Format | Read | Write | Use Case |
|
|
286
|
+
|--------|------|-------|----------|
|
|
287
|
+
| Parquet | ✅ | ✅ | ML data (recommended) |
|
|
288
|
+
| CSV | ✅ | ✅ | Tabular data |
|
|
289
|
+
| JSON | ✅ | ✅ | Semi-structured |
|
|
290
|
+
| Images | ✅ | ❌ | Computer vision |
|
|
291
|
+
| NumPy | ✅ | ✅ | Arrays |
|
|
292
|
+
| Pandas | ✅ | ❌ | DataFrames |
|
|
293
|
+
|
|
294
|
+
## Performance benchmarks
|
|
295
|
+
|
|
296
|
+
**Scaling** (processing 100GB data):
|
|
297
|
+
- 1 node (16 cores): ~30 minutes
|
|
298
|
+
- 4 nodes (64 cores): ~8 minutes
|
|
299
|
+
- 16 nodes (256 cores): ~2 minutes
|
|
300
|
+
|
|
301
|
+
**GPU acceleration** (image preprocessing):
|
|
302
|
+
- CPU only: 1,000 images/sec
|
|
303
|
+
- 1 GPU: 5,000 images/sec
|
|
304
|
+
- 4 GPUs: 18,000 images/sec
|
|
305
|
+
|
|
306
|
+
## Use cases
|
|
307
|
+
|
|
308
|
+
**Production deployments**:
|
|
309
|
+
- **Pinterest**: Last-mile data processing for model training
|
|
310
|
+
- **ByteDance**: Scaling offline inference with multi-modal LLMs
|
|
311
|
+
- **Spotify**: ML platform for batch inference
|
|
312
|
+
|
|
313
|
+
## References
|
|
314
|
+
|
|
315
|
+
- **[Transformations Guide](references/transformations.md)** - Map, filter, groupby operations
|
|
316
|
+
- **[Integration Guide](references/integration.md)** - Ray Train, PyTorch, TensorFlow
|
|
317
|
+
|
|
318
|
+
## Resources
|
|
319
|
+
|
|
320
|
+
- **Docs**: https://docs.ray.io/en/latest/data/data.html
|
|
321
|
+
- **GitHub**: https://github.com/ray-project/ray ⭐ 36,000+
|
|
322
|
+
- **Version**: Ray 2.40.0+
|
|
323
|
+
- **Examples**: https://docs.ray.io/en/latest/data/examples/overview.html
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Ray Data Integration Guide
|
|
2
|
+
|
|
3
|
+
Integration with Ray Train and ML frameworks.
|
|
4
|
+
|
|
5
|
+
## Ray Train integration
|
|
6
|
+
|
|
7
|
+
### Basic training with datasets
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
import ray
|
|
11
|
+
from ray.train import ScalingConfig
|
|
12
|
+
from ray.train.torch import TorchTrainer
|
|
13
|
+
|
|
14
|
+
# Create datasets
|
|
15
|
+
train_ds = ray.data.read_parquet("s3://data/train/")
|
|
16
|
+
val_ds = ray.data.read_parquet("s3://data/val/")
|
|
17
|
+
|
|
18
|
+
def train_func(config):
|
|
19
|
+
# Get dataset shards
|
|
20
|
+
train_ds = ray.train.get_dataset_shard("train")
|
|
21
|
+
val_ds = ray.train.get_dataset_shard("val")
|
|
22
|
+
|
|
23
|
+
for epoch in range(config["epochs"]):
|
|
24
|
+
# Iterate over batches
|
|
25
|
+
for batch in train_ds.iter_batches(batch_size=32):
|
|
26
|
+
# Train on batch
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
# Launch training
|
|
30
|
+
trainer = TorchTrainer(
|
|
31
|
+
train_func,
|
|
32
|
+
train_loop_config={"epochs": 10},
|
|
33
|
+
datasets={"train": train_ds, "val": val_ds},
|
|
34
|
+
scaling_config=ScalingConfig(num_workers=4, use_gpu=True)
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
result = trainer.fit()
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## PyTorch integration
|
|
41
|
+
|
|
42
|
+
### Convert to PyTorch Dataset
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
# Option 1: to_torch (recommended)
|
|
46
|
+
torch_ds = ds.to_torch(
|
|
47
|
+
label_column="label",
|
|
48
|
+
batch_size=32,
|
|
49
|
+
drop_last=True
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
for batch in torch_ds:
|
|
53
|
+
inputs = batch["features"]
|
|
54
|
+
labels = batch["label"]
|
|
55
|
+
# Train model
|
|
56
|
+
|
|
57
|
+
# Option 2: iter_torch_batches
|
|
58
|
+
for batch in ds.iter_torch_batches(batch_size=32):
|
|
59
|
+
# batch is dict of tensors
|
|
60
|
+
pass
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## TensorFlow integration
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
tf_ds = ds.to_tf(
|
|
67
|
+
feature_columns=["image", "text"],
|
|
68
|
+
label_column="label",
|
|
69
|
+
batch_size=32
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for features, labels in tf_ds:
|
|
73
|
+
# Train TensorFlow model
|
|
74
|
+
pass
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Best practices
|
|
78
|
+
|
|
79
|
+
1. **Shard datasets in Ray Train** - Automatic with `get_dataset_shard()`
|
|
80
|
+
2. **Use streaming** - Don't load entire dataset to memory
|
|
81
|
+
3. **Preprocess in Ray Data** - Distribute preprocessing across cluster
|
|
82
|
+
4. **Cache preprocessed data** - Write to Parquet, read in training
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Ray Data Transformations
|
|
2
|
+
|
|
3
|
+
Complete guide to data transformations in Ray Data.
|
|
4
|
+
|
|
5
|
+
## Core operations
|
|
6
|
+
|
|
7
|
+
### Map batches (vectorized)
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
# Recommended for performance
|
|
11
|
+
def process_batch(batch):
|
|
12
|
+
# batch is dict of numpy arrays or pandas Series
|
|
13
|
+
batch["doubled"] = batch["value"] * 2
|
|
14
|
+
return batch
|
|
15
|
+
|
|
16
|
+
ds = ds.map_batches(process_batch, batch_size=1000)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
**Performance**: 10-100× faster than row-by-row
|
|
20
|
+
|
|
21
|
+
### Map (row-by-row)
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
# Use only when vectorization not possible
|
|
25
|
+
def process_row(row):
|
|
26
|
+
row["squared"] = row["value"] ** 2
|
|
27
|
+
return row
|
|
28
|
+
|
|
29
|
+
ds = ds.map(process_row)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Filter
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
# Remove rows
|
|
36
|
+
ds = ds.filter(lambda row: row["score"] > 0.5)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Flat map
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
# One row → multiple rows
|
|
43
|
+
def expand_row(row):
|
|
44
|
+
return [{"value": row["value"] + i} for i in range(3)]
|
|
45
|
+
|
|
46
|
+
ds = ds.flat_map(expand_row)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## GPU-accelerated transforms
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
def gpu_transform(batch):
|
|
53
|
+
import torch
|
|
54
|
+
data = torch.tensor(batch["data"]).cuda()
|
|
55
|
+
# GPU processing
|
|
56
|
+
result = data * 2
|
|
57
|
+
return {"processed": result.cpu().numpy()}
|
|
58
|
+
|
|
59
|
+
ds = ds.map_batches(gpu_transform, num_gpus=1, batch_size=64)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Groupby operations
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
# Group by column
|
|
66
|
+
grouped = ds.groupby("category")
|
|
67
|
+
|
|
68
|
+
# Aggregate
|
|
69
|
+
result = grouped.count()
|
|
70
|
+
|
|
71
|
+
# Custom aggregation
|
|
72
|
+
result = grouped.map_groups(lambda group: {
|
|
73
|
+
"sum": group["value"].sum(),
|
|
74
|
+
"mean": group["value"].mean()
|
|
75
|
+
})
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Best practices
|
|
79
|
+
|
|
80
|
+
1. **Use map_batches over map** - 10-100× faster
|
|
81
|
+
2. **Tune batch_size** - Larger = faster (balance with memory)
|
|
82
|
+
3. **Use GPUs for heavy compute** - Image/audio preprocessing
|
|
83
|
+
4. **Stream large datasets** - Use iter_batches for >memory data
|