@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: nemo-curator
|
|
3
|
+
description: GPU-accelerated data curation for LLM training. Supports text/image/video/audio. Features fuzzy deduplication (16× faster), quality filtering (30+ heuristics), semantic deduplication, PII redaction, NSFW detection. Scales across GPUs with RAPIDS. Use for preparing high-quality training datasets, cleaning web data, or deduplicating large corpora.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Data Processing, NeMo Curator, Data Curation, GPU Acceleration, Deduplication, Quality Filtering, NVIDIA, RAPIDS, PII Redaction, Multimodal, LLM Training Data]
|
|
8
|
+
dependencies: [nemo-curator, cudf, dask, rapids]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# NeMo Curator - GPU-Accelerated Data Curation
|
|
12
|
+
|
|
13
|
+
NVIDIA's toolkit for preparing high-quality training data for LLMs.
|
|
14
|
+
|
|
15
|
+
## When to use NeMo Curator
|
|
16
|
+
|
|
17
|
+
**Use NeMo Curator when:**
|
|
18
|
+
- Preparing LLM training data from web scrapes (Common Crawl)
|
|
19
|
+
- Need fast deduplication (16× faster than CPU)
|
|
20
|
+
- Curating multi-modal datasets (text, images, video, audio)
|
|
21
|
+
- Filtering low-quality or toxic content
|
|
22
|
+
- Scaling data processing across GPU cluster
|
|
23
|
+
|
|
24
|
+
**Performance**:
|
|
25
|
+
- **16× faster** fuzzy deduplication (8TB RedPajama v2)
|
|
26
|
+
- **40% lower TCO** vs CPU alternatives
|
|
27
|
+
- **Near-linear scaling** across GPU nodes
|
|
28
|
+
|
|
29
|
+
**Use alternatives instead**:
|
|
30
|
+
- **datatrove**: CPU-based, open-source data processing
|
|
31
|
+
- **dolma**: Allen AI's data toolkit
|
|
32
|
+
- **Ray Data**: General ML data processing (no curation focus)
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
### Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Text curation (CUDA 12)
|
|
40
|
+
uv pip install "nemo-curator[text_cuda12]"
|
|
41
|
+
|
|
42
|
+
# All modalities
|
|
43
|
+
uv pip install "nemo-curator[all_cuda12]"
|
|
44
|
+
|
|
45
|
+
# CPU-only (slower)
|
|
46
|
+
uv pip install "nemo-curator[cpu]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Basic text curation pipeline
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from nemo_curator import ScoreFilter, Modify
|
|
53
|
+
from nemo_curator.datasets import DocumentDataset
|
|
54
|
+
import pandas as pd
|
|
55
|
+
|
|
56
|
+
# Load data
|
|
57
|
+
df = pd.DataFrame({"text": ["Good document", "Bad doc", "Excellent text"]})
|
|
58
|
+
dataset = DocumentDataset(df)
|
|
59
|
+
|
|
60
|
+
# Quality filtering
|
|
61
|
+
def quality_score(doc):
|
|
62
|
+
return len(doc["text"].split()) > 5 # Filter short docs
|
|
63
|
+
|
|
64
|
+
filtered = ScoreFilter(quality_score)(dataset)
|
|
65
|
+
|
|
66
|
+
# Deduplication
|
|
67
|
+
from nemo_curator.modules import ExactDuplicates
|
|
68
|
+
deduped = ExactDuplicates()(filtered)
|
|
69
|
+
|
|
70
|
+
# Save
|
|
71
|
+
deduped.to_parquet("curated_data/")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Data curation pipeline
|
|
75
|
+
|
|
76
|
+
### Stage 1: Quality filtering
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from nemo_curator.filters import (
|
|
80
|
+
WordCountFilter,
|
|
81
|
+
RepeatedLinesFilter,
|
|
82
|
+
UrlRatioFilter,
|
|
83
|
+
NonAlphaNumericFilter
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Apply 30+ heuristic filters
|
|
87
|
+
from nemo_curator import ScoreFilter
|
|
88
|
+
|
|
89
|
+
# Word count filter
|
|
90
|
+
dataset = dataset.filter(WordCountFilter(min_words=50, max_words=100000))
|
|
91
|
+
|
|
92
|
+
# Remove repetitive content
|
|
93
|
+
dataset = dataset.filter(RepeatedLinesFilter(max_repeated_line_fraction=0.3))
|
|
94
|
+
|
|
95
|
+
# URL ratio filter
|
|
96
|
+
dataset = dataset.filter(UrlRatioFilter(max_url_ratio=0.2))
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Stage 2: Deduplication
|
|
100
|
+
|
|
101
|
+
**Exact deduplication**:
|
|
102
|
+
```python
|
|
103
|
+
from nemo_curator.modules import ExactDuplicates
|
|
104
|
+
|
|
105
|
+
# Remove exact duplicates
|
|
106
|
+
deduped = ExactDuplicates(id_field="id", text_field="text")(dataset)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Fuzzy deduplication** (16× faster on GPU):
|
|
110
|
+
```python
|
|
111
|
+
from nemo_curator.modules import FuzzyDuplicates
|
|
112
|
+
|
|
113
|
+
# MinHash + LSH deduplication
|
|
114
|
+
fuzzy_dedup = FuzzyDuplicates(
|
|
115
|
+
id_field="id",
|
|
116
|
+
text_field="text",
|
|
117
|
+
num_hashes=260, # MinHash parameters
|
|
118
|
+
num_buckets=20,
|
|
119
|
+
hash_method="md5"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
deduped = fuzzy_dedup(dataset)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Semantic deduplication**:
|
|
126
|
+
```python
|
|
127
|
+
from nemo_curator.modules import SemanticDuplicates
|
|
128
|
+
|
|
129
|
+
# Embedding-based deduplication
|
|
130
|
+
semantic_dedup = SemanticDuplicates(
|
|
131
|
+
id_field="id",
|
|
132
|
+
text_field="text",
|
|
133
|
+
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
|
134
|
+
threshold=0.8 # Cosine similarity threshold
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
deduped = semantic_dedup(dataset)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Stage 3: PII redaction
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from nemo_curator.modules import Modify
|
|
144
|
+
from nemo_curator.modifiers import PIIRedactor
|
|
145
|
+
|
|
146
|
+
# Redact personally identifiable information
|
|
147
|
+
pii_redactor = PIIRedactor(
|
|
148
|
+
supported_entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON", "LOCATION"],
|
|
149
|
+
anonymize_action="replace" # or "redact"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
redacted = Modify(pii_redactor)(dataset)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Stage 4: Classifier filtering
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from nemo_curator.classifiers import QualityClassifier
|
|
159
|
+
|
|
160
|
+
# Quality classification
|
|
161
|
+
quality_clf = QualityClassifier(
|
|
162
|
+
model_path="nvidia/quality-classifier-deberta",
|
|
163
|
+
batch_size=256,
|
|
164
|
+
device="cuda"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Filter low-quality documents
|
|
168
|
+
high_quality = dataset.filter(lambda doc: quality_clf(doc["text"]) > 0.5)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## GPU acceleration
|
|
172
|
+
|
|
173
|
+
### GPU vs CPU performance
|
|
174
|
+
|
|
175
|
+
| Operation | CPU (16 cores) | GPU (A100) | Speedup |
|
|
176
|
+
|-----------|----------------|------------|---------|
|
|
177
|
+
| Fuzzy dedup (8TB) | 120 hours | 7.5 hours | 16× |
|
|
178
|
+
| Exact dedup (1TB) | 8 hours | 0.5 hours | 16× |
|
|
179
|
+
| Quality filtering | 2 hours | 0.2 hours | 10× |
|
|
180
|
+
|
|
181
|
+
### Multi-GPU scaling
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from nemo_curator import get_client
|
|
185
|
+
import dask_cuda
|
|
186
|
+
|
|
187
|
+
# Initialize GPU cluster
|
|
188
|
+
client = get_client(cluster_type="gpu", n_workers=8)
|
|
189
|
+
|
|
190
|
+
# Process with 8 GPUs
|
|
191
|
+
deduped = FuzzyDuplicates(...)(dataset)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Multi-modal curation
|
|
195
|
+
|
|
196
|
+
### Image curation
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from nemo_curator.image import (
|
|
200
|
+
AestheticFilter,
|
|
201
|
+
NSFWFilter,
|
|
202
|
+
CLIPEmbedder
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Aesthetic scoring
|
|
206
|
+
aesthetic_filter = AestheticFilter(threshold=5.0)
|
|
207
|
+
filtered_images = aesthetic_filter(image_dataset)
|
|
208
|
+
|
|
209
|
+
# NSFW detection
|
|
210
|
+
nsfw_filter = NSFWFilter(threshold=0.9)
|
|
211
|
+
safe_images = nsfw_filter(filtered_images)
|
|
212
|
+
|
|
213
|
+
# Generate CLIP embeddings
|
|
214
|
+
clip_embedder = CLIPEmbedder(model="openai/clip-vit-base-patch32")
|
|
215
|
+
image_embeddings = clip_embedder(safe_images)
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Video curation
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from nemo_curator.video import (
|
|
222
|
+
SceneDetector,
|
|
223
|
+
ClipExtractor,
|
|
224
|
+
InternVideo2Embedder
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Detect scenes
|
|
228
|
+
scene_detector = SceneDetector(threshold=27.0)
|
|
229
|
+
scenes = scene_detector(video_dataset)
|
|
230
|
+
|
|
231
|
+
# Extract clips
|
|
232
|
+
clip_extractor = ClipExtractor(min_duration=2.0, max_duration=10.0)
|
|
233
|
+
clips = clip_extractor(scenes)
|
|
234
|
+
|
|
235
|
+
# Generate embeddings
|
|
236
|
+
video_embedder = InternVideo2Embedder()
|
|
237
|
+
video_embeddings = video_embedder(clips)
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Audio curation
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
from nemo_curator.audio import (
|
|
244
|
+
ASRInference,
|
|
245
|
+
WERFilter,
|
|
246
|
+
DurationFilter
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# ASR transcription
|
|
250
|
+
asr = ASRInference(model="nvidia/stt_en_fastconformer_hybrid_large_pc")
|
|
251
|
+
transcribed = asr(audio_dataset)
|
|
252
|
+
|
|
253
|
+
# Filter by WER (word error rate)
|
|
254
|
+
wer_filter = WERFilter(max_wer=0.3)
|
|
255
|
+
high_quality_audio = wer_filter(transcribed)
|
|
256
|
+
|
|
257
|
+
# Duration filtering
|
|
258
|
+
duration_filter = DurationFilter(min_duration=1.0, max_duration=30.0)
|
|
259
|
+
filtered_audio = duration_filter(high_quality_audio)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Common patterns
|
|
263
|
+
|
|
264
|
+
### Web scrape curation (Common Crawl)
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
from nemo_curator import ScoreFilter, Modify
|
|
268
|
+
from nemo_curator.filters import *
|
|
269
|
+
from nemo_curator.modules import *
|
|
270
|
+
from nemo_curator.datasets import DocumentDataset
|
|
271
|
+
|
|
272
|
+
# Load Common Crawl data
|
|
273
|
+
dataset = DocumentDataset.read_parquet("common_crawl/*.parquet")
|
|
274
|
+
|
|
275
|
+
# Pipeline
|
|
276
|
+
pipeline = [
|
|
277
|
+
# 1. Quality filtering
|
|
278
|
+
WordCountFilter(min_words=100, max_words=50000),
|
|
279
|
+
RepeatedLinesFilter(max_repeated_line_fraction=0.2),
|
|
280
|
+
SymbolToWordRatioFilter(max_symbol_to_word_ratio=0.3),
|
|
281
|
+
UrlRatioFilter(max_url_ratio=0.3),
|
|
282
|
+
|
|
283
|
+
# 2. Language filtering
|
|
284
|
+
LanguageIdentificationFilter(target_languages=["en"]),
|
|
285
|
+
|
|
286
|
+
# 3. Deduplication
|
|
287
|
+
ExactDuplicates(id_field="id", text_field="text"),
|
|
288
|
+
FuzzyDuplicates(id_field="id", text_field="text", num_hashes=260),
|
|
289
|
+
|
|
290
|
+
# 4. PII redaction
|
|
291
|
+
PIIRedactor(),
|
|
292
|
+
|
|
293
|
+
# 5. NSFW filtering
|
|
294
|
+
NSFWClassifier(threshold=0.8)
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
# Execute
|
|
298
|
+
for stage in pipeline:
|
|
299
|
+
dataset = stage(dataset)
|
|
300
|
+
|
|
301
|
+
# Save
|
|
302
|
+
dataset.to_parquet("curated_common_crawl/")
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
### Distributed processing
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
from nemo_curator import get_client
|
|
309
|
+
from dask_cuda import LocalCUDACluster
|
|
310
|
+
|
|
311
|
+
# Multi-GPU cluster
|
|
312
|
+
cluster = LocalCUDACluster(n_workers=8)
|
|
313
|
+
client = get_client(cluster=cluster)
|
|
314
|
+
|
|
315
|
+
# Process large dataset
|
|
316
|
+
dataset = DocumentDataset.read_parquet("s3://large_dataset/*.parquet")
|
|
317
|
+
deduped = FuzzyDuplicates(...)(dataset)
|
|
318
|
+
|
|
319
|
+
# Cleanup
|
|
320
|
+
client.close()
|
|
321
|
+
cluster.close()
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## Performance benchmarks
|
|
325
|
+
|
|
326
|
+
### Fuzzy deduplication (8TB RedPajama v2)
|
|
327
|
+
|
|
328
|
+
- **CPU (256 cores)**: 120 hours
|
|
329
|
+
- **GPU (8× A100)**: 7.5 hours
|
|
330
|
+
- **Speedup**: 16×
|
|
331
|
+
|
|
332
|
+
### Exact deduplication (1TB)
|
|
333
|
+
|
|
334
|
+
- **CPU (64 cores)**: 8 hours
|
|
335
|
+
- **GPU (4× A100)**: 0.5 hours
|
|
336
|
+
- **Speedup**: 16×
|
|
337
|
+
|
|
338
|
+
### Quality filtering (100GB)
|
|
339
|
+
|
|
340
|
+
- **CPU (32 cores)**: 2 hours
|
|
341
|
+
- **GPU (2× A100)**: 0.2 hours
|
|
342
|
+
- **Speedup**: 10×
|
|
343
|
+
|
|
344
|
+
## Cost comparison
|
|
345
|
+
|
|
346
|
+
**CPU-based curation** (AWS c5.18xlarge × 10):
|
|
347
|
+
- Cost: $3.60/hour × 10 = $36/hour
|
|
348
|
+
- Time for 8TB: 120 hours
|
|
349
|
+
- **Total**: $4,320
|
|
350
|
+
|
|
351
|
+
**GPU-based curation** (AWS p4d.24xlarge × 2):
|
|
352
|
+
- Cost: $32.77/hour × 2 = $65.54/hour
|
|
353
|
+
- Time for 8TB: 7.5 hours
|
|
354
|
+
- **Total**: $491.55
|
|
355
|
+
|
|
356
|
+
**Savings**: 89% reduction ($3,828 saved)
|
|
357
|
+
|
|
358
|
+
## Supported data formats
|
|
359
|
+
|
|
360
|
+
- **Input**: Parquet, JSONL, CSV
|
|
361
|
+
- **Output**: Parquet (recommended), JSONL
|
|
362
|
+
- **WebDataset**: TAR archives for multi-modal
|
|
363
|
+
|
|
364
|
+
## Use cases
|
|
365
|
+
|
|
366
|
+
**Production deployments**:
|
|
367
|
+
- NVIDIA used NeMo Curator to prepare Nemotron-4 training data
|
|
368
|
+
- Open-source datasets curated: RedPajama v2, The Pile
|
|
369
|
+
|
|
370
|
+
## References
|
|
371
|
+
|
|
372
|
+
- **[Filtering Guide](references/filtering.md)** - 30+ quality filters, heuristics
|
|
373
|
+
- **[Deduplication Guide](references/deduplication.md)** - Exact, fuzzy, semantic methods
|
|
374
|
+
|
|
375
|
+
## Resources
|
|
376
|
+
|
|
377
|
+
- **GitHub**: https://github.com/NVIDIA/NeMo-Curator ⭐ 500+
|
|
378
|
+
- **Docs**: https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/
|
|
379
|
+
- **Version**: 0.4.0+
|
|
380
|
+
- **License**: Apache 2.0
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Deduplication Guide
|
|
2
|
+
|
|
3
|
+
Complete guide to exact, fuzzy, and semantic deduplication.
|
|
4
|
+
|
|
5
|
+
## Exact deduplication
|
|
6
|
+
|
|
7
|
+
Remove documents with identical content.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from nemo_curator.modules import ExactDuplicates
|
|
11
|
+
|
|
12
|
+
# Exact deduplication
|
|
13
|
+
exact_dedup = ExactDuplicates(
|
|
14
|
+
id_field="id",
|
|
15
|
+
text_field="text",
|
|
16
|
+
hash_method="md5" # or "sha256"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
deduped = exact_dedup(dataset)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Performance**: ~16× faster on GPU vs CPU
|
|
23
|
+
|
|
24
|
+
## Fuzzy deduplication
|
|
25
|
+
|
|
26
|
+
Remove near-duplicate documents using MinHash + LSH.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from nemo_curator.modules import FuzzyDuplicates
|
|
30
|
+
|
|
31
|
+
fuzzy_dedup = FuzzyDuplicates(
|
|
32
|
+
id_field="id",
|
|
33
|
+
text_field="text",
|
|
34
|
+
num_hashes=260, # MinHash permutations (more = accurate)
|
|
35
|
+
num_buckets=20, # LSH buckets (more = faster, less recall)
|
|
36
|
+
hash_method="md5",
|
|
37
|
+
jaccard_threshold=0.8 # Similarity threshold
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
deduped = fuzzy_dedup(dataset)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Parameters**:
|
|
44
|
+
- `num_hashes`: 128-512 (default 260)
|
|
45
|
+
- `num_buckets`: 10-50 (default 20)
|
|
46
|
+
- `jaccard_threshold`: 0.7-0.9 (default 0.8)
|
|
47
|
+
|
|
48
|
+
**Performance**: 16× faster on 8TB dataset (120h → 7.5h)
|
|
49
|
+
|
|
50
|
+
## Semantic deduplication
|
|
51
|
+
|
|
52
|
+
Remove semantically similar documents using embeddings.
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from nemo_curator.modules import SemanticDuplicates
|
|
56
|
+
|
|
57
|
+
semantic_dedup = SemanticDuplicates(
|
|
58
|
+
id_field="id",
|
|
59
|
+
text_field="text",
|
|
60
|
+
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
|
61
|
+
embedding_batch_size=256,
|
|
62
|
+
threshold=0.85, # Cosine similarity threshold
|
|
63
|
+
device="cuda"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
deduped = semantic_dedup(dataset)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Models**:
|
|
70
|
+
- `all-MiniLM-L6-v2`: Fast, 384 dims
|
|
71
|
+
- `all-mpnet-base-v2`: Better quality, 768 dims
|
|
72
|
+
- Custom models supported
|
|
73
|
+
|
|
74
|
+
## Comparison
|
|
75
|
+
|
|
76
|
+
| Method | Speed | Recall | Use Case |
|
|
77
|
+
|--------|-------|--------|----------|
|
|
78
|
+
| Exact | Fastest | 100% | Exact matches only |
|
|
79
|
+
| Fuzzy | Fast | ~95% | Near-duplicates (recommended) |
|
|
80
|
+
| Semantic | Slow | ~90% | Paraphrases, rewrites |
|
|
81
|
+
|
|
82
|
+
## Best practices
|
|
83
|
+
|
|
84
|
+
1. **Start with exact dedup** - Remove obvious duplicates
|
|
85
|
+
2. **Use fuzzy for large datasets** - Best speed/quality trade-off
|
|
86
|
+
3. **Semantic for high-value data** - Expensive but thorough
|
|
87
|
+
4. **GPU acceleration required** - 10-16× speedup
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Quality Filtering Guide
|
|
2
|
+
|
|
3
|
+
Complete guide to NeMo Curator's 30+ quality filters.
|
|
4
|
+
|
|
5
|
+
## Text-based filters
|
|
6
|
+
|
|
7
|
+
### Word count
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from nemo_curator.filters import WordCountFilter
|
|
11
|
+
|
|
12
|
+
# Filter by word count
|
|
13
|
+
dataset = dataset.filter(WordCountFilter(min_words=50, max_words=100000))
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
### Repeated content
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from nemo_curator.filters import RepeatedLinesFilter
|
|
20
|
+
|
|
21
|
+
# Remove documents with >30% repeated lines
|
|
22
|
+
dataset = dataset.filter(RepeatedLinesFilter(max_repeated_line_fraction=0.3))
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Symbol ratio
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from nemo_curator.filters import SymbolToWordRatioFilter
|
|
29
|
+
|
|
30
|
+
# Remove documents with too many symbols
|
|
31
|
+
dataset = dataset.filter(SymbolToWordRatioFilter(max_symbol_to_word_ratio=0.3))
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### URL ratio
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from nemo_curator.filters import UrlRatioFilter
|
|
38
|
+
|
|
39
|
+
# Remove documents with many URLs
|
|
40
|
+
dataset = dataset.filter(UrlRatioFilter(max_url_ratio=0.2))
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Language filtering
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from nemo_curator.filters import LanguageIdentificationFilter
|
|
47
|
+
|
|
48
|
+
# Keep only English documents
|
|
49
|
+
dataset = dataset.filter(LanguageIdentificationFilter(target_languages=["en"]))
|
|
50
|
+
|
|
51
|
+
# Multiple languages
|
|
52
|
+
dataset = dataset.filter(LanguageIdentificationFilter(target_languages=["en", "es", "fr"]))
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Classifier-based filtering
|
|
56
|
+
|
|
57
|
+
### Quality classifier
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from nemo_curator.classifiers import QualityClassifier
|
|
61
|
+
|
|
62
|
+
quality_clf = QualityClassifier(
|
|
63
|
+
model_path="nvidia/quality-classifier-deberta",
|
|
64
|
+
batch_size=256,
|
|
65
|
+
device="cuda"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Filter low-quality (threshold > 0.5 = high quality)
|
|
69
|
+
dataset = dataset.filter(lambda doc: quality_clf(doc["text"]) > 0.5)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### NSFW classifier
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from nemo_curator.classifiers import NSFWClassifier
|
|
76
|
+
|
|
77
|
+
nsfw_clf = NSFWClassifier(threshold=0.9, device="cuda")
|
|
78
|
+
|
|
79
|
+
# Remove NSFW content
|
|
80
|
+
dataset = dataset.filter(lambda doc: nsfw_clf(doc["text"]) < 0.9)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Heuristic filters
|
|
84
|
+
|
|
85
|
+
Full list of 30+ filters:
|
|
86
|
+
- WordCountFilter
|
|
87
|
+
- RepeatedLinesFilter
|
|
88
|
+
- UrlRatioFilter
|
|
89
|
+
- SymbolToWordRatioFilter
|
|
90
|
+
- NonAlphaNumericFilter
|
|
91
|
+
- BulletsFilter
|
|
92
|
+
- WhiteSpaceFilter
|
|
93
|
+
- ParenthesesFilter
|
|
94
|
+
- LongWordFilter
|
|
95
|
+
- And 20+ more...
|
|
96
|
+
|
|
97
|
+
## Best practices
|
|
98
|
+
|
|
99
|
+
1. **Apply cheap filters first** - Word count before GPU classifiers
|
|
100
|
+
2. **Tune thresholds on sample** - Test on 10k docs before full run
|
|
101
|
+
3. **Use GPU classifiers sparingly** - Expensive but effective
|
|
102
|
+
4. **Chain filters efficiently** - Order by cost (cheap → expensive)
|