@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sentence-transformers
|
|
3
|
+
description: Framework for state-of-the-art sentence, text, and image embeddings. Provides 5000+ pre-trained models for semantic similarity, clustering, and retrieval. Supports multilingual, domain-specific, and multimodal models. Use for generating embeddings for RAG, semantic search, or similarity tasks. Best for production embedding generation.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Sentence Transformers, Embeddings, Semantic Similarity, RAG, Multilingual, Multimodal, Pre-Trained Models, Clustering, Semantic Search, Production]
|
|
8
|
+
dependencies: [sentence-transformers, transformers, torch]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Sentence Transformers - State-of-the-Art Embeddings
|
|
12
|
+
|
|
13
|
+
Python framework for sentence and text embeddings using transformers.
|
|
14
|
+
|
|
15
|
+
## When to use Sentence Transformers
|
|
16
|
+
|
|
17
|
+
**Use when:**
|
|
18
|
+
- Need high-quality embeddings for RAG
|
|
19
|
+
- Semantic similarity and search
|
|
20
|
+
- Text clustering and classification
|
|
21
|
+
- Multilingual embeddings (100+ languages)
|
|
22
|
+
- Running embeddings locally (no API)
|
|
23
|
+
- Cost-effective alternative to OpenAI embeddings
|
|
24
|
+
|
|
25
|
+
**Metrics**:
|
|
26
|
+
- **15,700+ GitHub stars**
|
|
27
|
+
- **5000+ pre-trained models**
|
|
28
|
+
- **100+ languages** supported
|
|
29
|
+
- Based on PyTorch/Transformers
|
|
30
|
+
|
|
31
|
+
**Use alternatives instead**:
|
|
32
|
+
- **OpenAI Embeddings**: Need API-based, highest quality
|
|
33
|
+
- **Instructor**: Task-specific instructions
|
|
34
|
+
- **Cohere Embed**: Managed service
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
### Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install sentence-transformers
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Basic usage
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from sentence_transformers import SentenceTransformer
|
|
48
|
+
|
|
49
|
+
# Load model
|
|
50
|
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
51
|
+
|
|
52
|
+
# Generate embeddings
|
|
53
|
+
sentences = [
|
|
54
|
+
"This is an example sentence",
|
|
55
|
+
"Each sentence is converted to a vector"
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
embeddings = model.encode(sentences)
|
|
59
|
+
print(embeddings.shape) # (2, 384)
|
|
60
|
+
|
|
61
|
+
# Cosine similarity
|
|
62
|
+
from sentence_transformers.util import cos_sim
|
|
63
|
+
similarity = cos_sim(embeddings[0], embeddings[1])
|
|
64
|
+
print(f"Similarity: {similarity.item():.4f}")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Popular models
|
|
68
|
+
|
|
69
|
+
### General purpose
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
# Fast, good quality (384 dim)
|
|
73
|
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
74
|
+
|
|
75
|
+
# Better quality (768 dim)
|
|
76
|
+
model = SentenceTransformer('all-mpnet-base-v2')
|
|
77
|
+
|
|
78
|
+
# Best quality (1024 dim, slower)
|
|
79
|
+
model = SentenceTransformer('all-roberta-large-v1')
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Multilingual
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
# 50+ languages
|
|
86
|
+
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
|
87
|
+
|
|
88
|
+
# 100+ languages
|
|
89
|
+
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Domain-specific
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# Legal domain
|
|
96
|
+
model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')
|
|
97
|
+
|
|
98
|
+
# Scientific papers
|
|
99
|
+
model = SentenceTransformer('allenai/specter')
|
|
100
|
+
|
|
101
|
+
# Code
|
|
102
|
+
model = SentenceTransformer('microsoft/codebert-base')
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Semantic search
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from sentence_transformers import SentenceTransformer, util
|
|
109
|
+
|
|
110
|
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
111
|
+
|
|
112
|
+
# Corpus
|
|
113
|
+
corpus = [
|
|
114
|
+
"Python is a programming language",
|
|
115
|
+
"Machine learning uses algorithms",
|
|
116
|
+
"Neural networks are powerful"
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
# Encode corpus
|
|
120
|
+
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
|
|
121
|
+
|
|
122
|
+
# Query
|
|
123
|
+
query = "What is Python?"
|
|
124
|
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
125
|
+
|
|
126
|
+
# Find most similar
|
|
127
|
+
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=3)
|
|
128
|
+
print(hits)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Similarity computation
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# Cosine similarity
|
|
135
|
+
similarity = util.cos_sim(embedding1, embedding2)
|
|
136
|
+
|
|
137
|
+
# Dot product
|
|
138
|
+
similarity = util.dot_score(embedding1, embedding2)
|
|
139
|
+
|
|
140
|
+
# Pairwise cosine similarity
|
|
141
|
+
similarities = util.cos_sim(embeddings, embeddings)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Batch encoding
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
# Efficient batch processing
|
|
148
|
+
sentences = ["sentence 1", "sentence 2", ...] * 1000
|
|
149
|
+
|
|
150
|
+
embeddings = model.encode(
|
|
151
|
+
sentences,
|
|
152
|
+
batch_size=32,
|
|
153
|
+
show_progress_bar=True,
|
|
154
|
+
convert_to_tensor=False # or True for PyTorch tensors
|
|
155
|
+
)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Fine-tuning
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
from sentence_transformers import InputExample, losses
|
|
162
|
+
from torch.utils.data import DataLoader
|
|
163
|
+
|
|
164
|
+
# Training data
|
|
165
|
+
train_examples = [
|
|
166
|
+
InputExample(texts=['sentence 1', 'sentence 2'], label=0.8),
|
|
167
|
+
InputExample(texts=['sentence 3', 'sentence 4'], label=0.3),
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
train_dataloader = DataLoader(train_examples, batch_size=16)
|
|
171
|
+
|
|
172
|
+
# Loss function
|
|
173
|
+
train_loss = losses.CosineSimilarityLoss(model)
|
|
174
|
+
|
|
175
|
+
# Train
|
|
176
|
+
model.fit(
|
|
177
|
+
train_objectives=[(train_dataloader, train_loss)],
|
|
178
|
+
epochs=10,
|
|
179
|
+
warmup_steps=100
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Save
|
|
183
|
+
model.save('my-finetuned-model')
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## LangChain integration
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
190
|
+
|
|
191
|
+
embeddings = HuggingFaceEmbeddings(
|
|
192
|
+
model_name="sentence-transformers/all-mpnet-base-v2"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Use with vector stores
|
|
196
|
+
from langchain_chroma import Chroma
|
|
197
|
+
|
|
198
|
+
vectorstore = Chroma.from_documents(
|
|
199
|
+
documents=docs,
|
|
200
|
+
embedding=embeddings
|
|
201
|
+
)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## LlamaIndex integration
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
208
|
+
|
|
209
|
+
embed_model = HuggingFaceEmbedding(
|
|
210
|
+
model_name="sentence-transformers/all-mpnet-base-v2"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
from llama_index.core import Settings
|
|
214
|
+
Settings.embed_model = embed_model
|
|
215
|
+
|
|
216
|
+
# Use in index
|
|
217
|
+
index = VectorStoreIndex.from_documents(documents)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Model selection guide
|
|
221
|
+
|
|
222
|
+
| Model | Dimensions | Speed | Quality | Use Case |
|
|
223
|
+
|-------|------------|-------|---------|----------|
|
|
224
|
+
| all-MiniLM-L6-v2 | 384 | Fast | Good | General, prototyping |
|
|
225
|
+
| all-mpnet-base-v2 | 768 | Medium | Better | Production RAG |
|
|
226
|
+
| all-roberta-large-v1 | 1024 | Slow | Best | High accuracy needed |
|
|
227
|
+
| paraphrase-multilingual | 768 | Medium | Good | Multilingual |
|
|
228
|
+
|
|
229
|
+
## Best practices
|
|
230
|
+
|
|
231
|
+
1. **Start with all-MiniLM-L6-v2** - Good baseline
|
|
232
|
+
2. **Normalize embeddings** - Better for cosine similarity
|
|
233
|
+
3. **Use GPU if available** - 10× faster encoding
|
|
234
|
+
4. **Batch encoding** - More efficient
|
|
235
|
+
5. **Cache embeddings** - Expensive to recompute
|
|
236
|
+
6. **Fine-tune for domain** - Improves quality
|
|
237
|
+
7. **Test different models** - Quality varies by task
|
|
238
|
+
8. **Monitor memory** - Large models need more RAM
|
|
239
|
+
|
|
240
|
+
## Performance
|
|
241
|
+
|
|
242
|
+
| Model | Speed (sentences/sec) | Memory | Dimension |
|
|
243
|
+
|-------|----------------------|---------|-----------|
|
|
244
|
+
| MiniLM | ~2000 | 120MB | 384 |
|
|
245
|
+
| MPNet | ~600 | 420MB | 768 |
|
|
246
|
+
| RoBERTa | ~300 | 1.3GB | 1024 |
|
|
247
|
+
|
|
248
|
+
## Resources
|
|
249
|
+
|
|
250
|
+
- **GitHub**: https://github.com/UKPLab/sentence-transformers ⭐ 15,700+
|
|
251
|
+
- **Models**: https://huggingface.co/sentence-transformers
|
|
252
|
+
- **Docs**: https://www.sbert.net
|
|
253
|
+
- **License**: Apache 2.0
|
|
254
|
+
|
|
255
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Sentence Transformers Models Guide
|
|
2
|
+
|
|
3
|
+
Guide to selecting and using sentence-transformers models.
|
|
4
|
+
|
|
5
|
+
## Top recommended models
|
|
6
|
+
|
|
7
|
+
### General purpose
|
|
8
|
+
|
|
9
|
+
**all-MiniLM-L6-v2** (Default recommendation)
|
|
10
|
+
- Dimensions: 384
|
|
11
|
+
- Speed: ~2000 sentences/sec
|
|
12
|
+
- Quality: Good
|
|
13
|
+
- Use: Prototyping, general tasks
|
|
14
|
+
|
|
15
|
+
**all-mpnet-base-v2** (Best quality)
|
|
16
|
+
- Dimensions: 768
|
|
17
|
+
- Speed: ~600 sentences/sec
|
|
18
|
+
- Quality: Better
|
|
19
|
+
- Use: Production RAG
|
|
20
|
+
|
|
21
|
+
**all-roberta-large-v1** (Highest quality)
|
|
22
|
+
- Dimensions: 1024
|
|
23
|
+
- Speed: ~300 sentences/sec
|
|
24
|
+
- Quality: Best
|
|
25
|
+
- Use: When accuracy critical
|
|
26
|
+
|
|
27
|
+
### Multilingual (50+ languages)
|
|
28
|
+
|
|
29
|
+
**paraphrase-multilingual-MiniLM-L12-v2**
|
|
30
|
+
- Languages: 50+
|
|
31
|
+
- Dimensions: 384
|
|
32
|
+
- Speed: Fast
|
|
33
|
+
- Use: Multilingual semantic search
|
|
34
|
+
|
|
35
|
+
**paraphrase-multilingual-mpnet-base-v2**
|
|
36
|
+
- Languages: 50+
|
|
37
|
+
- Dimensions: 768
|
|
38
|
+
- Speed: Medium
|
|
39
|
+
- Use: Better multilingual quality
|
|
40
|
+
|
|
41
|
+
**LaBSE** (109 languages)
|
|
42
|
+
- Languages: 109
|
|
43
|
+
- Dimensions: 768
|
|
44
|
+
- Speed: Medium
|
|
45
|
+
- Use: Maximum language coverage
|
|
46
|
+
|
|
47
|
+
### Domain-specific
|
|
48
|
+
|
|
49
|
+
**allenai/specter** (Scientific papers)
|
|
50
|
+
- Domain: Academic papers
|
|
51
|
+
- Use: Paper similarity, citations
|
|
52
|
+
|
|
53
|
+
**nlpaueb/legal-bert-base-uncased** (Legal)
|
|
54
|
+
- Domain: Legal documents
|
|
55
|
+
- Use: Legal document analysis
|
|
56
|
+
|
|
57
|
+
**microsoft/codebert-base** (Code)
|
|
58
|
+
- Domain: Source code
|
|
59
|
+
- Use: Code similarity, search
|
|
60
|
+
|
|
61
|
+
## Model selection matrix
|
|
62
|
+
|
|
63
|
+
| Task | Model | Dimensions | Speed | Quality |
|
|
64
|
+
|------|-------|------------|-------|---------|
|
|
65
|
+
| Quick prototyping | MiniLM-L6 | 384 | Fast | Good |
|
|
66
|
+
| Production RAG | mpnet-base | 768 | Medium | Better |
|
|
67
|
+
| Highest accuracy | roberta-large | 1024 | Slow | Best |
|
|
68
|
+
| Multilingual | paraphrase-multi-mpnet | 768 | Medium | Good |
|
|
69
|
+
| Scientific papers | specter | 768 | Medium | Domain |
|
|
70
|
+
| Legal docs | legal-bert | 768 | Medium | Domain |
|
|
71
|
+
|
|
72
|
+
## Performance benchmarks
|
|
73
|
+
|
|
74
|
+
### Speed comparison (CPU)
|
|
75
|
+
|
|
76
|
+
| Model | Sentences/sec | Memory |
|
|
77
|
+
|-------|---------------|--------|
|
|
78
|
+
| MiniLM-L6 | 2000 | 120 MB |
|
|
79
|
+
| MPNet-base | 600 | 420 MB |
|
|
80
|
+
| RoBERTa-large | 300 | 1.3 GB |
|
|
81
|
+
|
|
82
|
+
### Quality comparison (STS Benchmark)
|
|
83
|
+
|
|
84
|
+
| Model | Cosine Similarity | Spearman |
|
|
85
|
+
|-------|-------------------|----------|
|
|
86
|
+
| MiniLM-L6 | 82.4 | - |
|
|
87
|
+
| MPNet-base | 84.1 | - |
|
|
88
|
+
| RoBERTa-large | 85.4 | - |
|
|
89
|
+
|
|
90
|
+
## Usage examples
|
|
91
|
+
|
|
92
|
+
### Load and use model
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from sentence_transformers import SentenceTransformer
|
|
96
|
+
|
|
97
|
+
# Load model
|
|
98
|
+
model = SentenceTransformer('all-mpnet-base-v2')
|
|
99
|
+
|
|
100
|
+
# Generate embeddings
|
|
101
|
+
sentences = ["This is a sentence", "This is another sentence"]
|
|
102
|
+
embeddings = model.encode(sentences)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Compare different models
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
models = {
|
|
109
|
+
'MiniLM': 'all-MiniLM-L6-v2',
|
|
110
|
+
'MPNet': 'all-mpnet-base-v2',
|
|
111
|
+
'RoBERTa': 'all-roberta-large-v1'
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for name, model_name in models.items():
|
|
115
|
+
model = SentenceTransformer(model_name)
|
|
116
|
+
embeddings = model.encode(["Test sentence"])
|
|
117
|
+
print(f"{name}: {embeddings.shape}")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Resources
|
|
121
|
+
|
|
122
|
+
- **Models**: https://huggingface.co/sentence-transformers
|
|
123
|
+
- **Docs**: https://www.sbert.net/docs/pretrained_models.html
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sentencepiece
|
|
3
|
+
description: Language-independent tokenizer treating text as raw Unicode. Supports BPE and Unigram algorithms. Fast (50k sentences/sec), lightweight (6MB memory), deterministic vocabulary. Used by T5, ALBERT, XLNet, mBART. Train on raw text without pre-tokenization. Use when you need multilingual support, CJK languages, or reproducible tokenization.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Tokenization, SentencePiece, Language-Independent, BPE, Unigram, Multilingual, CJK Languages, Unicode, Deterministic, Google]
|
|
8
|
+
dependencies: [sentencepiece, transformers]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# SentencePiece - Language-Independent Tokenization
|
|
12
|
+
|
|
13
|
+
Unsupervised tokenizer that works on raw text without language-specific preprocessing.
|
|
14
|
+
|
|
15
|
+
## When to use SentencePiece
|
|
16
|
+
|
|
17
|
+
**Use SentencePiece when:**
|
|
18
|
+
- Building multilingual models (no language-specific rules)
|
|
19
|
+
- Working with CJK languages (Chinese, Japanese, Korean)
|
|
20
|
+
- Need reproducible tokenization (deterministic vocabulary)
|
|
21
|
+
- Want to train on raw text (no pre-tokenization needed)
|
|
22
|
+
- Require lightweight deployment (6MB memory, 50k sentences/sec)
|
|
23
|
+
|
|
24
|
+
**Performance**:
|
|
25
|
+
- **Speed**: 50,000 sentences/sec
|
|
26
|
+
- **Memory**: ~6MB for loaded model
|
|
27
|
+
- **Languages**: All (language-independent)
|
|
28
|
+
|
|
29
|
+
**Use alternatives instead**:
|
|
30
|
+
- **HuggingFace Tokenizers**: Faster training, more flexibility
|
|
31
|
+
- **tiktoken**: OpenAI models (GPT-3.5/4)
|
|
32
|
+
- **BERT WordPiece**: English-centric tasks
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
### Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Python
|
|
40
|
+
pip install sentencepiece
|
|
41
|
+
|
|
42
|
+
# C++ (requires CMake)
|
|
43
|
+
git clone https://github.com/google/sentencepiece.git
|
|
44
|
+
cd sentencepiece
|
|
45
|
+
mkdir build && cd build
|
|
46
|
+
cmake .. && make -j $(nproc)
|
|
47
|
+
sudo make install
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Train model
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Command-line (BPE with 8000 vocab)
|
|
54
|
+
spm_train --input=data.txt --model_prefix=m --vocab_size=8000 --model_type=bpe
|
|
55
|
+
|
|
56
|
+
# Python API
|
|
57
|
+
import sentencepiece as spm
|
|
58
|
+
|
|
59
|
+
spm.SentencePieceTrainer.train(
|
|
60
|
+
input='data.txt',
|
|
61
|
+
model_prefix='m',
|
|
62
|
+
vocab_size=8000,
|
|
63
|
+
model_type='bpe'
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Training time**: ~1-2 minutes for 100MB corpus
|
|
68
|
+
|
|
69
|
+
### Encode and decode
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import sentencepiece as spm
|
|
73
|
+
|
|
74
|
+
# Load model
|
|
75
|
+
sp = spm.SentencePieceProcessor(model_file='m.model')
|
|
76
|
+
|
|
77
|
+
# Encode to pieces
|
|
78
|
+
pieces = sp.encode('This is a test', out_type=str)
|
|
79
|
+
print(pieces) # ['▁This', '▁is', '▁a', '▁test']
|
|
80
|
+
|
|
81
|
+
# Encode to IDs
|
|
82
|
+
ids = sp.encode('This is a test', out_type=int)
|
|
83
|
+
print(ids) # [284, 47, 11, 1243]
|
|
84
|
+
|
|
85
|
+
# Decode
|
|
86
|
+
text = sp.decode(ids)
|
|
87
|
+
print(text) # "This is a test"
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Language-independent design
|
|
91
|
+
|
|
92
|
+
### Whitespace as symbol (▁)
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
text = "Hello world"
|
|
96
|
+
pieces = sp.encode(text, out_type=str)
|
|
97
|
+
print(pieces) # ['▁Hello', '▁world']
|
|
98
|
+
|
|
99
|
+
# Decode preserves spaces
|
|
100
|
+
decoded = sp.decode_pieces(pieces)
|
|
101
|
+
print(decoded) # "Hello world"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Key principle**: Treat text as raw Unicode, whitespace = ▁ (meta symbol)
|
|
105
|
+
|
|
106
|
+
## Tokenization algorithms
|
|
107
|
+
|
|
108
|
+
### BPE (Byte-Pair Encoding)
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
spm.SentencePieceTrainer.train(
|
|
112
|
+
input='data.txt',
|
|
113
|
+
model_prefix='bpe_model',
|
|
114
|
+
vocab_size=16000,
|
|
115
|
+
model_type='bpe'
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Used by**: mBART
|
|
120
|
+
|
|
121
|
+
### Unigram (default)
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
spm.SentencePieceTrainer.train(
|
|
125
|
+
input='data.txt',
|
|
126
|
+
model_prefix='unigram_model',
|
|
127
|
+
vocab_size=8000,
|
|
128
|
+
model_type='unigram'
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
**Used by**: T5, ALBERT, XLNet
|
|
133
|
+
|
|
134
|
+
## Training configuration
|
|
135
|
+
|
|
136
|
+
### Essential parameters
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
spm.SentencePieceTrainer.train(
|
|
140
|
+
input='corpus.txt',
|
|
141
|
+
model_prefix='m',
|
|
142
|
+
vocab_size=32000,
|
|
143
|
+
model_type='unigram',
|
|
144
|
+
character_coverage=0.9995, # 1.0 for CJK
|
|
145
|
+
user_defined_symbols=['[SEP]', '[CLS]'],
|
|
146
|
+
unk_piece='<unk>',
|
|
147
|
+
num_threads=16
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Character coverage
|
|
152
|
+
|
|
153
|
+
| Language Type | Coverage | Rationale |
|
|
154
|
+
|---------------|----------|-----------|
|
|
155
|
+
| English | 0.9995 | Most common chars |
|
|
156
|
+
| CJK (Chinese) | 1.0 | All characters needed |
|
|
157
|
+
| Multilingual | 0.9995 | Balance |
|
|
158
|
+
|
|
159
|
+
## Encoding options
|
|
160
|
+
|
|
161
|
+
### Subword regularization
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
# Sample different tokenizations
|
|
165
|
+
for _ in range(3):
|
|
166
|
+
pieces = sp.encode('tokenization', out_type=str, enable_sampling=True, alpha=0.1)
|
|
167
|
+
print(pieces)
|
|
168
|
+
|
|
169
|
+
# Output (different each time):
|
|
170
|
+
# ['▁token', 'ization']
|
|
171
|
+
# ['▁tok', 'en', 'ization']
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Use case**: Data augmentation for robustness.
|
|
175
|
+
|
|
176
|
+
## Common patterns
|
|
177
|
+
|
|
178
|
+
### T5-style training
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
spm.SentencePieceTrainer.train(
|
|
182
|
+
input='c4_corpus.txt',
|
|
183
|
+
model_prefix='t5',
|
|
184
|
+
vocab_size=32000,
|
|
185
|
+
model_type='unigram',
|
|
186
|
+
user_defined_symbols=[f'<extra_id_{i}>' for i in range(100)],
|
|
187
|
+
unk_id=2,
|
|
188
|
+
eos_id=1,
|
|
189
|
+
pad_id=0
|
|
190
|
+
)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Integration with transformers
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from transformers import T5Tokenizer
|
|
197
|
+
|
|
198
|
+
# T5 uses SentencePiece internally
|
|
199
|
+
tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
|
200
|
+
inputs = tokenizer('translate English to French: Hello', return_tensors='pt')
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Performance benchmarks
|
|
204
|
+
|
|
205
|
+
### Training speed
|
|
206
|
+
|
|
207
|
+
| Corpus | BPE (16k) | Unigram (8k) |
|
|
208
|
+
|--------|-----------|--------------|
|
|
209
|
+
| 100 MB | 1-2 min | 3-4 min |
|
|
210
|
+
| 1 GB | 10-15 min | 30-40 min |
|
|
211
|
+
|
|
212
|
+
### Tokenization speed
|
|
213
|
+
|
|
214
|
+
- **SentencePiece**: 50,000 sentences/sec
|
|
215
|
+
- **HF Tokenizers**: 200,000 sentences/sec (4× faster)
|
|
216
|
+
|
|
217
|
+
## Supported models
|
|
218
|
+
|
|
219
|
+
**T5 family**: `t5-base`, `t5-large` (32k vocab, Unigram)
|
|
220
|
+
**ALBERT**: `albert-base-v2` (30k vocab, Unigram)
|
|
221
|
+
**XLNet**: `xlnet-base-cased` (32k vocab, Unigram)
|
|
222
|
+
**mBART**: `facebook/mbart-large-50` (250k vocab, BPE)
|
|
223
|
+
|
|
224
|
+
## References
|
|
225
|
+
|
|
226
|
+
- **[Training Guide](references/training.md)** - Detailed options, corpus preparation
|
|
227
|
+
- **[Algorithms](references/algorithms.md)** - BPE vs Unigram, subword regularization
|
|
228
|
+
|
|
229
|
+
## Resources
|
|
230
|
+
|
|
231
|
+
- **GitHub**: https://github.com/google/sentencepiece ⭐ 10,000+
|
|
232
|
+
- **Paper**: https://arxiv.org/abs/1808.06226 (EMNLP 2018)
|
|
233
|
+
- **Version**: 0.2.0+
|
|
234
|
+
|
|
235
|
+
|