@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
# Training Custom Tokenizers
|
|
2
|
+
|
|
3
|
+
Complete guide to training tokenizers from scratch.
|
|
4
|
+
|
|
5
|
+
## Training workflow
|
|
6
|
+
|
|
7
|
+
### Step 1: Choose tokenization algorithm
|
|
8
|
+
|
|
9
|
+
**Decision tree**:
|
|
10
|
+
- **GPT-style model** → BPE
|
|
11
|
+
- **BERT-style model** → WordPiece
|
|
12
|
+
- **Multilingual/No word boundaries** → Unigram
|
|
13
|
+
|
|
14
|
+
### Step 2: Prepare training data
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
# Option 1: From files
|
|
18
|
+
files = ["train.txt", "validation.txt"]
|
|
19
|
+
|
|
20
|
+
# Option 2: From Python list
|
|
21
|
+
texts = [
|
|
22
|
+
"This is the first sentence.",
|
|
23
|
+
"This is the second sentence.",
|
|
24
|
+
# ... more texts
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# Option 3: From dataset iterator
|
|
28
|
+
from datasets import load_dataset
|
|
29
|
+
|
|
30
|
+
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
|
|
31
|
+
|
|
32
|
+
def batch_iterator(batch_size=1000):
|
|
33
|
+
for i in range(0, len(dataset), batch_size):
|
|
34
|
+
yield dataset[i:i + batch_size]["text"]
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Step 3: Initialize tokenizer
|
|
38
|
+
|
|
39
|
+
**BPE example**:
|
|
40
|
+
```python
|
|
41
|
+
from tokenizers import Tokenizer
|
|
42
|
+
from tokenizers.models import BPE
|
|
43
|
+
from tokenizers.trainers import BpeTrainer
|
|
44
|
+
from tokenizers.pre_tokenizers import ByteLevel
|
|
45
|
+
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
|
|
46
|
+
|
|
47
|
+
tokenizer = Tokenizer(BPE())
|
|
48
|
+
tokenizer.pre_tokenizer = ByteLevel()
|
|
49
|
+
tokenizer.decoder = ByteLevelDecoder()
|
|
50
|
+
|
|
51
|
+
trainer = BpeTrainer(
|
|
52
|
+
vocab_size=50000,
|
|
53
|
+
min_frequency=2,
|
|
54
|
+
special_tokens=["<|endoftext|>", "<|padding|>"],
|
|
55
|
+
show_progress=True
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**WordPiece example**:
|
|
60
|
+
```python
|
|
61
|
+
from tokenizers.models import WordPiece
|
|
62
|
+
from tokenizers.trainers import WordPieceTrainer
|
|
63
|
+
from tokenizers.normalizers import BertNormalizer
|
|
64
|
+
from tokenizers.pre_tokenizers import BertPreTokenizer
|
|
65
|
+
|
|
66
|
+
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
|
|
67
|
+
tokenizer.normalizer = BertNormalizer(lowercase=True)
|
|
68
|
+
tokenizer.pre_tokenizer = BertPreTokenizer()
|
|
69
|
+
|
|
70
|
+
trainer = WordPieceTrainer(
|
|
71
|
+
vocab_size=30522,
|
|
72
|
+
min_frequency=2,
|
|
73
|
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
|
74
|
+
continuing_subword_prefix="##",
|
|
75
|
+
show_progress=True
|
|
76
|
+
)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Unigram example**:
|
|
80
|
+
```python
|
|
81
|
+
from tokenizers.models import Unigram
|
|
82
|
+
from tokenizers.trainers import UnigramTrainer
|
|
83
|
+
|
|
84
|
+
tokenizer = Tokenizer(Unigram())
|
|
85
|
+
|
|
86
|
+
trainer = UnigramTrainer(
|
|
87
|
+
vocab_size=8000,
|
|
88
|
+
special_tokens=["<unk>", "<s>", "</s>", "<pad>"],
|
|
89
|
+
unk_token="<unk>",
|
|
90
|
+
show_progress=True
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Step 4: Train
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
# From files
|
|
98
|
+
tokenizer.train(files=files, trainer=trainer)
|
|
99
|
+
|
|
100
|
+
# From iterator (recommended for large datasets)
|
|
101
|
+
tokenizer.train_from_iterator(
|
|
102
|
+
batch_iterator(),
|
|
103
|
+
trainer=trainer,
|
|
104
|
+
length=len(dataset) # Optional, for progress bar
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Training time** (30k vocab on 16-core CPU):
|
|
109
|
+
- 10 MB: 15-30 seconds
|
|
110
|
+
- 100 MB: 1-3 minutes
|
|
111
|
+
- 1 GB: 15-30 minutes
|
|
112
|
+
- 10 GB: 2-4 hours
|
|
113
|
+
|
|
114
|
+
### Step 5: Add post-processing
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from tokenizers.processors import TemplateProcessing
|
|
118
|
+
|
|
119
|
+
# BERT-style
|
|
120
|
+
tokenizer.post_processor = TemplateProcessing(
|
|
121
|
+
single="[CLS] $A [SEP]",
|
|
122
|
+
pair="[CLS] $A [SEP] $B [SEP]",
|
|
123
|
+
special_tokens=[
|
|
124
|
+
("[CLS]", tokenizer.token_to_id("[CLS]")),
|
|
125
|
+
("[SEP]", tokenizer.token_to_id("[SEP]")),
|
|
126
|
+
],
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# GPT-2 style
|
|
130
|
+
tokenizer.post_processor = TemplateProcessing(
|
|
131
|
+
single="$A <|endoftext|>",
|
|
132
|
+
special_tokens=[
|
|
133
|
+
("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>")),
|
|
134
|
+
],
|
|
135
|
+
)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Step 6: Save
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
# Save to JSON
|
|
142
|
+
tokenizer.save("my-tokenizer.json")
|
|
143
|
+
|
|
144
|
+
# Save to directory (for transformers)
|
|
145
|
+
tokenizer.save("my-tokenizer-dir/tokenizer.json")
|
|
146
|
+
|
|
147
|
+
# Convert to transformers format
|
|
148
|
+
from transformers import PreTrainedTokenizerFast
|
|
149
|
+
|
|
150
|
+
transformers_tokenizer = PreTrainedTokenizerFast(
|
|
151
|
+
tokenizer_object=tokenizer,
|
|
152
|
+
unk_token="[UNK]",
|
|
153
|
+
pad_token="[PAD]",
|
|
154
|
+
cls_token="[CLS]",
|
|
155
|
+
sep_token="[SEP]",
|
|
156
|
+
mask_token="[MASK]"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
transformers_tokenizer.save_pretrained("my-tokenizer-dir")
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Trainer configuration
|
|
163
|
+
|
|
164
|
+
### BpeTrainer parameters
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from tokenizers.trainers import BpeTrainer
|
|
168
|
+
|
|
169
|
+
trainer = BpeTrainer(
|
|
170
|
+
vocab_size=30000, # Target vocabulary size
|
|
171
|
+
min_frequency=2, # Minimum frequency for merges
|
|
172
|
+
special_tokens=["[UNK]"], # Special tokens (added first)
|
|
173
|
+
limit_alphabet=1000, # Limit initial alphabet size
|
|
174
|
+
initial_alphabet=[], # Pre-defined initial characters
|
|
175
|
+
show_progress=True, # Show progress bar
|
|
176
|
+
continuing_subword_prefix="", # Prefix for continuing subwords
|
|
177
|
+
end_of_word_suffix="" # Suffix for end of words
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
**Parameter tuning**:
|
|
182
|
+
- **vocab_size**: Start with 30k for English, 50k for multilingual
|
|
183
|
+
- **min_frequency**: 2-5 for large corpora, 1 for small
|
|
184
|
+
- **limit_alphabet**: Reduce for non-English (CJK languages)
|
|
185
|
+
|
|
186
|
+
### WordPieceTrainer parameters
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from tokenizers.trainers import WordPieceTrainer
|
|
190
|
+
|
|
191
|
+
trainer = WordPieceTrainer(
|
|
192
|
+
vocab_size=30522, # BERT uses 30,522
|
|
193
|
+
min_frequency=2,
|
|
194
|
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
|
195
|
+
limit_alphabet=1000,
|
|
196
|
+
continuing_subword_prefix="##", # BERT-style prefix
|
|
197
|
+
show_progress=True
|
|
198
|
+
)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### UnigramTrainer parameters
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from tokenizers.trainers import UnigramTrainer
|
|
205
|
+
|
|
206
|
+
trainer = UnigramTrainer(
|
|
207
|
+
vocab_size=8000, # Typically smaller than BPE/WordPiece
|
|
208
|
+
special_tokens=["<unk>", "<s>", "</s>"],
|
|
209
|
+
unk_token="<unk>",
|
|
210
|
+
max_piece_length=16, # Maximum token length
|
|
211
|
+
n_sub_iterations=2, # EM algorithm iterations
|
|
212
|
+
shrinking_factor=0.75, # Vocabulary reduction rate
|
|
213
|
+
show_progress=True
|
|
214
|
+
)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Training from large datasets
|
|
218
|
+
|
|
219
|
+
### Memory-efficient training
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
from datasets import load_dataset
|
|
223
|
+
from tokenizers import Tokenizer
|
|
224
|
+
from tokenizers.models import BPE
|
|
225
|
+
from tokenizers.trainers import BpeTrainer
|
|
226
|
+
|
|
227
|
+
# Load dataset
|
|
228
|
+
dataset = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
|
|
229
|
+
|
|
230
|
+
# Create iterator (yields batches)
|
|
231
|
+
def batch_iterator(batch_size=1000):
|
|
232
|
+
batch = []
|
|
233
|
+
for sample in dataset:
|
|
234
|
+
batch.append(sample["text"])
|
|
235
|
+
if len(batch) >= batch_size:
|
|
236
|
+
yield batch
|
|
237
|
+
batch = []
|
|
238
|
+
if batch:
|
|
239
|
+
yield batch
|
|
240
|
+
|
|
241
|
+
# Initialize tokenizer
|
|
242
|
+
tokenizer = Tokenizer(BPE())
|
|
243
|
+
trainer = BpeTrainer(vocab_size=50000, special_tokens=["<|endoftext|>"])
|
|
244
|
+
|
|
245
|
+
# Train (memory efficient - streams data)
|
|
246
|
+
tokenizer.train_from_iterator(
|
|
247
|
+
batch_iterator(),
|
|
248
|
+
trainer=trainer
|
|
249
|
+
)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
**Memory usage**: ~200 MB (vs 10+ GB loading full dataset)
|
|
253
|
+
|
|
254
|
+
### Multi-file training
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
import glob
|
|
258
|
+
|
|
259
|
+
# Find all training files
|
|
260
|
+
files = glob.glob("data/train/*.txt")
|
|
261
|
+
print(f"Training on {len(files)} files")
|
|
262
|
+
|
|
263
|
+
# Train on all files
|
|
264
|
+
tokenizer.train(files=files, trainer=trainer)
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Parallel training (multi-processing)
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
from multiprocessing import Pool, cpu_count
|
|
271
|
+
import os
|
|
272
|
+
|
|
273
|
+
def train_shard(shard_files):
|
|
274
|
+
"""Train tokenizer on a shard of files."""
|
|
275
|
+
tokenizer = Tokenizer(BPE())
|
|
276
|
+
trainer = BpeTrainer(vocab_size=50000)
|
|
277
|
+
tokenizer.train(files=shard_files, trainer=trainer)
|
|
278
|
+
return tokenizer.get_vocab()
|
|
279
|
+
|
|
280
|
+
# Split files into shards
|
|
281
|
+
num_shards = cpu_count()
|
|
282
|
+
file_shards = [files[i::num_shards] for i in range(num_shards)]
|
|
283
|
+
|
|
284
|
+
# Train shards in parallel
|
|
285
|
+
with Pool(num_shards) as pool:
|
|
286
|
+
vocab_shards = pool.map(train_shard, file_shards)
|
|
287
|
+
|
|
288
|
+
# Merge vocabularies (custom logic needed)
|
|
289
|
+
# This is a simplified example - real implementation would merge intelligently
|
|
290
|
+
final_vocab = {}
|
|
291
|
+
for vocab in vocab_shards:
|
|
292
|
+
final_vocab.update(vocab)
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## Domain-specific tokenizers
|
|
296
|
+
|
|
297
|
+
### Code tokenizer
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
from tokenizers import Tokenizer
|
|
301
|
+
from tokenizers.models import BPE
|
|
302
|
+
from tokenizers.trainers import BpeTrainer
|
|
303
|
+
from tokenizers.pre_tokenizers import ByteLevel
|
|
304
|
+
from tokenizers.normalizers import Sequence, NFC
|
|
305
|
+
|
|
306
|
+
# Code-optimized configuration
|
|
307
|
+
tokenizer = Tokenizer(BPE())
|
|
308
|
+
|
|
309
|
+
# Minimal normalization (preserve case, whitespace)
|
|
310
|
+
tokenizer.normalizer = NFC() # Only normalize Unicode
|
|
311
|
+
|
|
312
|
+
# Byte-level pre-tokenization (handles all characters)
|
|
313
|
+
tokenizer.pre_tokenizer = ByteLevel()
|
|
314
|
+
|
|
315
|
+
# Train on code corpus
|
|
316
|
+
trainer = BpeTrainer(
|
|
317
|
+
vocab_size=50000,
|
|
318
|
+
special_tokens=["<|endoftext|>", "<|pad|>"],
|
|
319
|
+
min_frequency=2
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
tokenizer.train(files=["code_corpus.txt"], trainer=trainer)
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### Medical/scientific tokenizer
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
# Preserve case and special characters
|
|
329
|
+
from tokenizers.normalizers import NFKC
|
|
330
|
+
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence
|
|
331
|
+
|
|
332
|
+
tokenizer = Tokenizer(BPE())
|
|
333
|
+
|
|
334
|
+
# Minimal normalization
|
|
335
|
+
tokenizer.normalizer = NFKC()
|
|
336
|
+
|
|
337
|
+
# Preserve medical terms
|
|
338
|
+
tokenizer.pre_tokenizer = Sequence([
|
|
339
|
+
Whitespace(),
|
|
340
|
+
Punctuation(behavior="isolated") # Keep punctuation separate
|
|
341
|
+
])
|
|
342
|
+
|
|
343
|
+
trainer = BpeTrainer(
|
|
344
|
+
vocab_size=50000,
|
|
345
|
+
special_tokens=["[UNK]", "[CLS]", "[SEP]"],
|
|
346
|
+
min_frequency=3 # Higher threshold for rare medical terms
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
tokenizer.train(files=["pubmed_corpus.txt"], trainer=trainer)
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
### Multilingual tokenizer
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
# Handle multiple scripts
|
|
356
|
+
from tokenizers.normalizers import NFKC, Lowercase, Sequence
|
|
357
|
+
|
|
358
|
+
tokenizer = Tokenizer(BPE())
|
|
359
|
+
|
|
360
|
+
# Normalize but don't lowercase (preserves script differences)
|
|
361
|
+
tokenizer.normalizer = NFKC()
|
|
362
|
+
|
|
363
|
+
# Byte-level handles all Unicode
|
|
364
|
+
from tokenizers.pre_tokenizers import ByteLevel
|
|
365
|
+
tokenizer.pre_tokenizer = ByteLevel()
|
|
366
|
+
|
|
367
|
+
trainer = BpeTrainer(
|
|
368
|
+
vocab_size=100000, # Larger vocab for multiple languages
|
|
369
|
+
special_tokens=["<unk>", "<s>", "</s>"],
|
|
370
|
+
limit_alphabet=None # No limit (handles all scripts)
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Train on multilingual corpus
|
|
374
|
+
tokenizer.train(files=["multilingual_corpus.txt"], trainer=trainer)
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
## Vocabulary size selection
|
|
378
|
+
|
|
379
|
+
### Guidelines by task
|
|
380
|
+
|
|
381
|
+
| Task | Recommended Vocab Size | Rationale |
|
|
382
|
+
|-----------------------|------------------------|-----------|
|
|
383
|
+
| English (monolingual) | 30,000 - 50,000 | Balanced coverage |
|
|
384
|
+
| Multilingual | 50,000 - 250,000 | More languages = more tokens |
|
|
385
|
+
| Code | 30,000 - 50,000 | Similar to English |
|
|
386
|
+
| Domain-specific | 10,000 - 30,000 | Smaller, focused vocabulary |
|
|
387
|
+
| Character-level tasks | 1,000 - 5,000 | Only characters + subwords |
|
|
388
|
+
|
|
389
|
+
### Vocabulary size impact
|
|
390
|
+
|
|
391
|
+
**Small vocab (10k)**:
|
|
392
|
+
- Pros: Faster training, smaller model, less memory
|
|
393
|
+
- Cons: More tokens per sentence, worse OOV handling
|
|
394
|
+
|
|
395
|
+
**Medium vocab (30k-50k)**:
|
|
396
|
+
- Pros: Good balance, standard choice
|
|
397
|
+
- Cons: None (recommended default)
|
|
398
|
+
|
|
399
|
+
**Large vocab (100k+)**:
|
|
400
|
+
- Pros: Fewer tokens per sentence, better OOV
|
|
401
|
+
- Cons: Slower training, larger embedding table
|
|
402
|
+
|
|
403
|
+
### Empirical testing
|
|
404
|
+
|
|
405
|
+
```python
|
|
406
|
+
# Train multiple tokenizers with different vocab sizes
|
|
407
|
+
vocab_sizes = [10000, 30000, 50000, 100000]
|
|
408
|
+
|
|
409
|
+
for vocab_size in vocab_sizes:
|
|
410
|
+
tokenizer = Tokenizer(BPE())
|
|
411
|
+
trainer = BpeTrainer(vocab_size=vocab_size)
|
|
412
|
+
tokenizer.train(files=["sample.txt"], trainer=trainer)
|
|
413
|
+
|
|
414
|
+
# Evaluate on test set
|
|
415
|
+
test_text = "Test sentence for evaluation..."
|
|
416
|
+
tokens = tokenizer.encode(test_text).ids
|
|
417
|
+
|
|
418
|
+
print(f"Vocab: {vocab_size:6d} | Tokens: {len(tokens):3d} | Avg: {len(test_text)/len(tokens):.2f} chars/token")
|
|
419
|
+
|
|
420
|
+
# Example output:
|
|
421
|
+
# Vocab: 10000 | Tokens: 12 | Avg: 2.33 chars/token
|
|
422
|
+
# Vocab: 30000 | Tokens: 8 | Avg: 3.50 chars/token
|
|
423
|
+
# Vocab: 50000 | Tokens: 7 | Avg: 4.00 chars/token
|
|
424
|
+
# Vocab: 100000 | Tokens: 6 | Avg: 4.67 chars/token
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
## Testing tokenizer quality
|
|
428
|
+
|
|
429
|
+
### Coverage test
|
|
430
|
+
|
|
431
|
+
```python
|
|
432
|
+
# Test on held-out data
|
|
433
|
+
test_corpus = load_dataset("wikitext", "wikitext-103-raw-v1", split="test")
|
|
434
|
+
|
|
435
|
+
total_tokens = 0
|
|
436
|
+
unk_tokens = 0
|
|
437
|
+
unk_id = tokenizer.token_to_id("[UNK]")
|
|
438
|
+
|
|
439
|
+
for text in test_corpus["text"]:
|
|
440
|
+
if text.strip():
|
|
441
|
+
encoding = tokenizer.encode(text)
|
|
442
|
+
total_tokens += len(encoding.ids)
|
|
443
|
+
unk_tokens += encoding.ids.count(unk_id)
|
|
444
|
+
|
|
445
|
+
unk_rate = unk_tokens / total_tokens
|
|
446
|
+
print(f"Unknown token rate: {unk_rate:.2%}")
|
|
447
|
+
|
|
448
|
+
# Good quality: <1% unknown tokens
|
|
449
|
+
# Acceptable: 1-5%
|
|
450
|
+
# Poor: >5%
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
### Compression test
|
|
454
|
+
|
|
455
|
+
```python
|
|
456
|
+
# Measure tokenization efficiency
|
|
457
|
+
import numpy as np
|
|
458
|
+
|
|
459
|
+
token_lengths = []
|
|
460
|
+
|
|
461
|
+
for text in test_corpus["text"][:1000]:
|
|
462
|
+
if text.strip():
|
|
463
|
+
encoding = tokenizer.encode(text)
|
|
464
|
+
chars_per_token = len(text) / len(encoding.ids)
|
|
465
|
+
token_lengths.append(chars_per_token)
|
|
466
|
+
|
|
467
|
+
avg_chars_per_token = np.mean(token_lengths)
|
|
468
|
+
print(f"Average characters per token: {avg_chars_per_token:.2f}")
|
|
469
|
+
|
|
470
|
+
# Good: 4-6 chars/token (English)
|
|
471
|
+
# Acceptable: 3-4 chars/token
|
|
472
|
+
# Poor: <3 chars/token (under-compression)
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
### Semantic test
|
|
476
|
+
|
|
477
|
+
```python
|
|
478
|
+
# Manually inspect tokenization of common words/phrases
|
|
479
|
+
test_phrases = [
|
|
480
|
+
"tokenization",
|
|
481
|
+
"machine learning",
|
|
482
|
+
"artificial intelligence",
|
|
483
|
+
"preprocessing",
|
|
484
|
+
"hello world"
|
|
485
|
+
]
|
|
486
|
+
|
|
487
|
+
for phrase in test_phrases:
|
|
488
|
+
tokens = tokenizer.encode(phrase).tokens
|
|
489
|
+
print(f"{phrase:25s} → {tokens}")
|
|
490
|
+
|
|
491
|
+
# Good tokenization:
|
|
492
|
+
# tokenization → ['token', 'ization']
|
|
493
|
+
# machine learning → ['machine', 'learning']
|
|
494
|
+
# artificial intelligence → ['artificial', 'intelligence']
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
## Troubleshooting
|
|
498
|
+
|
|
499
|
+
### Issue: Training too slow
|
|
500
|
+
|
|
501
|
+
**Solutions**:
|
|
502
|
+
1. Reduce vocabulary size
|
|
503
|
+
2. Increase `min_frequency`
|
|
504
|
+
3. Use `limit_alphabet` to reduce initial alphabet
|
|
505
|
+
4. Train on subset first
|
|
506
|
+
|
|
507
|
+
```python
|
|
508
|
+
# Fast training configuration
|
|
509
|
+
trainer = BpeTrainer(
|
|
510
|
+
vocab_size=20000, # Smaller vocab
|
|
511
|
+
min_frequency=5, # Higher threshold
|
|
512
|
+
limit_alphabet=500, # Limit alphabet
|
|
513
|
+
show_progress=True
|
|
514
|
+
)
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
### Issue: High unknown token rate
|
|
518
|
+
|
|
519
|
+
**Solutions**:
|
|
520
|
+
1. Increase vocabulary size
|
|
521
|
+
2. Decrease `min_frequency`
|
|
522
|
+
3. Check normalization (might be too aggressive)
|
|
523
|
+
|
|
524
|
+
```python
|
|
525
|
+
# Better coverage configuration
|
|
526
|
+
trainer = BpeTrainer(
|
|
527
|
+
vocab_size=50000, # Larger vocab
|
|
528
|
+
min_frequency=1, # Lower threshold
|
|
529
|
+
)
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
### Issue: Poor quality tokenization
|
|
533
|
+
|
|
534
|
+
**Solutions**:
|
|
535
|
+
1. Verify normalization matches your use case
|
|
536
|
+
2. Check pre-tokenization splits correctly
|
|
537
|
+
3. Ensure training data is representative
|
|
538
|
+
4. Try different algorithm (BPE vs WordPiece vs Unigram)
|
|
539
|
+
|
|
540
|
+
```python
|
|
541
|
+
# Debug tokenization pipeline
|
|
542
|
+
text = "Sample text to debug"
|
|
543
|
+
|
|
544
|
+
# Check normalization
|
|
545
|
+
normalized = tokenizer.normalizer.normalize_str(text)
|
|
546
|
+
print(f"Normalized: {normalized}")
|
|
547
|
+
|
|
548
|
+
# Check pre-tokenization
|
|
549
|
+
pre_tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
|
|
550
|
+
print(f"Pre-tokens: {pre_tokens}")
|
|
551
|
+
|
|
552
|
+
# Check final tokenization
|
|
553
|
+
tokens = tokenizer.encode(text).tokens
|
|
554
|
+
print(f"Tokens: {tokens}")
|
|
555
|
+
```
|
|
556
|
+
|
|
557
|
+
## Best practices
|
|
558
|
+
|
|
559
|
+
1. **Use representative training data** - Match your target domain
|
|
560
|
+
2. **Start with standard configs** - BERT WordPiece or GPT-2 BPE
|
|
561
|
+
3. **Test on held-out data** - Measure unknown token rate
|
|
562
|
+
4. **Iterate on vocabulary size** - Test 30k, 50k, 100k
|
|
563
|
+
5. **Save tokenizer with model** - Ensure reproducibility
|
|
564
|
+
6. **Version your tokenizers** - Track changes for reproducibility
|
|
565
|
+
7. **Document special tokens** - Critical for model training
|