twinkle-kit 0.2.dev0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/PKG-INFO +73 -39
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/README.md +70 -37
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/pyproject.toml +3 -3
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/setup.cfg +1 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/checkpoint_engine/base.py +3 -1
- twinkle_kit-0.3.0/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py +478 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/checkpoint_engine/manager.py +37 -7
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/checkpoint_engine/mixin.py +9 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/data_format/message.py +1 -4
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/data_format/output.py +4 -2
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/data_format/sampling.py +56 -12
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/data_format/trajectory.py +5 -3
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataloader/dataloader.py +76 -11
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataloader/device_mesh_fetcher.py +4 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataloader/device_mesh_sampler.py +14 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataloader/retry_sampler.py +26 -10
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataset/base.py +48 -12
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataset/iterable_dataset.py +3 -2
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataset/iterable_packing_dataset.py +5 -4
- twinkle_kit-0.3.0/src/twinkle/dataset/lazy_dataset.py +209 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataset/packing_dataset.py +1 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/hub/hub.py +23 -12
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/infra/__init__.py +31 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/infra/_ray/ray_helper.py +1 -1
- twinkle_kit-0.3.0/src/twinkle/infra/collectors.py +55 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/kernel/__init__.py +1 -0
- twinkle_kit-0.3.0/src/twinkle/kernel/monkey_patch_npu.py +86 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss/__init__.py +10 -3
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss/base.py +2 -0
- twinkle_kit-0.3.0/src/twinkle/loss/cross_entropy.py +40 -0
- twinkle_kit-0.3.0/src/twinkle/loss/dpo.py +551 -0
- twinkle_kit-0.3.0/src/twinkle/loss/gkd.py +233 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss/grpo.py +4 -105
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss/mse.py +2 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/metric/__init__.py +1 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/metric/accuracy.py +13 -7
- twinkle_kit-0.3.0/src/twinkle/metric/dpo.py +214 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/metric/loss.py +16 -11
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/metric/train_metric.py +4 -2
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/base.py +21 -2
- twinkle_kit-0.3.0/src/twinkle/model/megatron/_mindspeed_runtime.py +221 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/megatron/megatron.py +396 -409
- twinkle_kit-0.3.0/src/twinkle/model/megatron/multi_lora_megatron.py +423 -0
- twinkle_kit-0.3.0/src/twinkle/model/megatron/strategy/megatron.py +325 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/multi_lora.py +153 -52
- twinkle_kit-0.3.0/src/twinkle/model/optimizer_group.py +85 -0
- twinkle_kit-0.3.0/src/twinkle/model/transformers/moe/__init__.py +4 -0
- twinkle_kit-0.3.0/src/twinkle/model/transformers/moe/ep_utils.py +291 -0
- twinkle_kit-0.3.0/src/twinkle/model/transformers/moe/expert_parallel.py +501 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/transformers/multi_lora_transformers.py +58 -16
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/transformers/strategy/accelerate.py +73 -7
- twinkle_kit-0.3.0/src/twinkle/model/transformers/strategy/load_context.py +27 -0
- twinkle_kit-0.3.0/src/twinkle/model/transformers/strategy/native_fsdp.py +546 -0
- twinkle_kit-0.2.dev0/src/twinkle/model/transformers/strategy/sequence_parallel.py → twinkle_kit-0.3.0/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py +421 -500
- twinkle_kit-0.3.0/src/twinkle/model/transformers/strategy/sequence_parallel/linear_attention_sp.py +278 -0
- twinkle_kit-0.3.0/src/twinkle/model/transformers/strategy/sequence_parallel/utils.py +383 -0
- twinkle_kit-0.3.0/src/twinkle/model/transformers/strategy/sequence_parallel/zigzag_ring_attn.py +642 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/transformers/transformers.py +421 -172
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/patch/base.py +2 -2
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/patch/megatron_peft.py +5 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/patch/vllm_lora_weights.py +4 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/patch/vllm_moe_loader.py +10 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/preprocessor/__init__.py +4 -1
- twinkle_kit-0.3.0/src/twinkle/preprocessor/base.py +42 -0
- twinkle_kit-0.3.0/src/twinkle/preprocessor/dpo.py +76 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/preprocessor/llm.py +56 -13
- twinkle_kit-0.3.0/src/twinkle/preprocessor/mm.py +67 -0
- twinkle_kit-0.3.0/src/twinkle/preprocessor/olympiad_bench.py +132 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/processor/__init__.py +0 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/processor/base.py +289 -34
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/reward/__init__.py +2 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/reward/gsm8k.py +35 -9
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/reward/math_reward.py +15 -5
- twinkle_kit-0.3.0/src/twinkle/reward/mm_reward.py +70 -0
- twinkle_kit-0.3.0/src/twinkle/reward/olympiad_bench.py +420 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/__init__.py +0 -2
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/base.py +6 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/vllm_sampler/vllm_engine.py +220 -145
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/vllm_sampler/vllm_sampler.py +149 -127
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py +185 -72
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/__main__.py +8 -24
- twinkle_kit-0.3.0/src/twinkle/server/common/__init__.py +13 -0
- twinkle_kit-0.3.0/src/twinkle/server/common/checkpoint_factory.py +39 -0
- {twinkle_kit-0.2.dev0/src/twinkle/server/tinker → twinkle_kit-0.3.0/src/twinkle/server}/common/datum.py +55 -12
- {twinkle_kit-0.2.dev0/src/twinkle/server/tinker → twinkle_kit-0.3.0/src/twinkle/server}/common/router.py +3 -1
- twinkle_kit-0.3.0/src/twinkle/server/common/tinker_checkpoint.py +134 -0
- twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/common/io_utils.py → twinkle_kit-0.3.0/src/twinkle/server/common/twinkle_checkpoint.py +22 -118
- twinkle_kit-0.3.0/src/twinkle/server/gateway/__init__.py +3 -0
- {twinkle_kit-0.2.dev0/src/twinkle/server/tinker → twinkle_kit-0.3.0/src/twinkle/server/gateway}/proxy.py +25 -58
- twinkle_kit-0.3.0/src/twinkle/server/gateway/server.py +119 -0
- twinkle_kit-0.3.0/src/twinkle/server/gateway/tinker_gateway_handlers.py +273 -0
- twinkle_kit-0.3.0/src/twinkle/server/gateway/twinkle_gateway_handlers.py +141 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/launcher.py +78 -102
- twinkle_kit-0.3.0/src/twinkle/server/model/__init__.py +3 -0
- twinkle_kit-0.3.0/src/twinkle/server/model/app.py +193 -0
- twinkle_kit-0.3.0/src/twinkle/server/model/backends/common.py +274 -0
- twinkle_kit-0.3.0/src/twinkle/server/model/backends/megatron_model.py +110 -0
- twinkle_kit-0.3.0/src/twinkle/server/model/backends/transformers_model.py +105 -0
- twinkle_kit-0.3.0/src/twinkle/server/model/tinker_handlers.py +306 -0
- twinkle_kit-0.3.0/src/twinkle/server/model/twinkle_handlers.py +607 -0
- twinkle_kit-0.3.0/src/twinkle/server/processor/__init__.py +3 -0
- twinkle_kit-0.3.0/src/twinkle/server/processor/app.py +138 -0
- twinkle_kit-0.3.0/src/twinkle/server/processor/twinkle_handlers.py +130 -0
- twinkle_kit-0.3.0/src/twinkle/server/sampler/__init__.py +3 -0
- twinkle_kit-0.3.0/src/twinkle/server/sampler/app.py +144 -0
- twinkle_kit-0.3.0/src/twinkle/server/sampler/tinker_handlers.py +129 -0
- twinkle_kit-0.3.0/src/twinkle/server/sampler/twinkle_handlers.py +196 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/__init__.py +7 -0
- twinkle_kit-0.2.dev0/src/twinkle/server/utils/io_utils.py → twinkle_kit-0.3.0/src/twinkle/server/utils/checkpoint_base.py +40 -35
- twinkle_kit-0.3.0/src/twinkle/server/utils/lifecycle/__init__.py +8 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/lifecycle/adapter.py +109 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/lifecycle/base.py +328 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/lifecycle/processor.py +109 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/metrics.py +267 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/ray_serve_patch.py +141 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/server_state.py +91 -102
- twinkle_kit-0.3.0/src/twinkle/server/utils/task_queue/__init__.py +26 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/task_queue/config.py +79 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/task_queue/mixin.py +362 -0
- {twinkle_kit-0.2.dev0/src/twinkle/server/utils → twinkle_kit-0.3.0/src/twinkle/server/utils/task_queue}/rate_limiter.py +23 -54
- twinkle_kit-0.3.0/src/twinkle/server/utils/task_queue/types.py +49 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/task_queue/worker.py +292 -0
- twinkle_kit-0.3.0/src/twinkle/server/utils/template_utils.py +47 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/validation.py +25 -6
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/template/__init__.py +1 -1
- twinkle_kit-0.3.0/src/twinkle/template/base.py +743 -0
- twinkle_kit-0.3.0/src/twinkle/template/qwen3_5_vl.py +166 -0
- twinkle_kit-0.3.0/src/twinkle/template/utils.py +370 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/__init__.py +3 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/device_mesh.py +49 -1
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/framework.py +16 -3
- twinkle_kit-0.3.0/src/twinkle/utils/grad_clip.py +240 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/logger.py +72 -5
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/parallel.py +17 -2
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/torch_utils.py +67 -8
- twinkle_kit-0.3.0/src/twinkle/utils/vision_tools.py +54 -0
- twinkle_kit-0.3.0/src/twinkle/utils/zmq_utils.py +26 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/version.py +1 -1
- twinkle_kit-0.3.0/src/twinkle_client/__init__.py +75 -0
- {twinkle_kit-0.2.dev0/src/twinkle/server/twinkle → twinkle_kit-0.3.0/src/twinkle_client}/common/serialize.py +12 -5
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataloader/dataloader.py +47 -14
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataset/base.py +25 -19
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataset/iterable_dataset.py +8 -15
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataset/iterable_packing_dataset.py +7 -14
- twinkle_kit-0.3.0/src/twinkle_client/dataset/lazy_dataset.py +146 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataset/packing_dataset.py +6 -13
- twinkle_kit-0.3.0/src/twinkle_client/http/__init__.py +19 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/http/http_utils.py +23 -6
- twinkle_kit-0.3.0/src/twinkle_client/http/utils.py +64 -0
- twinkle_kit-0.3.0/src/twinkle_client/manager.py +403 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/model/multi_lora_transformers.py +118 -78
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/processor/base.py +4 -11
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/sampler/vllm_sampler.py +21 -37
- twinkle_kit-0.3.0/src/twinkle_client/types/__init__.py +93 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/checkpoint.py +23 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/model.py +327 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/processor.py +46 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/sampler.py +74 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/server.py +42 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/session.py +24 -0
- twinkle_kit-0.3.0/src/twinkle_client/types/training.py +91 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/utils/patch_tinker.py +4 -3
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_kit.egg-info/PKG-INFO +73 -39
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_kit.egg-info/SOURCES.txt +70 -56
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_kit.egg-info/requires.txt +2 -1
- twinkle_kit-0.2.dev0/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py +0 -439
- twinkle_kit-0.2.dev0/src/twinkle/dataset/lazy_dataset.py +0 -43
- twinkle_kit-0.2.dev0/src/twinkle/loss/cross_entropy.py +0 -20
- twinkle_kit-0.2.dev0/src/twinkle/loss/vocab_parallel_cross_entropy.py +0 -20
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/args.py +0 -692
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/__init__.py +0 -4
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/constant.py +0 -39
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/gpt_bridge.py +0 -1651
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/gpt_model.py +0 -465
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/gpts/__init__.py +0 -14
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/gpts/qwen3_next.py +0 -512
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/mm_gpt_model.py +0 -136
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/mm_gpts/__init__.py +0 -2
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/mm_gpts/qwen.py +0 -121
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/mm_gpts/qwen3_5.py +0 -174
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/mm_gpts/qwen3_vl.py +0 -450
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/mm_gpts/utils.py +0 -83
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/register.py +0 -98
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/model/rope.py +0 -175
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/multi_lora_megatron.py +0 -272
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/strategy/megatron.py +0 -176
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/tuners/__init__.py +0 -8
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/tuners/lora.py +0 -583
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/tuners/utils.py +0 -206
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/utils/__init__.py +0 -2
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/utils/config.py +0 -201
- twinkle_kit-0.2.dev0/src/twinkle/model/megatron/utils/utils.py +0 -32
- twinkle_kit-0.2.dev0/src/twinkle/model/transformers/moe/__init__.py +0 -4
- twinkle_kit-0.2.dev0/src/twinkle/model/transformers/moe/expert_parallel.py +0 -379
- twinkle_kit-0.2.dev0/src/twinkle/model/transformers/strategy/native_fsdp.py +0 -178
- twinkle_kit-0.2.dev0/src/twinkle/preprocessor/base.py +0 -15
- twinkle_kit-0.2.dev0/src/twinkle/processor/grpo.py +0 -34
- twinkle_kit-0.2.dev0/src/twinkle/sampler/torch_sampler/__init__.py +0 -1
- twinkle_kit-0.2.dev0/src/twinkle/sampler/torch_sampler/torch_sampler.py +0 -157
- twinkle_kit-0.2.dev0/src/twinkle/sampler/torch_sampler/transformers_engine.py +0 -298
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/__init__.py +0 -18
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/common/__init__.py +0 -3
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/common/compat_base.py +0 -151
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/common/io_utils.py +0 -181
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/common/megatron_model.py +0 -189
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/common/transformers_model.py +0 -148
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/model.py +0 -659
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/sampler.py +0 -251
- twinkle_kit-0.2.dev0/src/twinkle/server/tinker/server.py +0 -613
- twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/__init__.py +0 -20
- twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/model.py +0 -584
- twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/processor.py +0 -188
- twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/sampler.py +0 -308
- twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/server.py +0 -270
- twinkle_kit-0.2.dev0/src/twinkle/server/utils/__init__.py +0 -7
- twinkle_kit-0.2.dev0/src/twinkle/server/utils/adapter_manager.py +0 -341
- twinkle_kit-0.2.dev0/src/twinkle/server/utils/task_queue.py +0 -570
- twinkle_kit-0.2.dev0/src/twinkle/template/base.py +0 -441
- twinkle_kit-0.2.dev0/src/twinkle/template/qwen3_vl.py +0 -120
- twinkle_kit-0.2.dev0/src/twinkle/template/utils.py +0 -222
- twinkle_kit-0.2.dev0/src/twinkle/utils/grad_clip.py +0 -95
- twinkle_kit-0.2.dev0/src/twinkle_client/__init__.py +0 -51
- twinkle_kit-0.2.dev0/src/twinkle_client/dataset/lazy_dataset.py +0 -95
- twinkle_kit-0.2.dev0/src/twinkle_client/http/__init__.py +0 -22
- twinkle_kit-0.2.dev0/src/twinkle_client/http/heartbeat.py +0 -177
- twinkle_kit-0.2.dev0/src/twinkle_client/http/utils.py +0 -68
- twinkle_kit-0.2.dev0/src/twinkle_client/manager.py +0 -294
- twinkle_kit-0.2.dev0/src/twinkle_client/processor/grpo.py +0 -48
- twinkle_kit-0.2.dev0/src/twinkle_client/reward/__init__.py +0 -11
- twinkle_kit-0.2.dev0/src/twinkle_client/reward/math_reward.py +0 -56
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/LICENSE +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/advantage/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/advantage/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/advantage/grpo.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/advantage/rloo.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/checkpoint_engine/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/data_format/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/data_format/input_feature.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataloader/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/dataset/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/gym/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/gym/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/hub/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/infra/_ray/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/infra/_ray/resource_manager.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/kernel/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/kernel/function.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/kernel/layer.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/kernel/registry.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss/chunked_cross_entropy.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss_scale/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/loss_scale/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/metric/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/metric/completion_and_reward.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/megatron/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/megatron/strategy/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/transformers/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/model/transformers/strategy/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/module/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/module/scheduler/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/module/scheduler/cosine_warmup.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/module/scheduler/linear_warmup.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/patch/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/reward/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/reward/format_reward.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/base_engine.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/sampler/vllm_sampler/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/__init__.py +0 -0
- {twinkle_kit-0.2.dev0/src/twinkle/server/twinkle/common → twinkle_kit-0.3.0/src/twinkle/server/model/backends}/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/device_utils.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/config_manager.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/future_manager.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/model_manager.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/models.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/sampling_manager.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/server/utils/state/session_manager.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/dequantizer.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/import_utils.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/loader.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/network.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/platforms/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/platforms/base.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/platforms/gpu.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/platforms/mps.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/platforms/npu.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/safetensors.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/transformers_utils.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/unsafe.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle/utils/utils.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataloader/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/dataset/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/model/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/processor/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_client/sampler/__init__.py +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_kit.egg-info/dependency_links.txt +0 -0
- {twinkle_kit-0.2.dev0 → twinkle_kit-0.3.0}/src/twinkle_kit.egg-info/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: twinkle-kit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Training API for large language models with efficient data handling and advanced optimization techniques.
|
|
5
5
|
Author-email: ModelScope <contact@modelscope.cn>
|
|
6
6
|
Requires-Python: <3.13,>=3.11
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist: datasets<4.0,>=3.0
|
|
10
9
|
Requires-Dist: numpy<2.3.0,>=2.0.0
|
|
10
|
+
Requires-Dist: datasets
|
|
11
11
|
Requires-Dist: omegaconf<3.0.0,>=2.3.0
|
|
12
12
|
Requires-Dist: fastapi
|
|
13
13
|
Requires-Dist: modelscope[framework]>=1.34.0
|
|
@@ -23,6 +23,7 @@ Requires-Dist: kernels; extra == "kernels"
|
|
|
23
23
|
Provides-Extra: megatron
|
|
24
24
|
Requires-Dist: megatron-core>=0.12.0; extra == "megatron"
|
|
25
25
|
Requires-Dist: transformer-engine[pytorch]; extra == "megatron"
|
|
26
|
+
Requires-Dist: mcore_bridge; extra == "megatron"
|
|
26
27
|
Provides-Extra: vllm
|
|
27
28
|
Requires-Dist: vllm>=0.11; extra == "vllm"
|
|
28
29
|
Provides-Extra: ray
|
|
@@ -45,7 +46,7 @@ Dynamic: license-file
|
|
|
45
46
|
|
|
46
47
|
<p align="center">
|
|
47
48
|
<img src="assets/slogan.png" width="200"/>
|
|
48
|
-
|
|
49
|
+
</p>
|
|
49
50
|
<p align="center">
|
|
50
51
|
by <a href="https://modelscope.cn/home">ModelScope</a>
|
|
51
52
|
<br>
|
|
@@ -62,7 +63,7 @@ by <a href="https://modelscope.cn/home">ModelScope</a>
|
|
|
62
63
|
</p>
|
|
63
64
|
|
|
64
65
|
<p align="center">
|
|
65
|
-
<a href="https://twinkle-kit.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a>  
|
|
66
|
+
<a href="https://twinkle-kit.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a>   |   <a href="https://modelscope.github.io/twinkle-web/">Twinkle Web</a>  
|
|
66
67
|
</p>
|
|
67
68
|
|
|
68
69
|
## ✨ What is Twinkle?
|
|
@@ -73,8 +74,8 @@ with `torchrun`, or scaling training across Ray clusters,
|
|
|
73
74
|
Twinkle✨ eliminates infrastructure friction by encapsulating
|
|
74
75
|
training logic into standardized APIs. Beyond simple
|
|
75
76
|
abstraction, Twinkle✨ serves as a robust backend and gateway to enable serverless Training-as-a-Service (TaaS).
|
|
76
|
-
It offers interfaces that constitute a _superset_ of
|
|
77
|
-
thereby making it possible to access a Twinkle✨ training service via Tinker client or native Twinkle✨ client
|
|
77
|
+
It offers interfaces that constitute a _superset_ of [Tinker](https://thinkingmachines.ai/tinker/) APIs,
|
|
78
|
+
thereby making it possible to access a Twinkle✨ training service via Tinker client or the native Twinkle✨ client,
|
|
78
79
|
which offers more functionalities.
|
|
79
80
|
|
|
80
81
|
🧩 <b>Decoupled Architecture</b>: Standardized Interfaces, backward compatible with Tinker APIs.<br>
|
|
@@ -82,13 +83,13 @@ which offers more functionalities.
|
|
|
82
83
|
🔌 <b>Versatile Backends</b>: Transformers / Megatron.<br>
|
|
83
84
|
👥 <b>Multi-Tenancy Training Service</b>: Train multiple LoRAs that share one base model deployment.<br>
|
|
84
85
|
|
|
85
|
-
Note: Twinkle✨is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and
|
|
86
|
+
Note: Twinkle✨ is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and
|
|
86
87
|
we expect the two projects to evolve together. We expect some fundamental components in Twinkle✨will likely
|
|
87
88
|
be reused in [ms-swift](https://github.com/modelscope/ms-swift).
|
|
88
89
|
|
|
89
|
-
| Twinkle Wechat Group |
|
|
90
|
-
|
|
91
|
-
| <img src="assets/wechat.jpg" width="200" height="200"> |
|
|
90
|
+
[Discord Group](https://discord.gg/yeN59wxjwe) | Twinkle Wechat Group |
|
|
91
|
+
:------------------------------------------------------:|:------------------------------------------------------:|
|
|
92
|
+
<img src="assets/discord_qr.jpg" width="200" height="200"> | <img src="assets/wechat.jpg" width="200" height="200"> |
|
|
92
93
|
|
|
93
94
|
## Installation
|
|
94
95
|
|
|
@@ -106,32 +107,67 @@ cd twinkle
|
|
|
106
107
|
pip install -e .
|
|
107
108
|
```
|
|
108
109
|
|
|
110
|
+
### Use our docker image:
|
|
111
|
+
|
|
112
|
+
```text
|
|
113
|
+
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:twinkle-0.2.1
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
If you need to use Twinkle's Client, you can use our one-click installation script:
|
|
117
|
+
|
|
118
|
+
```shell
|
|
119
|
+
# Mac or Linux
|
|
120
|
+
sh INSTALL_CLIENT.sh
|
|
121
|
+
# Windows, Open with powershell
|
|
122
|
+
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
|
123
|
+
.\INSTALL_CLIENT.ps1
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
This script will download or utilize conda to create a virtual environment called `twinkle-client`, which can be directly used for remote training.
|
|
127
|
+
|
|
128
|
+
If you need to install Megatron-related dependencies, you can use the following script:
|
|
129
|
+
|
|
130
|
+
```shell
|
|
131
|
+
sh INSTALL_MEGATRON.sh
|
|
132
|
+
```
|
|
133
|
+
|
|
109
134
|
## Tutorials
|
|
110
135
|
|
|
111
|
-
| Training Type
|
|
112
|
-
|
|
|
113
|
-
| FSDP finetuning
|
|
114
|
-
| FSDP MoE finetuning
|
|
115
|
-
|
|
|
116
|
-
|
|
|
117
|
-
|
|
|
118
|
-
| pp/tp/cp finetuning
|
|
119
|
-
|
|
|
120
|
-
|
|
|
121
|
-
|
|
|
122
|
-
|
|
|
123
|
-
|
|
|
136
|
+
| Training Type | Model Framework | Cookbook Path |
|
|
137
|
+
| ------------------------------------ | --------------- | ----------------------------------------------------- |
|
|
138
|
+
| FSDP finetuning | transformers | [Script](cookbook/transformers/fsdp2.py) |
|
|
139
|
+
| FSDP MoE finetuning | transformers | [Script](cookbook/transformers/fsdp2_moe.py) |
|
|
140
|
+
| EP FSDP MoE finetuning | transformers | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) |
|
|
141
|
+
| SP FSDP finetuning | transformers | [Script](cookbook/transformers/sp_fsdp_dense.py) |
|
|
142
|
+
| pp/tp/cp finetuning | megatron | [Script](cookbook/megatron/tp.py) |
|
|
143
|
+
| pp/tp/cp MoE finetuning | megatron | [Script](cookbook/megatron/tp_moe.py) |
|
|
144
|
+
| Multimodal FSDP finetuning | transformers | [Script](cookbook/mm/fsdp2.py) |
|
|
145
|
+
| GRPO RL training | megatron | [Script](cookbook/rl/grpo.py) |
|
|
146
|
+
| GRPO Multimodal RL training | megatron | [Script](cookbook/rl/grpo_mm.py) |
|
|
147
|
+
| GRPO Math RL training | megatron | [Script](cookbook/rl/short_math_grpo.py) |
|
|
148
|
+
| DPO full-parameter training | transformers | [Script](cookbook/rl/dpo_full.py) |
|
|
149
|
+
| DPO LoRA training | transformers | [Script](cookbook/rl/dpo_lora.py) |
|
|
150
|
+
| DPO multi-LoRA training | transformers | [Script](cookbook/rl/dpo_multi_lora.py) |
|
|
151
|
+
| GKD on-policy distillation | megatron | [Script](cookbook/rl/gkd_on_policy.py) |
|
|
152
|
+
| GKD off-policy distillation | megatron | [Script](cookbook/rl/gkd_off_policy.py) |
|
|
153
|
+
| Tinker client finetuning (self-host) | transformers | [Script](cookbook/client/tinker/self_host) |
|
|
154
|
+
| Tinker client finetuning (ModelScope) | transformers | [Script](cookbook/client/tinker/modelscope) |
|
|
155
|
+
| Twinkle client finetuning (self-host) | transformers | [Script](cookbook/client/twinkle/self_host) |
|
|
156
|
+
| Twinkle client finetuning (ModelScope) | transformers | [Script](cookbook/client/twinkle/modelscope) |
|
|
157
|
+
| Server startup scripts | transformers/megatron | [Script](cookbook/client/server) |
|
|
124
158
|
|
|
125
159
|
## Changelog
|
|
126
|
-
|
|
160
|
+
- 🎉2026-04-27 Support the `padding_free` operation for sft/dpo/grpo/gkd, use `set_processor('InputProcessor', padding_free=True)` to train with it.
|
|
161
|
+
- 🎉2026-04-22 The ModelScope service has been deployed to [Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B) with a new release 0.2.1.
|
|
162
|
+
- 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B) with a new release 0.2.0.
|
|
163
|
+
- 🎉2026-03-28 Support DPO training with both Transformers and Megatron backends. See [dpo_full.py](cookbook/rl/dpo_full.py) and [dpo_lora.py](cookbook/rl/dpo_lora.py).
|
|
164
|
+
- 🎉2026-03-24 Twinkle Web site is now live at https://modelscope.github.io/twinkle-web/
|
|
165
|
+
- 🎉2026-03-19 Support GKD training, please refer to this [cookbook](cookbook/rl/gkd_on_policy.py).
|
|
127
166
|
- 🎉2026-02-13 Initial version of Twinkle✨ released, including SFT/PT/RL support for text models.
|
|
128
|
-
We also made available serverless training capabilities on [ModelScope](https://modelscope.cn) via
|
|
129
|
-
Tinker-compatible APIs.
|
|
130
167
|
|
|
131
168
|
## Training as a Service on ModelScope
|
|
132
169
|
|
|
133
|
-
We are rolling out training service built atop Twinkle✨ on ModelScope.
|
|
134
|
-
sign up for free access by joining the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization, and
|
|
170
|
+
We are rolling out training service built atop Twinkle✨ on ModelScope. You may
|
|
135
171
|
train via API endpoint `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to
|
|
136
172
|
our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
|
|
137
173
|
|
|
@@ -140,7 +176,7 @@ our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
|
|
|
140
176
|
| Hardware Environment | Notes |
|
|
141
177
|
| -------------------- | ---------------------------------------------------------------- |
|
|
142
178
|
| Nvidia GPUs | ✅ Support for BF16/Flash-Attn may be incomplete in earlier GPUs |
|
|
143
|
-
| Ascend NPU | ✅ Some operators may not supported
|
|
179
|
+
| Ascend NPU | ✅ Some operators may not be supported |
|
|
144
180
|
| PPU | ✅ |
|
|
145
181
|
| CPU | Supports partial components like dataset, dataloader |
|
|
146
182
|
|
|
@@ -153,15 +189,15 @@ supported on Twinkle✨ framework.
|
|
|
153
189
|
> For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
|
|
154
190
|
> is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
|
|
155
191
|
> both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
|
|
156
|
-
> by one training base at a time, and currently it is [Qwen3-
|
|
192
|
+
> by one training base at a time, and currently it is [Qwen3.6-27B](https://modelscope.cn/models/Qwen/Qwen3.6-27B).
|
|
157
193
|
|
|
158
194
|
| Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID |
|
|
159
195
|
|---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
|
|
160
196
|
| qwen3 series | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) |
|
|
161
197
|
| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) |
|
|
162
198
|
| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | 30B-A3B/A3B-Base,235B-A22B | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) |
|
|
163
|
-
| qwen3.5 moe series | [Qwen/Qwen3.5-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.5-35B-A3B) | 35B-A3B,122B-A10B, etc. | transformers>=5.
|
|
164
|
-
| qwen3.5 series | [Qwen/Qwen3.5-9B](https://www.modelscope.cn/models/Qwen/Qwen3.5-9B) | 2B ~ 27B | transformers>=5.
|
|
199
|
+
| qwen3.5 moe series | [Qwen/Qwen3.5-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.5-35B-A3B) | 35B-A3B,122B-A10B, etc. | transformers>=5.2.0 | ✔ | [Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) |
|
|
200
|
+
| qwen3.5 series | [Qwen/Qwen3.5-9B](https://www.modelscope.cn/models/Qwen/Qwen3.5-9B) | 2B ~ 27B | transformers>=5.2.0 | ✔ | [Qwen/Qwen3.5-9B](https://huggingface.co/Qwen/Qwen3.5-9B) |
|
|
165
201
|
| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) |
|
|
166
202
|
| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) |
|
|
167
203
|
| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) |
|
|
@@ -178,8 +214,6 @@ supported on Twinkle✨ framework.
|
|
|
178
214
|
| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
|
|
179
215
|
| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) |
|
|
180
216
|
|
|
181
|
-
For more detailed model support list 👉 [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md)
|
|
182
|
-
|
|
183
217
|
## Sample Code
|
|
184
218
|
|
|
185
219
|
Below are some of the capabilities demonstrated in the example code. For a complete introduction to training capabilities,
|
|
@@ -204,11 +238,11 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
|
|
|
204
238
|
|
|
205
239
|
def train():
|
|
206
240
|
# to load model from Hugging Face, use 'hf://...'
|
|
207
|
-
base_model = 'ms://Qwen/Qwen3.
|
|
241
|
+
base_model = 'ms://Qwen/Qwen3.6-27B'
|
|
208
242
|
# 1000 samples
|
|
209
243
|
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
|
|
210
244
|
# Set template to prepare encoding
|
|
211
|
-
dataset.set_template('
|
|
245
|
+
dataset.set_template('Qwen3_5Template', model_id=base_model)
|
|
212
246
|
# Preprocess the dataset to standard format
|
|
213
247
|
dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community'))
|
|
214
248
|
# Encode dataset
|
|
@@ -258,15 +292,15 @@ from twinkle import init_tinker_client
|
|
|
258
292
|
from twinkle.dataloader import DataLoader
|
|
259
293
|
from twinkle.dataset import Dataset, DatasetMeta
|
|
260
294
|
from twinkle.preprocessor import SelfCognitionProcessor
|
|
261
|
-
from twinkle.server.
|
|
295
|
+
from twinkle.server.common import input_feature_to_datum
|
|
262
296
|
|
|
263
|
-
base_model = 'ms://Qwen/Qwen3-
|
|
297
|
+
base_model = 'ms://Qwen/Qwen3.6-27B'
|
|
264
298
|
base_url='your-base-url'
|
|
265
299
|
api_key='your-api-key'
|
|
266
300
|
|
|
267
301
|
# Use twinkle dataset to load the data
|
|
268
302
|
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
|
|
269
|
-
dataset.set_template('
|
|
303
|
+
dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256)
|
|
270
304
|
dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False)
|
|
271
305
|
dataset.encode(batched=True, load_from_cache_file=False)
|
|
272
306
|
dataloader = DataLoader(dataset=dataset, batch_size=8)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
<p align="center">
|
|
4
4
|
<img src="assets/slogan.png" width="200"/>
|
|
5
|
-
|
|
5
|
+
</p>
|
|
6
6
|
<p align="center">
|
|
7
7
|
by <a href="https://modelscope.cn/home">ModelScope</a>
|
|
8
8
|
<br>
|
|
@@ -19,7 +19,7 @@ by <a href="https://modelscope.cn/home">ModelScope</a>
|
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
<p align="center">
|
|
22
|
-
<a href="https://twinkle-kit.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a>  
|
|
22
|
+
<a href="https://twinkle-kit.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a>   |   <a href="https://modelscope.github.io/twinkle-web/">Twinkle Web</a>  
|
|
23
23
|
</p>
|
|
24
24
|
|
|
25
25
|
## ✨ What is Twinkle?
|
|
@@ -30,8 +30,8 @@ with `torchrun`, or scaling training across Ray clusters,
|
|
|
30
30
|
Twinkle✨ eliminates infrastructure friction by encapsulating
|
|
31
31
|
training logic into standardized APIs. Beyond simple
|
|
32
32
|
abstraction, Twinkle✨ serves as a robust backend and gateway to enable serverless Training-as-a-Service (TaaS).
|
|
33
|
-
It offers interfaces that constitute a _superset_ of
|
|
34
|
-
thereby making it possible to access a Twinkle✨ training service via Tinker client or native Twinkle✨ client
|
|
33
|
+
It offers interfaces that constitute a _superset_ of [Tinker](https://thinkingmachines.ai/tinker/) APIs,
|
|
34
|
+
thereby making it possible to access a Twinkle✨ training service via Tinker client or the native Twinkle✨ client,
|
|
35
35
|
which offers more functionalities.
|
|
36
36
|
|
|
37
37
|
🧩 <b>Decoupled Architecture</b>: Standardized Interfaces, backward compatible with Tinker APIs.<br>
|
|
@@ -39,13 +39,13 @@ which offers more functionalities.
|
|
|
39
39
|
🔌 <b>Versatile Backends</b>: Transformers / Megatron.<br>
|
|
40
40
|
👥 <b>Multi-Tenancy Training Service</b>: Train multiple LoRAs that share one base model deployment.<br>
|
|
41
41
|
|
|
42
|
-
Note: Twinkle✨is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and
|
|
42
|
+
Note: Twinkle✨ is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and
|
|
43
43
|
we expect the two projects to evolve together. We expect some fundamental components in Twinkle✨will likely
|
|
44
44
|
be reused in [ms-swift](https://github.com/modelscope/ms-swift).
|
|
45
45
|
|
|
46
|
-
| Twinkle Wechat Group |
|
|
47
|
-
|
|
48
|
-
| <img src="assets/wechat.jpg" width="200" height="200"> |
|
|
46
|
+
[Discord Group](https://discord.gg/yeN59wxjwe) | Twinkle Wechat Group |
|
|
47
|
+
:------------------------------------------------------:|:------------------------------------------------------:|
|
|
48
|
+
<img src="assets/discord_qr.jpg" width="200" height="200"> | <img src="assets/wechat.jpg" width="200" height="200"> |
|
|
49
49
|
|
|
50
50
|
## Installation
|
|
51
51
|
|
|
@@ -63,32 +63,67 @@ cd twinkle
|
|
|
63
63
|
pip install -e .
|
|
64
64
|
```
|
|
65
65
|
|
|
66
|
+
### Use our docker image:
|
|
67
|
+
|
|
68
|
+
```text
|
|
69
|
+
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:twinkle-0.2.1
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
If you need to use Twinkle's Client, you can use our one-click installation script:
|
|
73
|
+
|
|
74
|
+
```shell
|
|
75
|
+
# Mac or Linux
|
|
76
|
+
sh INSTALL_CLIENT.sh
|
|
77
|
+
# Windows, Open with powershell
|
|
78
|
+
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
|
79
|
+
.\INSTALL_CLIENT.ps1
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This script will download or utilize conda to create a virtual environment called `twinkle-client`, which can be directly used for remote training.
|
|
83
|
+
|
|
84
|
+
If you need to install Megatron-related dependencies, you can use the following script:
|
|
85
|
+
|
|
86
|
+
```shell
|
|
87
|
+
sh INSTALL_MEGATRON.sh
|
|
88
|
+
```
|
|
89
|
+
|
|
66
90
|
## Tutorials
|
|
67
91
|
|
|
68
|
-
| Training Type
|
|
69
|
-
|
|
|
70
|
-
| FSDP finetuning
|
|
71
|
-
| FSDP MoE finetuning
|
|
72
|
-
|
|
|
73
|
-
|
|
|
74
|
-
|
|
|
75
|
-
| pp/tp/cp finetuning
|
|
76
|
-
|
|
|
77
|
-
|
|
|
78
|
-
|
|
|
79
|
-
|
|
|
80
|
-
|
|
|
92
|
+
| Training Type | Model Framework | Cookbook Path |
|
|
93
|
+
| ------------------------------------ | --------------- | ----------------------------------------------------- |
|
|
94
|
+
| FSDP finetuning | transformers | [Script](cookbook/transformers/fsdp2.py) |
|
|
95
|
+
| FSDP MoE finetuning | transformers | [Script](cookbook/transformers/fsdp2_moe.py) |
|
|
96
|
+
| EP FSDP MoE finetuning | transformers | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) |
|
|
97
|
+
| SP FSDP finetuning | transformers | [Script](cookbook/transformers/sp_fsdp_dense.py) |
|
|
98
|
+
| pp/tp/cp finetuning | megatron | [Script](cookbook/megatron/tp.py) |
|
|
99
|
+
| pp/tp/cp MoE finetuning | megatron | [Script](cookbook/megatron/tp_moe.py) |
|
|
100
|
+
| Multimodal FSDP finetuning | transformers | [Script](cookbook/mm/fsdp2.py) |
|
|
101
|
+
| GRPO RL training | megatron | [Script](cookbook/rl/grpo.py) |
|
|
102
|
+
| GRPO Multimodal RL training | megatron | [Script](cookbook/rl/grpo_mm.py) |
|
|
103
|
+
| GRPO Math RL training | megatron | [Script](cookbook/rl/short_math_grpo.py) |
|
|
104
|
+
| DPO full-parameter training | transformers | [Script](cookbook/rl/dpo_full.py) |
|
|
105
|
+
| DPO LoRA training | transformers | [Script](cookbook/rl/dpo_lora.py) |
|
|
106
|
+
| DPO multi-LoRA training | transformers | [Script](cookbook/rl/dpo_multi_lora.py) |
|
|
107
|
+
| GKD on-policy distillation | megatron | [Script](cookbook/rl/gkd_on_policy.py) |
|
|
108
|
+
| GKD off-policy distillation | megatron | [Script](cookbook/rl/gkd_off_policy.py) |
|
|
109
|
+
| Tinker client finetuning (self-host) | transformers | [Script](cookbook/client/tinker/self_host) |
|
|
110
|
+
| Tinker client finetuning (ModelScope) | transformers | [Script](cookbook/client/tinker/modelscope) |
|
|
111
|
+
| Twinkle client finetuning (self-host) | transformers | [Script](cookbook/client/twinkle/self_host) |
|
|
112
|
+
| Twinkle client finetuning (ModelScope) | transformers | [Script](cookbook/client/twinkle/modelscope) |
|
|
113
|
+
| Server startup scripts | transformers/megatron | [Script](cookbook/client/server) |
|
|
81
114
|
|
|
82
115
|
## Changelog
|
|
83
|
-
|
|
116
|
+
- 🎉2026-04-27 Support the `padding_free` operation for sft/dpo/grpo/gkd, use `set_processor('InputProcessor', padding_free=True)` to train with it.
|
|
117
|
+
- 🎉2026-04-22 The ModelScope service has been deployed to [Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B) with a new release 0.2.1.
|
|
118
|
+
- 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B) with a new release 0.2.0.
|
|
119
|
+
- 🎉2026-03-28 Support DPO training with both Transformers and Megatron backends. See [dpo_full.py](cookbook/rl/dpo_full.py) and [dpo_lora.py](cookbook/rl/dpo_lora.py).
|
|
120
|
+
- 🎉2026-03-24 Twinkle Web site is now live at https://modelscope.github.io/twinkle-web/
|
|
121
|
+
- 🎉2026-03-19 Support GKD training, please refer to this [cookbook](cookbook/rl/gkd_on_policy.py).
|
|
84
122
|
- 🎉2026-02-13 Initial version of Twinkle✨ released, including SFT/PT/RL support for text models.
|
|
85
|
-
We also made available serverless training capabilities on [ModelScope](https://modelscope.cn) via
|
|
86
|
-
Tinker-compatible APIs.
|
|
87
123
|
|
|
88
124
|
## Training as a Service on ModelScope
|
|
89
125
|
|
|
90
|
-
We are rolling out training service built atop Twinkle✨ on ModelScope.
|
|
91
|
-
sign up for free access by joining the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization, and
|
|
126
|
+
We are rolling out training service built atop Twinkle✨ on ModelScope. You may
|
|
92
127
|
train via API endpoint `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to
|
|
93
128
|
our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
|
|
94
129
|
|
|
@@ -97,7 +132,7 @@ our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
|
|
|
97
132
|
| Hardware Environment | Notes |
|
|
98
133
|
| -------------------- | ---------------------------------------------------------------- |
|
|
99
134
|
| Nvidia GPUs | ✅ Support for BF16/Flash-Attn may be incomplete in earlier GPUs |
|
|
100
|
-
| Ascend NPU | ✅ Some operators may not supported
|
|
135
|
+
| Ascend NPU | ✅ Some operators may not be supported |
|
|
101
136
|
| PPU | ✅ |
|
|
102
137
|
| CPU | Supports partial components like dataset, dataloader |
|
|
103
138
|
|
|
@@ -110,15 +145,15 @@ supported on Twinkle✨ framework.
|
|
|
110
145
|
> For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
|
|
111
146
|
> is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
|
|
112
147
|
> both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
|
|
113
|
-
> by one training base at a time, and currently it is [Qwen3-
|
|
148
|
+
> by one training base at a time, and currently it is [Qwen3.6-27B](https://modelscope.cn/models/Qwen/Qwen3.6-27B).
|
|
114
149
|
|
|
115
150
|
| Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID |
|
|
116
151
|
|---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
|
|
117
152
|
| qwen3 series | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) |
|
|
118
153
|
| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) |
|
|
119
154
|
| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | 30B-A3B/A3B-Base,235B-A22B | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) |
|
|
120
|
-
| qwen3.5 moe series | [Qwen/Qwen3.5-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.5-35B-A3B) | 35B-A3B,122B-A10B, etc. | transformers>=5.
|
|
121
|
-
| qwen3.5 series | [Qwen/Qwen3.5-9B](https://www.modelscope.cn/models/Qwen/Qwen3.5-9B) | 2B ~ 27B | transformers>=5.
|
|
155
|
+
| qwen3.5 moe series | [Qwen/Qwen3.5-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.5-35B-A3B) | 35B-A3B,122B-A10B, etc. | transformers>=5.2.0 | ✔ | [Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) |
|
|
156
|
+
| qwen3.5 series | [Qwen/Qwen3.5-9B](https://www.modelscope.cn/models/Qwen/Qwen3.5-9B) | 2B ~ 27B | transformers>=5.2.0 | ✔ | [Qwen/Qwen3.5-9B](https://huggingface.co/Qwen/Qwen3.5-9B) |
|
|
122
157
|
| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) |
|
|
123
158
|
| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) |
|
|
124
159
|
| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) |
|
|
@@ -135,8 +170,6 @@ supported on Twinkle✨ framework.
|
|
|
135
170
|
| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
|
|
136
171
|
| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) |
|
|
137
172
|
|
|
138
|
-
For more detailed model support list 👉 [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md)
|
|
139
|
-
|
|
140
173
|
## Sample Code
|
|
141
174
|
|
|
142
175
|
Below are some of the capabilities demonstrated in the example code. For a complete introduction to training capabilities,
|
|
@@ -161,11 +194,11 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
|
|
|
161
194
|
|
|
162
195
|
def train():
|
|
163
196
|
# to load model from Hugging Face, use 'hf://...'
|
|
164
|
-
base_model = 'ms://Qwen/Qwen3.
|
|
197
|
+
base_model = 'ms://Qwen/Qwen3.6-27B'
|
|
165
198
|
# 1000 samples
|
|
166
199
|
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
|
|
167
200
|
# Set template to prepare encoding
|
|
168
|
-
dataset.set_template('
|
|
201
|
+
dataset.set_template('Qwen3_5Template', model_id=base_model)
|
|
169
202
|
# Preprocess the dataset to standard format
|
|
170
203
|
dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community'))
|
|
171
204
|
# Encode dataset
|
|
@@ -215,15 +248,15 @@ from twinkle import init_tinker_client
|
|
|
215
248
|
from twinkle.dataloader import DataLoader
|
|
216
249
|
from twinkle.dataset import Dataset, DatasetMeta
|
|
217
250
|
from twinkle.preprocessor import SelfCognitionProcessor
|
|
218
|
-
from twinkle.server.
|
|
251
|
+
from twinkle.server.common import input_feature_to_datum
|
|
219
252
|
|
|
220
|
-
base_model = 'ms://Qwen/Qwen3-
|
|
253
|
+
base_model = 'ms://Qwen/Qwen3.6-27B'
|
|
221
254
|
base_url='your-base-url'
|
|
222
255
|
api_key='your-api-key'
|
|
223
256
|
|
|
224
257
|
# Use twinkle dataset to load the data
|
|
225
258
|
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
|
|
226
|
-
dataset.set_template('
|
|
259
|
+
dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256)
|
|
227
260
|
dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False)
|
|
228
261
|
dataset.encode(batched=True, load_from_cache_file=False)
|
|
229
262
|
dataloader = DataLoader(dataset=dataset, batch_size=8)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "twinkle-kit"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Training API for large language models with efficient data handling and advanced optimization techniques."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "ModelScope", email = "contact@modelscope.cn" }]
|
|
7
7
|
requires-python = ">=3.11,<3.13"
|
|
8
8
|
dependencies = [
|
|
9
|
-
"datasets>=3.0,<4.0",
|
|
10
9
|
"numpy>=2.0.0,<2.3.0",
|
|
10
|
+
"datasets",
|
|
11
11
|
"omegaconf>=2.3.0,<3.0.0",
|
|
12
12
|
"fastapi",
|
|
13
13
|
"modelscope[framework]>=1.34.0",
|
|
@@ -23,7 +23,7 @@ transformers = [
|
|
|
23
23
|
"torchvision",
|
|
24
24
|
]
|
|
25
25
|
kernels = ["kernels"]
|
|
26
|
-
megatron = ["megatron-core>=0.12.0", "transformer-engine[pytorch]"]
|
|
26
|
+
megatron = ["megatron-core>=0.12.0", "transformer-engine[pytorch]", "mcore_bridge"]
|
|
27
27
|
vllm = ["vllm>=0.11"]
|
|
28
28
|
ray = ["ray[serve]"]
|
|
29
29
|
tinker = ["tinker==0.14.0"]
|
|
@@ -22,7 +22,7 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
|
|
|
22
22
|
[flake8]
|
|
23
23
|
max-line-length = 120
|
|
24
24
|
select = B,E,F,P,T4,W,B9
|
|
25
|
-
ignore = F401,F403,F405,F821,W503,E251,W504,E126
|
|
25
|
+
ignore = F401,F403,F405,F821,W503,E251,W504,E126,E125
|
|
26
26
|
exclude = docs/src,*.pyi,.git,peft.py
|
|
27
27
|
|
|
28
28
|
[darglint]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
2
|
# Adapted from https://github.com/volcengine/verl/blob/main/verl/checkpoint_engine/base.py
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, TypedDict
|
|
4
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional, TypedDict
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
import torch
|
|
@@ -38,6 +38,8 @@ class CheckpointEngine(ABC):
|
|
|
38
38
|
>>> engine.finalize()
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
|
+
rank: Optional[int] = None
|
|
42
|
+
|
|
41
43
|
@abstractmethod
|
|
42
44
|
def prepare(self) -> dict[str, Any]:
|
|
43
45
|
"""Prepare the checkpoint engine before weight synchronization.
|