torchtitan-npu 0.2.2.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. torchtitan_npu-0.2.2.post1/LICENSE +29 -0
  2. torchtitan_npu-0.2.2.post1/NOTICE +128 -0
  3. torchtitan_npu-0.2.2.post1/PKG-INFO +247 -0
  4. torchtitan_npu-0.2.2.post1/README.md +187 -0
  5. torchtitan_npu-0.2.2.post1/assets/version.txt +1 -0
  6. torchtitan_npu-0.2.2.post1/pyproject.toml +80 -0
  7. torchtitan_npu-0.2.2.post1/setup.cfg +4 -0
  8. torchtitan_npu-0.2.2.post1/torchtitan_npu/__init__.py +82 -0
  9. torchtitan_npu-0.2.2.post1/torchtitan_npu/config/custom_config.py +145 -0
  10. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/__init__.py +31 -0
  11. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/base_converter.py +35 -0
  12. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/convert_utils.py +173 -0
  13. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/features/bypass_triton_codegen.py +68 -0
  14. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/dsa.py +329 -0
  15. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/expert_parallel.py +148 -0
  16. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/fusion_attention.py +78 -0
  17. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/gmm.py +237 -0
  18. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/permutation.py +58 -0
  19. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/permute.py +65 -0
  20. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/quant_gmm.py +200 -0
  21. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/quant_linear.py +194 -0
  22. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/rms_norm.py +65 -0
  23. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/kernels/rope.py +159 -0
  24. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/npu_converter.py +61 -0
  25. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/quant_converter.py +142 -0
  26. torchtitan_npu-0.2.2.post1/torchtitan_npu/converters/registry.py +76 -0
  27. torchtitan_npu-0.2.2.post1/torchtitan_npu/distributed/context_parallel/dsa_cp.py +267 -0
  28. torchtitan_npu-0.2.2.post1/torchtitan_npu/distributed/context_parallel/ulysses_cp.py +111 -0
  29. torchtitan_npu-0.2.2.post1/torchtitan_npu/entry.py +121 -0
  30. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v3/__init__.py +113 -0
  31. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v3/infra/parallelize.py +59 -0
  32. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v3/model/args.py +19 -0
  33. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v3/model/state_dict_adapter.py +54 -0
  34. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v32/__init__.py +166 -0
  35. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v32/infra/parallelize.py +788 -0
  36. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v32/model/args.py +35 -0
  37. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v32/model/model.py +908 -0
  38. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/deepseek_v32/model/state_dict_adapter.py +172 -0
  39. torchtitan_npu-0.2.2.post1/torchtitan_npu/models/llama4/model/state_dict_adapter.py +82 -0
  40. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/distributed/cp_input_sharding.py +57 -0
  41. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/distributed/custom_context_parallel.py +155 -0
  42. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/distributed/mtp_context_parallel.py +130 -0
  43. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/distributed/utils.py +86 -0
  44. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/optimizer/muon_optimizer.py +434 -0
  45. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/optimizer/swap_optimizer.py +397 -0
  46. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/quantization/quant_config.py +84 -0
  47. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/quantization/quantize.py +130 -0
  48. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/tools/metrics.py +136 -0
  49. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch/_inductor/graph.py +125 -0
  50. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch/clip_grad.py +164 -0
  51. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch/micro_pipeline_tp.py +154 -0
  52. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch/pipelining.py +109 -0
  53. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch/testing/_internal/distributed/fake_pg.py +37 -0
  54. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch_npu/_inductor/lowering.py +33 -0
  55. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch_npu/_meta_registrations.py +70 -0
  56. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torch_npu/custom_shardings.py +318 -0
  57. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torchtitan/activation_checkpoint.py +88 -0
  58. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torchtitan/hf_datasets.py +83 -0
  59. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torchtitan/loss.py +66 -0
  60. torchtitan_npu-0.2.2.post1/torchtitan_npu/patches/torchtitan/lr_scheduler.py +76 -0
  61. torchtitan_npu-0.2.2.post1/torchtitan_npu/tools/checkpoint_patch.py +371 -0
  62. torchtitan_npu-0.2.2.post1/torchtitan_npu/tools/flight_recorder.py +46 -0
  63. torchtitan_npu-0.2.2.post1/torchtitan_npu/tools/profiling.py +132 -0
  64. torchtitan_npu-0.2.2.post1/torchtitan_npu/tools/weight_utils.py +247 -0
  65. torchtitan_npu-0.2.2.post1/torchtitan_npu/train.py +158 -0
  66. torchtitan_npu-0.2.2.post1/torchtitan_npu.egg-info/PKG-INFO +247 -0
  67. torchtitan_npu-0.2.2.post1/torchtitan_npu.egg-info/SOURCES.txt +68 -0
  68. torchtitan_npu-0.2.2.post1/torchtitan_npu.egg-info/dependency_links.txt +1 -0
  69. torchtitan_npu-0.2.2.post1/torchtitan_npu.egg-info/requires.txt +16 -0
  70. torchtitan_npu-0.2.2.post1/torchtitan_npu.egg-info/top_level.txt +1 -0
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026 Huawei Technologies Co., Ltd. All rights reserved.
4
+ (c) Meta Platforms, Inc. and affiliates.
5
+
6
+ Redistribution and use in source and binary forms, with or without modification,
7
+ are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice,this list
10
+ of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice, this
13
+ list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its contributors may
17
+ be used to endorse or promote products derived from this software without specific
18
+ prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
21
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
23
+ SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
25
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29
+ DAMAGE.
@@ -0,0 +1,128 @@
1
+ This project constitutes a derivative work of pytorch/torchtitan.
2
+
3
+ Original Project: torchtitan
4
+ Original License: BSD-3-Clause
5
+ Copyright (c) Meta Platforms Inc. and affiliates
6
+
7
+ The original license text of torchtitan can be found below:
8
+ ```
9
+ BSD 3-Clause License
10
+
11
+ (c) Meta Platforms, Inc. and affiliates.
12
+
13
+ Redistribution and use in source and binary forms, with or without modification,
14
+ are permitted provided that the following conditions are met:
15
+
16
+ 1. Redistributions of source code must retain the above copyright notice,this list
17
+ of conditions and the following disclaimer.
18
+
19
+ 2. Redistributions in binary form must reproduce the above copyright notice, this
20
+ list of conditions and the following disclaimer in the documentation
21
+ and/or other materials provided with the distribution.
22
+
23
+ 3. Neither the name of the copyright holder nor the names of its contributors may
24
+ be used to endorse or promote products derived from this software without specific
25
+ prior written permission.
26
+
27
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
28
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
29
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
30
+ SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
31
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
32
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
33
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
35
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
36
+ DAMAGE.
37
+ ```
38
+
39
+ Original Project: pytorch
40
+ Original License: BSD-3-Clause
41
+
42
+ The original license text of pytorch can be found below:
43
+ ```
44
+ From PyTorch:
45
+
46
+ Copyright (c) 2016- Facebook, Inc (Adam Paszke)
47
+ Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
48
+ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
49
+ Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
50
+ Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
51
+ Copyright (c) 2011-2013 NYU (Clement Farabet)
52
+ Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
53
+ Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
54
+ Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
55
+
56
+ From Caffe2:
57
+
58
+ Copyright (c) 2016-present, Facebook Inc. All rights reserved.
59
+
60
+ All contributions by Facebook:
61
+ Copyright (c) 2016 Facebook Inc.
62
+
63
+ All contributions by Google:
64
+ Copyright (c) 2015 Google Inc.
65
+ All rights reserved.
66
+
67
+ All contributions by Yangqing Jia:
68
+ Copyright (c) 2015 Yangqing Jia
69
+ All rights reserved.
70
+
71
+ All contributions by Kakao Brain:
72
+ Copyright 2019-2020 Kakao Brain
73
+
74
+ All contributions by Cruise LLC:
75
+ Copyright (c) 2022 Cruise LLC.
76
+ All rights reserved.
77
+
78
+ All contributions by Tri Dao:
79
+ Copyright (c) 2024 Tri Dao.
80
+ All rights reserved.
81
+
82
+ All contributions by Arm:
83
+ Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates
84
+
85
+ All contributions from Caffe:
86
+ Copyright(c) 2013, 2014, 2015, the respective contributors
87
+ All rights reserved.
88
+
89
+ All other contributions:
90
+ Copyright(c) 2015, 2016 the respective contributors
91
+ All rights reserved.
92
+
93
+ Caffe2 uses a copyright model similar to Caffe: each contributor holds
94
+ copyright over their contributions to Caffe2. The project versioning records
95
+ all such contribution and copyright details. If a contributor wants to further
96
+ mark their specific copyright on a particular contribution, they should
97
+ indicate their copyright solely in the commit message of the change when it is
98
+ committed.
99
+
100
+ All rights reserved.
101
+
102
+ Redistribution and use in source and binary forms, with or without
103
+ modification, are permitted provided that the following conditions are met:
104
+
105
+ 1. Redistributions of source code must retain the above copyright
106
+ notice, this list of conditions and the following disclaimer.
107
+
108
+ 2. Redistributions in binary form must reproduce the above copyright
109
+ notice, this list of conditions and the following disclaimer in the
110
+ documentation and/or other materials provided with the distribution.
111
+
112
+ 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
113
+ and IDIAP Research Institute nor the names of its contributors may be
114
+ used to endorse or promote products derived from this software without
115
+ specific prior written permission.
116
+
117
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
118
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
119
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
120
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
121
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
122
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
123
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
124
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
125
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
126
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
127
+ POSSIBILITY OF SUCH DAMAGE.
128
+ ```
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: torchtitan_npu
3
+ Version: 0.2.2.post1
4
+ Summary: Ascend End-to-End Large Model Training Adaptation Framework Based on torchtitan
5
+ Author: Huawei Hisilicon
6
+ License: BSD 3-Clause License
7
+
8
+ Copyright (c) 2026 Huawei Technologies Co., Ltd. All rights reserved.
9
+ (c) Meta Platforms, Inc. and affiliates.
10
+
11
+ Redistribution and use in source and binary forms, with or without modification,
12
+ are permitted provided that the following conditions are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright notice,this list
15
+ of conditions and the following disclaimer.
16
+
17
+ 2. Redistributions in binary form must reproduce the above copyright notice, this
18
+ list of conditions and the following disclaimer in the documentation
19
+ and/or other materials provided with the distribution.
20
+
21
+ 3. Neither the name of the copyright holder nor the names of its contributors may
22
+ be used to endorse or promote products derived from this software without specific
23
+ prior written permission.
24
+
25
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
26
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
28
+ SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
30
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
31
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
34
+ DAMAGE.
35
+
36
+ Project-URL: Homepage, https://gitcode.com/cann/torchtitan-npu
37
+ Project-URL: Documentation, https://gitcode.com/cann/torchtitan-npu/tree/master/docs
38
+ Project-URL: Issues, https://gitcode.com/cann/torchtitan-npu/issues
39
+ Project-URL: Repository, https://gitcode.com/cann/torchtitan-npu.git
40
+ Keywords: pytorch,training,llm,npu,ascend
41
+ Requires-Python: >=3.10
42
+ Description-Content-Type: text/markdown
43
+ License-File: LICENSE
44
+ License-File: NOTICE
45
+ Requires-Dist: torch
46
+ Requires-Dist: torch_npu
47
+ Provides-Extra: testing
48
+ Requires-Dist: pytest>=7.0; extra == "testing"
49
+ Requires-Dist: pytest-html>=4.0; extra == "testing"
50
+ Requires-Dist: pytest-cov>=4.0; extra == "testing"
51
+ Requires-Dist: pytest-mock>=3.10; extra == "testing"
52
+ Requires-Dist: hypothesis>=6.0; extra == "testing"
53
+ Provides-Extra: dev
54
+ Requires-Dist: pre-commit; extra == "dev"
55
+ Requires-Dist: pytest; extra == "dev"
56
+ Requires-Dist: pytest-cov; extra == "dev"
57
+ Requires-Dist: expecttest; extra == "dev"
58
+ Requires-Dist: pyrefly==0.45.1; extra == "dev"
59
+ Dynamic: license-file
60
+
61
+ <div align="center">
62
+
63
+ # torchtitan-npu
64
+
65
+ <h4>基于 torchtitan 的昇腾全流程大模型训练适配插件</h4>
66
+
67
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](#特性支持概览)
68
+ [![license](https://img.shields.io/badge/license-BSD_3--Clause-lightgrey.svg)](https://gitcode.com/cann/torchtitan-npu/blob/master/LICENSE)
69
+ [![contributing](https://img.shields.io/badge/CONTRIBUTING-teal)](https://gitcode.com/cann/torchtitan-npu/blob/master/CONTRIBUTING.md)
70
+ [![SIG](https://img.shields.io/badge/SIG-framework--adapter-yellow)](https://gitcode.com/cann/community/tree/master/CANN/sigs/framework-adapter)
71
+ [![contributing](https://img.shields.io/badge/pypi-0.2.2-blue)](https://gitcode.com/cann/torchtitan-npu/blob/master/CONTRIBUTING.md)
72
+
73
+ </div>
74
+
75
+ # 简介
76
+
77
+ ---
78
+
79
+ `torchtitan-npu`定位为`torchtitan`的昇腾(Ascend)后端扩展插件,通过即插即用的硬件亲和性优化,充分释放NPU算力,助力`PyTorch native`训练在昇腾平台无缝、高效、稳定地运行。
80
+
81
+ 本插件基于社区 `ModelConverter` 拓展机制构建,已支持多维度训练优化,涵盖 NPU融合算子、图优化、图下沉、**算子自动融合**、显存管理、分布式并行以及调试维测能力等等。
82
+
83
+ ## 社群
84
+ [![SIG](https://img.shields.io/badge/SIG-framework--adapter-yellow)](https://gitcode.com/cann/community/tree/master/CANN/sigs/framework-adapter)
85
+
86
+ SIG 例会:[sig-framework-adapter](https://meeting.osinfra.cn/cann?sig=sig-framework-adapter)
87
+
88
+ # 最新消息
89
+
90
+ ---
91
+
92
+ - [Apr. 2026]: 🚀 **【重要特性支持】算子自动融合**:基于AscendC AutoFuse的能力,支持torch.compile + Inductor后端的算子自动融合。
93
+ - [Apr. 2026]: 🚀 **torchtitan‑npu 正式开源**:在 NPU 上支持 4D 并行等 torchtitan 原生特性,并引入 Swap Optimizer 等 NPU 亲和优化。
94
+
95
+ # Roadmap
96
+
97
+ ---
98
+
99
+ 当前季度的规划见 `torchtitan-npu` [Roadmap](https://gitcode.com/cann/torchtitan-npu/issues/5)。欢迎访问。
100
+
101
+ # 安装
102
+
103
+ 源码安装:
104
+
105
+ ```shell
106
+ git clone https://gitcode.com/cann/torchtitan-npu.git
107
+ cd torchtitan-npu
108
+ pip install -e . ,
109
+ ```
110
+
111
+ 详情请参考 [部署文档](https://gitcode.com/cann/torchtitan-npu/blob/master/docs/user-guide/installation.md) 安装torchtitan-npu及其依赖。
112
+
113
+
114
+ # 快速上手
115
+ 指导开发者快速启动大语言模型的训练任务,具体的操作请参考:
116
+ [快速入门(基于PyTorch框架)](https://gitcode.com/cann/torchtitan-npu/blob/master/docs/user-guide/quick-start.md)
117
+
118
+
119
+ # 特性支持概览
120
+
121
+ ---
122
+
123
+ <table>
124
+ <thead>
125
+ <tr>
126
+ <th>场景</th>
127
+ <th>特性名称</th>
128
+ <th>原生支持</th>
129
+ <th>NPU支持</th>
130
+ </tr>
131
+ </thead>
132
+ <tbody>
133
+ <!-- 并行能力 -->
134
+ <tr>
135
+ <td rowspan="3">并行能力</td>
136
+ <td>4D 并行 (FSDP2/TP/CP/PP)</td>
137
+ <td>✅</td>
138
+ <td>✅</td>
139
+ </tr>
140
+ <tr>
141
+ <td>专家并行 (EP/ETP)</td>
142
+ <td>✅</td>
143
+ <td>✅</td>
144
+ </tr>
145
+ <tr>
146
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/parallelism/custom_cp.md">自定义 CP (DeepSeek V3.2 CP/SDPA Ulysses CP)</a></td>
147
+ <td>❌</td>
148
+ <td>✅</td>
149
+ </tr>
150
+ <!-- torch.compile -->
151
+ <tr>
152
+ <td>torch.compile</td>
153
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/torch_compile.md">torch.compile</a></td>
154
+ <td>✅</td>
155
+ <td>✅</td>
156
+ </tr>
157
+ <!-- 训练精度 -->
158
+ <tr>
159
+ <td rowspan="2">训练精度</td>
160
+ <td>MxFP8 量化</td>
161
+ <td>✅</td>
162
+ <td>✅ (Ascend 950)</td>
163
+ </tr>
164
+ <tr>
165
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/low_precision_training.md">HiF8 量化</a></td>
166
+ <td>❌</td>
167
+ <td>✅ (Ascend 950)</td>
168
+ </tr>
169
+ <!-- 训练调试与监控 -->
170
+ <tr>
171
+ <td rowspan="2">训练调试与监控</td>
172
+ <td>分布式 Checkpoint</td>
173
+ <td>✅</td>
174
+ <td>✅</td>
175
+ </tr>
176
+ <tr>
177
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/metrics_and_debugging.md">调试工具</a></td>
178
+ <td>✅</td>
179
+ <td>✅</td>
180
+ </tr>
181
+ <!-- 性能优化 -->
182
+ <tr>
183
+ <td rowspan="2">性能优化</td>
184
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/swap_optimizer.md">Swap Optimizer</a></td>
185
+ <td>❌</td>
186
+ <td>✅</td>
187
+ </tr>
188
+ <tr>
189
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/npu_fused_ops.md">NPU 融合算子适配</a></td>
190
+ <td>❌</td>
191
+ <td>✅</td>
192
+ </tr>
193
+ </tbody>
194
+ </table>
195
+
196
+ # 项目结构
197
+ torchtitan-npu 充分利用了 torchtitan 提供的 ModelConverter 插件化机制。该机制介入模型定义之后、并行策略(如 TP/FSDP)应用之前,支持以非侵入式的方式,通过注册机制对特定模块进行替换或重写。基于此方案,我们实现了融合算子优化、量化支持以及优化器增强等功能。见以下项目结构:
198
+ ```
199
+ torchtitan-npu/
200
+ ├── torchtitan_npu/ # torchtitan_npu核心源代码
201
+ │ ├── config/ # 对Config的补丁
202
+ │ ├── converters/ # 基于torchtitan ModelConverter机制的补丁
203
+ │ ├── distributed/ # 自定义分布式代码
204
+ │ ├── models/ # 基于torchtitan-npu的模型 (如Deepseek-V3.2)
205
+ │ ├── patches/ # 其他补丁
206
+ │ ├── tools/ # 工具补丁
207
+ │ ├── entry.py # 启动训练
208
+ │ ├── train.py # 训练主流程补丁
209
+ │ └── __init__.py # torchtitan-npu 插件修改注入点
210
+ ├── docs/ # 文档
211
+
212
+ ```
213
+
214
+ # 性能基准
215
+
216
+ ---
217
+
218
+ ### 2026.04
219
+
220
+ System: Atlas 800T A3
221
+ | Model | Number of NPUs | Precision | GBS | Local BS | Sequence Length | FSDP | TP | PP | CP | EP | Throughput (tokens/p/s) |
222
+ | :----------------- | :------------- | :-------- | :-- | :------- | :-------------- | :--- | :-- | :-- | :-- | :-- | :----------- |
223
+ | [Deepseek V3.2-671B](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v32/train_configs/deepseek_v32_671b_61layers_32k_128die.toml) | 64 | BF16 | 128 | 1 | 32768 | 4 | 4 | 1 | 8 | 64 | 103 |
224
+ | [Deepseek V3.2-671B](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v32/train_configs/deepseek_v32_671b_61layers_4k_128die.toml) | 64 | BF16 | 512 | 1 | 4096 | 32 | 4 | 1 | 1 | 64 | 146 |
225
+ | [Deepseek V3-671B](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v3/train_configs/deepseek_v3_671b_61layers_4k_128die.toml) | 64 | BF16 | 1024 | 1 | 4096 | 32 | 4 | 1 | 1 | 128 | 546 |
226
+ | [Deepseek V3-671B + compile(Autofuse)](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v3/train_configs/deepseek_v3_671b_61layers_4k_128die.toml) | 64 | BF16 | 1024 | 1 | 4096 | 32 | 4 | 1 | 1 | 128 | 576 |
227
+ > 注:以上MoE模型的性能数据均开启负载均衡配置moe_force_load_balance=true。
228
+
229
+ # 免责声明
230
+
231
+ ---
232
+
233
+ ## 致 torchtitan‑npu 使用者
234
+
235
+ 1. torchtitan‑npu 提供的所有内容仅供您用于非商业目的。
236
+ 2. 对于 torchtitan‑npu 测试用例以及示例文件中所涉及的各模型和数据集,平台仅用于功能测试,华为不提供任何模型权重和数据集。如您使用这些数据进行训练,请您特别注意应遵守对应模型和数据集的 License,如您因使用这些模型和数据集而产生侵权纠纷,华为不承担任何责任。
237
+ 3. 如您在使用 torchtitan‑npu 过程中,发现任何问题(包括但不限于功能问题、合规问题),请在 GitCode 提交 issue,我们将及时审视并解决。
238
+
239
+ torchtitan‑npu 功能依赖的 PyTorch 等第三方开源软件,均由第三方社区提供和维护,因第三方开源软件导致的问题的修复依赖相关社区的贡献和反馈。您应理解,torchtitan‑npu 仓库不保证对第三方开源软件本身的问题进行修复,也不保证会测试、纠正所有第三方开源软件的漏洞和错误。
240
+
241
+
242
+ # License 声明
243
+
244
+ ---
245
+
246
+ - torchtitan‑npu 产品的使用许可证,具体请参见 [LICENSE](https://gitcode.com/cann/torchtitan-npu/blob/master/LICENSE)。
247
+ - torchtitan‑npu 工具 docs 目录下的文档适用相应许可证,具体请参见文档目录下的 LICENSE 文件。
@@ -0,0 +1,187 @@
1
+ <div align="center">
2
+
3
+ # torchtitan-npu
4
+
5
+ <h4>基于 torchtitan 的昇腾全流程大模型训练适配插件</h4>
6
+
7
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](#特性支持概览)
8
+ [![license](https://img.shields.io/badge/license-BSD_3--Clause-lightgrey.svg)](https://gitcode.com/cann/torchtitan-npu/blob/master/LICENSE)
9
+ [![contributing](https://img.shields.io/badge/CONTRIBUTING-teal)](https://gitcode.com/cann/torchtitan-npu/blob/master/CONTRIBUTING.md)
10
+ [![SIG](https://img.shields.io/badge/SIG-framework--adapter-yellow)](https://gitcode.com/cann/community/tree/master/CANN/sigs/framework-adapter)
11
+ [![contributing](https://img.shields.io/badge/pypi-0.2.2-blue)](https://gitcode.com/cann/torchtitan-npu/blob/master/CONTRIBUTING.md)
12
+
13
+ </div>
14
+
15
+ # 简介
16
+
17
+ ---
18
+
19
+ `torchtitan-npu`定位为`torchtitan`的昇腾(Ascend)后端扩展插件,通过即插即用的硬件亲和性优化,充分释放NPU算力,助力`PyTorch native`训练在昇腾平台无缝、高效、稳定地运行。
20
+
21
+ 本插件基于社区 `ModelConverter` 拓展机制构建,已支持多维度训练优化,涵盖 NPU融合算子、图优化、图下沉、**算子自动融合**、显存管理、分布式并行以及调试维测能力等等。
22
+
23
+ ## 社群
24
+ [![SIG](https://img.shields.io/badge/SIG-framework--adapter-yellow)](https://gitcode.com/cann/community/tree/master/CANN/sigs/framework-adapter)
25
+
26
+ SIG 例会:[sig-framework-adapter](https://meeting.osinfra.cn/cann?sig=sig-framework-adapter)
27
+
28
+ # 最新消息
29
+
30
+ ---
31
+
32
+ - [Apr. 2026]: 🚀 **【重要特性支持】算子自动融合**:基于AscendC AutoFuse的能力,支持torch.compile + Inductor后端的算子自动融合。
33
+ - [Apr. 2026]: 🚀 **torchtitan‑npu 正式开源**:在 NPU 上支持 4D 并行等 torchtitan 原生特性,并引入 Swap Optimizer 等 NPU 亲和优化。
34
+
35
+ # Roadmap
36
+
37
+ ---
38
+
39
+ 当前季度的规划见 `torchtitan-npu` [Roadmap](https://gitcode.com/cann/torchtitan-npu/issues/5)。欢迎访问。
40
+
41
+ # 安装
42
+
43
+ 源码安装:
44
+
45
+ ```shell
46
+ git clone https://gitcode.com/cann/torchtitan-npu.git
47
+ cd torchtitan-npu
48
+ pip install -e . ,
49
+ ```
50
+
51
+ 详情请参考 [部署文档](https://gitcode.com/cann/torchtitan-npu/blob/master/docs/user-guide/installation.md) 安装torchtitan-npu及其依赖。
52
+
53
+
54
+ # 快速上手
55
+ 指导开发者快速启动大语言模型的训练任务,具体的操作请参考:
56
+ [快速入门(基于PyTorch框架)](https://gitcode.com/cann/torchtitan-npu/blob/master/docs/user-guide/quick-start.md)
57
+
58
+
59
+ # 特性支持概览
60
+
61
+ ---
62
+
63
+ <table>
64
+ <thead>
65
+ <tr>
66
+ <th>场景</th>
67
+ <th>特性名称</th>
68
+ <th>原生支持</th>
69
+ <th>NPU支持</th>
70
+ </tr>
71
+ </thead>
72
+ <tbody>
73
+ <!-- 并行能力 -->
74
+ <tr>
75
+ <td rowspan="3">并行能力</td>
76
+ <td>4D 并行 (FSDP2/TP/CP/PP)</td>
77
+ <td>✅</td>
78
+ <td>✅</td>
79
+ </tr>
80
+ <tr>
81
+ <td>专家并行 (EP/ETP)</td>
82
+ <td>✅</td>
83
+ <td>✅</td>
84
+ </tr>
85
+ <tr>
86
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/parallelism/custom_cp.md">自定义 CP (DeepSeek V3.2 CP/SDPA Ulysses CP)</a></td>
87
+ <td>❌</td>
88
+ <td>✅</td>
89
+ </tr>
90
+ <!-- torch.compile -->
91
+ <tr>
92
+ <td>torch.compile</td>
93
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/torch_compile.md">torch.compile</a></td>
94
+ <td>✅</td>
95
+ <td>✅</td>
96
+ </tr>
97
+ <!-- 训练精度 -->
98
+ <tr>
99
+ <td rowspan="2">训练精度</td>
100
+ <td>MxFP8 量化</td>
101
+ <td>✅</td>
102
+ <td>✅ (Ascend 950)</td>
103
+ </tr>
104
+ <tr>
105
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/low_precision_training.md">HiF8 量化</a></td>
106
+ <td>❌</td>
107
+ <td>✅ (Ascend 950)</td>
108
+ </tr>
109
+ <!-- 训练调试与监控 -->
110
+ <tr>
111
+ <td rowspan="2">训练调试与监控</td>
112
+ <td>分布式 Checkpoint</td>
113
+ <td>✅</td>
114
+ <td>✅</td>
115
+ </tr>
116
+ <tr>
117
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/metrics_and_debugging.md">调试工具</a></td>
118
+ <td>✅</td>
119
+ <td>✅</td>
120
+ </tr>
121
+ <!-- 性能优化 -->
122
+ <tr>
123
+ <td rowspan="2">性能优化</td>
124
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/swap_optimizer.md">Swap Optimizer</a></td>
125
+ <td>❌</td>
126
+ <td>✅</td>
127
+ </tr>
128
+ <tr>
129
+ <td><a href="https://gitcode.com/cann/torchtitan-npu/blob/master/docs/feature_guides/npu_fused_ops.md">NPU 融合算子适配</a></td>
130
+ <td>❌</td>
131
+ <td>✅</td>
132
+ </tr>
133
+ </tbody>
134
+ </table>
135
+
136
+ # 项目结构
137
+ torchtitan-npu 充分利用了 torchtitan 提供的 ModelConverter 插件化机制。该机制介入模型定义之后、并行策略(如 TP/FSDP)应用之前,支持以非侵入式的方式,通过注册机制对特定模块进行替换或重写。基于此方案,我们实现了融合算子优化、量化支持以及优化器增强等功能。见以下项目结构:
138
+ ```
139
+ torchtitan-npu/
140
+ ├── torchtitan_npu/ # torchtitan_npu核心源代码
141
+ │ ├── config/ # 对Config的补丁
142
+ │ ├── converters/ # 基于torchtitan ModelConverter机制的补丁
143
+ │ ├── distributed/ # 自定义分布式代码
144
+ │ ├── models/ # 基于torchtitan-npu的模型 (如Deepseek-V3.2)
145
+ │ ├── patches/ # 其他补丁
146
+ │ ├── tools/ # 工具补丁
147
+ │ ├── entry.py # 启动训练
148
+ │ ├── train.py # 训练主流程补丁
149
+ │ └── __init__.py # torchtitan-npu 插件修改注入点
150
+ ├── docs/ # 文档
151
+
152
+ ```
153
+
154
+ # 性能基准
155
+
156
+ ---
157
+
158
+ ### 2026.04
159
+
160
+ System: Atlas 800T A3
161
+ | Model | Number of NPUs | Precision | GBS | Local BS | Sequence Length | FSDP | TP | PP | CP | EP | Throughput (tokens/p/s) |
162
+ | :----------------- | :------------- | :-------- | :-- | :------- | :-------------- | :--- | :-- | :-- | :-- | :-- | :----------- |
163
+ | [Deepseek V3.2-671B](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v32/train_configs/deepseek_v32_671b_61layers_32k_128die.toml) | 64 | BF16 | 128 | 1 | 32768 | 4 | 4 | 1 | 8 | 64 | 103 |
164
+ | [Deepseek V3.2-671B](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v32/train_configs/deepseek_v32_671b_61layers_4k_128die.toml) | 64 | BF16 | 512 | 1 | 4096 | 32 | 4 | 1 | 1 | 64 | 146 |
165
+ | [Deepseek V3-671B](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v3/train_configs/deepseek_v3_671b_61layers_4k_128die.toml) | 64 | BF16 | 1024 | 1 | 4096 | 32 | 4 | 1 | 1 | 128 | 546 |
166
+ | [Deepseek V3-671B + compile(Autofuse)](https://gitcode.com/cann/torchtitan-npu/blob/master/torchtitan_npu/models/deepseek_v3/train_configs/deepseek_v3_671b_61layers_4k_128die.toml) | 64 | BF16 | 1024 | 1 | 4096 | 32 | 4 | 1 | 1 | 128 | 576 |
167
+ > 注:以上MoE模型的性能数据均开启负载均衡配置moe_force_load_balance=true。
168
+
169
+ # 免责声明
170
+
171
+ ---
172
+
173
+ ## 致 torchtitan‑npu 使用者
174
+
175
+ 1. torchtitan‑npu 提供的所有内容仅供您用于非商业目的。
176
+ 2. 对于 torchtitan‑npu 测试用例以及示例文件中所涉及的各模型和数据集,平台仅用于功能测试,华为不提供任何模型权重和数据集。如您使用这些数据进行训练,请您特别注意应遵守对应模型和数据集的 License,如您因使用这些模型和数据集而产生侵权纠纷,华为不承担任何责任。
177
+ 3. 如您在使用 torchtitan‑npu 过程中,发现任何问题(包括但不限于功能问题、合规问题),请在 GitCode 提交 issue,我们将及时审视并解决。
178
+
179
+ torchtitan‑npu 功能依赖的 PyTorch 等第三方开源软件,均由第三方社区提供和维护,因第三方开源软件导致的问题的修复依赖相关社区的贡献和反馈。您应理解,torchtitan‑npu 仓库不保证对第三方开源软件本身的问题进行修复,也不保证会测试、纠正所有第三方开源软件的漏洞和错误。
180
+
181
+
182
+ # License 声明
183
+
184
+ ---
185
+
186
+ - torchtitan‑npu 产品的使用许可证,具体请参见 [LICENSE](https://gitcode.com/cann/torchtitan-npu/blob/master/LICENSE)。
187
+ - torchtitan‑npu 工具 docs 目录下的文档适用相应许可证,具体请参见文档目录下的 LICENSE 文件。
@@ -0,0 +1 @@
1
+ 0.2.2.post1
@@ -0,0 +1,80 @@
1
+ # Copyright (c) 2026 Huawei Technologies Co., Ltd. All rights reserved.
2
+ #
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # ---- All project specifications ---- #
7
+ [project]
8
+ name = "torchtitan_npu"
9
+ description = "Ascend End-to-End Large Model Training Adaptation Framework Based on torchtitan"
10
+ readme = "README.md"
11
+ requires-python = ">=3.10"
12
+ license = {file = "LICENSE"}
13
+ authors = [
14
+ { name = "Huawei Hisilicon" },
15
+ ]
16
+ keywords = ["pytorch", "training", "llm", "npu", "ascend"]
17
+ dependencies = [
18
+ "torch",
19
+ "torch_npu",
20
+ ]
21
+ dynamic = ["version"]
22
+
23
+ [project.urls]
24
+ Homepage = "https://gitcode.com/cann/torchtitan-npu"
25
+ Documentation = "https://gitcode.com/cann/torchtitan-npu/tree/master/docs"
26
+ Issues = "https://gitcode.com/cann/torchtitan-npu/issues"
27
+ Repository = "https://gitcode.com/cann/torchtitan-npu.git"
28
+
29
+ [project.optional-dependencies]
30
+ testing = [
31
+ "pytest>=7.0",
32
+ "pytest-html>=4.0",
33
+ "pytest-cov>=4.0",
34
+ "pytest-mock>=3.10",
35
+ "hypothesis>=6.0",
36
+ ]
37
+ dev = [
38
+ "pre-commit",
39
+ "pytest",
40
+ "pytest-cov",
41
+ "expecttest", # test_tokenizer
42
+ "pyrefly==0.45.1",
43
+ ]
44
+
45
+ [tool.setuptools.dynamic]
46
+ version = {file = "assets/version.txt"}
47
+
48
+ [tool.codespell]
49
+ skip = ["*.json"]
50
+ ignore-words-list = ["cann"]
51
+
52
+ # ---- Explicit project build information ---- #
53
+ [build-system]
54
+ requires = ["setuptools>=61.0"]
55
+ build-backend = "setuptools.build_meta"
56
+
57
+ [tool.setuptools.packages.find]
58
+ where = [""]
59
+ include = ["torchtitan_npu*"]
60
+ exclude = ["CI*", "test_reports*", "cloudcache*", "third_party*"]
61
+
62
+ [tool.pytest.ini_options]
63
+ addopts = ["--showlocals", "-v", "--tb=short", "--import-mode=importlib"]
64
+ testpaths = ["tests"]
65
+ python_files = ["test_*.py"]
66
+ python_classes = ["Test*"]
67
+ python_functions = ["test_*"]
68
+ markers = [
69
+ "nightly: heavy-weight validation reserved for nightly or dedicated multi-card environments",
70
+ "smoke: integration/e2e smoke coverage for NPU features and training paths",
71
+ ]
72
+
73
+ [tool.pyrefly]
74
+ project-excludes = ["**/tests/**", "CI"]
75
+ ignore-missing-imports = ["torchao.*", "torchft"] # optional dependencies
76
+ search-path = ["../pytorch"] # local built pytorch
77
+ replace-imports-with-any = ["torch_npu", "torch.npu"]
78
+
79
+ [tool.usort]
80
+ known_first_party = ["torchtitan_npu", "tests"]