ctranslate2 4.6.2__cp314-cp314t-win_amd64.whl → 4.6.3__cp314-cp314t-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ctranslate2/_ext.cp314t-win_amd64.pyd +0 -0
- ctranslate2/converters/fairseq.py +3 -1
- ctranslate2/converters/opennmt_py.py +3 -1
- ctranslate2/converters/transformers.py +436 -30
- ctranslate2/ctranslate2.dll +0 -0
- ctranslate2/cudnn64_9.dll +0 -0
- ctranslate2/extensions.py +17 -13
- ctranslate2/specs/attention_spec.py +3 -1
- ctranslate2/specs/transformer_spec.py +93 -10
- ctranslate2/version.py +1 -1
- {ctranslate2-4.6.2.dist-info → ctranslate2-4.6.3.dist-info}/METADATA +14 -3
- {ctranslate2-4.6.2.dist-info → ctranslate2-4.6.3.dist-info}/RECORD +15 -15
- {ctranslate2-4.6.2.dist-info → ctranslate2-4.6.3.dist-info}/WHEEL +0 -0
- {ctranslate2-4.6.2.dist-info → ctranslate2-4.6.3.dist-info}/entry_points.txt +0 -0
- {ctranslate2-4.6.2.dist-info → ctranslate2-4.6.3.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -146,7 +146,9 @@ class FairseqConverter(Converter):
|
|
|
146
146
|
import_user_module(argparse.Namespace(user_dir=self._user_dir))
|
|
147
147
|
|
|
148
148
|
with torch.no_grad():
|
|
149
|
-
checkpoint =
|
|
149
|
+
checkpoint = torch.load(
|
|
150
|
+
self._model_path, map_location=torch.device("cpu"), weights_only=False
|
|
151
|
+
)
|
|
150
152
|
args = checkpoint["args"] or checkpoint["cfg"]["model"]
|
|
151
153
|
|
|
152
154
|
args.data = self._data_dir
|
|
@@ -174,7 +174,9 @@ class OpenNMTPyConverter(Converter):
|
|
|
174
174
|
def _load(self):
|
|
175
175
|
import torch
|
|
176
176
|
|
|
177
|
-
checkpoint = torch.load(
|
|
177
|
+
checkpoint = torch.load(
|
|
178
|
+
self._model_path, map_location="cpu", weights_only=False
|
|
179
|
+
)
|
|
178
180
|
|
|
179
181
|
src_vocabs, tgt_vocabs = get_vocabs(checkpoint["vocab"])
|
|
180
182
|
|
|
@@ -89,7 +89,7 @@ class TransformersConverter(Converter):
|
|
|
89
89
|
copy_files: List of filenames to copy from the Hugging Face model to the
|
|
90
90
|
converted model directory.
|
|
91
91
|
load_as_float16: Load the model weights as float16. More precisely, the model
|
|
92
|
-
will be loaded with ``from_pretrained(...,
|
|
92
|
+
will be loaded with ``from_pretrained(..., dtype=torch.float16)``.
|
|
93
93
|
revision: Revision of the model to download from the Hugging Face Hub.
|
|
94
94
|
low_cpu_mem_usage: Enable the flag ``low_cpu_mem_usage`` when loading the model
|
|
95
95
|
with ``from_pretrained``.
|
|
@@ -123,10 +123,11 @@ class TransformersConverter(Converter):
|
|
|
123
123
|
tokenizer_class = transformers.AutoTokenizer
|
|
124
124
|
|
|
125
125
|
kwargs = {
|
|
126
|
-
"
|
|
126
|
+
"dtype": (
|
|
127
127
|
torch.float16
|
|
128
128
|
if self._load_as_float16
|
|
129
|
-
else getattr(config, "
|
|
129
|
+
else getattr(config, "dtype", None)
|
|
130
|
+
or getattr(config, "torch_dtype", None)
|
|
130
131
|
)
|
|
131
132
|
}
|
|
132
133
|
|
|
@@ -235,7 +236,7 @@ class ModelLoader(abc.ABC):
|
|
|
235
236
|
|
|
236
237
|
if isinstance(module, transformers.Conv1D):
|
|
237
238
|
spec.weight = spec.weight.transpose(0, 1)
|
|
238
|
-
if module.bias is not None:
|
|
239
|
+
if hasattr(module, "bias") and module.bias is not None:
|
|
239
240
|
spec.bias = module.bias
|
|
240
241
|
|
|
241
242
|
def set_embeddings(self, spec, module):
|
|
@@ -2182,6 +2183,28 @@ class Qwen2Loader(ModelLoader):
|
|
|
2182
2183
|
rotary_scaling_type = None
|
|
2183
2184
|
rotary_scaling_factor = 1
|
|
2184
2185
|
|
|
2186
|
+
# Check for AWQ quantization config
|
|
2187
|
+
quantization_config = getattr(model.config, "quantization_config", None)
|
|
2188
|
+
if quantization_config:
|
|
2189
|
+
quant_type = None
|
|
2190
|
+
if quantization_config.quant_method == "awq":
|
|
2191
|
+
quant_type = _SUPPORTED_QUANTIZATION.get(quantization_config.version)
|
|
2192
|
+
if quant_type is None:
|
|
2193
|
+
raise NotImplementedError(
|
|
2194
|
+
"Quantization type '%s' is not yet implemented. "
|
|
2195
|
+
"The following Quantization types are currently supported: %s"
|
|
2196
|
+
% (
|
|
2197
|
+
quantization_config.quant_method,
|
|
2198
|
+
", ".join(_SUPPORTED_QUANTIZATION.keys()),
|
|
2199
|
+
)
|
|
2200
|
+
)
|
|
2201
|
+
quant_group_size = quantization_config.group_size
|
|
2202
|
+
quant_bits = quantization_config.bits
|
|
2203
|
+
else:
|
|
2204
|
+
quant_type = common_spec.Quantization.CT2
|
|
2205
|
+
quant_group_size = None
|
|
2206
|
+
quant_bits = None
|
|
2207
|
+
|
|
2185
2208
|
spec = transformer_spec.TransformerDecoderModelSpec.from_config(
|
|
2186
2209
|
num_layers,
|
|
2187
2210
|
num_heads,
|
|
@@ -2195,9 +2218,12 @@ class Qwen2Loader(ModelLoader):
|
|
|
2195
2218
|
rotary_scaling_factor=rotary_scaling_factor,
|
|
2196
2219
|
rotary_base=getattr(model.config, "rope_theta", 10000),
|
|
2197
2220
|
num_heads_kv=num_heads_kv,
|
|
2221
|
+
quant_type=quant_type,
|
|
2222
|
+
quant_group_size=quant_group_size,
|
|
2223
|
+
quant_bits=quant_bits,
|
|
2198
2224
|
)
|
|
2199
2225
|
|
|
2200
|
-
self.set_decoder(spec.decoder, model.model)
|
|
2226
|
+
self.set_decoder(spec.decoder, model.model, quant_type)
|
|
2201
2227
|
self.set_linear(spec.decoder.projection, model.lm_head)
|
|
2202
2228
|
return spec
|
|
2203
2229
|
|
|
@@ -2227,7 +2253,7 @@ class Qwen2Loader(ModelLoader):
|
|
|
2227
2253
|
def set_layer_norm(self, spec, layer_norm):
|
|
2228
2254
|
spec.gamma = layer_norm.weight
|
|
2229
2255
|
|
|
2230
|
-
def set_decoder(self, spec, module):
|
|
2256
|
+
def set_decoder(self, spec, module, quant_type=common_spec.Quantization.CT2):
|
|
2231
2257
|
spec.scale_embeddings = False
|
|
2232
2258
|
self.set_embeddings(spec.embeddings, module.embed_tokens)
|
|
2233
2259
|
self.set_layer_norm(spec.layer_norm, module.norm)
|
|
@@ -2241,19 +2267,39 @@ class Qwen2Loader(ModelLoader):
|
|
|
2241
2267
|
)
|
|
2242
2268
|
|
|
2243
2269
|
split_layers = [common_spec.LinearSpec() for _ in range(3)]
|
|
2244
|
-
self.set_linear(
|
|
2245
|
-
|
|
2246
|
-
|
|
2270
|
+
self.set_linear(
|
|
2271
|
+
split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
|
|
2272
|
+
)
|
|
2273
|
+
self.set_linear(
|
|
2274
|
+
split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
|
|
2275
|
+
)
|
|
2276
|
+
self.set_linear(
|
|
2277
|
+
split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
|
|
2278
|
+
)
|
|
2279
|
+
|
|
2280
|
+
if quant_type == common_spec.Quantization.CT2:
|
|
2281
|
+
utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
|
|
2282
|
+
else:
|
|
2283
|
+
cc_dim = 1 if quant_type == common_spec.Quantization.AWQ_GEMM else 0
|
|
2284
|
+
utils.fuse_linear_prequant(
|
|
2285
|
+
layer_spec.self_attention.linear[0], split_layers, cc_dim
|
|
2286
|
+
)
|
|
2247
2287
|
|
|
2248
|
-
utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
|
|
2249
2288
|
self.set_linear(
|
|
2250
2289
|
layer_spec.self_attention.linear[1],
|
|
2251
2290
|
layer.self_attn.o_proj,
|
|
2291
|
+
quant_type=quant_type,
|
|
2252
2292
|
)
|
|
2253
2293
|
|
|
2254
|
-
self.set_linear(
|
|
2255
|
-
|
|
2256
|
-
|
|
2294
|
+
self.set_linear(
|
|
2295
|
+
layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
|
|
2296
|
+
)
|
|
2297
|
+
self.set_linear(
|
|
2298
|
+
layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
|
|
2299
|
+
)
|
|
2300
|
+
self.set_linear(
|
|
2301
|
+
layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
|
|
2302
|
+
)
|
|
2257
2303
|
|
|
2258
2304
|
delattr(layer, "self_attn")
|
|
2259
2305
|
delattr(layer, "mlp")
|
|
@@ -2292,6 +2338,28 @@ class Qwen3Loader(ModelLoader):
|
|
|
2292
2338
|
rotary_scaling_type = None
|
|
2293
2339
|
rotary_scaling_factor = 1
|
|
2294
2340
|
|
|
2341
|
+
# Check for AWQ quantization config
|
|
2342
|
+
quantization_config = getattr(model.config, "quantization_config", None)
|
|
2343
|
+
if quantization_config:
|
|
2344
|
+
quant_type = None
|
|
2345
|
+
if quantization_config.quant_method == "awq":
|
|
2346
|
+
quant_type = _SUPPORTED_QUANTIZATION.get(quantization_config.version)
|
|
2347
|
+
if quant_type is None:
|
|
2348
|
+
raise NotImplementedError(
|
|
2349
|
+
"Quantization type '%s' is not yet implemented. "
|
|
2350
|
+
"The following Quantization types are currently supported: %s"
|
|
2351
|
+
% (
|
|
2352
|
+
quantization_config.quant_method,
|
|
2353
|
+
", ".join(_SUPPORTED_QUANTIZATION.keys()),
|
|
2354
|
+
)
|
|
2355
|
+
)
|
|
2356
|
+
quant_group_size = quantization_config.group_size
|
|
2357
|
+
quant_bits = quantization_config.bits
|
|
2358
|
+
else:
|
|
2359
|
+
quant_type = common_spec.Quantization.CT2
|
|
2360
|
+
quant_group_size = None
|
|
2361
|
+
quant_bits = None
|
|
2362
|
+
|
|
2295
2363
|
spec = transformer_spec.TransformerDecoderModelSpec.from_config(
|
|
2296
2364
|
num_layers,
|
|
2297
2365
|
num_heads,
|
|
@@ -2307,9 +2375,12 @@ class Qwen3Loader(ModelLoader):
|
|
|
2307
2375
|
num_heads_kv=num_heads_kv,
|
|
2308
2376
|
head_dim=head_dim,
|
|
2309
2377
|
qk_norm=True,
|
|
2378
|
+
quant_type=quant_type,
|
|
2379
|
+
quant_group_size=quant_group_size,
|
|
2380
|
+
quant_bits=quant_bits,
|
|
2310
2381
|
)
|
|
2311
2382
|
|
|
2312
|
-
self.set_decoder(spec.decoder, model.model)
|
|
2383
|
+
self.set_decoder(spec.decoder, model.model, quant_type)
|
|
2313
2384
|
self.set_linear(spec.decoder.projection, model.lm_head)
|
|
2314
2385
|
return spec
|
|
2315
2386
|
|
|
@@ -2338,7 +2409,7 @@ class Qwen3Loader(ModelLoader):
|
|
|
2338
2409
|
def set_layer_norm(self, spec, layer_norm):
|
|
2339
2410
|
spec.gamma = layer_norm.weight
|
|
2340
2411
|
|
|
2341
|
-
def set_decoder(self, spec, module):
|
|
2412
|
+
def set_decoder(self, spec, module, quant_type=common_spec.Quantization.CT2):
|
|
2342
2413
|
spec.scale_embeddings = False
|
|
2343
2414
|
self.set_embeddings(spec.embeddings, module.embed_tokens)
|
|
2344
2415
|
self.set_layer_norm(spec.layer_norm, module.norm)
|
|
@@ -2359,22 +2430,43 @@ class Qwen3Loader(ModelLoader):
|
|
|
2359
2430
|
)
|
|
2360
2431
|
|
|
2361
2432
|
split_layers = [common_spec.LinearSpec() for _ in range(3)]
|
|
2362
|
-
self.set_linear(
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2433
|
+
self.set_linear(
|
|
2434
|
+
split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
|
|
2435
|
+
)
|
|
2436
|
+
self.set_linear(
|
|
2437
|
+
split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
|
|
2438
|
+
)
|
|
2439
|
+
self.set_linear(
|
|
2440
|
+
split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
|
|
2441
|
+
)
|
|
2442
|
+
|
|
2443
|
+
if quant_type == common_spec.Quantization.CT2:
|
|
2444
|
+
utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
|
|
2445
|
+
else:
|
|
2446
|
+
cc_dim = 1 if quant_type == common_spec.Quantization.AWQ_GEMM else 0
|
|
2447
|
+
utils.fuse_linear_prequant(
|
|
2448
|
+
layer_spec.self_attention.linear[0], split_layers, cc_dim
|
|
2449
|
+
)
|
|
2366
2450
|
|
|
2367
2451
|
self.set_linear(
|
|
2368
2452
|
layer_spec.self_attention.linear[1],
|
|
2369
2453
|
layer.self_attn.o_proj,
|
|
2454
|
+
quant_type=quant_type,
|
|
2370
2455
|
)
|
|
2371
2456
|
|
|
2372
|
-
self.set_linear(
|
|
2373
|
-
|
|
2374
|
-
|
|
2457
|
+
self.set_linear(
|
|
2458
|
+
layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
|
|
2459
|
+
)
|
|
2460
|
+
self.set_linear(
|
|
2461
|
+
layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
|
|
2462
|
+
)
|
|
2463
|
+
self.set_linear(
|
|
2464
|
+
layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
|
|
2465
|
+
)
|
|
2375
2466
|
|
|
2376
2467
|
delattr(layer, "self_attn")
|
|
2377
2468
|
delattr(layer, "mlp")
|
|
2469
|
+
gc.collect()
|
|
2378
2470
|
|
|
2379
2471
|
|
|
2380
2472
|
@register_loader("MixFormerSequentialConfig")
|
|
@@ -2514,6 +2606,28 @@ class Phi3Loader(ModelLoader):
|
|
|
2514
2606
|
rotary_scaling_type = None
|
|
2515
2607
|
rotary_scaling_factor = 1
|
|
2516
2608
|
|
|
2609
|
+
# Check for AWQ quantization config
|
|
2610
|
+
quantization_config = getattr(model.config, "quantization_config", None)
|
|
2611
|
+
if quantization_config:
|
|
2612
|
+
quant_type = None
|
|
2613
|
+
if quantization_config.quant_method == "awq":
|
|
2614
|
+
quant_type = _SUPPORTED_QUANTIZATION.get(quantization_config.version)
|
|
2615
|
+
if quant_type is None:
|
|
2616
|
+
raise NotImplementedError(
|
|
2617
|
+
"Quantization type '%s' is not yet implemented. "
|
|
2618
|
+
"The following Quantization types are currently supported: %s"
|
|
2619
|
+
% (
|
|
2620
|
+
quantization_config.quant_method,
|
|
2621
|
+
", ".join(_SUPPORTED_QUANTIZATION.keys()),
|
|
2622
|
+
)
|
|
2623
|
+
)
|
|
2624
|
+
quant_group_size = quantization_config.group_size
|
|
2625
|
+
quant_bits = quantization_config.bits
|
|
2626
|
+
else:
|
|
2627
|
+
quant_type = common_spec.Quantization.CT2
|
|
2628
|
+
quant_group_size = None
|
|
2629
|
+
quant_bits = None
|
|
2630
|
+
|
|
2517
2631
|
spec = transformer_spec.TransformerDecoderModelSpec.from_config(
|
|
2518
2632
|
num_layers,
|
|
2519
2633
|
num_heads,
|
|
@@ -2529,9 +2643,12 @@ class Phi3Loader(ModelLoader):
|
|
|
2529
2643
|
original_max_position_embeddings=original_max_position_embeddings,
|
|
2530
2644
|
max_position_embeddings=max_position_embeddings,
|
|
2531
2645
|
num_heads_kv=num_heads_kv,
|
|
2646
|
+
quant_type=quant_type,
|
|
2647
|
+
quant_group_size=quant_group_size,
|
|
2648
|
+
quant_bits=quant_bits,
|
|
2532
2649
|
)
|
|
2533
2650
|
|
|
2534
|
-
self.set_decoder(spec.decoder, model.model)
|
|
2651
|
+
self.set_decoder(spec.decoder, model.model, quant_type)
|
|
2535
2652
|
self.set_linear(spec.decoder.projection, model.lm_head)
|
|
2536
2653
|
return spec
|
|
2537
2654
|
|
|
@@ -2565,7 +2682,7 @@ class Phi3Loader(ModelLoader):
|
|
|
2565
2682
|
rotary_scaling_short_factor, dtype=torch.float32
|
|
2566
2683
|
)
|
|
2567
2684
|
|
|
2568
|
-
def set_decoder(self, spec, module):
|
|
2685
|
+
def set_decoder(self, spec, module, quant_type=common_spec.Quantization.CT2):
|
|
2569
2686
|
spec.scale_embeddings = False
|
|
2570
2687
|
self.set_embeddings(spec.embeddings, module.embed_tokens)
|
|
2571
2688
|
self.set_layer_norm(spec.layer_norm, module.norm)
|
|
@@ -2579,9 +2696,15 @@ class Phi3Loader(ModelLoader):
|
|
|
2579
2696
|
)
|
|
2580
2697
|
|
|
2581
2698
|
self.set_linear(
|
|
2582
|
-
layer_spec.self_attention.linear[0],
|
|
2699
|
+
layer_spec.self_attention.linear[0],
|
|
2700
|
+
layer.self_attn.qkv_proj,
|
|
2701
|
+
quant_type=quant_type,
|
|
2702
|
+
)
|
|
2703
|
+
self.set_linear(
|
|
2704
|
+
layer_spec.self_attention.linear[1],
|
|
2705
|
+
layer.self_attn.o_proj,
|
|
2706
|
+
quant_type=quant_type,
|
|
2583
2707
|
)
|
|
2584
|
-
self.set_linear(layer_spec.self_attention.linear[1], layer.self_attn.o_proj)
|
|
2585
2708
|
if (
|
|
2586
2709
|
layer.self_attn.rotary_emb.long_factor is not None
|
|
2587
2710
|
and layer.self_attn.rotary_emb.short_factor is not None
|
|
@@ -2592,10 +2715,30 @@ class Phi3Loader(ModelLoader):
|
|
|
2592
2715
|
layer.self_attn.rotary_emb.short_factor,
|
|
2593
2716
|
)
|
|
2594
2717
|
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2718
|
+
# Handle gate_up_proj differently for AWQ vs regular models
|
|
2719
|
+
if quant_type == common_spec.Quantization.CT2:
|
|
2720
|
+
gate_proj, up_proj = layer.mlp.gate_up_proj.weight.chunk(2, dim=0)
|
|
2721
|
+
layer_spec.ffn.linear_0.weight = gate_proj
|
|
2722
|
+
layer_spec.ffn.linear_0_noact.weight = up_proj
|
|
2723
|
+
else:
|
|
2724
|
+
# AWQ: chunk qweight, scales, and qzeros
|
|
2725
|
+
gate_qweight, up_qweight = layer.mlp.gate_up_proj.qweight.chunk(
|
|
2726
|
+
2, dim=1
|
|
2727
|
+
)
|
|
2728
|
+
gate_scales, up_scales = layer.mlp.gate_up_proj.scales.chunk(2, dim=1)
|
|
2729
|
+
gate_qzeros, up_qzeros = layer.mlp.gate_up_proj.qzeros.chunk(2, dim=1)
|
|
2730
|
+
|
|
2731
|
+
layer_spec.ffn.linear_0.weight = gate_qweight
|
|
2732
|
+
layer_spec.ffn.linear_0.weight_scale = gate_scales
|
|
2733
|
+
layer_spec.ffn.linear_0.weight_zero = gate_qzeros
|
|
2734
|
+
|
|
2735
|
+
layer_spec.ffn.linear_0_noact.weight = up_qweight
|
|
2736
|
+
layer_spec.ffn.linear_0_noact.weight_scale = up_scales
|
|
2737
|
+
layer_spec.ffn.linear_0_noact.weight_zero = up_qzeros
|
|
2738
|
+
|
|
2739
|
+
self.set_linear(
|
|
2740
|
+
layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
|
|
2741
|
+
)
|
|
2599
2742
|
|
|
2600
2743
|
delattr(layer, "self_attn")
|
|
2601
2744
|
delattr(layer, "mlp")
|
|
@@ -3325,3 +3468,266 @@ _WHISPER_ALIGNMENT_HEADS = {
|
|
|
3325
3468
|
(25, 6),
|
|
3326
3469
|
],
|
|
3327
3470
|
}
|
|
3471
|
+
|
|
3472
|
+
|
|
3473
|
+
# Paper: https://arxiv.org/pdf/2504.06225
|
|
3474
|
+
@register_loader("T5GemmaConfig")
|
|
3475
|
+
class T5GemmaLoader(ModelLoader):
|
|
3476
|
+
@property
|
|
3477
|
+
def architecture_name(self):
|
|
3478
|
+
return "T5GemmaForConditionalGeneration"
|
|
3479
|
+
|
|
3480
|
+
def set_layer_norm(self, spec, layer_norm):
|
|
3481
|
+
spec.gamma = layer_norm.weight.data + 1.0
|
|
3482
|
+
|
|
3483
|
+
def get_model_spec(self, model):
|
|
3484
|
+
encoder_config = model.config.encoder
|
|
3485
|
+
decoder_config = model.config.decoder
|
|
3486
|
+
sliding_window = getattr(model.config, "sliding_window", 4096)
|
|
3487
|
+
|
|
3488
|
+
encoder_num_heads = encoder_config.num_attention_heads
|
|
3489
|
+
encoder_num_heads_kv = getattr(
|
|
3490
|
+
encoder_config, "num_key_value_heads", encoder_num_heads
|
|
3491
|
+
)
|
|
3492
|
+
if encoder_num_heads_kv == encoder_num_heads:
|
|
3493
|
+
encoder_num_heads_kv = None
|
|
3494
|
+
|
|
3495
|
+
encoder = transformer_spec.TransformerEncoderSpec(
|
|
3496
|
+
encoder_config.num_hidden_layers,
|
|
3497
|
+
encoder_config.num_attention_heads,
|
|
3498
|
+
pre_norm=True,
|
|
3499
|
+
activation=_SUPPORTED_ACTIVATIONS[encoder_config.hidden_activation],
|
|
3500
|
+
ffn_glu=True,
|
|
3501
|
+
rms_norm=True,
|
|
3502
|
+
rotary_dim=encoder_config.head_dim,
|
|
3503
|
+
rotary_interleave=False,
|
|
3504
|
+
rotary_base=getattr(encoder_config, "rope_theta", 10000),
|
|
3505
|
+
sliding_window=sliding_window,
|
|
3506
|
+
pre_post_layer_norm=True,
|
|
3507
|
+
num_heads_kv=encoder_num_heads_kv,
|
|
3508
|
+
head_dim=encoder_config.head_dim,
|
|
3509
|
+
)
|
|
3510
|
+
|
|
3511
|
+
decoder_num_heads = decoder_config.num_attention_heads
|
|
3512
|
+
decoder_num_heads_kv = getattr(
|
|
3513
|
+
decoder_config, "num_key_value_heads", decoder_num_heads
|
|
3514
|
+
)
|
|
3515
|
+
if decoder_num_heads_kv == decoder_num_heads:
|
|
3516
|
+
decoder_num_heads_kv = None
|
|
3517
|
+
|
|
3518
|
+
decoder = transformer_spec.TransformerDecoderSpec(
|
|
3519
|
+
decoder_config.num_hidden_layers,
|
|
3520
|
+
decoder_config.num_attention_heads,
|
|
3521
|
+
pre_norm=True,
|
|
3522
|
+
activation=_SUPPORTED_ACTIVATIONS[decoder_config.hidden_activation],
|
|
3523
|
+
ffn_glu=True,
|
|
3524
|
+
rms_norm=True,
|
|
3525
|
+
with_encoder_attention=True,
|
|
3526
|
+
rotary_dim=decoder_config.head_dim,
|
|
3527
|
+
rotary_interleave=False,
|
|
3528
|
+
rotary_base=getattr(decoder_config, "rope_theta", 10000),
|
|
3529
|
+
sliding_window=sliding_window,
|
|
3530
|
+
pre_post_layer_norm=True,
|
|
3531
|
+
external_pre_post_encoder_layers=True,
|
|
3532
|
+
num_heads_kv=decoder_num_heads_kv,
|
|
3533
|
+
head_dim=decoder_config.head_dim,
|
|
3534
|
+
)
|
|
3535
|
+
|
|
3536
|
+
spec = transformer_spec.TransformerSpec(encoder, decoder)
|
|
3537
|
+
|
|
3538
|
+
self.set_encoder(spec.encoder, model.model.encoder, encoder_config)
|
|
3539
|
+
|
|
3540
|
+
self.set_decoder(
|
|
3541
|
+
spec.decoder,
|
|
3542
|
+
model.model.decoder,
|
|
3543
|
+
decoder_config,
|
|
3544
|
+
common_spec.Quantization.CT2,
|
|
3545
|
+
)
|
|
3546
|
+
|
|
3547
|
+
# Tie_word_embeddings
|
|
3548
|
+
self.set_linear(spec.decoder.projection, model.model.decoder.embed_tokens)
|
|
3549
|
+
return spec
|
|
3550
|
+
|
|
3551
|
+
def set_vocabulary(self, spec, tokens):
|
|
3552
|
+
spec.register_source_vocabulary(tokens)
|
|
3553
|
+
spec.register_target_vocabulary(tokens)
|
|
3554
|
+
|
|
3555
|
+
def set_config(self, config, model, tokenizer):
|
|
3556
|
+
config.bos_token = tokenizer.bos_token
|
|
3557
|
+
config.eos_token = tokenizer.eos_token
|
|
3558
|
+
config.unk_token = tokenizer.unk_token
|
|
3559
|
+
|
|
3560
|
+
if hasattr(model.config, "encoder"):
|
|
3561
|
+
config.layer_norm_epsilon = model.config.encoder.rms_norm_eps
|
|
3562
|
+
elif hasattr(model.config, "rms_norm_eps"):
|
|
3563
|
+
config.layer_norm_epsilon = model.config.rms_norm_eps
|
|
3564
|
+
else:
|
|
3565
|
+
config.layer_norm_epsilon = 1e-6
|
|
3566
|
+
|
|
3567
|
+
config.decoder_start_token = tokenizer.bos_token
|
|
3568
|
+
|
|
3569
|
+
def set_encoder(
|
|
3570
|
+
self, spec, encoder, encoder_config, quant_type=common_spec.Quantization.CT2
|
|
3571
|
+
):
|
|
3572
|
+
spec.scale_embeddings = True
|
|
3573
|
+
|
|
3574
|
+
encoder_emb_spec = (
|
|
3575
|
+
spec.embeddings[0] if isinstance(spec.embeddings, list) else spec.embeddings
|
|
3576
|
+
)
|
|
3577
|
+
|
|
3578
|
+
self.set_embeddings(encoder_emb_spec, encoder.embed_tokens)
|
|
3579
|
+
encoder_emb_spec.multiply_by_sqrt_depth = encoder_config.hidden_size**0.5
|
|
3580
|
+
self.set_layer_norm(spec.layer_norm, encoder.norm)
|
|
3581
|
+
|
|
3582
|
+
module = encoder
|
|
3583
|
+
for i, (layer_spec, layer) in enumerate(zip(spec.layer, module.layers)):
|
|
3584
|
+
self.set_layer_norm(
|
|
3585
|
+
layer_spec.input_layer_norm, layer.pre_self_attn_layernorm
|
|
3586
|
+
)
|
|
3587
|
+
self.set_layer_norm(
|
|
3588
|
+
layer_spec.post_attention_layer_norm, layer.post_self_attn_layernorm
|
|
3589
|
+
)
|
|
3590
|
+
|
|
3591
|
+
# T5GemmaSelfAttention
|
|
3592
|
+
qkv_split_layers = [common_spec.LinearSpec() for _ in range(3)]
|
|
3593
|
+
self.set_linear(
|
|
3594
|
+
qkv_split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
|
|
3595
|
+
)
|
|
3596
|
+
self.set_linear(
|
|
3597
|
+
qkv_split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
|
|
3598
|
+
)
|
|
3599
|
+
self.set_linear(
|
|
3600
|
+
qkv_split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
|
|
3601
|
+
)
|
|
3602
|
+
utils.fuse_linear(layer_spec.self_attention.linear[0], qkv_split_layers)
|
|
3603
|
+
self.set_linear(
|
|
3604
|
+
layer_spec.self_attention.linear[1],
|
|
3605
|
+
layer.self_attn.o_proj,
|
|
3606
|
+
quant_type=quant_type,
|
|
3607
|
+
)
|
|
3608
|
+
|
|
3609
|
+
# T5GemmaRMSNorm
|
|
3610
|
+
self.set_layer_norm(
|
|
3611
|
+
layer_spec.pre_feedforward_layer_norm, layer.pre_feedforward_layernorm
|
|
3612
|
+
)
|
|
3613
|
+
# T5GemmaRMSNorm
|
|
3614
|
+
self.set_layer_norm(
|
|
3615
|
+
layer_spec.post_feedforward_layer_norm, layer.post_feedforward_layernorm
|
|
3616
|
+
)
|
|
3617
|
+
|
|
3618
|
+
# T5GemmaMLP
|
|
3619
|
+
self.set_linear(
|
|
3620
|
+
layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
|
|
3621
|
+
)
|
|
3622
|
+
self.set_linear(
|
|
3623
|
+
layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
|
|
3624
|
+
)
|
|
3625
|
+
self.set_linear(
|
|
3626
|
+
layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
|
|
3627
|
+
)
|
|
3628
|
+
|
|
3629
|
+
# Clean up
|
|
3630
|
+
delattr(layer, "self_attn")
|
|
3631
|
+
delattr(layer, "mlp")
|
|
3632
|
+
gc.collect()
|
|
3633
|
+
|
|
3634
|
+
def set_decoder(
|
|
3635
|
+
self, spec, module, decoder_config, quant_type=common_spec.Quantization.CT2
|
|
3636
|
+
):
|
|
3637
|
+
spec.scale_embeddings = True
|
|
3638
|
+
spec.start_from_zero_embedding = False
|
|
3639
|
+
|
|
3640
|
+
self.set_embeddings(spec.embeddings, module.embed_tokens)
|
|
3641
|
+
spec.embeddings.multiply_by_sqrt_depth = decoder_config.hidden_size**0.5
|
|
3642
|
+
self.set_layer_norm(spec.layer_norm, module.norm)
|
|
3643
|
+
|
|
3644
|
+
for i, (layer_spec, layer) in enumerate(zip(spec.layer, module.layers)):
|
|
3645
|
+
# Self-attention block
|
|
3646
|
+
self.set_layer_norm(
|
|
3647
|
+
layer_spec.input_layer_norm, layer.pre_self_attn_layernorm
|
|
3648
|
+
)
|
|
3649
|
+
self.set_layer_norm(
|
|
3650
|
+
layer_spec.post_attention_layer_norm, layer.post_self_attn_layernorm
|
|
3651
|
+
)
|
|
3652
|
+
|
|
3653
|
+
# T5GemmaSelfAttention - QKV projections
|
|
3654
|
+
qkv_split_layers = [common_spec.LinearSpec() for _ in range(3)]
|
|
3655
|
+
self.set_linear(
|
|
3656
|
+
qkv_split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
|
|
3657
|
+
)
|
|
3658
|
+
self.set_linear(
|
|
3659
|
+
qkv_split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
|
|
3660
|
+
)
|
|
3661
|
+
self.set_linear(
|
|
3662
|
+
qkv_split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
|
|
3663
|
+
)
|
|
3664
|
+
utils.fuse_linear(layer_spec.self_attention.linear[0], qkv_split_layers)
|
|
3665
|
+
self.set_linear(
|
|
3666
|
+
layer_spec.self_attention.linear[1],
|
|
3667
|
+
layer.self_attn.o_proj,
|
|
3668
|
+
quant_type=quant_type,
|
|
3669
|
+
)
|
|
3670
|
+
|
|
3671
|
+
# Pre and post cross-attention layer norm
|
|
3672
|
+
self.set_layer_norm(
|
|
3673
|
+
layer_spec.external_pre_encoder_attention_layer_norm,
|
|
3674
|
+
layer.pre_cross_attn_layernorm,
|
|
3675
|
+
)
|
|
3676
|
+
|
|
3677
|
+
self.set_layer_norm(
|
|
3678
|
+
layer_spec.external_post_encoder_attention_layer_norm,
|
|
3679
|
+
layer.post_cross_attn_layernorm,
|
|
3680
|
+
)
|
|
3681
|
+
|
|
3682
|
+
# Cross-attention Q projection
|
|
3683
|
+
self.set_linear(
|
|
3684
|
+
layer_spec.attention.linear[0],
|
|
3685
|
+
layer.cross_attn.q_proj,
|
|
3686
|
+
quant_type=quant_type,
|
|
3687
|
+
)
|
|
3688
|
+
|
|
3689
|
+
# Cross-attention K+V fused
|
|
3690
|
+
kv_split_layers = [common_spec.LinearSpec() for _ in range(2)]
|
|
3691
|
+
self.set_linear(
|
|
3692
|
+
kv_split_layers[0],
|
|
3693
|
+
layer.cross_attn.k_proj,
|
|
3694
|
+
quant_type=quant_type,
|
|
3695
|
+
)
|
|
3696
|
+
self.set_linear(
|
|
3697
|
+
kv_split_layers[1],
|
|
3698
|
+
layer.cross_attn.v_proj,
|
|
3699
|
+
quant_type=quant_type,
|
|
3700
|
+
)
|
|
3701
|
+
utils.fuse_linear(layer_spec.attention.linear[1], kv_split_layers)
|
|
3702
|
+
|
|
3703
|
+
# Cross-attention output projection
|
|
3704
|
+
self.set_linear(
|
|
3705
|
+
layer_spec.attention.linear[2],
|
|
3706
|
+
layer.cross_attn.o_proj,
|
|
3707
|
+
quant_type=quant_type,
|
|
3708
|
+
)
|
|
3709
|
+
|
|
3710
|
+
# Feed-forward block
|
|
3711
|
+
self.set_layer_norm(
|
|
3712
|
+
layer_spec.pre_feedforward_layer_norm, layer.pre_feedforward_layernorm
|
|
3713
|
+
)
|
|
3714
|
+
self.set_layer_norm(
|
|
3715
|
+
layer_spec.post_feedforward_layer_norm, layer.post_feedforward_layernorm
|
|
3716
|
+
)
|
|
3717
|
+
|
|
3718
|
+
# T5GemmaMLP
|
|
3719
|
+
self.set_linear(
|
|
3720
|
+
layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
|
|
3721
|
+
)
|
|
3722
|
+
self.set_linear(
|
|
3723
|
+
layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
|
|
3724
|
+
)
|
|
3725
|
+
self.set_linear(
|
|
3726
|
+
layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
|
|
3727
|
+
)
|
|
3728
|
+
|
|
3729
|
+
# Clean up
|
|
3730
|
+
delattr(layer, "self_attn")
|
|
3731
|
+
delattr(layer, "cross_attn")
|
|
3732
|
+
delattr(layer, "mlp")
|
|
3733
|
+
gc.collect()
|
ctranslate2/ctranslate2.dll
CHANGED
|
Binary file
|
ctranslate2/cudnn64_9.dll
CHANGED
|
Binary file
|
ctranslate2/extensions.py
CHANGED
|
@@ -556,12 +556,28 @@ def _process_iterable(process_func, iterables, max_batch_size, batch_type, **kwa
|
|
|
556
556
|
|
|
557
557
|
def _batch_iterator(iterable, batch_size, batch_type):
|
|
558
558
|
streams = None
|
|
559
|
-
|
|
559
|
+
max_length = 0
|
|
560
560
|
|
|
561
561
|
for example in iterable:
|
|
562
562
|
if not isinstance(example, tuple):
|
|
563
563
|
example = (example,)
|
|
564
564
|
|
|
565
|
+
if batch_type == "examples":
|
|
566
|
+
if streams and len(streams[0]) == batch_size:
|
|
567
|
+
yield streams
|
|
568
|
+
streams = None
|
|
569
|
+
|
|
570
|
+
elif batch_type == "tokens":
|
|
571
|
+
max_length = max(max_length, len(example[0]))
|
|
572
|
+
|
|
573
|
+
if streams and (len(streams[0]) + 1) * max_length > batch_size:
|
|
574
|
+
yield streams
|
|
575
|
+
streams = None
|
|
576
|
+
max_length = len(example[0])
|
|
577
|
+
|
|
578
|
+
else:
|
|
579
|
+
raise ValueError("Invalid batch type %s" % batch_type)
|
|
580
|
+
|
|
565
581
|
if streams is None:
|
|
566
582
|
streams = tuple([] for _ in example)
|
|
567
583
|
for batch, element in zip(streams, example):
|
|
@@ -569,17 +585,5 @@ def _batch_iterator(iterable, batch_size, batch_type):
|
|
|
569
585
|
raise ValueError("Input iterables do not have the same length")
|
|
570
586
|
batch.append(element)
|
|
571
587
|
|
|
572
|
-
if batch_type == "examples":
|
|
573
|
-
cur_batch_size += 1
|
|
574
|
-
elif batch_type == "tokens":
|
|
575
|
-
cur_batch_size += len(example[0])
|
|
576
|
-
else:
|
|
577
|
-
raise ValueError("Invalid batch type %s" % batch_type)
|
|
578
|
-
|
|
579
|
-
if cur_batch_size >= batch_size:
|
|
580
|
-
yield streams
|
|
581
|
-
streams = None
|
|
582
|
-
cur_batch_size = 0
|
|
583
|
-
|
|
584
588
|
if streams is not None:
|
|
585
589
|
yield streams
|
|
@@ -34,10 +34,12 @@ class MultiHeadAttentionSpec(model_spec.LayerSpec):
|
|
|
34
34
|
sliding_window=None,
|
|
35
35
|
qk_norm=False,
|
|
36
36
|
qk_norm_rms=True,
|
|
37
|
+
has_norm=True,
|
|
37
38
|
):
|
|
38
39
|
self.queries_scale = model_spec.OPTIONAL
|
|
39
40
|
|
|
40
|
-
|
|
41
|
+
if has_norm:
|
|
42
|
+
self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
41
43
|
self.linear = [
|
|
42
44
|
common_spec.LinearSpec() for _ in range(2 if self_attention else 3)
|
|
43
45
|
]
|
|
@@ -23,6 +23,16 @@ class TransformerEncoderSpec(model_spec.LayerSpec):
|
|
|
23
23
|
ffn_glu: bool = False,
|
|
24
24
|
rms_norm: bool = False,
|
|
25
25
|
multi_query_attention: bool = False,
|
|
26
|
+
num_heads_kv: Optional[int] = None,
|
|
27
|
+
head_dim: Optional[int] = None,
|
|
28
|
+
rotary_dim: Optional[int] = None,
|
|
29
|
+
rotary_interleave: bool = True,
|
|
30
|
+
rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
|
|
31
|
+
rotary_scaling_factor: float = 1,
|
|
32
|
+
rotary_base: float = 10000,
|
|
33
|
+
sliding_window: Optional[int] = None,
|
|
34
|
+
qk_norm: Optional[bool] = False,
|
|
35
|
+
pre_post_layer_norm: bool = False,
|
|
26
36
|
):
|
|
27
37
|
"""Initializes a Transformer encoder specification.
|
|
28
38
|
|
|
@@ -43,8 +53,28 @@ class TransformerEncoderSpec(model_spec.LayerSpec):
|
|
|
43
53
|
ffn_glu: Use gated linear units in the FFN layers as described in
|
|
44
54
|
https://arxiv.org/abs/2002.05202.
|
|
45
55
|
rms_norm: Use the root mean square layer normalization.
|
|
46
|
-
multi_query_attention: Use multi-query attention.
|
|
56
|
+
multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
|
|
57
|
+
num_heads_kv: Number of attention heads for the key and value.
|
|
58
|
+
head_dim: Number of dimensions per attention head.
|
|
59
|
+
rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
|
|
60
|
+
embeddings are applied to all dimensions.
|
|
61
|
+
rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
|
|
62
|
+
Otherwise the head dimensions are sliced in half.
|
|
63
|
+
rotary_scaling_type: Type of RoPE scaling.
|
|
64
|
+
rotary_scaling_factor: Factor used in the RoPE scaling.
|
|
65
|
+
rotary_base: The base period of the rotary embeddings.
|
|
66
|
+
sliding_window: Max sequence length to retain in KV Cache.
|
|
67
|
+
qk_norm: Apply layer normalization to the query and key projections.
|
|
68
|
+
pre_post_layer_norm: Add post layer norm for each pre norm layer.
|
|
47
69
|
"""
|
|
70
|
+
|
|
71
|
+
if multi_query_attention:
|
|
72
|
+
if num_heads_kv is not None and num_heads_kv != 1:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
"Enabling multi_query_attention implies num_heads_kv=1"
|
|
75
|
+
)
|
|
76
|
+
num_heads_kv = 1
|
|
77
|
+
|
|
48
78
|
self.multi_query_attention = multi_query_attention
|
|
49
79
|
self.num_heads = np.dtype("int16").type(num_heads)
|
|
50
80
|
self.pre_norm = pre_norm
|
|
@@ -60,13 +90,24 @@ class TransformerEncoderSpec(model_spec.LayerSpec):
|
|
|
60
90
|
self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
61
91
|
if layernorm_embedding:
|
|
62
92
|
self.layernorm_embedding = common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
93
|
+
if sliding_window is not None:
|
|
94
|
+
self.sliding_window = np.dtype("int32").type(sliding_window)
|
|
95
|
+
|
|
63
96
|
self.layer = [
|
|
64
97
|
TransformerEncoderLayerSpec(
|
|
65
98
|
relative_position=relative_position,
|
|
66
99
|
relative_attention_bias=relative_attention_bias,
|
|
67
100
|
ffn_glu=ffn_glu,
|
|
68
101
|
rms_norm=rms_norm,
|
|
69
|
-
num_heads_kv=
|
|
102
|
+
num_heads_kv=num_heads_kv,
|
|
103
|
+
head_dim=head_dim,
|
|
104
|
+
rotary_dim=rotary_dim,
|
|
105
|
+
rotary_interleave=rotary_interleave,
|
|
106
|
+
rotary_scaling_type=rotary_scaling_type,
|
|
107
|
+
rotary_scaling_factor=rotary_scaling_factor,
|
|
108
|
+
rotary_base=rotary_base,
|
|
109
|
+
qk_norm=qk_norm,
|
|
110
|
+
pre_post_layer_norm=pre_post_layer_norm,
|
|
70
111
|
)
|
|
71
112
|
for _ in range(num_layers)
|
|
72
113
|
]
|
|
@@ -109,7 +150,8 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
|
|
|
109
150
|
quant_type: Optional[common_spec.Quantization] = None,
|
|
110
151
|
quant_group_size: Optional[int] = None,
|
|
111
152
|
quant_bits: Optional[int] = None,
|
|
112
|
-
qk_norm:
|
|
153
|
+
qk_norm: bool = False,
|
|
154
|
+
external_pre_post_encoder_layers: Optional[bool] = False,
|
|
113
155
|
):
|
|
114
156
|
"""Initializes a Transformer decoder specification.
|
|
115
157
|
|
|
@@ -156,6 +198,8 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
|
|
|
156
198
|
quant_type: quantization type used (like awq... for lower bit quantization)
|
|
157
199
|
quant_group_size: group size of the lower bit quantization
|
|
158
200
|
quant_bits: number of bit of the quantization (ex: 4bit)
|
|
201
|
+
external_pre_post_encoder_layers: if the encoder attention pre and processing
|
|
202
|
+
is done outside the attention.
|
|
159
203
|
"""
|
|
160
204
|
|
|
161
205
|
self._config = dict()
|
|
@@ -172,12 +216,6 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
|
|
|
172
216
|
)
|
|
173
217
|
num_heads_kv = 1
|
|
174
218
|
|
|
175
|
-
if with_encoder_attention and num_heads_kv not in (None, 1, num_heads):
|
|
176
|
-
raise ValueError(
|
|
177
|
-
"num_heads_kv=%d is not supported in the cross-attention layers"
|
|
178
|
-
% num_heads_kv
|
|
179
|
-
)
|
|
180
|
-
|
|
181
219
|
self.num_heads = np.dtype("int16").type(num_heads)
|
|
182
220
|
self.pre_norm = pre_norm
|
|
183
221
|
self.activation = np.dtype("int8").type(activation)
|
|
@@ -224,6 +262,7 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
|
|
|
224
262
|
head_dim=head_dim,
|
|
225
263
|
sliding_window=sliding_window,
|
|
226
264
|
qk_norm=qk_norm,
|
|
265
|
+
external_pre_post_encoder_layers=external_pre_post_encoder_layers,
|
|
227
266
|
)
|
|
228
267
|
for _ in range(num_layers)
|
|
229
268
|
]
|
|
@@ -254,7 +293,15 @@ class TransformerEncoderLayerSpec(model_spec.LayerSpec):
|
|
|
254
293
|
ffn_glu=False,
|
|
255
294
|
rms_norm=False,
|
|
256
295
|
num_heads_kv=None,
|
|
296
|
+
head_dim=None,
|
|
257
297
|
sliding_window=None,
|
|
298
|
+
rotary_dim: Optional[int] = None,
|
|
299
|
+
rotary_interleave: bool = True,
|
|
300
|
+
rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
|
|
301
|
+
rotary_scaling_factor: float = 1,
|
|
302
|
+
rotary_base: float = 10000,
|
|
303
|
+
qk_norm=False,
|
|
304
|
+
pre_post_layer_norm: bool = False,
|
|
258
305
|
):
|
|
259
306
|
self.self_attention = attention_spec.MultiHeadAttentionSpec(
|
|
260
307
|
self_attention=True,
|
|
@@ -262,10 +309,32 @@ class TransformerEncoderLayerSpec(model_spec.LayerSpec):
|
|
|
262
309
|
relative_attention_bias=relative_attention_bias,
|
|
263
310
|
rms_norm=rms_norm,
|
|
264
311
|
num_heads_kv=num_heads_kv,
|
|
312
|
+
head_dim=head_dim,
|
|
265
313
|
sliding_window=sliding_window,
|
|
314
|
+
rotary_dim=rotary_dim,
|
|
315
|
+
rotary_interleave=rotary_interleave,
|
|
316
|
+
rotary_scaling_type=rotary_scaling_type,
|
|
317
|
+
rotary_scaling_factor=rotary_scaling_factor,
|
|
318
|
+
rotary_base=rotary_base,
|
|
319
|
+
qk_norm=qk_norm,
|
|
266
320
|
)
|
|
267
321
|
self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)
|
|
268
322
|
|
|
323
|
+
if pre_post_layer_norm:
|
|
324
|
+
self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
325
|
+
self.post_attention_layer_norm = common_spec.LayerNormSpec(
|
|
326
|
+
rms_norm=rms_norm
|
|
327
|
+
)
|
|
328
|
+
self.pre_feedforward_layer_norm = common_spec.LayerNormSpec(
|
|
329
|
+
rms_norm=rms_norm
|
|
330
|
+
)
|
|
331
|
+
self.post_feedforward_layer_norm = common_spec.LayerNormSpec(
|
|
332
|
+
rms_norm=rms_norm
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
delattr(self.self_attention, "layer_norm")
|
|
336
|
+
delattr(self.ffn, "layer_norm")
|
|
337
|
+
|
|
269
338
|
|
|
270
339
|
class TransformerDecoderLayerSpec(model_spec.LayerSpec):
|
|
271
340
|
def __init__(
|
|
@@ -289,6 +358,7 @@ class TransformerDecoderLayerSpec(model_spec.LayerSpec):
|
|
|
289
358
|
head_dim=None,
|
|
290
359
|
sliding_window=None,
|
|
291
360
|
qk_norm=False,
|
|
361
|
+
external_pre_post_encoder_layers=False,
|
|
292
362
|
):
|
|
293
363
|
self.self_attention = attention_spec.MultiHeadAttentionSpec(
|
|
294
364
|
self_attention=True,
|
|
@@ -312,8 +382,10 @@ class TransformerDecoderLayerSpec(model_spec.LayerSpec):
|
|
|
312
382
|
self.attention = attention_spec.MultiHeadAttentionSpec(
|
|
313
383
|
rms_norm=rms_norm,
|
|
314
384
|
num_heads_kv=num_heads_kv,
|
|
385
|
+
head_dim=head_dim,
|
|
315
386
|
sliding_window=sliding_window,
|
|
316
387
|
qk_norm=qk_norm,
|
|
388
|
+
has_norm=external_pre_post_encoder_layers is False,
|
|
317
389
|
)
|
|
318
390
|
|
|
319
391
|
self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)
|
|
@@ -329,10 +401,21 @@ class TransformerDecoderLayerSpec(model_spec.LayerSpec):
|
|
|
329
401
|
delattr(self.ffn, "layer_norm")
|
|
330
402
|
|
|
331
403
|
if pre_post_layer_norm:
|
|
404
|
+
# Self-attention layer norms
|
|
332
405
|
self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
333
406
|
self.post_attention_layer_norm = common_spec.LayerNormSpec(
|
|
334
407
|
rms_norm=rms_norm
|
|
335
408
|
)
|
|
409
|
+
|
|
410
|
+
if with_encoder_attention and external_pre_post_encoder_layers:
|
|
411
|
+
self.external_post_encoder_attention_layer_norm = (
|
|
412
|
+
common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
413
|
+
)
|
|
414
|
+
self.external_pre_encoder_attention_layer_norm = (
|
|
415
|
+
common_spec.LayerNormSpec(rms_norm=rms_norm)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Feed-forward layer norms
|
|
336
419
|
self.pre_feedforward_layer_norm = common_spec.LayerNormSpec(
|
|
337
420
|
rms_norm=rms_norm
|
|
338
421
|
)
|
|
@@ -562,7 +645,7 @@ class TransformerDecoderModelSpec(model_spec.LanguageModelSpec):
|
|
|
562
645
|
quant_type: Optional[common_spec.Quantization] = None,
|
|
563
646
|
quant_group_size: Optional[int] = None,
|
|
564
647
|
quant_bits: Optional[int] = None,
|
|
565
|
-
qk_norm:
|
|
648
|
+
qk_norm: bool = False,
|
|
566
649
|
):
|
|
567
650
|
"""Creates a Transformer decoder model specification.
|
|
568
651
|
|
ctranslate2/version.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ctranslate2
|
|
3
|
-
Version: 4.6.
|
|
3
|
+
Version: 4.6.3
|
|
4
4
|
Summary: Fast inference engine for Transformer models
|
|
5
5
|
Home-page: https://opennmt.net
|
|
6
6
|
Author: OpenNMT
|
|
7
|
+
License: MIT
|
|
7
8
|
Project-URL: Documentation, https://opennmt.net/CTranslate2
|
|
8
9
|
Project-URL: Forum, https://forum.opennmt.net
|
|
9
10
|
Project-URL: Gitter, https://gitter.im/OpenNMT/CTranslate2
|
|
@@ -13,7 +14,6 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
|
13
14
|
Classifier: Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4
|
|
14
15
|
Classifier: Intended Audience :: Developers
|
|
15
16
|
Classifier: Intended Audience :: Science/Research
|
|
16
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
|
18
18
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -34,6 +34,7 @@ Dynamic: description
|
|
|
34
34
|
Dynamic: description-content-type
|
|
35
35
|
Dynamic: home-page
|
|
36
36
|
Dynamic: keywords
|
|
37
|
+
Dynamic: license
|
|
37
38
|
Dynamic: project-url
|
|
38
39
|
Dynamic: requires-dist
|
|
39
40
|
Dynamic: requires-python
|
|
@@ -49,7 +50,7 @@ The project implements a custom runtime that applies many performance optimizati
|
|
|
49
50
|
|
|
50
51
|
The following model types are currently supported:
|
|
51
52
|
|
|
52
|
-
* Encoder-decoder models: Transformer base/big, M2M-100, NLLB, BART, mBART, Pegasus, T5, Whisper
|
|
53
|
+
* Encoder-decoder models: Transformer base/big, M2M-100, NLLB, BART, mBART, Pegasus, T5, Whisper T5Gemma
|
|
53
54
|
* Decoder-only models: GPT-2, GPT-J, GPT-NeoX, OPT, BLOOM, MPT, Llama, Mistral, Gemma, CodeGen, GPTBigCode, Falcon, Qwen2
|
|
54
55
|
* Encoder-only models: BERT, DistilBERT, XLM-RoBERTa
|
|
55
56
|
|
|
@@ -160,6 +161,16 @@ Executed with 4 threads on a [*c5.2xlarge*](https://aws.amazon.com/ec2/instance-
|
|
|
160
161
|
|
|
161
162
|
Executed with CUDA 11 on a [*g5.xlarge*](https://aws.amazon.com/ec2/instance-types/g5/) Amazon EC2 instance equipped with a NVIDIA A10G GPU (driver version: 510.47.03).
|
|
162
163
|
|
|
164
|
+
## Contributing
|
|
165
|
+
|
|
166
|
+
CTranslate2 is a community-driven project. We welcome contributions of all kinds:
|
|
167
|
+
* **New Model Support:** Help us implement more Transformer architectures.
|
|
168
|
+
* **Performance:** Propose optimizations for CPU or GPU kernels.
|
|
169
|
+
* **Bug Reports:** Open an issue if you find something not working as expected.
|
|
170
|
+
* **Documentation:** Improve our guides or add new examples.
|
|
171
|
+
|
|
172
|
+
Check out our [Contributing Guide](CONTRIBUTING.md) to learn how to set up your development environment.
|
|
173
|
+
|
|
163
174
|
## Additional resources
|
|
164
175
|
|
|
165
176
|
* [Documentation](https://opennmt.net/CTranslate2)
|
|
@@ -1,33 +1,33 @@
|
|
|
1
1
|
ctranslate2/__init__.py,sha256=CGqShDaFxQ-u-aCtVq99T4HKuBdMB8b49l2KSxnQb8M,1735
|
|
2
|
-
ctranslate2/_ext.cp314t-win_amd64.pyd,sha256=
|
|
3
|
-
ctranslate2/ctranslate2.dll,sha256=
|
|
4
|
-
ctranslate2/cudnn64_9.dll,sha256=
|
|
5
|
-
ctranslate2/extensions.py,sha256=
|
|
2
|
+
ctranslate2/_ext.cp314t-win_amd64.pyd,sha256=SJzyI2umi8fNXsvbshO65rCof-2B7QCfPf0t44_NEGY,775168
|
|
3
|
+
ctranslate2/ctranslate2.dll,sha256=9TRGHvoyNSzXe9eEc3jKJa5-21-AeSENgp3DXvhCZ4M,58590720
|
|
4
|
+
ctranslate2/cudnn64_9.dll,sha256=ntvN_3OwrwcOsWCyzmbln-ygSqAXNR2O7cxejhSZZ9I,266288
|
|
5
|
+
ctranslate2/extensions.py,sha256=kDNt0H9KvfNCc3PrRGzfkj9Fkvna84i2O5Y-rav6UkU,21940
|
|
6
6
|
ctranslate2/libiomp5md.dll,sha256=mCIzNmsK_NoeD1WgsTQJfjW3eWE_VN22nmhebNBrdV8,1614192
|
|
7
7
|
ctranslate2/logging.py,sha256=P9evHdxuMx_iHvwJjEASEq-j5062H64Pl5-fJjxEuHk,1221
|
|
8
|
-
ctranslate2/version.py,sha256=
|
|
8
|
+
ctranslate2/version.py,sha256=TboXlbA67GNmSOm1v2u_U8AKgYh5iminMMLTvi3Xho4,53
|
|
9
9
|
ctranslate2/converters/__init__.py,sha256=ufYjcXf2sK4fiXAUU6tIJyWmNuLjKFf_KH3GWLXe4ls,507
|
|
10
10
|
ctranslate2/converters/converter.py,sha256=Qkb8NGLLmgqMT6HZkFq61zwbxyq3NlWcaxLZ6Ap-YOQ,3601
|
|
11
11
|
ctranslate2/converters/eole_ct2.py,sha256=RUcDJH_2AUt0jDs5oAqccE6tQPbO9LQ6JmVriC1DTy8,12564
|
|
12
|
-
ctranslate2/converters/fairseq.py,sha256=
|
|
12
|
+
ctranslate2/converters/fairseq.py,sha256=2vlBk4AVCHwXxKkwPHVmcjyfo1dAV0_DJS1i6q-44NE,12822
|
|
13
13
|
ctranslate2/converters/marian.py,sha256=1_7P3EbIDPOdyJbtb_Lp-LCBPBb9A8E9OhzoyFwTb64,11274
|
|
14
14
|
ctranslate2/converters/openai_gpt2.py,sha256=1rXKM2ZURZHWRv4XZ135fPkVWpM4rTG-q7VR7OD6d-A,3304
|
|
15
|
-
ctranslate2/converters/opennmt_py.py,sha256=
|
|
15
|
+
ctranslate2/converters/opennmt_py.py,sha256=zex4TbHiiJMy0tkqQg39oNjxmSZKf8dnRLH3iQ1H4z0,13227
|
|
16
16
|
ctranslate2/converters/opennmt_tf.py,sha256=uBRp2wz5xriSQcA_c0S0ekY7ws6RpRX_0EKeMRdM7-s,16222
|
|
17
17
|
ctranslate2/converters/opus_mt.py,sha256=5KbPaTiBhhorPzMpTugIfIJ8SgcqHfJUbJrWKBN-Djs,1254
|
|
18
|
-
ctranslate2/converters/transformers.py,sha256=
|
|
18
|
+
ctranslate2/converters/transformers.py,sha256=VRal3vKSQrAOvcNPwewjVMtgvWskz0KD5bdIrpNrZNA,142380
|
|
19
19
|
ctranslate2/converters/utils.py,sha256=w7NG39lx-9dOdL57OqKVTdC__opkuP8RACg1TLlUJwM,3817
|
|
20
20
|
ctranslate2/models/__init__.py,sha256=53p98uemtuvVPz8xK7_LbOhBiUJJu-c-NdmOHJgdXus,497
|
|
21
21
|
ctranslate2/specs/__init__.py,sha256=9GabtSyczznYqiqUS6XvULi8pQ3_3RNRogXobGP0G80,653
|
|
22
|
-
ctranslate2/specs/attention_spec.py,sha256=
|
|
22
|
+
ctranslate2/specs/attention_spec.py,sha256=FnaSiQREWQw_cURgsCb9_aIpGOCxyVGTCpIOdd-08v8,3492
|
|
23
23
|
ctranslate2/specs/common_spec.py,sha256=freTDhQMy5PYofBrij4_FDgrKokMYApWSPIpASZIlJc,1608
|
|
24
24
|
ctranslate2/specs/model_spec.py,sha256=atCAYzDEIzyJ1TCayFGZVutHqSWa1ww-vbZ0OiIJqh8,25736
|
|
25
|
-
ctranslate2/specs/transformer_spec.py,sha256
|
|
25
|
+
ctranslate2/specs/transformer_spec.py,sha256=-GJ0oSjI3ns-Ei_-xXIM_P2GaZxt5Z-g03zJ0m_4ciU,34317
|
|
26
26
|
ctranslate2/specs/wav2vec2_spec.py,sha256=NITsuOuf2F5bU1-aXit8-WEtWV9fH2Eq7A7857UyYho,2106
|
|
27
27
|
ctranslate2/specs/wav2vec2bert_spec.py,sha256=UgtsJWC9mMgJ7bn4T_xg1uXK0rqA4-9tT2KMGVgPKnw,3529
|
|
28
28
|
ctranslate2/specs/whisper_spec.py,sha256=_vm1sc5yOowOJ4iyvcxMXrgt-UcLJrZT8OtPscUXcQQ,2447
|
|
29
|
-
ctranslate2-4.6.
|
|
30
|
-
ctranslate2-4.6.
|
|
31
|
-
ctranslate2-4.6.
|
|
32
|
-
ctranslate2-4.6.
|
|
33
|
-
ctranslate2-4.6.
|
|
29
|
+
ctranslate2-4.6.3.dist-info/METADATA,sha256=awoc6t4JSxpv51lmfAG28ZG91FhGQ8DHspyLzLqLo_Q,10839
|
|
30
|
+
ctranslate2-4.6.3.dist-info/WHEEL,sha256=IxxXYqBIlKEzFy9ulBJ928Gdqg6XQ3DHti4avqq3myk,102
|
|
31
|
+
ctranslate2-4.6.3.dist-info/entry_points.txt,sha256=ZHkojut_TmVRHl0bJIGm2b9wqr98GAJqxN9rlJtQshs,466
|
|
32
|
+
ctranslate2-4.6.3.dist-info/top_level.txt,sha256=1hUaWzcFIuSo2BAIUHFA3Osgsu6S1giq0y6Rosv8HOQ,12
|
|
33
|
+
ctranslate2-4.6.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|