ctranslate2 4.6.2__cp314-cp314t-win_amd64.whl → 4.6.3__cp314-cp314t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -146,7 +146,9 @@ class FairseqConverter(Converter):
146
146
  import_user_module(argparse.Namespace(user_dir=self._user_dir))
147
147
 
148
148
  with torch.no_grad():
149
- checkpoint = checkpoint_utils.load_checkpoint_to_cpu(self._model_path)
149
+ checkpoint = torch.load(
150
+ self._model_path, map_location=torch.device("cpu"), weights_only=False
151
+ )
150
152
  args = checkpoint["args"] or checkpoint["cfg"]["model"]
151
153
 
152
154
  args.data = self._data_dir
@@ -174,7 +174,9 @@ class OpenNMTPyConverter(Converter):
174
174
  def _load(self):
175
175
  import torch
176
176
 
177
- checkpoint = torch.load(self._model_path, map_location="cpu")
177
+ checkpoint = torch.load(
178
+ self._model_path, map_location="cpu", weights_only=False
179
+ )
178
180
 
179
181
  src_vocabs, tgt_vocabs = get_vocabs(checkpoint["vocab"])
180
182
 
@@ -89,7 +89,7 @@ class TransformersConverter(Converter):
89
89
  copy_files: List of filenames to copy from the Hugging Face model to the
90
90
  converted model directory.
91
91
  load_as_float16: Load the model weights as float16. More precisely, the model
92
- will be loaded with ``from_pretrained(..., torch_dtype=torch.float16)``.
92
+ will be loaded with ``from_pretrained(..., dtype=torch.float16)``.
93
93
  revision: Revision of the model to download from the Hugging Face Hub.
94
94
  low_cpu_mem_usage: Enable the flag ``low_cpu_mem_usage`` when loading the model
95
95
  with ``from_pretrained``.
@@ -123,10 +123,11 @@ class TransformersConverter(Converter):
123
123
  tokenizer_class = transformers.AutoTokenizer
124
124
 
125
125
  kwargs = {
126
- "torch_dtype": (
126
+ "dtype": (
127
127
  torch.float16
128
128
  if self._load_as_float16
129
- else getattr(config, "torch_dtype", None)
129
+ else getattr(config, "dtype", None)
130
+ or getattr(config, "torch_dtype", None)
130
131
  )
131
132
  }
132
133
 
@@ -235,7 +236,7 @@ class ModelLoader(abc.ABC):
235
236
 
236
237
  if isinstance(module, transformers.Conv1D):
237
238
  spec.weight = spec.weight.transpose(0, 1)
238
- if module.bias is not None:
239
+ if hasattr(module, "bias") and module.bias is not None:
239
240
  spec.bias = module.bias
240
241
 
241
242
  def set_embeddings(self, spec, module):
@@ -2182,6 +2183,28 @@ class Qwen2Loader(ModelLoader):
2182
2183
  rotary_scaling_type = None
2183
2184
  rotary_scaling_factor = 1
2184
2185
 
2186
+ # Check for AWQ quantization config
2187
+ quantization_config = getattr(model.config, "quantization_config", None)
2188
+ if quantization_config:
2189
+ quant_type = None
2190
+ if quantization_config.quant_method == "awq":
2191
+ quant_type = _SUPPORTED_QUANTIZATION.get(quantization_config.version)
2192
+ if quant_type is None:
2193
+ raise NotImplementedError(
2194
+ "Quantization type '%s' is not yet implemented. "
2195
+ "The following Quantization types are currently supported: %s"
2196
+ % (
2197
+ quantization_config.quant_method,
2198
+ ", ".join(_SUPPORTED_QUANTIZATION.keys()),
2199
+ )
2200
+ )
2201
+ quant_group_size = quantization_config.group_size
2202
+ quant_bits = quantization_config.bits
2203
+ else:
2204
+ quant_type = common_spec.Quantization.CT2
2205
+ quant_group_size = None
2206
+ quant_bits = None
2207
+
2185
2208
  spec = transformer_spec.TransformerDecoderModelSpec.from_config(
2186
2209
  num_layers,
2187
2210
  num_heads,
@@ -2195,9 +2218,12 @@ class Qwen2Loader(ModelLoader):
2195
2218
  rotary_scaling_factor=rotary_scaling_factor,
2196
2219
  rotary_base=getattr(model.config, "rope_theta", 10000),
2197
2220
  num_heads_kv=num_heads_kv,
2221
+ quant_type=quant_type,
2222
+ quant_group_size=quant_group_size,
2223
+ quant_bits=quant_bits,
2198
2224
  )
2199
2225
 
2200
- self.set_decoder(spec.decoder, model.model)
2226
+ self.set_decoder(spec.decoder, model.model, quant_type)
2201
2227
  self.set_linear(spec.decoder.projection, model.lm_head)
2202
2228
  return spec
2203
2229
 
@@ -2227,7 +2253,7 @@ class Qwen2Loader(ModelLoader):
2227
2253
  def set_layer_norm(self, spec, layer_norm):
2228
2254
  spec.gamma = layer_norm.weight
2229
2255
 
2230
- def set_decoder(self, spec, module):
2256
+ def set_decoder(self, spec, module, quant_type=common_spec.Quantization.CT2):
2231
2257
  spec.scale_embeddings = False
2232
2258
  self.set_embeddings(spec.embeddings, module.embed_tokens)
2233
2259
  self.set_layer_norm(spec.layer_norm, module.norm)
@@ -2241,19 +2267,39 @@ class Qwen2Loader(ModelLoader):
2241
2267
  )
2242
2268
 
2243
2269
  split_layers = [common_spec.LinearSpec() for _ in range(3)]
2244
- self.set_linear(split_layers[0], layer.self_attn.q_proj)
2245
- self.set_linear(split_layers[1], layer.self_attn.k_proj)
2246
- self.set_linear(split_layers[2], layer.self_attn.v_proj)
2270
+ self.set_linear(
2271
+ split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
2272
+ )
2273
+ self.set_linear(
2274
+ split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
2275
+ )
2276
+ self.set_linear(
2277
+ split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
2278
+ )
2279
+
2280
+ if quant_type == common_spec.Quantization.CT2:
2281
+ utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
2282
+ else:
2283
+ cc_dim = 1 if quant_type == common_spec.Quantization.AWQ_GEMM else 0
2284
+ utils.fuse_linear_prequant(
2285
+ layer_spec.self_attention.linear[0], split_layers, cc_dim
2286
+ )
2247
2287
 
2248
- utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
2249
2288
  self.set_linear(
2250
2289
  layer_spec.self_attention.linear[1],
2251
2290
  layer.self_attn.o_proj,
2291
+ quant_type=quant_type,
2252
2292
  )
2253
2293
 
2254
- self.set_linear(layer_spec.ffn.linear_0, layer.mlp.gate_proj)
2255
- self.set_linear(layer_spec.ffn.linear_0_noact, layer.mlp.up_proj)
2256
- self.set_linear(layer_spec.ffn.linear_1, layer.mlp.down_proj)
2294
+ self.set_linear(
2295
+ layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
2296
+ )
2297
+ self.set_linear(
2298
+ layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
2299
+ )
2300
+ self.set_linear(
2301
+ layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
2302
+ )
2257
2303
 
2258
2304
  delattr(layer, "self_attn")
2259
2305
  delattr(layer, "mlp")
@@ -2292,6 +2338,28 @@ class Qwen3Loader(ModelLoader):
2292
2338
  rotary_scaling_type = None
2293
2339
  rotary_scaling_factor = 1
2294
2340
 
2341
+ # Check for AWQ quantization config
2342
+ quantization_config = getattr(model.config, "quantization_config", None)
2343
+ if quantization_config:
2344
+ quant_type = None
2345
+ if quantization_config.quant_method == "awq":
2346
+ quant_type = _SUPPORTED_QUANTIZATION.get(quantization_config.version)
2347
+ if quant_type is None:
2348
+ raise NotImplementedError(
2349
+ "Quantization type '%s' is not yet implemented. "
2350
+ "The following Quantization types are currently supported: %s"
2351
+ % (
2352
+ quantization_config.quant_method,
2353
+ ", ".join(_SUPPORTED_QUANTIZATION.keys()),
2354
+ )
2355
+ )
2356
+ quant_group_size = quantization_config.group_size
2357
+ quant_bits = quantization_config.bits
2358
+ else:
2359
+ quant_type = common_spec.Quantization.CT2
2360
+ quant_group_size = None
2361
+ quant_bits = None
2362
+
2295
2363
  spec = transformer_spec.TransformerDecoderModelSpec.from_config(
2296
2364
  num_layers,
2297
2365
  num_heads,
@@ -2307,9 +2375,12 @@ class Qwen3Loader(ModelLoader):
2307
2375
  num_heads_kv=num_heads_kv,
2308
2376
  head_dim=head_dim,
2309
2377
  qk_norm=True,
2378
+ quant_type=quant_type,
2379
+ quant_group_size=quant_group_size,
2380
+ quant_bits=quant_bits,
2310
2381
  )
2311
2382
 
2312
- self.set_decoder(spec.decoder, model.model)
2383
+ self.set_decoder(spec.decoder, model.model, quant_type)
2313
2384
  self.set_linear(spec.decoder.projection, model.lm_head)
2314
2385
  return spec
2315
2386
 
@@ -2338,7 +2409,7 @@ class Qwen3Loader(ModelLoader):
2338
2409
  def set_layer_norm(self, spec, layer_norm):
2339
2410
  spec.gamma = layer_norm.weight
2340
2411
 
2341
- def set_decoder(self, spec, module):
2412
+ def set_decoder(self, spec, module, quant_type=common_spec.Quantization.CT2):
2342
2413
  spec.scale_embeddings = False
2343
2414
  self.set_embeddings(spec.embeddings, module.embed_tokens)
2344
2415
  self.set_layer_norm(spec.layer_norm, module.norm)
@@ -2359,22 +2430,43 @@ class Qwen3Loader(ModelLoader):
2359
2430
  )
2360
2431
 
2361
2432
  split_layers = [common_spec.LinearSpec() for _ in range(3)]
2362
- self.set_linear(split_layers[0], layer.self_attn.q_proj)
2363
- self.set_linear(split_layers[1], layer.self_attn.k_proj)
2364
- self.set_linear(split_layers[2], layer.self_attn.v_proj)
2365
- utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
2433
+ self.set_linear(
2434
+ split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
2435
+ )
2436
+ self.set_linear(
2437
+ split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
2438
+ )
2439
+ self.set_linear(
2440
+ split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
2441
+ )
2442
+
2443
+ if quant_type == common_spec.Quantization.CT2:
2444
+ utils.fuse_linear(layer_spec.self_attention.linear[0], split_layers)
2445
+ else:
2446
+ cc_dim = 1 if quant_type == common_spec.Quantization.AWQ_GEMM else 0
2447
+ utils.fuse_linear_prequant(
2448
+ layer_spec.self_attention.linear[0], split_layers, cc_dim
2449
+ )
2366
2450
 
2367
2451
  self.set_linear(
2368
2452
  layer_spec.self_attention.linear[1],
2369
2453
  layer.self_attn.o_proj,
2454
+ quant_type=quant_type,
2370
2455
  )
2371
2456
 
2372
- self.set_linear(layer_spec.ffn.linear_0, layer.mlp.gate_proj)
2373
- self.set_linear(layer_spec.ffn.linear_0_noact, layer.mlp.up_proj)
2374
- self.set_linear(layer_spec.ffn.linear_1, layer.mlp.down_proj)
2457
+ self.set_linear(
2458
+ layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
2459
+ )
2460
+ self.set_linear(
2461
+ layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
2462
+ )
2463
+ self.set_linear(
2464
+ layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
2465
+ )
2375
2466
 
2376
2467
  delattr(layer, "self_attn")
2377
2468
  delattr(layer, "mlp")
2469
+ gc.collect()
2378
2470
 
2379
2471
 
2380
2472
  @register_loader("MixFormerSequentialConfig")
@@ -2514,6 +2606,28 @@ class Phi3Loader(ModelLoader):
2514
2606
  rotary_scaling_type = None
2515
2607
  rotary_scaling_factor = 1
2516
2608
 
2609
+ # Check for AWQ quantization config
2610
+ quantization_config = getattr(model.config, "quantization_config", None)
2611
+ if quantization_config:
2612
+ quant_type = None
2613
+ if quantization_config.quant_method == "awq":
2614
+ quant_type = _SUPPORTED_QUANTIZATION.get(quantization_config.version)
2615
+ if quant_type is None:
2616
+ raise NotImplementedError(
2617
+ "Quantization type '%s' is not yet implemented. "
2618
+ "The following Quantization types are currently supported: %s"
2619
+ % (
2620
+ quantization_config.quant_method,
2621
+ ", ".join(_SUPPORTED_QUANTIZATION.keys()),
2622
+ )
2623
+ )
2624
+ quant_group_size = quantization_config.group_size
2625
+ quant_bits = quantization_config.bits
2626
+ else:
2627
+ quant_type = common_spec.Quantization.CT2
2628
+ quant_group_size = None
2629
+ quant_bits = None
2630
+
2517
2631
  spec = transformer_spec.TransformerDecoderModelSpec.from_config(
2518
2632
  num_layers,
2519
2633
  num_heads,
@@ -2529,9 +2643,12 @@ class Phi3Loader(ModelLoader):
2529
2643
  original_max_position_embeddings=original_max_position_embeddings,
2530
2644
  max_position_embeddings=max_position_embeddings,
2531
2645
  num_heads_kv=num_heads_kv,
2646
+ quant_type=quant_type,
2647
+ quant_group_size=quant_group_size,
2648
+ quant_bits=quant_bits,
2532
2649
  )
2533
2650
 
2534
- self.set_decoder(spec.decoder, model.model)
2651
+ self.set_decoder(spec.decoder, model.model, quant_type)
2535
2652
  self.set_linear(spec.decoder.projection, model.lm_head)
2536
2653
  return spec
2537
2654
 
@@ -2565,7 +2682,7 @@ class Phi3Loader(ModelLoader):
2565
2682
  rotary_scaling_short_factor, dtype=torch.float32
2566
2683
  )
2567
2684
 
2568
- def set_decoder(self, spec, module):
2685
+ def set_decoder(self, spec, module, quant_type=common_spec.Quantization.CT2):
2569
2686
  spec.scale_embeddings = False
2570
2687
  self.set_embeddings(spec.embeddings, module.embed_tokens)
2571
2688
  self.set_layer_norm(spec.layer_norm, module.norm)
@@ -2579,9 +2696,15 @@ class Phi3Loader(ModelLoader):
2579
2696
  )
2580
2697
 
2581
2698
  self.set_linear(
2582
- layer_spec.self_attention.linear[0], layer.self_attn.qkv_proj
2699
+ layer_spec.self_attention.linear[0],
2700
+ layer.self_attn.qkv_proj,
2701
+ quant_type=quant_type,
2702
+ )
2703
+ self.set_linear(
2704
+ layer_spec.self_attention.linear[1],
2705
+ layer.self_attn.o_proj,
2706
+ quant_type=quant_type,
2583
2707
  )
2584
- self.set_linear(layer_spec.self_attention.linear[1], layer.self_attn.o_proj)
2585
2708
  if (
2586
2709
  layer.self_attn.rotary_emb.long_factor is not None
2587
2710
  and layer.self_attn.rotary_emb.short_factor is not None
@@ -2592,10 +2715,30 @@ class Phi3Loader(ModelLoader):
2592
2715
  layer.self_attn.rotary_emb.short_factor,
2593
2716
  )
2594
2717
 
2595
- gate_proj, up_proj = layer.mlp.gate_up_proj.weight.chunk(2, dim=0)
2596
- layer_spec.ffn.linear_0.weight = gate_proj
2597
- layer_spec.ffn.linear_0_noact.weight = up_proj
2598
- self.set_linear(layer_spec.ffn.linear_1, layer.mlp.down_proj)
2718
+ # Handle gate_up_proj differently for AWQ vs regular models
2719
+ if quant_type == common_spec.Quantization.CT2:
2720
+ gate_proj, up_proj = layer.mlp.gate_up_proj.weight.chunk(2, dim=0)
2721
+ layer_spec.ffn.linear_0.weight = gate_proj
2722
+ layer_spec.ffn.linear_0_noact.weight = up_proj
2723
+ else:
2724
+ # AWQ: chunk qweight, scales, and qzeros
2725
+ gate_qweight, up_qweight = layer.mlp.gate_up_proj.qweight.chunk(
2726
+ 2, dim=1
2727
+ )
2728
+ gate_scales, up_scales = layer.mlp.gate_up_proj.scales.chunk(2, dim=1)
2729
+ gate_qzeros, up_qzeros = layer.mlp.gate_up_proj.qzeros.chunk(2, dim=1)
2730
+
2731
+ layer_spec.ffn.linear_0.weight = gate_qweight
2732
+ layer_spec.ffn.linear_0.weight_scale = gate_scales
2733
+ layer_spec.ffn.linear_0.weight_zero = gate_qzeros
2734
+
2735
+ layer_spec.ffn.linear_0_noact.weight = up_qweight
2736
+ layer_spec.ffn.linear_0_noact.weight_scale = up_scales
2737
+ layer_spec.ffn.linear_0_noact.weight_zero = up_qzeros
2738
+
2739
+ self.set_linear(
2740
+ layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
2741
+ )
2599
2742
 
2600
2743
  delattr(layer, "self_attn")
2601
2744
  delattr(layer, "mlp")
@@ -3325,3 +3468,266 @@ _WHISPER_ALIGNMENT_HEADS = {
3325
3468
  (25, 6),
3326
3469
  ],
3327
3470
  }
3471
+
3472
+
3473
+ # Paper: https://arxiv.org/pdf/2504.06225
3474
+ @register_loader("T5GemmaConfig")
3475
+ class T5GemmaLoader(ModelLoader):
3476
+ @property
3477
+ def architecture_name(self):
3478
+ return "T5GemmaForConditionalGeneration"
3479
+
3480
+ def set_layer_norm(self, spec, layer_norm):
3481
+ spec.gamma = layer_norm.weight.data + 1.0
3482
+
3483
+ def get_model_spec(self, model):
3484
+ encoder_config = model.config.encoder
3485
+ decoder_config = model.config.decoder
3486
+ sliding_window = getattr(model.config, "sliding_window", 4096)
3487
+
3488
+ encoder_num_heads = encoder_config.num_attention_heads
3489
+ encoder_num_heads_kv = getattr(
3490
+ encoder_config, "num_key_value_heads", encoder_num_heads
3491
+ )
3492
+ if encoder_num_heads_kv == encoder_num_heads:
3493
+ encoder_num_heads_kv = None
3494
+
3495
+ encoder = transformer_spec.TransformerEncoderSpec(
3496
+ encoder_config.num_hidden_layers,
3497
+ encoder_config.num_attention_heads,
3498
+ pre_norm=True,
3499
+ activation=_SUPPORTED_ACTIVATIONS[encoder_config.hidden_activation],
3500
+ ffn_glu=True,
3501
+ rms_norm=True,
3502
+ rotary_dim=encoder_config.head_dim,
3503
+ rotary_interleave=False,
3504
+ rotary_base=getattr(encoder_config, "rope_theta", 10000),
3505
+ sliding_window=sliding_window,
3506
+ pre_post_layer_norm=True,
3507
+ num_heads_kv=encoder_num_heads_kv,
3508
+ head_dim=encoder_config.head_dim,
3509
+ )
3510
+
3511
+ decoder_num_heads = decoder_config.num_attention_heads
3512
+ decoder_num_heads_kv = getattr(
3513
+ decoder_config, "num_key_value_heads", decoder_num_heads
3514
+ )
3515
+ if decoder_num_heads_kv == decoder_num_heads:
3516
+ decoder_num_heads_kv = None
3517
+
3518
+ decoder = transformer_spec.TransformerDecoderSpec(
3519
+ decoder_config.num_hidden_layers,
3520
+ decoder_config.num_attention_heads,
3521
+ pre_norm=True,
3522
+ activation=_SUPPORTED_ACTIVATIONS[decoder_config.hidden_activation],
3523
+ ffn_glu=True,
3524
+ rms_norm=True,
3525
+ with_encoder_attention=True,
3526
+ rotary_dim=decoder_config.head_dim,
3527
+ rotary_interleave=False,
3528
+ rotary_base=getattr(decoder_config, "rope_theta", 10000),
3529
+ sliding_window=sliding_window,
3530
+ pre_post_layer_norm=True,
3531
+ external_pre_post_encoder_layers=True,
3532
+ num_heads_kv=decoder_num_heads_kv,
3533
+ head_dim=decoder_config.head_dim,
3534
+ )
3535
+
3536
+ spec = transformer_spec.TransformerSpec(encoder, decoder)
3537
+
3538
+ self.set_encoder(spec.encoder, model.model.encoder, encoder_config)
3539
+
3540
+ self.set_decoder(
3541
+ spec.decoder,
3542
+ model.model.decoder,
3543
+ decoder_config,
3544
+ common_spec.Quantization.CT2,
3545
+ )
3546
+
3547
+ # Tie_word_embeddings
3548
+ self.set_linear(spec.decoder.projection, model.model.decoder.embed_tokens)
3549
+ return spec
3550
+
3551
+ def set_vocabulary(self, spec, tokens):
3552
+ spec.register_source_vocabulary(tokens)
3553
+ spec.register_target_vocabulary(tokens)
3554
+
3555
+ def set_config(self, config, model, tokenizer):
3556
+ config.bos_token = tokenizer.bos_token
3557
+ config.eos_token = tokenizer.eos_token
3558
+ config.unk_token = tokenizer.unk_token
3559
+
3560
+ if hasattr(model.config, "encoder"):
3561
+ config.layer_norm_epsilon = model.config.encoder.rms_norm_eps
3562
+ elif hasattr(model.config, "rms_norm_eps"):
3563
+ config.layer_norm_epsilon = model.config.rms_norm_eps
3564
+ else:
3565
+ config.layer_norm_epsilon = 1e-6
3566
+
3567
+ config.decoder_start_token = tokenizer.bos_token
3568
+
3569
+ def set_encoder(
3570
+ self, spec, encoder, encoder_config, quant_type=common_spec.Quantization.CT2
3571
+ ):
3572
+ spec.scale_embeddings = True
3573
+
3574
+ encoder_emb_spec = (
3575
+ spec.embeddings[0] if isinstance(spec.embeddings, list) else spec.embeddings
3576
+ )
3577
+
3578
+ self.set_embeddings(encoder_emb_spec, encoder.embed_tokens)
3579
+ encoder_emb_spec.multiply_by_sqrt_depth = encoder_config.hidden_size**0.5
3580
+ self.set_layer_norm(spec.layer_norm, encoder.norm)
3581
+
3582
+ module = encoder
3583
+ for i, (layer_spec, layer) in enumerate(zip(spec.layer, module.layers)):
3584
+ self.set_layer_norm(
3585
+ layer_spec.input_layer_norm, layer.pre_self_attn_layernorm
3586
+ )
3587
+ self.set_layer_norm(
3588
+ layer_spec.post_attention_layer_norm, layer.post_self_attn_layernorm
3589
+ )
3590
+
3591
+ # T5GemmaSelfAttention
3592
+ qkv_split_layers = [common_spec.LinearSpec() for _ in range(3)]
3593
+ self.set_linear(
3594
+ qkv_split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
3595
+ )
3596
+ self.set_linear(
3597
+ qkv_split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
3598
+ )
3599
+ self.set_linear(
3600
+ qkv_split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
3601
+ )
3602
+ utils.fuse_linear(layer_spec.self_attention.linear[0], qkv_split_layers)
3603
+ self.set_linear(
3604
+ layer_spec.self_attention.linear[1],
3605
+ layer.self_attn.o_proj,
3606
+ quant_type=quant_type,
3607
+ )
3608
+
3609
+ # T5GemmaRMSNorm
3610
+ self.set_layer_norm(
3611
+ layer_spec.pre_feedforward_layer_norm, layer.pre_feedforward_layernorm
3612
+ )
3613
+ # T5GemmaRMSNorm
3614
+ self.set_layer_norm(
3615
+ layer_spec.post_feedforward_layer_norm, layer.post_feedforward_layernorm
3616
+ )
3617
+
3618
+ # T5GemmaMLP
3619
+ self.set_linear(
3620
+ layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
3621
+ )
3622
+ self.set_linear(
3623
+ layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
3624
+ )
3625
+ self.set_linear(
3626
+ layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
3627
+ )
3628
+
3629
+ # Clean up
3630
+ delattr(layer, "self_attn")
3631
+ delattr(layer, "mlp")
3632
+ gc.collect()
3633
+
3634
+ def set_decoder(
3635
+ self, spec, module, decoder_config, quant_type=common_spec.Quantization.CT2
3636
+ ):
3637
+ spec.scale_embeddings = True
3638
+ spec.start_from_zero_embedding = False
3639
+
3640
+ self.set_embeddings(spec.embeddings, module.embed_tokens)
3641
+ spec.embeddings.multiply_by_sqrt_depth = decoder_config.hidden_size**0.5
3642
+ self.set_layer_norm(spec.layer_norm, module.norm)
3643
+
3644
+ for i, (layer_spec, layer) in enumerate(zip(spec.layer, module.layers)):
3645
+ # Self-attention block
3646
+ self.set_layer_norm(
3647
+ layer_spec.input_layer_norm, layer.pre_self_attn_layernorm
3648
+ )
3649
+ self.set_layer_norm(
3650
+ layer_spec.post_attention_layer_norm, layer.post_self_attn_layernorm
3651
+ )
3652
+
3653
+ # T5GemmaSelfAttention - QKV projections
3654
+ qkv_split_layers = [common_spec.LinearSpec() for _ in range(3)]
3655
+ self.set_linear(
3656
+ qkv_split_layers[0], layer.self_attn.q_proj, quant_type=quant_type
3657
+ )
3658
+ self.set_linear(
3659
+ qkv_split_layers[1], layer.self_attn.k_proj, quant_type=quant_type
3660
+ )
3661
+ self.set_linear(
3662
+ qkv_split_layers[2], layer.self_attn.v_proj, quant_type=quant_type
3663
+ )
3664
+ utils.fuse_linear(layer_spec.self_attention.linear[0], qkv_split_layers)
3665
+ self.set_linear(
3666
+ layer_spec.self_attention.linear[1],
3667
+ layer.self_attn.o_proj,
3668
+ quant_type=quant_type,
3669
+ )
3670
+
3671
+ # Pre and post cross-attention layer norm
3672
+ self.set_layer_norm(
3673
+ layer_spec.external_pre_encoder_attention_layer_norm,
3674
+ layer.pre_cross_attn_layernorm,
3675
+ )
3676
+
3677
+ self.set_layer_norm(
3678
+ layer_spec.external_post_encoder_attention_layer_norm,
3679
+ layer.post_cross_attn_layernorm,
3680
+ )
3681
+
3682
+ # Cross-attention Q projection
3683
+ self.set_linear(
3684
+ layer_spec.attention.linear[0],
3685
+ layer.cross_attn.q_proj,
3686
+ quant_type=quant_type,
3687
+ )
3688
+
3689
+ # Cross-attention K+V fused
3690
+ kv_split_layers = [common_spec.LinearSpec() for _ in range(2)]
3691
+ self.set_linear(
3692
+ kv_split_layers[0],
3693
+ layer.cross_attn.k_proj,
3694
+ quant_type=quant_type,
3695
+ )
3696
+ self.set_linear(
3697
+ kv_split_layers[1],
3698
+ layer.cross_attn.v_proj,
3699
+ quant_type=quant_type,
3700
+ )
3701
+ utils.fuse_linear(layer_spec.attention.linear[1], kv_split_layers)
3702
+
3703
+ # Cross-attention output projection
3704
+ self.set_linear(
3705
+ layer_spec.attention.linear[2],
3706
+ layer.cross_attn.o_proj,
3707
+ quant_type=quant_type,
3708
+ )
3709
+
3710
+ # Feed-forward block
3711
+ self.set_layer_norm(
3712
+ layer_spec.pre_feedforward_layer_norm, layer.pre_feedforward_layernorm
3713
+ )
3714
+ self.set_layer_norm(
3715
+ layer_spec.post_feedforward_layer_norm, layer.post_feedforward_layernorm
3716
+ )
3717
+
3718
+ # T5GemmaMLP
3719
+ self.set_linear(
3720
+ layer_spec.ffn.linear_0, layer.mlp.gate_proj, quant_type=quant_type
3721
+ )
3722
+ self.set_linear(
3723
+ layer_spec.ffn.linear_0_noact, layer.mlp.up_proj, quant_type=quant_type
3724
+ )
3725
+ self.set_linear(
3726
+ layer_spec.ffn.linear_1, layer.mlp.down_proj, quant_type=quant_type
3727
+ )
3728
+
3729
+ # Clean up
3730
+ delattr(layer, "self_attn")
3731
+ delattr(layer, "cross_attn")
3732
+ delattr(layer, "mlp")
3733
+ gc.collect()
Binary file
ctranslate2/cudnn64_9.dll CHANGED
Binary file
ctranslate2/extensions.py CHANGED
@@ -556,12 +556,28 @@ def _process_iterable(process_func, iterables, max_batch_size, batch_type, **kwa
556
556
 
557
557
  def _batch_iterator(iterable, batch_size, batch_type):
558
558
  streams = None
559
- cur_batch_size = 0
559
+ max_length = 0
560
560
 
561
561
  for example in iterable:
562
562
  if not isinstance(example, tuple):
563
563
  example = (example,)
564
564
 
565
+ if batch_type == "examples":
566
+ if streams and len(streams[0]) == batch_size:
567
+ yield streams
568
+ streams = None
569
+
570
+ elif batch_type == "tokens":
571
+ max_length = max(max_length, len(example[0]))
572
+
573
+ if streams and (len(streams[0]) + 1) * max_length > batch_size:
574
+ yield streams
575
+ streams = None
576
+ max_length = len(example[0])
577
+
578
+ else:
579
+ raise ValueError("Invalid batch type %s" % batch_type)
580
+
565
581
  if streams is None:
566
582
  streams = tuple([] for _ in example)
567
583
  for batch, element in zip(streams, example):
@@ -569,17 +585,5 @@ def _batch_iterator(iterable, batch_size, batch_type):
569
585
  raise ValueError("Input iterables do not have the same length")
570
586
  batch.append(element)
571
587
 
572
- if batch_type == "examples":
573
- cur_batch_size += 1
574
- elif batch_type == "tokens":
575
- cur_batch_size += len(example[0])
576
- else:
577
- raise ValueError("Invalid batch type %s" % batch_type)
578
-
579
- if cur_batch_size >= batch_size:
580
- yield streams
581
- streams = None
582
- cur_batch_size = 0
583
-
584
588
  if streams is not None:
585
589
  yield streams
@@ -34,10 +34,12 @@ class MultiHeadAttentionSpec(model_spec.LayerSpec):
34
34
  sliding_window=None,
35
35
  qk_norm=False,
36
36
  qk_norm_rms=True,
37
+ has_norm=True,
37
38
  ):
38
39
  self.queries_scale = model_spec.OPTIONAL
39
40
 
40
- self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
41
+ if has_norm:
42
+ self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
41
43
  self.linear = [
42
44
  common_spec.LinearSpec() for _ in range(2 if self_attention else 3)
43
45
  ]
@@ -23,6 +23,16 @@ class TransformerEncoderSpec(model_spec.LayerSpec):
23
23
  ffn_glu: bool = False,
24
24
  rms_norm: bool = False,
25
25
  multi_query_attention: bool = False,
26
+ num_heads_kv: Optional[int] = None,
27
+ head_dim: Optional[int] = None,
28
+ rotary_dim: Optional[int] = None,
29
+ rotary_interleave: bool = True,
30
+ rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
31
+ rotary_scaling_factor: float = 1,
32
+ rotary_base: float = 10000,
33
+ sliding_window: Optional[int] = None,
34
+ qk_norm: Optional[bool] = False,
35
+ pre_post_layer_norm: bool = False,
26
36
  ):
27
37
  """Initializes a Transformer encoder specification.
28
38
 
@@ -43,8 +53,28 @@ class TransformerEncoderSpec(model_spec.LayerSpec):
43
53
  ffn_glu: Use gated linear units in the FFN layers as described in
44
54
  https://arxiv.org/abs/2002.05202.
45
55
  rms_norm: Use the root mean square layer normalization.
46
- multi_query_attention: Use multi-query attention.
56
+ multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
57
+ num_heads_kv: Number of attention heads for the key and value.
58
+ head_dim: Number of dimensions per attention head.
59
+ rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
60
+ embeddings are applied to all dimensions.
61
+ rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
62
+ Otherwise the head dimensions are sliced in half.
63
+ rotary_scaling_type: Type of RoPE scaling.
64
+ rotary_scaling_factor: Factor used in the RoPE scaling.
65
+ rotary_base: The base period of the rotary embeddings.
66
+ sliding_window: Max sequence length to retain in KV Cache.
67
+ qk_norm: Apply layer normalization to the query and key projections.
68
+ pre_post_layer_norm: Add post layer norm for each pre norm layer.
47
69
  """
70
+
71
+ if multi_query_attention:
72
+ if num_heads_kv is not None and num_heads_kv != 1:
73
+ raise ValueError(
74
+ "Enabling multi_query_attention implies num_heads_kv=1"
75
+ )
76
+ num_heads_kv = 1
77
+
48
78
  self.multi_query_attention = multi_query_attention
49
79
  self.num_heads = np.dtype("int16").type(num_heads)
50
80
  self.pre_norm = pre_norm
@@ -60,13 +90,24 @@ class TransformerEncoderSpec(model_spec.LayerSpec):
60
90
  self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
61
91
  if layernorm_embedding:
62
92
  self.layernorm_embedding = common_spec.LayerNormSpec(rms_norm=rms_norm)
93
+ if sliding_window is not None:
94
+ self.sliding_window = np.dtype("int32").type(sliding_window)
95
+
63
96
  self.layer = [
64
97
  TransformerEncoderLayerSpec(
65
98
  relative_position=relative_position,
66
99
  relative_attention_bias=relative_attention_bias,
67
100
  ffn_glu=ffn_glu,
68
101
  rms_norm=rms_norm,
69
- num_heads_kv=1 if multi_query_attention else None,
102
+ num_heads_kv=num_heads_kv,
103
+ head_dim=head_dim,
104
+ rotary_dim=rotary_dim,
105
+ rotary_interleave=rotary_interleave,
106
+ rotary_scaling_type=rotary_scaling_type,
107
+ rotary_scaling_factor=rotary_scaling_factor,
108
+ rotary_base=rotary_base,
109
+ qk_norm=qk_norm,
110
+ pre_post_layer_norm=pre_post_layer_norm,
70
111
  )
71
112
  for _ in range(num_layers)
72
113
  ]
@@ -109,7 +150,8 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
109
150
  quant_type: Optional[common_spec.Quantization] = None,
110
151
  quant_group_size: Optional[int] = None,
111
152
  quant_bits: Optional[int] = None,
112
- qk_norm: Optional[bool] = False,
153
+ qk_norm: bool = False,
154
+ external_pre_post_encoder_layers: Optional[bool] = False,
113
155
  ):
114
156
  """Initializes a Transformer decoder specification.
115
157
 
@@ -156,6 +198,8 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
156
198
  quant_type: quantization type used (like awq... for lower bit quantization)
157
199
  quant_group_size: group size of the lower bit quantization
158
200
  quant_bits: number of bit of the quantization (ex: 4bit)
201
+ external_pre_post_encoder_layers: if the encoder attention pre and processing
202
+ is done outside the attention.
159
203
  """
160
204
 
161
205
  self._config = dict()
@@ -172,12 +216,6 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
172
216
  )
173
217
  num_heads_kv = 1
174
218
 
175
- if with_encoder_attention and num_heads_kv not in (None, 1, num_heads):
176
- raise ValueError(
177
- "num_heads_kv=%d is not supported in the cross-attention layers"
178
- % num_heads_kv
179
- )
180
-
181
219
  self.num_heads = np.dtype("int16").type(num_heads)
182
220
  self.pre_norm = pre_norm
183
221
  self.activation = np.dtype("int8").type(activation)
@@ -224,6 +262,7 @@ class TransformerDecoderSpec(model_spec.LayerSpec):
224
262
  head_dim=head_dim,
225
263
  sliding_window=sliding_window,
226
264
  qk_norm=qk_norm,
265
+ external_pre_post_encoder_layers=external_pre_post_encoder_layers,
227
266
  )
228
267
  for _ in range(num_layers)
229
268
  ]
@@ -254,7 +293,15 @@ class TransformerEncoderLayerSpec(model_spec.LayerSpec):
254
293
  ffn_glu=False,
255
294
  rms_norm=False,
256
295
  num_heads_kv=None,
296
+ head_dim=None,
257
297
  sliding_window=None,
298
+ rotary_dim: Optional[int] = None,
299
+ rotary_interleave: bool = True,
300
+ rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
301
+ rotary_scaling_factor: float = 1,
302
+ rotary_base: float = 10000,
303
+ qk_norm=False,
304
+ pre_post_layer_norm: bool = False,
258
305
  ):
259
306
  self.self_attention = attention_spec.MultiHeadAttentionSpec(
260
307
  self_attention=True,
@@ -262,10 +309,32 @@ class TransformerEncoderLayerSpec(model_spec.LayerSpec):
262
309
  relative_attention_bias=relative_attention_bias,
263
310
  rms_norm=rms_norm,
264
311
  num_heads_kv=num_heads_kv,
312
+ head_dim=head_dim,
265
313
  sliding_window=sliding_window,
314
+ rotary_dim=rotary_dim,
315
+ rotary_interleave=rotary_interleave,
316
+ rotary_scaling_type=rotary_scaling_type,
317
+ rotary_scaling_factor=rotary_scaling_factor,
318
+ rotary_base=rotary_base,
319
+ qk_norm=qk_norm,
266
320
  )
267
321
  self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)
268
322
 
323
+ if pre_post_layer_norm:
324
+ self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
325
+ self.post_attention_layer_norm = common_spec.LayerNormSpec(
326
+ rms_norm=rms_norm
327
+ )
328
+ self.pre_feedforward_layer_norm = common_spec.LayerNormSpec(
329
+ rms_norm=rms_norm
330
+ )
331
+ self.post_feedforward_layer_norm = common_spec.LayerNormSpec(
332
+ rms_norm=rms_norm
333
+ )
334
+
335
+ delattr(self.self_attention, "layer_norm")
336
+ delattr(self.ffn, "layer_norm")
337
+
269
338
 
270
339
  class TransformerDecoderLayerSpec(model_spec.LayerSpec):
271
340
  def __init__(
@@ -289,6 +358,7 @@ class TransformerDecoderLayerSpec(model_spec.LayerSpec):
289
358
  head_dim=None,
290
359
  sliding_window=None,
291
360
  qk_norm=False,
361
+ external_pre_post_encoder_layers=False,
292
362
  ):
293
363
  self.self_attention = attention_spec.MultiHeadAttentionSpec(
294
364
  self_attention=True,
@@ -312,8 +382,10 @@ class TransformerDecoderLayerSpec(model_spec.LayerSpec):
312
382
  self.attention = attention_spec.MultiHeadAttentionSpec(
313
383
  rms_norm=rms_norm,
314
384
  num_heads_kv=num_heads_kv,
385
+ head_dim=head_dim,
315
386
  sliding_window=sliding_window,
316
387
  qk_norm=qk_norm,
388
+ has_norm=external_pre_post_encoder_layers is False,
317
389
  )
318
390
 
319
391
  self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)
@@ -329,10 +401,21 @@ class TransformerDecoderLayerSpec(model_spec.LayerSpec):
329
401
  delattr(self.ffn, "layer_norm")
330
402
 
331
403
  if pre_post_layer_norm:
404
+ # Self-attention layer norms
332
405
  self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
333
406
  self.post_attention_layer_norm = common_spec.LayerNormSpec(
334
407
  rms_norm=rms_norm
335
408
  )
409
+
410
+ if with_encoder_attention and external_pre_post_encoder_layers:
411
+ self.external_post_encoder_attention_layer_norm = (
412
+ common_spec.LayerNormSpec(rms_norm=rms_norm)
413
+ )
414
+ self.external_pre_encoder_attention_layer_norm = (
415
+ common_spec.LayerNormSpec(rms_norm=rms_norm)
416
+ )
417
+
418
+ # Feed-forward layer norms
336
419
  self.pre_feedforward_layer_norm = common_spec.LayerNormSpec(
337
420
  rms_norm=rms_norm
338
421
  )
@@ -562,7 +645,7 @@ class TransformerDecoderModelSpec(model_spec.LanguageModelSpec):
562
645
  quant_type: Optional[common_spec.Quantization] = None,
563
646
  quant_group_size: Optional[int] = None,
564
647
  quant_bits: Optional[int] = None,
565
- qk_norm: Optional[bool] = False,
648
+ qk_norm: bool = False,
566
649
  ):
567
650
  """Creates a Transformer decoder model specification.
568
651
 
ctranslate2/version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "4.6.2"
3
+ __version__ = "4.6.3"
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ctranslate2
3
- Version: 4.6.2
3
+ Version: 4.6.3
4
4
  Summary: Fast inference engine for Transformer models
5
5
  Home-page: https://opennmt.net
6
6
  Author: OpenNMT
7
+ License: MIT
7
8
  Project-URL: Documentation, https://opennmt.net/CTranslate2
8
9
  Project-URL: Forum, https://forum.opennmt.net
9
10
  Project-URL: Gitter, https://gitter.im/OpenNMT/CTranslate2
@@ -13,7 +14,6 @@ Classifier: Development Status :: 5 - Production/Stable
13
14
  Classifier: Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4
14
15
  Classifier: Intended Audience :: Developers
15
16
  Classifier: Intended Audience :: Science/Research
16
- Classifier: License :: OSI Approved :: MIT License
17
17
  Classifier: Programming Language :: Python :: 3
18
18
  Classifier: Programming Language :: Python :: 3 :: Only
19
19
  Classifier: Programming Language :: Python :: 3.9
@@ -34,6 +34,7 @@ Dynamic: description
34
34
  Dynamic: description-content-type
35
35
  Dynamic: home-page
36
36
  Dynamic: keywords
37
+ Dynamic: license
37
38
  Dynamic: project-url
38
39
  Dynamic: requires-dist
39
40
  Dynamic: requires-python
@@ -49,7 +50,7 @@ The project implements a custom runtime that applies many performance optimizati
49
50
 
50
51
  The following model types are currently supported:
51
52
 
52
- * Encoder-decoder models: Transformer base/big, M2M-100, NLLB, BART, mBART, Pegasus, T5, Whisper
53
+ * Encoder-decoder models: Transformer base/big, M2M-100, NLLB, BART, mBART, Pegasus, T5, Whisper T5Gemma
53
54
  * Decoder-only models: GPT-2, GPT-J, GPT-NeoX, OPT, BLOOM, MPT, Llama, Mistral, Gemma, CodeGen, GPTBigCode, Falcon, Qwen2
54
55
  * Encoder-only models: BERT, DistilBERT, XLM-RoBERTa
55
56
 
@@ -160,6 +161,16 @@ Executed with 4 threads on a [*c5.2xlarge*](https://aws.amazon.com/ec2/instance-
160
161
 
161
162
  Executed with CUDA 11 on a [*g5.xlarge*](https://aws.amazon.com/ec2/instance-types/g5/) Amazon EC2 instance equipped with a NVIDIA A10G GPU (driver version: 510.47.03).
162
163
 
164
+ ## Contributing
165
+
166
+ CTranslate2 is a community-driven project. We welcome contributions of all kinds:
167
+ * **New Model Support:** Help us implement more Transformer architectures.
168
+ * **Performance:** Propose optimizations for CPU or GPU kernels.
169
+ * **Bug Reports:** Open an issue if you find something not working as expected.
170
+ * **Documentation:** Improve our guides or add new examples.
171
+
172
+ Check out our [Contributing Guide](CONTRIBUTING.md) to learn how to set up your development environment.
173
+
163
174
  ## Additional resources
164
175
 
165
176
  * [Documentation](https://opennmt.net/CTranslate2)
@@ -1,33 +1,33 @@
1
1
  ctranslate2/__init__.py,sha256=CGqShDaFxQ-u-aCtVq99T4HKuBdMB8b49l2KSxnQb8M,1735
2
- ctranslate2/_ext.cp314t-win_amd64.pyd,sha256=ryUxQaSEW8-dMZYWQsHYQXoi-2dmaTbqkSnFIeC3z0I,774656
3
- ctranslate2/ctranslate2.dll,sha256=9zIz4dY3yV1kTTKaipyQwjcGDwzZ3OzKiOkNpXdcQ1U,58389504
4
- ctranslate2/cudnn64_9.dll,sha256=wHzEfy-kpWZZPHr0qn5X7fCamFoP3dFMuNb0VuJSrwU,438840
5
- ctranslate2/extensions.py,sha256=axO2FI8ddiFmlko2AzQ6VcdtF-3hDA7VmPGnTIkrPkI,21782
2
+ ctranslate2/_ext.cp314t-win_amd64.pyd,sha256=SJzyI2umi8fNXsvbshO65rCof-2B7QCfPf0t44_NEGY,775168
3
+ ctranslate2/ctranslate2.dll,sha256=9TRGHvoyNSzXe9eEc3jKJa5-21-AeSENgp3DXvhCZ4M,58590720
4
+ ctranslate2/cudnn64_9.dll,sha256=ntvN_3OwrwcOsWCyzmbln-ygSqAXNR2O7cxejhSZZ9I,266288
5
+ ctranslate2/extensions.py,sha256=kDNt0H9KvfNCc3PrRGzfkj9Fkvna84i2O5Y-rav6UkU,21940
6
6
  ctranslate2/libiomp5md.dll,sha256=mCIzNmsK_NoeD1WgsTQJfjW3eWE_VN22nmhebNBrdV8,1614192
7
7
  ctranslate2/logging.py,sha256=P9evHdxuMx_iHvwJjEASEq-j5062H64Pl5-fJjxEuHk,1221
8
- ctranslate2/version.py,sha256=f2Hk9NHTYgXftujV8JVkeOzenykZ9QzbsZ-nIt9U1uc,53
8
+ ctranslate2/version.py,sha256=TboXlbA67GNmSOm1v2u_U8AKgYh5iminMMLTvi3Xho4,53
9
9
  ctranslate2/converters/__init__.py,sha256=ufYjcXf2sK4fiXAUU6tIJyWmNuLjKFf_KH3GWLXe4ls,507
10
10
  ctranslate2/converters/converter.py,sha256=Qkb8NGLLmgqMT6HZkFq61zwbxyq3NlWcaxLZ6Ap-YOQ,3601
11
11
  ctranslate2/converters/eole_ct2.py,sha256=RUcDJH_2AUt0jDs5oAqccE6tQPbO9LQ6JmVriC1DTy8,12564
12
- ctranslate2/converters/fairseq.py,sha256=uQpd-ftYSO4c6WdEwCUyuZWhzWX1UTG7dGOC6EtcDVE,12765
12
+ ctranslate2/converters/fairseq.py,sha256=2vlBk4AVCHwXxKkwPHVmcjyfo1dAV0_DJS1i6q-44NE,12822
13
13
  ctranslate2/converters/marian.py,sha256=1_7P3EbIDPOdyJbtb_Lp-LCBPBb9A8E9OhzoyFwTb64,11274
14
14
  ctranslate2/converters/openai_gpt2.py,sha256=1rXKM2ZURZHWRv4XZ135fPkVWpM4rTG-q7VR7OD6d-A,3304
15
- ctranslate2/converters/opennmt_py.py,sha256=Vva60az6tGqlQXs0UgC09r_fCD3u2u6wUJB-8V4OUFQ,13183
15
+ ctranslate2/converters/opennmt_py.py,sha256=zex4TbHiiJMy0tkqQg39oNjxmSZKf8dnRLH3iQ1H4z0,13227
16
16
  ctranslate2/converters/opennmt_tf.py,sha256=uBRp2wz5xriSQcA_c0S0ekY7ws6RpRX_0EKeMRdM7-s,16222
17
17
  ctranslate2/converters/opus_mt.py,sha256=5KbPaTiBhhorPzMpTugIfIJ8SgcqHfJUbJrWKBN-Djs,1254
18
- ctranslate2/converters/transformers.py,sha256=zwqUFFFwLpam6z5lpBz2rgfYj065CbsdT9S_xVqPjCk,126110
18
+ ctranslate2/converters/transformers.py,sha256=VRal3vKSQrAOvcNPwewjVMtgvWskz0KD5bdIrpNrZNA,142380
19
19
  ctranslate2/converters/utils.py,sha256=w7NG39lx-9dOdL57OqKVTdC__opkuP8RACg1TLlUJwM,3817
20
20
  ctranslate2/models/__init__.py,sha256=53p98uemtuvVPz8xK7_LbOhBiUJJu-c-NdmOHJgdXus,497
21
21
  ctranslate2/specs/__init__.py,sha256=9GabtSyczznYqiqUS6XvULi8pQ3_3RNRogXobGP0G80,653
22
- ctranslate2/specs/attention_spec.py,sha256=0JhCBrbb20G07UFnUAYIUtfcqn4VtflJHYWGIunwKDw,3442
22
+ ctranslate2/specs/attention_spec.py,sha256=FnaSiQREWQw_cURgsCb9_aIpGOCxyVGTCpIOdd-08v8,3492
23
23
  ctranslate2/specs/common_spec.py,sha256=freTDhQMy5PYofBrij4_FDgrKokMYApWSPIpASZIlJc,1608
24
24
  ctranslate2/specs/model_spec.py,sha256=atCAYzDEIzyJ1TCayFGZVutHqSWa1ww-vbZ0OiIJqh8,25736
25
- ctranslate2/specs/transformer_spec.py,sha256=43jOIvCSbAvqZJ1IyvRdGUa4f-zhdKhQBOXvp0T8YLE,30360
25
+ ctranslate2/specs/transformer_spec.py,sha256=-GJ0oSjI3ns-Ei_-xXIM_P2GaZxt5Z-g03zJ0m_4ciU,34317
26
26
  ctranslate2/specs/wav2vec2_spec.py,sha256=NITsuOuf2F5bU1-aXit8-WEtWV9fH2Eq7A7857UyYho,2106
27
27
  ctranslate2/specs/wav2vec2bert_spec.py,sha256=UgtsJWC9mMgJ7bn4T_xg1uXK0rqA4-9tT2KMGVgPKnw,3529
28
28
  ctranslate2/specs/whisper_spec.py,sha256=_vm1sc5yOowOJ4iyvcxMXrgt-UcLJrZT8OtPscUXcQQ,2447
29
- ctranslate2-4.6.2.dist-info/METADATA,sha256=r5HnmZE0BMI60j3N0GmDdM6l7Q7KW3w5nLLOX_AKCRY,10354
30
- ctranslate2-4.6.2.dist-info/WHEEL,sha256=IxxXYqBIlKEzFy9ulBJ928Gdqg6XQ3DHti4avqq3myk,102
31
- ctranslate2-4.6.2.dist-info/entry_points.txt,sha256=ZHkojut_TmVRHl0bJIGm2b9wqr98GAJqxN9rlJtQshs,466
32
- ctranslate2-4.6.2.dist-info/top_level.txt,sha256=1hUaWzcFIuSo2BAIUHFA3Osgsu6S1giq0y6Rosv8HOQ,12
33
- ctranslate2-4.6.2.dist-info/RECORD,,
29
+ ctranslate2-4.6.3.dist-info/METADATA,sha256=awoc6t4JSxpv51lmfAG28ZG91FhGQ8DHspyLzLqLo_Q,10839
30
+ ctranslate2-4.6.3.dist-info/WHEEL,sha256=IxxXYqBIlKEzFy9ulBJ928Gdqg6XQ3DHti4avqq3myk,102
31
+ ctranslate2-4.6.3.dist-info/entry_points.txt,sha256=ZHkojut_TmVRHl0bJIGm2b9wqr98GAJqxN9rlJtQshs,466
32
+ ctranslate2-4.6.3.dist-info/top_level.txt,sha256=1hUaWzcFIuSo2BAIUHFA3Osgsu6S1giq0y6Rosv8HOQ,12
33
+ ctranslate2-4.6.3.dist-info/RECORD,,