InvokeAI 6.10.0rc1__py3-none-any.whl → 6.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. invokeai/app/api/routers/model_manager.py +43 -1
  2. invokeai/app/invocations/fields.py +1 -1
  3. invokeai/app/invocations/flux2_denoise.py +499 -0
  4. invokeai/app/invocations/flux2_klein_model_loader.py +222 -0
  5. invokeai/app/invocations/flux2_klein_text_encoder.py +222 -0
  6. invokeai/app/invocations/flux2_vae_decode.py +106 -0
  7. invokeai/app/invocations/flux2_vae_encode.py +88 -0
  8. invokeai/app/invocations/flux_denoise.py +77 -3
  9. invokeai/app/invocations/flux_lora_loader.py +1 -1
  10. invokeai/app/invocations/flux_model_loader.py +2 -5
  11. invokeai/app/invocations/ideal_size.py +6 -1
  12. invokeai/app/invocations/metadata.py +4 -0
  13. invokeai/app/invocations/metadata_linked.py +47 -0
  14. invokeai/app/invocations/model.py +1 -0
  15. invokeai/app/invocations/pbr_maps.py +59 -0
  16. invokeai/app/invocations/z_image_denoise.py +244 -84
  17. invokeai/app/invocations/z_image_image_to_latents.py +9 -1
  18. invokeai/app/invocations/z_image_latents_to_image.py +9 -1
  19. invokeai/app/invocations/z_image_seed_variance_enhancer.py +110 -0
  20. invokeai/app/services/config/config_default.py +3 -1
  21. invokeai/app/services/invocation_stats/invocation_stats_common.py +6 -6
  22. invokeai/app/services/invocation_stats/invocation_stats_default.py +9 -4
  23. invokeai/app/services/model_manager/model_manager_default.py +7 -0
  24. invokeai/app/services/model_records/model_records_base.py +4 -2
  25. invokeai/app/services/shared/invocation_context.py +15 -0
  26. invokeai/app/services/shared/sqlite/sqlite_util.py +2 -0
  27. invokeai/app/services/shared/sqlite_migrator/migrations/migration_25.py +61 -0
  28. invokeai/app/util/step_callback.py +58 -2
  29. invokeai/backend/flux/denoise.py +338 -118
  30. invokeai/backend/flux/dype/__init__.py +31 -0
  31. invokeai/backend/flux/dype/base.py +260 -0
  32. invokeai/backend/flux/dype/embed.py +116 -0
  33. invokeai/backend/flux/dype/presets.py +148 -0
  34. invokeai/backend/flux/dype/rope.py +110 -0
  35. invokeai/backend/flux/extensions/dype_extension.py +91 -0
  36. invokeai/backend/flux/schedulers.py +62 -0
  37. invokeai/backend/flux/util.py +35 -1
  38. invokeai/backend/flux2/__init__.py +4 -0
  39. invokeai/backend/flux2/denoise.py +280 -0
  40. invokeai/backend/flux2/ref_image_extension.py +294 -0
  41. invokeai/backend/flux2/sampling_utils.py +209 -0
  42. invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
  43. invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
  44. invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
  45. invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
  46. invokeai/backend/model_manager/configs/factory.py +19 -1
  47. invokeai/backend/model_manager/configs/lora.py +36 -0
  48. invokeai/backend/model_manager/configs/main.py +395 -3
  49. invokeai/backend/model_manager/configs/qwen3_encoder.py +116 -7
  50. invokeai/backend/model_manager/configs/vae.py +104 -2
  51. invokeai/backend/model_manager/load/model_cache/model_cache.py +107 -2
  52. invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
  53. invokeai/backend/model_manager/load/model_loaders/flux.py +1020 -8
  54. invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
  55. invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
  56. invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +2 -1
  57. invokeai/backend/model_manager/load/model_loaders/z_image.py +158 -31
  58. invokeai/backend/model_manager/starter_models.py +141 -4
  59. invokeai/backend/model_manager/taxonomy.py +31 -4
  60. invokeai/backend/model_manager/util/select_hf_files.py +3 -2
  61. invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
  62. invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
  63. invokeai/backend/util/vae_working_memory.py +0 -2
  64. invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
  65. invokeai/frontend/web/dist/assets/App-D13dX7be.js +161 -0
  66. invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-u_ZjhQTI.js} +1 -1
  67. invokeai/frontend/web/dist/assets/index-BB0nHmDe.js +530 -0
  68. invokeai/frontend/web/dist/index.html +1 -1
  69. invokeai/frontend/web/dist/locales/en-GB.json +1 -0
  70. invokeai/frontend/web/dist/locales/en.json +85 -6
  71. invokeai/frontend/web/dist/locales/it.json +135 -15
  72. invokeai/frontend/web/dist/locales/ru.json +11 -11
  73. invokeai/version/invokeai_version.py +1 -1
  74. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/METADATA +8 -2
  75. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/RECORD +81 -57
  76. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/WHEEL +1 -1
  77. invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
  78. invokeai/frontend/web/dist/assets/index-dgSJAY--.js +0 -530
  79. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/entry_points.txt +0 -0
  80. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE +0 -0
  81. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
  82. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
  83. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/top_level.txt +0 -0
@@ -37,12 +37,14 @@ class GenericDiffusersLoader(ModelLoader):
37
37
  repo_variant = config.repo_variant if isinstance(config, Diffusers_Config_Base) else None
38
38
  variant = repo_variant.value if repo_variant else None
39
39
  try:
40
- result: AnyModel = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, variant=variant)
40
+ result: AnyModel = model_class.from_pretrained(
41
+ model_path, torch_dtype=self._torch_dtype, variant=variant, local_files_only=True
42
+ )
41
43
  except OSError as e:
42
44
  if variant and "no file named" in str(
43
45
  e
44
46
  ): # try without the variant, just in case user's preferences changed
45
- result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype)
47
+ result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
46
48
  else:
47
49
  raise e
48
50
  return result
@@ -38,5 +38,6 @@ class OnnyxDiffusersModel(GenericDiffusersLoader):
38
38
  model_path,
39
39
  torch_dtype=self._torch_dtype,
40
40
  variant=variant,
41
+ local_files_only=True,
41
42
  )
42
43
  return result
@@ -80,12 +80,13 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):
80
80
  model_path,
81
81
  torch_dtype=self._torch_dtype,
82
82
  variant=variant,
83
+ local_files_only=True,
83
84
  )
84
85
  except OSError as e:
85
86
  if variant and "no file named" in str(
86
87
  e
87
88
  ): # try without the variant, just in case user's preferences changed
88
- result = load_class.from_pretrained(model_path, torch_dtype=self._torch_dtype)
89
+ result = load_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
89
90
  else:
90
91
  raise e
91
92
 
@@ -384,15 +384,19 @@ class Qwen3EncoderLoader(ModelLoader):
384
384
 
385
385
  match submodel_type:
386
386
  case SubModelType.Tokenizer:
387
- return AutoTokenizer.from_pretrained(tokenizer_path)
387
+ # Use local_files_only=True to prevent network requests for validation
388
+ # The tokenizer files should already exist locally in the model directory
389
+ return AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
388
390
  case SubModelType.TextEncoder:
389
391
  # Determine safe dtype based on target device capabilities
390
392
  target_device = TorchDevice.choose_torch_device()
391
393
  model_dtype = TorchDevice.choose_bfloat16_safe_dtype(target_device)
394
+ # Use local_files_only=True to prevent network requests for validation
392
395
  return Qwen3ForCausalLM.from_pretrained(
393
396
  text_encoder_path,
394
397
  torch_dtype=model_dtype,
395
398
  low_cpu_mem_usage=True,
399
+ local_files_only=True,
396
400
  )
397
401
 
398
402
  raise ValueError(
@@ -526,12 +530,27 @@ class Qwen3EncoderCheckpointLoader(ModelLoader):
526
530
  return self._load_from_singlefile(config)
527
531
  case SubModelType.Tokenizer:
528
532
  # For single-file Qwen3, load tokenizer from HuggingFace
529
- return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
533
+ # Try local cache first to support offline usage after initial download
534
+ return self._load_tokenizer_with_offline_fallback()
530
535
 
531
536
  raise ValueError(
532
537
  f"Only TextEncoder and Tokenizer submodels are supported. Received: {submodel_type.value if submodel_type else 'None'}"
533
538
  )
534
539
 
540
+ def _load_tokenizer_with_offline_fallback(self) -> AnyModel:
541
+ """Load tokenizer with local_files_only fallback for offline support.
542
+
543
+ First tries to load from local cache (offline), falling back to network download
544
+ if the tokenizer hasn't been cached yet. This ensures offline operation after
545
+ the initial download.
546
+ """
547
+ try:
548
+ # Try loading from local cache first (supports offline usage)
549
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE, local_files_only=True)
550
+ except OSError:
551
+ # Not in cache yet, download from HuggingFace
552
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
553
+
535
554
  def _load_from_singlefile(
536
555
  self,
537
556
  config: AnyModelConfig,
@@ -557,7 +576,54 @@ class Qwen3EncoderCheckpointLoader(ModelLoader):
557
576
  # Load the state dict from safetensors file
558
577
  sd = load_file(model_path)
559
578
 
560
- # Determine Qwen model configuration from state dict
579
+ # Handle ComfyUI quantized checkpoints
580
+ # ComfyUI stores quantized weights with accompanying scale factors:
581
+ # - layer.weight: quantized data (FP8)
582
+ # - layer.weight_scale: scale factor (FP32 scalar)
583
+ # Dequantization formula: dequantized = weight.to(dtype) * weight_scale
584
+ # Reference: https://github.com/Comfy-Org/ComfyUI/blob/master/QUANTIZATION.md
585
+ original_key_count = len(sd)
586
+ weight_scale_keys = [k for k in sd.keys() if k.endswith(".weight_scale")]
587
+ dequantized_count = 0
588
+
589
+ for scale_key in weight_scale_keys:
590
+ # Get the corresponding weight key (remove "_scale" suffix)
591
+ weight_key = scale_key.replace(".weight_scale", ".weight")
592
+ if weight_key in sd:
593
+ weight = sd[weight_key]
594
+ scale = sd[scale_key]
595
+ # Dequantize: convert to float and multiply by scale
596
+ # Handle block-wise quantization (e.g., FP4 with block_size=8)
597
+ # where scale has shape [weight_dim / block_size, ...]
598
+ # Note: Float8 types (e.g., float8_e4m3fn) require .float() instead of .to(torch.float32)
599
+ # as PyTorch doesn't support direct type promotion for Float8 types
600
+ weight_float = weight.float()
601
+ scale = scale.float()
602
+ if scale.shape != weight_float.shape and scale.numel() > 1:
603
+ # Block-wise quantization: need to expand scale to match weight shape
604
+ # Find which dimension differs and repeat scale along that dimension
605
+ for dim in range(len(weight_float.shape)):
606
+ if dim < len(scale.shape) and scale.shape[dim] != weight_float.shape[dim]:
607
+ block_size = weight_float.shape[dim] // scale.shape[dim]
608
+ if block_size > 1:
609
+ # Repeat scale along this dimension to match weight shape
610
+ scale = scale.repeat_interleave(block_size, dim=dim)
611
+ sd[weight_key] = weight_float * scale
612
+ dequantized_count += 1
613
+
614
+ if dequantized_count > 0:
615
+ logger.info(f"Dequantized {dequantized_count} ComfyUI quantized weights")
616
+
617
+ # Filter out ComfyUI quantization metadata keys (comfy_quant, weight_scale)
618
+ # These are no longer needed after dequantization
619
+ comfy_metadata_keys = [k for k in sd.keys() if "comfy_quant" in k or "weight_scale" in k]
620
+ for k in comfy_metadata_keys:
621
+ del sd[k]
622
+ if comfy_metadata_keys:
623
+ logger.info(f"Filtered out {len(comfy_metadata_keys)} ComfyUI quantization metadata keys")
624
+
625
+ logger.info(f"Loaded state dict with {len(sd)} keys (originally {original_key_count})")
626
+
561
627
  # Count the number of layers by looking at layer keys
562
628
  layer_count = 0
563
629
  for key in sd.keys():
@@ -570,34 +636,63 @@ class Qwen3EncoderCheckpointLoader(ModelLoader):
570
636
  except ValueError:
571
637
  pass
572
638
 
573
- # Get hidden size from embed_tokens weight shape
639
+ # Get vocab size from embed_tokens weight shape
574
640
  embed_weight = sd.get("model.embed_tokens.weight")
575
641
  if embed_weight is None:
576
642
  raise ValueError("Could not find model.embed_tokens.weight in state dict")
577
- if embed_weight.ndim != 2:
578
- raise ValueError(
579
- f"Expected 2D embed_tokens weight tensor, got shape {embed_weight.shape}. "
580
- "The model file may be corrupted or incompatible."
581
- )
582
- hidden_size = embed_weight.shape[1]
643
+
583
644
  vocab_size = embed_weight.shape[0]
645
+ embed_hidden_size = embed_weight.shape[1]
646
+
647
+ # Detect model variant based on embed_tokens hidden size and layer count
648
+ # FLUX 2 Klein / Z-Image uses Qwen3 configurations from ComfyUI:
649
+ # Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/text_encoders/llama.py
650
+ # - Qwen3-4B: hidden_size=2560, 36 layers, 32 heads, 8 KV heads, intermediate=9728
651
+ # - Qwen3-8B: hidden_size=4096, 36 layers, 32 heads, 8 KV heads, intermediate=12288
652
+ if embed_hidden_size == 2560 and layer_count == 36:
653
+ # Qwen3-4B variant (FLUX 2 Klein / Z-Image)
654
+ logger.info("Detected Qwen3-4B variant (FLUX 2 Klein / Z-Image)")
655
+ hidden_size = 2560
656
+ num_attention_heads = 32
657
+ num_kv_heads = 8
658
+ intermediate_size = 9728
659
+ head_dim = 128
660
+ max_position_embeddings = 40960
661
+ elif embed_hidden_size == 4096 and layer_count == 36:
662
+ # Qwen3-8B variant
663
+ logger.info("Detected Qwen3-8B variant")
664
+ hidden_size = 4096
665
+ num_attention_heads = 32
666
+ num_kv_heads = 8
667
+ intermediate_size = 12288
668
+ head_dim = 128
669
+ max_position_embeddings = 40960
670
+ else:
671
+ # Unknown variant - try to detect from weights
672
+ logger.warning(
673
+ f"Unknown Qwen3 variant: embed_hidden_size={embed_hidden_size}, layers={layer_count}. "
674
+ "Attempting to detect configuration from weights..."
675
+ )
676
+ q_proj_weight = sd.get("model.layers.0.self_attn.q_proj.weight")
677
+ k_proj_weight = sd.get("model.layers.0.self_attn.k_proj.weight")
678
+ gate_proj_weight = sd.get("model.layers.0.mlp.gate_proj.weight")
584
679
 
585
- # Detect attention configuration from layer 0 weights
586
- q_proj_weight = sd.get("model.layers.0.self_attn.q_proj.weight")
587
- k_proj_weight = sd.get("model.layers.0.self_attn.k_proj.weight")
588
- gate_proj_weight = sd.get("model.layers.0.mlp.gate_proj.weight")
680
+ if q_proj_weight is None or k_proj_weight is None or gate_proj_weight is None:
681
+ raise ValueError("Could not find attention/mlp weights to determine configuration")
589
682
 
590
- if q_proj_weight is None or k_proj_weight is None or gate_proj_weight is None:
591
- raise ValueError("Could not find attention/mlp weights in state dict to determine configuration")
683
+ hidden_size = embed_hidden_size
684
+ head_dim = 128
685
+ num_attention_heads = q_proj_weight.shape[0] // head_dim
686
+ num_kv_heads = k_proj_weight.shape[0] // head_dim
687
+ intermediate_size = gate_proj_weight.shape[0]
688
+ max_position_embeddings = 40960
592
689
 
593
- # Calculate dimensions from actual weights
594
- # Qwen3 uses head_dim separately from hidden_size
595
- head_dim = 128 # Standard head dimension for Qwen3 models
596
- num_attention_heads = q_proj_weight.shape[0] // head_dim
597
- num_kv_heads = k_proj_weight.shape[0] // head_dim
598
- intermediate_size = gate_proj_weight.shape[0]
690
+ logger.info(
691
+ f"Qwen3 config: hidden_size={hidden_size}, layers={layer_count}, "
692
+ f"heads={num_attention_heads}, kv_heads={num_kv_heads}, intermediate={intermediate_size}"
693
+ )
599
694
 
600
- # Create Qwen3 config - matches the diffusers text_encoder/config.json
695
+ # Create Qwen3 config
601
696
  qwen_config = Qwen3Config(
602
697
  vocab_size=vocab_size,
603
698
  hidden_size=hidden_size,
@@ -606,7 +701,7 @@ class Qwen3EncoderCheckpointLoader(ModelLoader):
606
701
  num_attention_heads=num_attention_heads,
607
702
  num_key_value_heads=num_kv_heads,
608
703
  head_dim=head_dim,
609
- max_position_embeddings=40960,
704
+ max_position_embeddings=max_position_embeddings,
610
705
  rms_norm_eps=1e-6,
611
706
  tie_word_embeddings=True,
612
707
  rope_theta=1000000.0,
@@ -686,12 +781,27 @@ class Qwen3EncoderGGUFLoader(ModelLoader):
686
781
  return self._load_from_gguf(config)
687
782
  case SubModelType.Tokenizer:
688
783
  # For GGUF Qwen3, load tokenizer from HuggingFace
689
- return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
784
+ # Try local cache first to support offline usage after initial download
785
+ return self._load_tokenizer_with_offline_fallback()
690
786
 
691
787
  raise ValueError(
692
788
  f"Only TextEncoder and Tokenizer submodels are supported. Received: {submodel_type.value if submodel_type else 'None'}"
693
789
  )
694
790
 
791
+ def _load_tokenizer_with_offline_fallback(self) -> AnyModel:
792
+ """Load tokenizer with local_files_only fallback for offline support.
793
+
794
+ First tries to load from local cache (offline), falling back to network download
795
+ if the tokenizer hasn't been cached yet. This ensures offline operation after
796
+ the initial download.
797
+ """
798
+ try:
799
+ # Try loading from local cache first (supports offline usage)
800
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE, local_files_only=True)
801
+ except OSError:
802
+ # Not in cache yet, download from HuggingFace
803
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
804
+
695
805
  def _load_from_gguf(
696
806
  self,
697
807
  config: AnyModelConfig,
@@ -737,7 +847,7 @@ class Qwen3EncoderGGUFLoader(ModelLoader):
737
847
  except ValueError:
738
848
  pass
739
849
 
740
- # Get hidden size from embed_tokens weight shape
850
+ # Get vocab size from embed_tokens weight shape
741
851
  embed_weight = sd.get("model.embed_tokens.weight")
742
852
  if embed_weight is None:
743
853
  raise ValueError("Could not find model.embed_tokens.weight in state dict")
@@ -749,13 +859,23 @@ class Qwen3EncoderGGUFLoader(ModelLoader):
749
859
  f"Expected 2D embed_tokens weight tensor, got shape {embed_shape}. "
750
860
  "The model file may be corrupted or incompatible."
751
861
  )
752
- hidden_size = embed_shape[1]
753
862
  vocab_size = embed_shape[0]
754
863
 
755
- # Detect attention configuration from layer 0 weights
756
- q_proj_weight = sd.get("model.layers.0.self_attn.q_proj.weight")
757
- k_proj_weight = sd.get("model.layers.0.self_attn.k_proj.weight")
758
- gate_proj_weight = sd.get("model.layers.0.mlp.gate_proj.weight")
864
+ # Detect attention configuration from layer weights
865
+ # IMPORTANT: Use layer 1 (not layer 0) because some models like FLUX 2 Klein have a special
866
+ # first layer with different dimensions (input projection layer) while the rest of the
867
+ # transformer layers have a different hidden_size. Using a middle layer ensures we get
868
+ # the representative hidden_size for the bulk of the model.
869
+ # Fall back to layer 0 if layer 1 doesn't exist.
870
+ q_proj_weight = sd.get("model.layers.1.self_attn.q_proj.weight")
871
+ k_proj_weight = sd.get("model.layers.1.self_attn.k_proj.weight")
872
+ gate_proj_weight = sd.get("model.layers.1.mlp.gate_proj.weight")
873
+
874
+ # Fall back to layer 0 if layer 1 doesn't exist (single-layer model edge case)
875
+ if q_proj_weight is None:
876
+ q_proj_weight = sd.get("model.layers.0.self_attn.q_proj.weight")
877
+ k_proj_weight = sd.get("model.layers.0.self_attn.k_proj.weight")
878
+ gate_proj_weight = sd.get("model.layers.0.mlp.gate_proj.weight")
759
879
 
760
880
  if q_proj_weight is None or k_proj_weight is None or gate_proj_weight is None:
761
881
  raise ValueError("Could not find attention/mlp weights in state dict to determine configuration")
@@ -766,7 +886,14 @@ class Qwen3EncoderGGUFLoader(ModelLoader):
766
886
  gate_shape = gate_proj_weight.shape if hasattr(gate_proj_weight, "shape") else gate_proj_weight.tensor_shape
767
887
 
768
888
  # Calculate dimensions from actual weights
889
+ # IMPORTANT: Use hidden_size from k_proj input dimension (not q_proj or embed_tokens).
890
+ # Some models (like FLUX 2 Klein) have unusual architectures where:
891
+ # - embed_tokens has a larger dimension (e.g., 2560)
892
+ # - q_proj may have a larger input dimension for query expansion
893
+ # - k_proj/v_proj have the actual transformer hidden_size (e.g., 1280)
894
+ # Using k_proj ensures we get the correct internal hidden_size.
769
895
  head_dim = 128 # Standard head dimension for Qwen3 models
896
+ hidden_size = k_shape[1] # Use k_proj input dim as the hidden_size
770
897
  num_attention_heads = q_shape[0] // head_dim
771
898
  num_kv_heads = k_shape[0] // head_dim
772
899
  intermediate_size = gate_shape[0]
@@ -690,6 +690,115 @@ flux_fill = StarterModel(
690
690
  )
691
691
  # endregion
692
692
 
693
+ # region FLUX.2 Klein
694
+ flux2_vae = StarterModel(
695
+ name="FLUX.2 VAE",
696
+ base=BaseModelType.Flux2,
697
+ source="black-forest-labs/FLUX.2-klein-4B::vae",
698
+ description="FLUX.2 VAE (16-channel, same architecture as FLUX.1 VAE). ~335MB",
699
+ type=ModelType.VAE,
700
+ )
701
+
702
+ flux2_klein_qwen3_4b_encoder = StarterModel(
703
+ name="FLUX.2 Klein Qwen3 4B Encoder",
704
+ base=BaseModelType.Any,
705
+ source="black-forest-labs/FLUX.2-klein-4B::text_encoder+tokenizer",
706
+ description="Qwen3 4B text encoder for FLUX.2 Klein 4B (also compatible with Z-Image). ~8GB",
707
+ type=ModelType.Qwen3Encoder,
708
+ )
709
+
710
+ flux2_klein_qwen3_8b_encoder = StarterModel(
711
+ name="FLUX.2 Klein Qwen3 8B Encoder",
712
+ base=BaseModelType.Any,
713
+ source="black-forest-labs/FLUX.2-klein-9B::text_encoder+tokenizer",
714
+ description="Qwen3 8B text encoder for FLUX.2 Klein 9B models. ~16GB",
715
+ type=ModelType.Qwen3Encoder,
716
+ )
717
+
718
+ flux2_klein_4b = StarterModel(
719
+ name="FLUX.2 Klein 4B (Diffusers)",
720
+ base=BaseModelType.Flux2,
721
+ source="black-forest-labs/FLUX.2-klein-4B",
722
+ description="FLUX.2 Klein 4B in Diffusers format - includes transformer, VAE and Qwen3 encoder. ~10GB",
723
+ type=ModelType.Main,
724
+ )
725
+
726
+ flux2_klein_4b_single = StarterModel(
727
+ name="FLUX.2 Klein 4B",
728
+ base=BaseModelType.Flux2,
729
+ source="https://huggingface.co/black-forest-labs/FLUX.2-klein-4B/resolve/main/flux-2-klein-4b.safetensors",
730
+ description="FLUX.2 Klein 4B standalone transformer. Installs with VAE and Qwen3 4B encoder. ~8GB",
731
+ type=ModelType.Main,
732
+ dependencies=[flux2_vae, flux2_klein_qwen3_4b_encoder],
733
+ )
734
+
735
+ flux2_klein_4b_fp8 = StarterModel(
736
+ name="FLUX.2 Klein 4B (FP8)",
737
+ base=BaseModelType.Flux2,
738
+ source="https://huggingface.co/black-forest-labs/FLUX.2-klein-4b-fp8/resolve/main/flux-2-klein-4b-fp8.safetensors",
739
+ description="FLUX.2 Klein 4B FP8 quantized - smaller and faster. Installs with VAE and Qwen3 4B encoder. ~4GB",
740
+ type=ModelType.Main,
741
+ dependencies=[flux2_vae, flux2_klein_qwen3_4b_encoder],
742
+ )
743
+
744
+ flux2_klein_9b = StarterModel(
745
+ name="FLUX.2 Klein 9B (Diffusers)",
746
+ base=BaseModelType.Flux2,
747
+ source="black-forest-labs/FLUX.2-klein-9B",
748
+ description="FLUX.2 Klein 9B in Diffusers format - includes transformer, VAE and Qwen3 encoder. ~20GB",
749
+ type=ModelType.Main,
750
+ )
751
+
752
+ flux2_klein_9b_fp8 = StarterModel(
753
+ name="FLUX.2 Klein 9B (FP8)",
754
+ base=BaseModelType.Flux2,
755
+ source="https://huggingface.co/black-forest-labs/FLUX.2-klein-9b-fp8/resolve/main/flux-2-klein-9b-fp8.safetensors",
756
+ description="FLUX.2 Klein 9B FP8 quantized - more efficient than full precision. Installs with VAE and Qwen3 8B encoder. ~9.5GB",
757
+ type=ModelType.Main,
758
+ dependencies=[flux2_vae, flux2_klein_qwen3_8b_encoder],
759
+ )
760
+
761
+ flux2_klein_4b_gguf_q4 = StarterModel(
762
+ name="FLUX.2 Klein 4B (GGUF Q4)",
763
+ base=BaseModelType.Flux2,
764
+ source="https://huggingface.co/unsloth/FLUX.2-klein-4B-GGUF/resolve/main/flux-2-klein-4b-Q4_K_M.gguf",
765
+ description="FLUX.2 Klein 4B GGUF Q4_K_M quantized - runs on 6-8GB VRAM. Installs with VAE and Qwen3 4B encoder. ~2.6GB",
766
+ type=ModelType.Main,
767
+ format=ModelFormat.GGUFQuantized,
768
+ dependencies=[flux2_vae, flux2_klein_qwen3_4b_encoder],
769
+ )
770
+
771
+ flux2_klein_4b_gguf_q8 = StarterModel(
772
+ name="FLUX.2 Klein 4B (GGUF Q8)",
773
+ base=BaseModelType.Flux2,
774
+ source="https://huggingface.co/unsloth/FLUX.2-klein-4B-GGUF/resolve/main/flux-2-klein-4b-Q8_0.gguf",
775
+ description="FLUX.2 Klein 4B GGUF Q8_0 quantized - higher quality than Q4. Installs with VAE and Qwen3 4B encoder. ~4.3GB",
776
+ type=ModelType.Main,
777
+ format=ModelFormat.GGUFQuantized,
778
+ dependencies=[flux2_vae, flux2_klein_qwen3_4b_encoder],
779
+ )
780
+
781
+ flux2_klein_9b_gguf_q4 = StarterModel(
782
+ name="FLUX.2 Klein 9B (GGUF Q4)",
783
+ base=BaseModelType.Flux2,
784
+ source="https://huggingface.co/unsloth/FLUX.2-klein-9B-GGUF/resolve/main/flux-2-klein-9b-Q4_K_M.gguf",
785
+ description="FLUX.2 Klein 9B GGUF Q4_K_M quantized - runs on 12GB+ VRAM. Installs with VAE and Qwen3 8B encoder. ~5.8GB",
786
+ type=ModelType.Main,
787
+ format=ModelFormat.GGUFQuantized,
788
+ dependencies=[flux2_vae, flux2_klein_qwen3_8b_encoder],
789
+ )
790
+
791
+ flux2_klein_9b_gguf_q8 = StarterModel(
792
+ name="FLUX.2 Klein 9B (GGUF Q8)",
793
+ base=BaseModelType.Flux2,
794
+ source="https://huggingface.co/unsloth/FLUX.2-klein-9B-GGUF/resolve/main/flux-2-klein-9b-Q8_0.gguf",
795
+ description="FLUX.2 Klein 9B GGUF Q8_0 quantized - higher quality than Q4. Installs with VAE and Qwen3 8B encoder. ~10GB",
796
+ type=ModelType.Main,
797
+ format=ModelFormat.GGUFQuantized,
798
+ dependencies=[flux2_vae, flux2_klein_qwen3_8b_encoder],
799
+ )
800
+ # endregion
801
+
693
802
  # region Z-Image
694
803
  z_image_qwen3_encoder = StarterModel(
695
804
  name="Z-Image Qwen3 Text Encoder",
@@ -720,20 +829,20 @@ z_image_turbo_quantized = StarterModel(
720
829
  name="Z-Image Turbo (quantized)",
721
830
  base=BaseModelType.ZImage,
722
831
  source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
723
- description="Z-Image Turbo quantized to GGUF Q4_K format. Requires separate Qwen3 text encoder. ~4GB",
832
+ description="Z-Image Turbo quantized to GGUF Q4_K format. Requires standalone Qwen3 text encoder and Flux VAE. ~4GB",
724
833
  type=ModelType.Main,
725
834
  format=ModelFormat.GGUFQuantized,
726
- dependencies=[z_image_qwen3_encoder_quantized],
835
+ dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
727
836
  )
728
837
 
729
838
  z_image_turbo_q8 = StarterModel(
730
839
  name="Z-Image Turbo (Q8)",
731
840
  base=BaseModelType.ZImage,
732
841
  source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
733
- description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires separate Qwen3 text encoder. ~6.6GB",
842
+ description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires standalone Qwen3 text encoder and Flux VAE. ~6.6GB",
734
843
  type=ModelType.Main,
735
844
  format=ModelFormat.GGUFQuantized,
736
- dependencies=[z_image_qwen3_encoder_quantized],
845
+ dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
737
846
  )
738
847
 
739
848
  z_image_controlnet_union = StarterModel(
@@ -826,6 +935,18 @@ STARTER_MODELS: list[StarterModel] = [
826
935
  flux_redux,
827
936
  llava_onevision,
828
937
  flux_fill,
938
+ flux2_vae,
939
+ flux2_klein_4b,
940
+ flux2_klein_4b_single,
941
+ flux2_klein_4b_fp8,
942
+ flux2_klein_9b,
943
+ flux2_klein_9b_fp8,
944
+ flux2_klein_4b_gguf_q4,
945
+ flux2_klein_4b_gguf_q8,
946
+ flux2_klein_9b_gguf_q4,
947
+ flux2_klein_9b_gguf_q8,
948
+ flux2_klein_qwen3_4b_encoder,
949
+ flux2_klein_qwen3_8b_encoder,
829
950
  cogview4,
830
951
  flux_krea,
831
952
  flux_krea_quantized,
@@ -890,10 +1011,26 @@ flux_bundle: list[StarterModel] = [
890
1011
  flux_krea_quantized,
891
1012
  ]
892
1013
 
1014
+ zimage_bundle: list[StarterModel] = [
1015
+ z_image_turbo_quantized,
1016
+ z_image_qwen3_encoder_quantized,
1017
+ z_image_controlnet_union,
1018
+ z_image_controlnet_tile,
1019
+ flux_vae,
1020
+ ]
1021
+
1022
+ flux2_klein_bundle: list[StarterModel] = [
1023
+ flux2_klein_4b_gguf_q4,
1024
+ flux2_vae,
1025
+ flux2_klein_qwen3_4b_encoder,
1026
+ ]
1027
+
893
1028
  STARTER_BUNDLES: dict[str, StarterModelBundle] = {
894
1029
  BaseModelType.StableDiffusion1: StarterModelBundle(name="Stable Diffusion 1.5", models=sd1_bundle),
895
1030
  BaseModelType.StableDiffusionXL: StarterModelBundle(name="SDXL", models=sdxl_bundle),
896
1031
  BaseModelType.Flux: StarterModelBundle(name="FLUX.1 dev", models=flux_bundle),
1032
+ BaseModelType.Flux2: StarterModelBundle(name="FLUX.2 Klein", models=flux2_klein_bundle),
1033
+ BaseModelType.ZImage: StarterModelBundle(name="Z-Image Turbo", models=zimage_bundle),
897
1034
  }
898
1035
 
899
1036
  assert len(STARTER_MODELS) == len({m.source for m in STARTER_MODELS}), "Duplicate starter models"
@@ -46,6 +46,8 @@ class BaseModelType(str, Enum):
46
46
  """Indicates the model is associated with the Stable Diffusion XL Refiner model architecture."""
47
47
  Flux = "flux"
48
48
  """Indicates the model is associated with FLUX.1 model architecture, including FLUX Dev, Schnell and Fill."""
49
+ Flux2 = "flux2"
50
+ """Indicates the model is associated with FLUX.2 model architecture, including FLUX2 Klein."""
49
51
  CogView4 = "cogview4"
50
52
  """Indicates the model is associated with CogView 4 model architecture."""
51
53
  ZImage = "z-image"
@@ -111,11 +113,36 @@ class ModelVariantType(str, Enum):
111
113
 
112
114
 
113
115
  class FluxVariantType(str, Enum):
116
+ """FLUX.1 model variants."""
117
+
114
118
  Schnell = "schnell"
115
119
  Dev = "dev"
116
120
  DevFill = "dev_fill"
117
121
 
118
122
 
123
+ class Flux2VariantType(str, Enum):
124
+ """FLUX.2 model variants."""
125
+
126
+ Klein4B = "klein_4b"
127
+ """Flux2 Klein 4B variant using Qwen3 4B text encoder."""
128
+
129
+ Klein9B = "klein_9b"
130
+ """Flux2 Klein 9B variant using Qwen3 8B text encoder (distilled)."""
131
+
132
+ Klein9BBase = "klein_9b_base"
133
+ """Flux2 Klein 9B Base variant - undistilled foundation model using Qwen3 8B text encoder."""
134
+
135
+
136
+ class Qwen3VariantType(str, Enum):
137
+ """Qwen3 text encoder variants based on model size."""
138
+
139
+ Qwen3_4B = "qwen3_4b"
140
+ """Qwen3 4B text encoder (hidden_size=2560). Used by FLUX.2 Klein 4B and Z-Image."""
141
+
142
+ Qwen3_8B = "qwen3_8b"
143
+ """Qwen3 8B text encoder (hidden_size=4096). Used by FLUX.2 Klein 9B."""
144
+
145
+
119
146
  class ModelFormat(str, Enum):
120
147
  """Storage format of model."""
121
148
 
@@ -174,7 +201,7 @@ class FluxLoRAFormat(str, Enum):
174
201
  XLabs = "flux.xlabs"
175
202
 
176
203
 
177
- AnyVariant: TypeAlias = Union[ModelVariantType, ClipVariantType, FluxVariantType]
178
- variant_type_adapter = TypeAdapter[ModelVariantType | ClipVariantType | FluxVariantType](
179
- ModelVariantType | ClipVariantType | FluxVariantType
180
- )
204
+ AnyVariant: TypeAlias = Union[ModelVariantType, ClipVariantType, FluxVariantType, Flux2VariantType, Qwen3VariantType]
205
+ variant_type_adapter = TypeAdapter[
206
+ ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | Qwen3VariantType
207
+ ](ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | Qwen3VariantType)
@@ -60,7 +60,7 @@ def filter_files(
60
60
 
61
61
  # Start by filtering on model file extensions, discarding images, docs, etc
62
62
  for file in files:
63
- if file.name.endswith((".json", ".txt")):
63
+ if file.name.endswith((".json", ".txt", ".jinja")): # .jinja for chat templates
64
64
  paths.append(file)
65
65
  elif file.name.endswith(
66
66
  (
@@ -116,7 +116,8 @@ def _filter_by_variant(files: List[Path], variant: ModelRepoVariant) -> Set[Path
116
116
 
117
117
  # Note: '.model' was added to support:
118
118
  # https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/768d12a373ed5cc9ef9a9dea7504dc09fcc14842/tokenizer_2/spiece.model
119
- elif path.suffix in [".json", ".txt", ".model"]:
119
+ # Note: '.jinja' was added to support chat templates for FLUX.2 Klein models
120
+ elif path.suffix in [".json", ".txt", ".model", ".jinja"]:
120
121
  result.add(path)
121
122
 
122
123
  elif variant in [
@@ -140,16 +140,50 @@ def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | N
140
140
 
141
141
 
142
142
  def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
143
- """Groups the keys in the state dict by layer."""
143
+ """Groups the keys in the state dict by layer.
144
+
145
+ Z-Image LoRAs have keys like:
146
+ - diffusion_model.layers.17.attention.to_k.alpha
147
+ - diffusion_model.layers.17.attention.to_k.dora_scale
148
+ - diffusion_model.layers.17.attention.to_k.lora_down.weight
149
+ - diffusion_model.layers.17.attention.to_k.lora_up.weight
150
+
151
+ We need to group these by the full layer path (e.g., diffusion_model.layers.17.attention.to_k)
152
+ and extract the suffix (alpha, dora_scale, lora_down.weight, lora_up.weight).
153
+ """
144
154
  layer_dict: dict[str, dict[str, torch.Tensor]] = {}
155
+
156
+ # Known suffixes that indicate the end of a layer name
157
+ known_suffixes = [
158
+ ".lora_A.weight",
159
+ ".lora_B.weight",
160
+ ".lora_down.weight",
161
+ ".lora_up.weight",
162
+ ".dora_scale",
163
+ ".alpha",
164
+ ]
165
+
145
166
  for key in state_dict:
146
167
  if not isinstance(key, str):
147
168
  continue
148
- # Split the 'lora_A.weight' or 'lora_B.weight' suffix from the layer name.
149
- parts = key.rsplit(".", maxsplit=2)
150
- layer_name = parts[0]
151
- key_name = ".".join(parts[1:])
169
+
170
+ # Try to find a known suffix
171
+ layer_name = None
172
+ key_name = None
173
+ for suffix in known_suffixes:
174
+ if key.endswith(suffix):
175
+ layer_name = key[: -len(suffix)]
176
+ key_name = suffix[1:] # Remove leading dot
177
+ break
178
+
179
+ if layer_name is None:
180
+ # Fallback to original logic for unknown formats
181
+ parts = key.rsplit(".", maxsplit=2)
182
+ layer_name = parts[0]
183
+ key_name = ".".join(parts[1:])
184
+
152
185
  if layer_name not in layer_dict:
153
186
  layer_dict[layer_name] = {}
154
187
  layer_dict[layer_name][key_name] = state_dict[key]
188
+
155
189
  return layer_dict