PyPI - codon-model - Versions diffs - 0.0.2__tar.gz → 0.0.3a1__tar.gz - Mend

codon-model 0.0.2tar.gz → 0.0.3a1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{codon_model-0.0.2/codon_model.egg-info → codon_model-0.0.3a1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codon-model
-Version: 0.0.2
+Version: 0.0.3a1
 Summary: Codon model package
 Author: CodonTeam
 Requires-Python: >=3.8

{codon_model-0.0.2 → codon_model-0.0.3a1}/codon/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from typing import Optional
-__version__ = '0.0.2'
+__version__ = '0.0.3a1'
 __seed__: Optional[int] = None

codon_model-0.0.3a1/codon/kit/train/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .vision import auto_vision_train, AutoVisionTrainResult
+__all__ = [
+    'auto_vision_train',
+    'AutoVisionTrainResult'
+]

codon_model-0.0.2/codon/kit/auto_vision_train.py → codon_model-0.0.3a1/codon/kit/train/vision.py RENAMED Viewed

@@ -2,19 +2,23 @@ import torch
 import torch.nn as nn
 import numpy as np
+from PIL import Image
 from dataclasses import dataclass
 from typing import Union, Optional, Literal, Callable
-from PIL import Image
-from codon.model.motif.motif_v1 import MotifV1, MotifV1Output
-from codon.model.patch_disc import PatchDiscriminator
+from codon.model import PatchDiscriminator
+from codon.model.motif import (
+    AutoencoderVisionModel,
+    AutoVisionEncoderOutput,
+    AutoVisionDecoderOutput
+)
 from codon.utils.split import split_image, SplitedImage
 @dataclass
-class AutoTrainMotifVisionOutput:
+class AutoVisionTrainResult:
     '''
-    Dataclass to hold the outputs and metrics from a single auto_train step.
+    Dataclass to hold the outputs and metrics from a single auto_vision_train step.
     Attributes:
         loss_g (float): Total generator loss.
@@ -40,8 +44,30 @@ class AutoTrainMotifVisionOutput:
     fake_patches: Optional[torch.Tensor] = None
-def auto_train_motif_vision(
-    model: MotifV1,
+def _patches_to_image(patches: torch.Tensor, grid_shape: tuple) -> torch.Tensor:
+    '''
+    Helper function to reconstruct a full image tensor from a sequence of patches.
+    This is used to supply a padded full image to the generic AutoencoderVisionModel.encode().
+    Args:
+        patches (torch.Tensor): Patches tensor with shape [num_patches_h * num_patches_w, channels, patch_size, patch_size].
+        grid_shape (tuple): Grid shape as (num_patches_h, num_patches_w).
+    Returns:
+        torch.Tensor: Reconstructed full image tensor with shape [1, channels, height, width].
+    '''
+    num_patches_h, num_patches_w = grid_shape
+    channels, patch_size = patches.shape[1], patches.shape[2]
+    patches = patches.view(1, num_patches_h, num_patches_w, channels, patch_size, patch_size)
+    patches = patches.permute(0, 3, 1, 4, 2, 5).contiguous()
+    patches = patches.view(1, channels, num_patches_h * patch_size, num_patches_w * patch_size)
+    return patches
+def auto_vision_train(
+    model: AutoencoderVisionModel,
     discriminator: PatchDiscriminator,
     optimizer_g: torch.optim.Optimizer,
     optimizer_d: torch.optim.Optimizer,
@@ -53,36 +79,36 @@ def auto_train_motif_vision(
     perceptual_weight: float = 1.0,
     adv_weight: float = 0.1,
     quant_weight: float = 1.0,
-    codebook_size: int = 2**18,
-    device: Union[str, torch.device] = 'cpu'
-) -> AutoTrainMotifVisionOutput:
+) -> AutoVisionTrainResult:
     '''
-    Executes a single end-to-end training step for the MotifV1 autoencoder.
+    Executes a single end-to-end training step for an AutoencoderVisionModel.
-    This function handles image splitting, forward passes for both the generator (MotifV1)
-    and the discriminator (PatchDiscriminator), loss calculations (including GAN, LPIPS,
-    L1/MSE, and Quantization), and backpropagation.
+    This function handles image splitting (with necessary padding), forward passes for both
+    the generator (AutoencoderVisionModel) and the discriminator (PatchDiscriminator),
+    loss calculations (including GAN, LPIPS, L1/MSE, and Quantization), and backpropagation.
     Args:
-        model (MotifV1): The MotifV1 autoencoder model.
+        model (AutoencoderVisionModel): The autoencoder vision model.
         discriminator (PatchDiscriminator): The PatchGAN discriminator.
-        optimizer_g (torch.optim.Optimizer): Optimizer for the MotifV1 model.
+        optimizer_g (torch.optim.Optimizer): Optimizer for the autoencoder model.
         optimizer_d (torch.optim.Optimizer): Optimizer for the discriminator.
         image (Union[torch.Tensor, str, Image.Image, np.ndarray]): The input image.
-        patch_size (int): The patch size used by the MotifV1 model. Defaults to 12.
+        patch_size (int): The patch size used by the model. Defaults to 12.
         recon_loss_type (Literal['l1', 'mse']): Type of reconstruction loss. Defaults to 'l1'.
         recon_weight (float): Weight for the reconstruction loss. Defaults to 1.0.
         perceptual_loss_fn (Callable, optional): Initialized LPIPS or other perceptual loss function. Defaults to None.
         perceptual_weight (float): Weight for the perceptual loss. Defaults to 1.0.
         adv_weight (float): Weight for the generator's adversarial GAN loss. Defaults to 0.1.
         quant_weight (float): Weight for the lookup-free quantization loss. Defaults to 1.0.
-        codebook_size (int): The total capacity of the codebook. Defaults to 2^18 = 262144.
-        device (Union[str, torch.device]): Device to perform computations on. Defaults to 'cpu'.
     Returns:
-        AutoTrainMotifVisionOutput: Dataclass containing all the calculated losses and metrics.
+        AutoVisionTrainResult: Dataclass containing all the calculated losses and metrics.
     '''
-    # 1. Process and split the input image
+    # Fallback mechanisms to get device and codebook_size if they aren't explicitly properties
+    device = getattr(model, 'device', next(model.parameters()).device)
+    codebook_size = getattr(model, 'codebook_size', 2**18)
+    # 1. Process and split the input image with padding to handle arbitrary sizes
     splited: SplitedImage = split_image(
         image=image,
         patch_size=patch_size,
@@ -102,10 +128,16 @@ def auto_train_motif_vision(
     else:
         recon_criterion = mse_criterion
-    # Forward pass through MotifV1 once, reusing outputs for both discriminator and generator
-    motif_out: MotifV1Output = model(real_patches, grid_shape)
-    fake_patches = motif_out.reconstructed_image
+    # 2. Forward pass through generator (AutoencoderVisionModel)
+    # Reconstruct padded full image to feed into generic encode method
+    padded_full_image = _patches_to_image(real_patches, grid_shape).to(device)
+    encoder_out: AutoVisionEncoderOutput = model.encode(padded_full_image)
+    decoder_out: AutoVisionDecoderOutput = model.decode(encoder_out)
+    fake_patches = decoder_out.reconstructed
+    # 3. Discriminator Training
     optimizer_d.zero_grad()
     # Forward discriminator on real patches
@@ -121,51 +153,61 @@ def auto_train_motif_vision(
     loss_d.backward()
     optimizer_d.step()
+    # 4. Generator Training
     optimizer_g.zero_grad()
-    # 2.1 Reconstruction Loss (L1 or MSE)
+    # 4.1 Reconstruction Loss (L1 or MSE)
     loss_recon = recon_criterion(fake_patches, real_patches)
-    # 2.2 Perceptual Loss (LPIPS)
+    # 4.2 Perceptual Loss (LPIPS)
     loss_perceptual_val = torch.tensor(0.0, device=device)
     if perceptual_loss_fn is not None:
-        # LPIPS expects input in range [-1, 1], Motif uses [0, 1]
+        # Expected image range handling: LPIPS usually expects [-1, 1], models might output [0, 1]
         p_real = real_patches * 2.0 - 1.0
         p_fake = fake_patches * 2.0 - 1.0
         loss_perceptual_val = perceptual_loss_fn(p_real, p_fake).mean()
-    # 2.3 Quantization Loss
-    loss_quant = motif_out.quantization_loss
+    # 4.3 Quantization Loss
+    # Fallback to 0.0 if the encoder output does not provide a quantization loss (e.g., standard AE)
+    loss_quant_val = torch.tensor(0.0, device=device)
+    if encoder_out.loss is not None:
+        loss_quant_val = encoder_out.loss
-    # 2.4 Generator Adversarial Loss
+    # 4.4 Generator Adversarial Loss
     d_out_fake_g = discriminator(fake_patches)
     loss_adv = mse_criterion(d_out_fake_g, torch.ones_like(d_out_fake_g))
-    # 2.5 Total Generator Loss
+    # 4.5 Total Generator Loss
     loss_g = (
         recon_weight * loss_recon +
         perceptual_weight * loss_perceptual_val +
-        quant_weight * loss_quant +
+        quant_weight * loss_quant_val +
         adv_weight * loss_adv
     )
     loss_g.backward()
     optimizer_g.step()
-    # Calculate codebook utilization
-    indices = motif_out.indices
-    unique_indices = torch.unique(indices)
-    usage_rate = unique_indices.numel() / codebook_size
-    return AutoTrainMotifVisionOutput(
+    # Calculate codebook utilization if applicable
+    usage_rate = 0.0
+    if encoder_out.indices is not None:
+        indices = encoder_out.indices
+        unique_indices = torch.unique(indices)
+        usage_rate = unique_indices.numel() / codebook_size
+    perplexity_val = 0.0
+    if encoder_out.perplexity is not None:
+        perplexity_val = encoder_out.perplexity.item()
+    return AutoVisionTrainResult(
         loss_g=loss_g.item(),
         loss_d=loss_d.item(),
         loss_recon=loss_recon.item(),
         loss_perceptual=loss_perceptual_val.item(),
-        loss_quant=loss_quant.item(),
+        loss_quant=loss_quant_val.item(),
         loss_adv=loss_adv.item(),
         codebook_usage_rate=float(usage_rate),
-        perplexity=motif_out.perplexity.item(),
+        perplexity=float(perplexity_val),
         real_patches=real_patches,
         fake_patches=fake_patches
     )

codon_model-0.0.3a1/codon/model/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .resnet import ResNet
+from .patch_disc import PatchDiscriminator
+from .tcn import TemporalConvNet
+__all__ = [
+    'ResNet',
+    'PatchDiscriminator',
+    'TemporalConvNet'
+]

codon_model-0.0.3a1/codon/model/motif/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from .base import (
+    CausalLanguageModel,
+    CausalLanguageModelOutput,
+    AutoencoderVisionModel,
+    AutoVisionEncoderOutput,
+    AutoVisionDecoderOutput
+)
+from .motif_a1 import MotifA1
+from .motif_v1 import MotifV1Encoder, MotifV1Decoder, MotifV1
+__all__ = [
+    'CausalLanguageModel',
+    'CausalLanguageModelOutput',
+    'AutoencoderVisionModel',
+    'AutoVisionEncoderOutput',
+    'AutoVisionDecoderOutput',
+    'MotifA1',
+    'MotifV1Encoder', 'MotifV1Decoder', 'MotifV1',
+]

codon_model-0.0.3a1/codon/model/motif/base.py ADDED Viewed

@@ -0,0 +1,231 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Callable, Any, Iterator, Union, Optional, List, Tuple
+from dataclasses import dataclass
+from codon.base import BasicModel
+@dataclass
+class AutoVisionEncoderOutput:
+    '''
+    Output of autoencoder vision model encoder.
+    Attributes:
+        z_q (torch.Tensor): Quantized latent tensor.
+        loss (torch.Tensor): Quantization loss.
+        indices (torch.Tensor): Quantized indices.
+        grid_shape (tuple): Grid shape as (num_patches_h, num_patches_w).
+        entropy (torch.Tensor): Average bit-wise entropy from codebook.
+        perplexity (torch.Tensor): Perplexity calculated as 2^entropy.
+        hidden_states (torch.Tensor): Hidden states before quantization.
+    '''
+    z_q: torch.Tensor
+    loss: torch.Tensor = None
+    indices: torch.Tensor = None
+    grid_shape: tuple = None
+    entropy: torch.Tensor = None
+    perplexity: torch.Tensor = None
+    hidden_states: torch.Tensor = None
+@dataclass
+class AutoVisionDecoderOutput:
+    '''
+    Output of autoencoder vision model decoder.
+    Attributes:
+        reconstructed (torch.Tensor): Reconstructed output tensor.
+        grid_shape (tuple): Grid shape as (num_patches_h, num_patches_w).
+        hidden_states (torch.Tensor): Hidden states after attention.
+    '''
+    reconstructed: torch.Tensor
+    grid_shape: tuple = None
+    hidden_states: torch.Tensor = None
+@dataclass
+class CausalLanguageModelOutput:
+    '''
+    Output of causal language model.
+    Attributes:
+        logits (torch.Tensor): Prediction logits.
+        past_key_values (list, optional): List of past key value states.
+        aux_loss (torch.Tensor, optional): Auxiliary loss.
+        attentions (list, optional): List of attention weights.
+        hidden_states (tuple, optional): Tuple of hidden states.
+    '''
+    logits: torch.Tensor
+    past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None
+    aux_loss: Optional[torch.Tensor] = None
+    attentions: Optional[List[torch.Tensor]] = None
+    hidden_states: Optional[Tuple[torch.Tensor]] = None
+class CausalLanguageModel(BasicModel):
+    '''
+    Base class for causal language models with text generation capabilities.
+    Attributes:
+        gradient_checkpointing (bool): Whether gradient checkpointing is enabled.
+    '''
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_k: int = None,
+        eos_token_id: int = None
+    ) -> torch.Tensor:
+        '''
+        Generate text tokens autoregressively.
+        Args:
+            input_ids (torch.Tensor): Input token IDs with shape [batch, seq_len].
+            max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 100.
+            temperature (float): Sampling temperature. Higher values increase randomness.
+                                 Defaults to 1.0.
+            top_k (int, optional): If set, sample only from top k tokens. Defaults to None.
+            eos_token_id (int, optional): End-of-sequence token ID. If None, generation
+                                          stops after max_new_tokens. Defaults to None.
+        Returns:
+            torch.Tensor: Generated token IDs with shape [batch, seq_len + num_generated].
+        '''
+        self.eval()
+        with torch.no_grad():
+            batch_size, seq_len = input_ids.shape
+            generated = input_ids.clone()
+            past_key_values = None
+            for _ in range(max_new_tokens):
+                if seq_len > 1:
+                    outputs = self.forward(
+                        input_ids=generated,
+                        past_key_values=past_key_values,
+                        use_cache=True
+                    )
+                    past_key_values = outputs.past_key_values
+                    logits = outputs.logits[:, -1, :]
+                else:
+                    outputs = self.forward(input_ids=generated)
+                    logits = outputs.logits[:, -1, :]
+                logits = logits / temperature
+                if top_k is not None:
+                    top_k_vals = torch.topk(logits, top_k).values[:, -1]
+                    logits = torch.where(logits < top_k_vals.unsqueeze(1), torch.full_like(logits, float('-inf')), logits)
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                generated = torch.cat([generated, next_token], dim=1)
+                if eos_token_id is not None and (next_token == eos_token_id).all():
+                    break
+            return generated
+    def compute_perplexity(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        '''
+        Compute perplexity from logits and target tokens.
+        Args:
+            logits (torch.Tensor): Model output logits with shape [batch, seq_len, vocab_size].
+            targets (torch.Tensor): Target token IDs with shape [batch, seq_len].
+        Returns:
+            torch.Tensor: Perplexity value (lower is better).
+        '''
+        batch_size, seq_len, vocab_size = logits.shape
+        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
+        targets_flat = targets.reshape(batch_size * seq_len)
+        loss = F.cross_entropy(logits_flat, targets_flat, reduction='mean')
+        perplexity = torch.exp(loss)
+        return perplexity
+class AutoencoderVisionModel(BasicModel):
+    '''
+    Base class for autoencoder vision models with encoding/decoding capabilities.
+    Attributes:
+        gradient_checkpointing (bool): Whether gradient checkpointing is enabled.
+    '''
+    def __init__(self):
+        super().__init__()
+        self.codebook_size: int = 0
+    @staticmethod
+    def compute_psnr(img1: torch.Tensor, img2: torch.Tensor, max_value: float = 1.0) -> torch.Tensor:
+        '''
+        Compute Peak Signal-to-Noise Ratio between two images.
+        Args:
+            img1 (torch.Tensor): Reference image tensor.
+            img2 (torch.Tensor): Comparison image tensor.
+            max_value (float): Maximum possible pixel value. Defaults to 1.0.
+        Returns:
+            torch.Tensor: PSNR value in dB (higher is better).
+        '''
+        mse = torch.mean((img1 - img2) ** 2)
+        psnr = 10 * torch.log10(max_value ** 2 / mse)
+        return psnr
+    def encode(self, x: torch.Tensor) -> AutoVisionEncoderOutput:
+        '''
+        Encode an image to latent representation.
+        Args:
+            x (torch.Tensor): Input image tensor with shape [batch, channels, height, width].
+        Returns:
+            AutoVisionEncoderOutput: Output containing latent representation and grid_shape.
+        '''
+        return self._encode(x)
+    def decode(self, encoder_output: AutoVisionEncoderOutput) -> AutoVisionDecoderOutput:
+        '''
+        Decode a latent representation to an image.
+        Args:
+            encoder_output (AutoVisionEncoderOutput): Output from encode method containing
+                                                      latent representation and grid_shape.
+        Returns:
+            AutoVisionDecoderOutput: Output containing reconstructed image and grid_shape.
+        '''
+        return self._decode(encoder_output)
+    def _encode(self, x: torch.Tensor) -> AutoVisionEncoderOutput:
+        '''
+        Internal encoding method to be implemented by subclasses.
+        Args:
+            x (torch.Tensor): Input image tensor.
+        Returns:
+            AutoVisionEncoderOutput: Output containing latent representation and grid_shape.
+        '''
+        raise NotImplementedError('Subclasses must implement _encode method')
+    def _decode(self, encoder_output: AutoVisionEncoderOutput) -> AutoVisionDecoderOutput:
+        '''
+        Internal decoding method to be implemented by subclasses.
+        Args:
+            encoder_output (AutoVisionEncoderOutput): Output from encode method.
+        Returns:
+            AutoVisionDecoderOutput: Output containing reconstructed image.
+        '''
+        raise NotImplementedError('Subclasses must implement _decode method')

{codon_model-0.0.2 → codon_model-0.0.3a1}/codon/model/motif/motif_a1.py RENAMED Viewed

@@ -3,18 +3,13 @@ from codon.base import *
 from codon.block.transformer import TransformerMoEDecoder
 from codon.block.embedding   import RotaryEmbedding
+from .base import CausalLanguageModel, CausalLanguageModelOutput
 from typing import Optional, List, Tuple
 from dataclasses import dataclass
-@dataclass
-class MotifA1Output:
-    logits: torch.Tensor
-    past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None
-    aux_loss: Optional[torch.Tensor] = None
-    attentions: Optional[List[torch.Tensor]] = None
-class MotifA1(BasicModel):
+class MotifA1(CausalLanguageModel):
     def __init__(
         self,
         vocab_size: int = 32000,
@@ -75,7 +70,7 @@ class MotifA1(BasicModel):
         past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
         output_attentions: bool = False
-    ) -> MotifA1Output:
+    ) -> CausalLanguageModelOutput:
         x = self.token_emb(input_ids)
         x = self.dropout(x)
@@ -113,7 +108,7 @@ class MotifA1(BasicModel):
         x = self.norm(x)
         logits = self.proj_out(x)
-        return MotifA1Output(
+        return CausalLanguageModelOutput(
             logits=logits,
             past_key_values=new_kv_cache,
             aux_loss=aux_loss,

codon-model 0.0.2__tar.gz → 0.0.3a1__tar.gz

codon-model 0.0.2tar.gz → 0.0.3a1tar.gz