PyPI - npcpy - Versions diffs - 1.2.34__py3-none-any.whl → 1.2.35__py3-none-any.whl - Mend

npcpy 1.2.34py3-none-any.whl → 1.2.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

npcpy/data/audio.py +35 -1
npcpy/data/load.py +149 -7
npcpy/data/video.py +72 -0
npcpy/ft/diff.py +332 -71
npcpy/gen/image_gen.py +120 -23
npcpy/gen/ocr.py +187 -0
npcpy/memory/command_history.py +231 -40
npcpy/npc_compiler.py +14 -5
npcpy/serve.py +1206 -547
{npcpy-1.2.34.dist-info → npcpy-1.2.35.dist-info}/METADATA +1 -1
{npcpy-1.2.34.dist-info → npcpy-1.2.35.dist-info}/RECORD +14 -13
{npcpy-1.2.34.dist-info → npcpy-1.2.35.dist-info}/WHEEL +0 -0
{npcpy-1.2.34.dist-info → npcpy-1.2.35.dist-info}/licenses/LICENSE +0 -0
{npcpy-1.2.34.dist-info → npcpy-1.2.35.dist-info}/top_level.txt +0 -0

npcpy/ft/diff.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# finetuning diffuser models
 try:
     import torch
     import torch.nn as nn
     import torch.nn.functional as F
     from torch.utils.data import DataLoader, Dataset as TorchDataset
     from transformers import CLIPTextModel, CLIPTokenizer
-except:
+    TORCH_AVAILABLE = True
+except ImportError:
     torch = None
     nn = None
     F = None
@@ -13,9 +13,11 @@ except:
     TorchDataset = None
     CLIPTextModel = None
     CLIPTokenizer = None
+    TORCH_AVAILABLE = False
 import math
 from dataclasses import dataclass, field
-from typing import List, Optional, Callable
+from typing import List, Optional
 import numpy as np
 from PIL import Image
 import os
@@ -34,77 +36,336 @@ class DiffusionConfig:
     num_epochs: int = 100
     batch_size: int = 4
     learning_rate: float = 1e-5
-    checkpoint_frequency: int = 1000
-    output_dir: str = "diffusion_model"
-    use_clip: bool = True
-    num_channels: int = 1
+    checkpoint_frequency: int = 10
+    output_model_path: str = "diffusion_model"
+    use_clip: bool = False
+    num_channels: int = 3
-class SinusoidalPositionEmbeddings(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-    def forward(self, time):
-        device = time.device
-        half_dim = self.dim // 2
-        embeddings = math.log(10000) / (half_dim - 1)
-        embeddings = torch.exp(
-            torch.arange(half_dim, device=device) * -embeddings
-        )
-        embeddings = time[:, None] * embeddings[None, :]
-        embeddings = torch.cat(
-            (embeddings.sin(), embeddings.cos()),
-            dim=-1
-        )
-        return embeddings
+if TORCH_AVAILABLE:
+    class SinusoidalPositionEmbeddings(nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.dim = dim
+        def forward(self, time):
+            device = time.device
+            half_dim = self.dim // 2
+            embeddings = math.log(10000) / (half_dim - 1)
+            embeddings = torch.exp(
+                torch.arange(half_dim, device=device) * -embeddings
+            )
+            embeddings = time[:, None] * embeddings[None, :]
+            embeddings = torch.cat(
+                (embeddings.sin(), embeddings.cos()),
+                dim=-1
+            )
+            return embeddings
-class SimpleUNet(nn.Module):
-    def __init__(
-        self,
-        image_size=128,
-        channels=256,
-        time_emb_dim=128,
-        num_channels=1
-    ):
-        super().__init__()
-        self.image_size = image_size
-        self.time_mlp = nn.Sequential(
-            SinusoidalPositionEmbeddings(time_emb_dim),
-            nn.Linear(time_emb_dim, time_emb_dim * 4),
-            nn.GELU(),
-            nn.Linear(time_emb_dim * 4, channels),
-        )
-        self.text_mlp = nn.Sequential(
-            nn.Linear(768, time_emb_dim),
-            nn.GELU(),
-            nn.Linear(time_emb_dim, time_emb_dim),
-            nn.GELU(),
-            nn.Linear(time_emb_dim, channels),
-        )
-        self.conv_in = nn.Conv2d(num_channels, channels, 1, padding=0)
-        self.down1 = nn.Sequential(
-            nn.Conv2d(channels, channels * 2, 4, 2, 1),
-            nn.GroupNorm(8, channels * 2),
-            nn.GELU(),
+    class SimpleUNet(nn.Module):
+        def __init__(self, image_size=128, channels=256,
+                     time_emb_dim=128, num_channels=3):
+            super().__init__()
+            self.image_size = image_size
+            self.time_mlp = nn.Sequential(
+                SinusoidalPositionEmbeddings(time_emb_dim),
+                nn.Linear(time_emb_dim, time_emb_dim * 4),
+                nn.GELU(),
+                nn.Linear(time_emb_dim * 4, channels),
+            )
+            self.conv_in = nn.Conv2d(num_channels, channels, 3, padding=1)
+            self.down1 = nn.Sequential(
+                nn.Conv2d(channels, channels * 2, 4, 2, 1),
+                nn.GroupNorm(8, channels * 2),
+                nn.GELU(),
+            )
+            self.down2 = nn.Sequential(
+                nn.Conv2d(channels * 2, channels * 4, 4, 2, 1),
+                nn.GroupNorm(8, channels * 4),
+                nn.GELU(),
+            )
+            self.mid = nn.Sequential(
+                nn.Conv2d(channels * 4, channels * 4, 3, 1, 1),
+                nn.GroupNorm(8, channels * 4),
+                nn.GELU(),
+            )
+            self.up1 = nn.Sequential(
+                nn.ConvTranspose2d(channels * 4, channels * 2, 4, 2, 1),
+                nn.GroupNorm(8, channels * 2),
+                nn.GELU(),
+            )
+            self.up2 = nn.Sequential(
+                nn.ConvTranspose2d(channels * 4, channels, 4, 2, 1),
+                nn.GroupNorm(8, channels),
+                nn.GELU(),
+            )
+            self.conv_out = nn.Conv2d(channels * 2, num_channels, 3, padding=1)
+        def forward(self, x, t):
+            t_emb = self.time_mlp(t)
+            x = self.conv_in(x)
+            h1 = x + t_emb[:, :, None, None]
+            h2 = self.down1(h1)
+            h3 = self.down2(h2)
+            h3 = self.mid(h3)
+            h = self.up1(h3)
+            h = torch.cat([h, h2], dim=1)
+            h = self.up2(h)
+            h = torch.cat([h, h1], dim=1)
+            return self.conv_out(h)
+    class ImageDataset(TorchDataset):
+        def __init__(self, image_paths, captions, image_size=128):
+            self.image_paths = image_paths
+            self.captions = captions if captions else [''] * len(image_paths)
+            self.image_size = image_size
+        def __len__(self):
+            return len(self.image_paths)
+        def __getitem__(self, idx):
+            img_path = self.image_paths[idx]
+            img = Image.open(img_path).convert('RGB')
+            img = img.resize((self.image_size, self.image_size))
+            img = np.array(img).astype(np.float32) / 255.0
+            img = (img - 0.5) * 2.0
+            img = torch.from_numpy(img).permute(2, 0, 1)
+            caption = self.captions[idx] if idx < len(self.captions) else ''
+            return img, caption
+    class DiffusionTrainer:
+        def __init__(self, config):
+            self.config = config
+            self.device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu'
+            )
+            self.model = SimpleUNet(
+                image_size=config.image_size,
+                channels=config.channels,
+                time_emb_dim=config.time_emb_dim,
+                num_channels=config.num_channels
+            ).to(self.device)
+            self.betas = torch.linspace(
+                config.beta_start,
+                config.beta_end,
+                config.timesteps
+            ).to(self.device)
+            self.alphas = 1.0 - self.betas
+            self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+            self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+            self.sqrt_one_minus_alphas_cumprod = torch.sqrt(
+                1.0 - self.alphas_cumprod
+            )
+        def add_noise(self, x, t):
+            sqrt_alpha = self.sqrt_alphas_cumprod[t][:, None, None, None]
+            sqrt_one_minus = self.sqrt_one_minus_alphas_cumprod[t][
+                :, None, None, None
+            ]
+            noise = torch.randn_like(x)
+            return sqrt_alpha * x + sqrt_one_minus * noise, noise
+        def train(self, dataloader):
+            optimizer = torch.optim.AdamW(
+                self.model.parameters(),
+                lr=self.config.learning_rate
+            )
+            os.makedirs(self.config.output_model_path, exist_ok=True)
+            checkpoint_dir = os.path.join(
+                self.config.output_model_path,
+                'checkpoints'
+            )
+            os.makedirs(checkpoint_dir, exist_ok=True)
+            global_step = 0
+            for epoch in range(self.config.num_epochs):
+                self.model.train()
+                epoch_loss = 0.0
+                pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}')
+                for batch_idx, (images, captions) in enumerate(pbar):
+                    images = images.to(self.device)
+                    batch_size = images.shape[0]
+                    t = torch.randint(
+                        0,
+                        self.config.timesteps,
+                        (batch_size,),
+                        device=self.device
+                    ).long()
+                    noisy_images, noise = self.add_noise(images, t)
+                    predicted_noise = self.model(noisy_images, t)
+                    loss = F.mse_loss(predicted_noise, noise)
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+                    epoch_loss += loss.item()
+                    global_step += 1
+                    pbar.set_postfix({'loss': loss.item()})
+                    if global_step % self.config.checkpoint_frequency == 0:
+                        ckpt_path = os.path.join(
+                            checkpoint_dir,
+                            f'checkpoint-epoch{epoch+1}-step{global_step}.pt'
+                        )
+                        torch.save({
+                            'epoch': epoch,
+                            'step': global_step,
+                            'model_state_dict': self.model.state_dict(),
+                            'optimizer_state_dict': optimizer.state_dict(),
+                            'loss': loss.item(),
+                        }, ckpt_path)
+                avg_loss = epoch_loss / len(dataloader)
+                print(f'Epoch {epoch+1} avg loss: {avg_loss:.6f}')
+            final_path = os.path.join(
+                self.config.output_model_path,
+                'model_final.pt'
+            )
+            torch.save({
+                'model_state_dict': self.model.state_dict(),
+                'config': self.config,
+            }, final_path)
+            return self.config.output_model_path
+        @torch.no_grad()
+        def sample(self, num_samples=1):
+            self.model.eval()
+            x = torch.randn(
+                num_samples,
+                self.config.num_channels,
+                self.config.image_size,
+                self.config.image_size,
+                device=self.device
+            )
+            for t in reversed(range(self.config.timesteps)):
+                t_batch = torch.full(
+                    (num_samples,),
+                    t,
+                    device=self.device,
+                    dtype=torch.long
+                )
+                predicted_noise = self.model(x, t_batch)
+                alpha = self.alphas[t]
+                alpha_cumprod = self.alphas_cumprod[t]
+                beta = self.betas[t]
+                if t > 0:
+                    noise = torch.randn_like(x)
+                else:
+                    noise = torch.zeros_like(x)
+                x = (1 / torch.sqrt(alpha)) * (
+                    x - (beta / torch.sqrt(1 - alpha_cumprod)) * predicted_noise
+                ) + torch.sqrt(beta) * noise
+            x = (x + 1) / 2
+            x = torch.clamp(x, 0, 1)
+            return x
+else:
+    SinusoidalPositionEmbeddings = None
+    SimpleUNet = None
+    ImageDataset = None
+    DiffusionTrainer = None
+def train_diffusion(image_paths, captions=None, config=None,
+                    resume_from=None):
+    if not TORCH_AVAILABLE:
+        raise ImportError(
+            "PyTorch not available. Install: pip install torch torchvision"
         )
-        self.down2 = nn.Sequential(
-            nn.Conv2d(channels * 2, channels * 4, 4, 2, 1),
-            nn.GroupNorm(8, channels * 4),
-            nn.GELU(),
+    if config is None:
+        config = DiffusionConfig()
+    if captions is None:
+        captions = [''] * len(image_paths)
+    dataset = ImageDataset(image_paths, captions, config.image_size)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        num_workers=0
+    )
+    trainer = DiffusionTrainer(config)
+    if resume_from and os.path.exists(resume_from):
+        checkpoint = torch.load(resume_from, map_location=trainer.device)
+        trainer.model.load_state_dict(checkpoint['model_state_dict'])
+        print(f'Resumed from {resume_from}')
+    output_path = trainer.train(dataloader)
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return output_path
+def generate_image(model_path, prompt=None, num_samples=1, image_size=128):
+    if not TORCH_AVAILABLE:
+        raise ImportError(
+            "PyTorch not available. Install: pip install torch torchvision"
         )
-        self.down3 = nn.Sequential(
-            nn.Conv2d(channels * 4, channels * 8, 4, 2, 1),
-            nn.GroupNorm(8, channels * 8),
-            nn.GELU(),
-        )
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Fix: Load with weights_only=False for your custom checkpoint
+    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+    if 'config' in checkpoint:
+        config = checkpoint['config']
+    else:
+        config = DiffusionConfig(image_size=image_size)
+    trainer = DiffusionTrainer(config)
+    trainer.model.load_state_dict(checkpoint['model_state_dict'])
+    samples = trainer.sample(num_samples)
+    images = []
+    for i in range(num_samples):
+        img_tensor = samples[i].cpu()
+        img_np = img_tensor.permute(1, 2, 0).numpy()
+        img_np = (img_np * 255).astype(np.uint8)
+        img = Image.fromarray(img_np)
+        images.append(img)
+    if num_samples == 1:
+        return images[0]
+    return images

npcpy/gen/image_gen.py CHANGED Viewed

@@ -21,29 +21,86 @@ def generate_image_diffusers(
     """Generate an image using the Stable Diffusion API with memory optimization."""
     import torch
     import gc
+    import os
+    from diffusers import DiffusionPipeline, StableDiffusionPipeline
     try:
         torch_dtype = torch.float16 if device != "cpu" and torch.cuda.is_available() else torch.float32
-        if 'Qwen' in model:
-            from diffusers import DiffusionPipeline
-            pipe = DiffusionPipeline.from_pretrained(
-                model,
-                torch_dtype=torch_dtype,
-                use_safetensors=True,
-                variant="fp16" if torch_dtype == torch.float16 else None,
-            )
-        else:
-            from diffusers import StableDiffusionPipeline
+        if os.path.isdir(model):
+            print(f"🌋 Loading fine-tuned Diffusers model from local path: {model}")
-            pipe = StableDiffusionPipeline.from_pretrained(
-                model,
-                torch_dtype=torch_dtype,
-                use_safetensors=True,
-                variant="fp16" if torch_dtype == torch.float16 else None,
-            )
+            checkpoint_path = os.path.join(model, 'model_final.pt')
+            if os.path.exists(checkpoint_path):
+                print(f"🌋 Found model_final.pt at {checkpoint_path}.")
+                # Load checkpoint to inspect it
+                checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+                # Check if this is a custom SimpleUNet model (from your training code)
+                # vs a Stable Diffusion UNet2DConditionModel
+                if 'config' in checkpoint and hasattr(checkpoint['config'], 'image_size'):
+                    print(f"🌋 Detected custom SimpleUNet model, using custom generation")
+                    # Use your custom generate_image function from npcpy.ft.diff
+                    from npcpy.ft.diff import generate_image as custom_generate_image
+                    # Your custom model ignores prompts and generates based on training data
+                    image = custom_generate_image(
+                        model_path=checkpoint_path,
+                        prompt=prompt,
+                        num_samples=1,
+                        image_size=height  # Use the requested height
+                    )
+                    return image
+                else:
+                    # This is a Stable Diffusion checkpoint
+                    print(f"🌋 Detected Stable Diffusion UNet checkpoint")
+                    base_model_id = "runwayml/stable-diffusion-v1-5"
+                    print(f"🌋 Loading base pipeline: {base_model_id}")
+                    pipe = StableDiffusionPipeline.from_pretrained(
+                        base_model_id,
+                        torch_dtype=torch_dtype,
+                        use_safetensors=True,
+                        variant="fp16" if torch_dtype == torch.float16 else None,
+                    )
+                    print(f"🌋 Loading custom UNet weights from {checkpoint_path}")
+                    # Extract the actual model state dict
+                    if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+                        unet_state_dict = checkpoint['model_state_dict']
+                        print(f"🌋 Extracted model_state_dict from checkpoint")
+                    else:
+                        unet_state_dict = checkpoint
+                        print(f"🌋 Using checkpoint directly as state_dict")
+                    # Load the state dict into the UNet
+                    pipe.unet.load_state_dict(unet_state_dict)
+                    pipe = pipe.to(device)
+                    print(f"🌋 Successfully loaded fine-tuned UNet weights")
+            else:
+                raise OSError(f"Error: Fine-tuned model directory {model} does not contain 'model_final.pt'")
+        else:
+            print(f"🌋 Loading standard Diffusers model: {model}")
+            if 'Qwen' in model:
+                pipe = DiffusionPipeline.from_pretrained(
+                    model,
+                    torch_dtype=torch_dtype,
+                    use_safetensors=True,
+                    variant="fp16" if torch_dtype == torch.float16 else None,
+                )
+            else:
+                pipe = StableDiffusionPipeline.from_pretrained(
+                    model,
+                    torch_dtype=torch_dtype,
+                    use_safetensors=True,
+                    variant="fp16" if torch_dtype == torch.float16 else None,
+                )
+        # Common pipeline setup for Stable Diffusion models
         if hasattr(pipe, 'enable_attention_slicing'):
             pipe.enable_attention_slicing()
@@ -85,7 +142,6 @@ def generate_image_diffusers(
             raise MemoryError(f"Insufficient memory for image generation with model {model}. Try a smaller model or reduce image size.")
         else:
             raise e
 import os
 import base64
 import io
@@ -294,6 +350,8 @@ def gemini_image_gen(
         else:
             raise ValueError(f"Unsupported Gemini image model or API usage for new generation: '{model}'")
+# In npcpy/gen/image_gen.py, find the generate_image function and replace it with this:
 def generate_image(
     prompt: str,
     model: str ,
@@ -305,6 +363,7 @@ def generate_image(
     api_url: Optional[str] = None,
     attachments: Union[List[Union[str, bytes, Image.Image]], None] = None,
     save_path: Optional[str] = None,
+    custom_model_path: Optional[str] = None, # <--- NEW: Accept custom_model_path
 ):
     """
     Unified function to generate or edit images using various providers.
@@ -320,13 +379,15 @@ def generate_image(
         api_url (str): API URL for the provider.
         attachments (list): List of images for editing. Can be file paths, bytes, or PIL Images.
         save_path (str): Path to save the generated image.
+        custom_model_path (str): Path to a locally fine-tuned Diffusers model. <--- NEW
     Returns:
         List[PIL.Image.Image]: A list of generated PIL Image objects.
     """
     from urllib.request import urlopen
+    import os # Ensure os is imported for path checks
-    if model is None:
+    if model is None and custom_model_path is None: # Only set default if no model or custom path is provided
         if provider == "openai":
             model = "dall-e-2"
         elif provider == "diffusers":
@@ -336,12 +397,22 @@ def generate_image(
     all_generated_pil_images = []
+    # <--- CRITICAL FIX: Handle custom_model_path for Diffusers here
     if provider == "diffusers":
+        # If a custom_model_path is provided and exists, use it instead of a generic model name
+        if custom_model_path and os.path.isdir(custom_model_path):
+            print(f"🌋 Using custom Diffusers model from path: {custom_model_path}")
+            model_to_use = custom_model_path
+        else:
+            # Otherwise, use the standard model name (e.g., "runwayml/stable-diffusion-v1-5")
+            model_to_use = model
+            print(f"🌋 Using standard Diffusers model: {model_to_use}")
         for _ in range(n_images):
             try:
                 image = generate_image_diffusers(
                     prompt=prompt,
-                    model=model,
+                    model=model_to_use, # <--- Pass the resolved model_to_use
                     height=height,
                     width=width
                 )
@@ -373,15 +444,42 @@ def generate_image(
         all_generated_pil_images.extend(images)
     else:
+        # This is the fallback for other providers or if provider is not explicitly handled
         valid_sizes = ["256x256", "512x512", "1024x1024", "1024x1792", "1792x1024"]
         size = f"{width}x{height}"
         if attachments is not None:
             raise ValueError("Image editing not supported with litellm provider")
+        # The litellm.image_generation function expects the provider as part of the model string
+        # e.g., "huggingface/starcoder" or "openai/dall-e-3"
+        # Since we've already handled "diffusers", "openai", "gemini" above,
+        # this 'else' block implies a generic litellm call.
+        # We need to ensure the model string is correctly formatted for litellm.
+        # However, the error message "LLM Provider NOT provided" suggests litellm
+        # is not even getting the `provider` correctly.
+        # The fix for this is ensuring the `provider` is explicitly passed to litellm.image_generation
+        # which is already happening in `gen_image` in `llm_funcs.py`
+        # If we reach here, it means the provider is not 'diffusers', 'openai', or 'gemini',
+        # and litellm is the intended route. We need to pass the provider explicitly.
+        # The original code here was trying to construct `model=f"{provider}/{model}"`
+        # but the error indicates `provider` itself was missing.
+        # The `image_generation` from litellm expects `model` to be `provider/model_name`.
+        # Since the `provider` variable is available, we can construct this.
+        # This block is for generic litellm providers (not diffusers, openai, gemini)
+        # The error indicates `provider` itself was not making it to litellm.
+        # This `generate_image` function already receives `provider`.
+        # The issue is likely how `gen_image` in `llm_funcs.py` calls this `generate_image`.
+        # However, if this `else` branch is hit, we ensure litellm gets the provider.
+        # Construct the model string for litellm
+        litellm_model_string = f"{provider}/{model}" if provider and model else model
         image_response = image_generation(
             prompt=prompt,
-            model=f"{provider}/{model}",
+            model=litellm_model_string, # <--- Ensure model string includes provider for litellm
             n=n_images,
             size=size,
             api_key=api_key,
@@ -407,7 +505,6 @@ def generate_image(
     return all_generated_pil_images
 def edit_image(
     prompt: str,
     image_path: str,

npcpy 1.2.34__py3-none-any.whl → 1.2.35__py3-none-any.whl

npcpy 1.2.34py3-none-any.whl → 1.2.35py3-none-any.whl