PyPI - npcpy - Versions diffs - 1.3.21__py3-none-any.whl → 1.3.23__py3-none-any.whl - Mend

npcpy 1.3.21py3-none-any.whl → 1.3.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

npcpy/data/audio.py +58 -286
npcpy/data/image.py +15 -15
npcpy/data/web.py +2 -2
npcpy/gen/audio_gen.py +172 -2
npcpy/gen/image_gen.py +113 -62
npcpy/gen/response.py +239 -0
npcpy/llm_funcs.py +73 -71
npcpy/memory/command_history.py +117 -69
npcpy/memory/kg_vis.py +74 -74
npcpy/npc_compiler.py +261 -26
npcpy/npc_sysenv.py +4 -1
npcpy/serve.py +393 -91
npcpy/work/desktop.py +31 -5
npcpy-1.3.23.dist-info/METADATA +416 -0
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/RECORD +18 -18
npcpy-1.3.21.dist-info/METADATA +0 -1039
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/WHEEL +0 -0
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/licenses/LICENSE +0 -0
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/top_level.txt +0 -0

npcpy/gen/image_gen.py CHANGED Viewed

@@ -34,27 +34,21 @@ def generate_image_diffusers(
             if os.path.exists(checkpoint_path):
                 print(f"🌋 Found model_final.pt at {checkpoint_path}.")
-                # Load checkpoint to inspect it
                 checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
-                # Check if this is a custom SimpleUNet model (from your training code)
-                # vs a Stable Diffusion UNet2DConditionModel
                 if 'config' in checkpoint and hasattr(checkpoint['config'], 'image_size'):
                     print(f"🌋 Detected custom SimpleUNet model, using custom generation")
-                    # Use your custom generate_image function from npcpy.ft.diff
                     from npcpy.ft.diff import generate_image as custom_generate_image
-                    # Your custom model ignores prompts and generates based on training data
                     image = custom_generate_image(
                         model_path=checkpoint_path,
                         prompt=prompt,
                         num_samples=1,
-                        image_size=height  # Use the requested height
+                        image_size=height
                     )
                     return image
                 else:
-                    # This is a Stable Diffusion checkpoint
                     print(f"🌋 Detected Stable Diffusion UNet checkpoint")
                     base_model_id = "runwayml/stable-diffusion-v1-5"
                     print(f"🌋 Loading base pipeline: {base_model_id}")
@@ -67,7 +61,6 @@ def generate_image_diffusers(
                     print(f"🌋 Loading custom UNet weights from {checkpoint_path}")
-                    # Extract the actual model state dict
                     if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
                         unet_state_dict = checkpoint['model_state_dict']
                         print(f"🌋 Extracted model_state_dict from checkpoint")
@@ -75,7 +68,6 @@ def generate_image_diffusers(
                         unet_state_dict = checkpoint
                         print(f"🌋 Using checkpoint directly as state_dict")
-                    # Load the state dict into the UNet
                     pipe.unet.load_state_dict(unet_state_dict)
                     pipe = pipe.to(device)
                     print(f"🌋 Successfully loaded fine-tuned UNet weights")
@@ -100,7 +92,6 @@ def generate_image_diffusers(
                     variant="fp16" if torch_dtype == torch.float16 else None,
                 )
-        # Common pipeline setup for Stable Diffusion models
         if hasattr(pipe, 'enable_attention_slicing'):
             pipe.enable_attention_slicing()
@@ -142,16 +133,7 @@ def generate_image_diffusers(
             raise MemoryError(f"Insufficient memory for image generation with model {model}. Try a smaller model or reduce image size.")
         else:
             raise e
-import os
-import base64
-import io
-from typing import Union, List, Optional
-import PIL
-from PIL import Image
-import requests
-from urllib.request import urlopen
 def openai_image_gen(
     prompt: str,
@@ -184,13 +166,13 @@ def openai_image_gen(
                 files_to_close.append(file_handle)
             elif isinstance(attachment, bytes):
                 img_byte_arr = io.BytesIO(attachment)
-                img_byte_arr.name = 'image.png'  # FIX: Add filename hint
+                img_byte_arr.name = 'image.png'
                 processed_images.append(img_byte_arr)
             elif isinstance(attachment, Image.Image):
                 img_byte_arr = io.BytesIO()
                 attachment.save(img_byte_arr, format='PNG')
                 img_byte_arr.seek(0)
-                img_byte_arr.name = 'image.png'  # FIX: Add filename hint
+                img_byte_arr.name = 'image.png'
                 processed_images.append(img_byte_arr)
         try:
@@ -202,7 +184,6 @@ def openai_image_gen(
                 size=size_str,
             )
         finally:
-            # This ensures any files we opened are properly closed
             for f in files_to_close:
                 f.close()
     else:
@@ -231,7 +212,6 @@ def openai_image_gen(
     return collected_images
 def gemini_image_gen(
     prompt: str,
     model: str = "gemini-2.5-flash",
@@ -305,18 +285,21 @@ def gemini_image_gen(
         response = client.models.generate_content(
             model=model,
             contents=processed_contents,
+            config=types.GenerateContentConfig(
+                response_modalities=["IMAGE", "TEXT"],
+            ),
         )
         if hasattr(response, 'candidates') and response.candidates:
             for candidate in response.candidates:
                 for part in candidate.content.parts:
                     if hasattr(part, 'inline_data') and part.inline_data:
                         image_data = part.inline_data.data
                         collected_images.append(Image.open(BytesIO(image_data)))
         if not collected_images and hasattr(response, 'text'):
             print(f"Gemini response text: {response.text}")
         return collected_images
     else:
         if 'imagen' in model:
@@ -335,6 +318,9 @@ def gemini_image_gen(
             response = client.models.generate_content(
                 model=model,
                 contents=[prompt],
+                config=types.GenerateContentConfig(
+                    response_modalities=["IMAGE", "TEXT"],
+                ),
             )
             if hasattr(response, 'candidates') and response.candidates:
@@ -351,7 +337,86 @@ def gemini_image_gen(
         else:
             raise ValueError(f"Unsupported Gemini image model or API usage for new generation: '{model}'")
-# In npcpy/gen/image_gen.py, find the generate_image function and replace it with this:
+def ollama_image_gen(
+    prompt: str,
+    model: str = "x/z-image-turbo",
+    height: int = 512,
+    width: int = 512,
+    n_images: int = 1,
+    api_url: Optional[str] = None,
+    seed: Optional[int] = None,
+    negative_prompt: Optional[str] = None,
+    num_steps: Optional[int] = None,
+):
+    """Generate images using Ollama's image generation API.
+    Works with ollama image gen models like x/z-image-turbo and x/flux2-klein.
+    Uses the /api/generate endpoint with image gen specific options.
+    """
+    import requests
+    if api_url is None:
+        api_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
+    endpoint = f"{api_url}/api/generate"
+    collected_images = []
+    for _ in range(n_images):
+        options = {}
+        if width:
+            options["width"] = width
+        if height:
+            options["height"] = height
+        if seed is not None:
+            options["seed"] = seed
+        if num_steps is not None:
+            options["num_steps"] = num_steps
+        payload = {
+            "model": model,
+            "prompt": prompt,
+            "stream": False,
+        }
+        if options:
+            payload["options"] = options
+        if negative_prompt:
+            payload["negative_prompt"] = negative_prompt
+        response = requests.post(endpoint, json=payload)
+        if not response.ok:
+            try:
+                err = response.json()
+                err_msg = err.get('error', response.text)
+            except Exception:
+                err_msg = response.text
+            raise RuntimeError(
+                f"Ollama image gen failed ({response.status_code}): {err_msg}\n"
+                f"Model: {model} — make sure it's pulled (`ollama pull {model}`)"
+            )
+        result = response.json()
+        if 'image' in result and result['image']:
+            image_bytes = base64.b64decode(result['image'])
+            image = Image.open(io.BytesIO(image_bytes))
+            collected_images.append(image)
+        elif 'images' in result and result['images']:
+            for img_b64 in result['images']:
+                image_bytes = base64.b64decode(img_b64)
+                image = Image.open(io.BytesIO(image_bytes))
+                collected_images.append(image)
+        else:
+            raise ValueError(
+                f"No images returned from Ollama. Response keys: {list(result.keys())}. "
+                f"Make sure '{model}' is an image generation model (e.g. x/z-image-turbo, x/flux2-klein)."
+            )
+    return collected_images
 def generate_image(
     prompt: str,
@@ -364,7 +429,7 @@ def generate_image(
     api_url: Optional[str] = None,
     attachments: Union[List[Union[str, bytes, Image.Image]], None] = None,
     save_path: Optional[str] = None,
-    custom_model_path: Optional[str] = None, # <--- NEW: Accept custom_model_path,
+    custom_model_path: Optional[str] = None,
 ):
     """
@@ -373,7 +438,7 @@ def generate_image(
     Args:
         prompt (str): The prompt for generating/editing the image.
         model (str): The model to use.
-        provider (str): The provider to use ('openai', 'diffusers', 'gemini').
+        provider (str): The provider to use ('openai', 'diffusers', 'gemini', 'ollama').
         height (int): The height of the output image.
         width (int): The width of the output image.
         n_images (int): Number of images to generate.
@@ -381,32 +446,31 @@ def generate_image(
         api_url (str): API URL for the provider.
         attachments (list): List of images for editing. Can be file paths, bytes, or PIL Images.
         save_path (str): Path to save the generated image.
-        custom_model_path (str): Path to a locally fine-tuned Diffusers model. <--- NEW
+        custom_model_path (str): Path to a locally fine-tuned Diffusers model.
     Returns:
         List[PIL.Image.Image]: A list of generated PIL Image objects.
     """
     from urllib.request import urlopen
-    import os # Ensure os is imported for path checks
+    import os
-    if model is None and custom_model_path is None: # Only set default if no model or custom path is provided
+    if model is None and custom_model_path is None:
         if provider == "openai":
             model = "dall-e-2"
         elif provider == "diffusers":
             model = "runwayml/stable-diffusion-v1-5"
         elif provider == "gemini":
             model = "gemini-2.5-flash-image-preview"
+        elif provider == "ollama":
+            model = "x/z-image-turbo"
     all_generated_pil_images = []
-    # <--- CRITICAL FIX: Handle custom_model_path for Diffusers here
     if provider == "diffusers":
-        # If a custom_model_path is provided and exists, use it instead of a generic model name
         if custom_model_path and os.path.isdir(custom_model_path):
             print(f"🌋 Using custom Diffusers model from path: {custom_model_path}")
             model_to_use = custom_model_path
         else:
-            # Otherwise, use the standard model name (e.g., "runwayml/stable-diffusion-v1-5")
             model_to_use = model
             print(f"🌋 Using standard Diffusers model: {model_to_use}")
@@ -414,7 +478,7 @@ def generate_image(
             try:
                 image = generate_image_diffusers(
                     prompt=prompt,
-                    model=model_to_use, # <--- Pass the resolved model_to_use
+                    model=model_to_use,
                     height=height,
                     width=width
                 )
@@ -447,43 +511,29 @@ def generate_image(
         )
         all_generated_pil_images.extend(images)
+    elif provider == "ollama":
+        images = ollama_image_gen(
+            prompt=prompt,
+            model=model,
+            height=height,
+            width=width,
+            n_images=n_images,
+            api_url=api_url
+        )
+        all_generated_pil_images.extend(images)
     else:
-        # This is the fallback for other providers or if provider is not explicitly handled
         valid_sizes = ["256x256", "512x512", "1024x1024", "1024x1792", "1792x1024"]
         size = f"{width}x{height}"
         if attachments is not None:
             raise ValueError("Image editing not supported with litellm provider")
-        # The litellm.image_generation function expects the provider as part of the model string
-        # e.g., "huggingface/starcoder" or "openai/dall-e-3"
-        # Since we've already handled "diffusers", "openai", "gemini" above,
-        # this 'else' block implies a generic litellm call.
-        # We need to ensure the model string is correctly formatted for litellm.
-        # However, the error message "LLM Provider NOT provided" suggests litellm
-        # is not even getting the `provider` correctly.
-        # The fix for this is ensuring the `provider` is explicitly passed to litellm.image_generation
-        # which is already happening in `gen_image` in `llm_funcs.py`
-        # If we reach here, it means the provider is not 'diffusers', 'openai', or 'gemini',
-        # and litellm is the intended route. We need to pass the provider explicitly.
-        # The original code here was trying to construct `model=f"{provider}/{model}"`
-        # but the error indicates `provider` itself was missing.
-        # The `image_generation` from litellm expects `model` to be `provider/model_name`.
-        # Since the `provider` variable is available, we can construct this.
-        # This block is for generic litellm providers (not diffusers, openai, gemini)
-        # The error indicates `provider` itself was not making it to litellm.
-        # This `generate_image` function already receives `provider`.
-        # The issue is likely how `gen_image` in `llm_funcs.py` calls this `generate_image`.
-        # However, if this `else` branch is hit, we ensure litellm gets the provider.
-        # Construct the model string for litellm
         litellm_model_string = f"{provider}/{model}" if provider and model else model
         image_response = image_generation(
             prompt=prompt,
-            model=litellm_model_string, # <--- Ensure model string includes provider for litellm
+            model=litellm_model_string,
             n=n_images,
             size=size,
             api_key=api_key,
@@ -509,6 +559,7 @@ def generate_image(
     return all_generated_pil_images
 def edit_image(
     prompt: str,
     image_path: str,

npcpy/gen/response.py CHANGED Viewed

@@ -830,6 +830,234 @@ def get_llamacpp_response(
     return result
+_AIRLLM_MODEL_CACHE = {}
+_AIRLLM_MLX_PATCHED = False
+def _patch_airllm_mlx_bias():
+    """
+    Monkey-patch airllm's MLX Attention/FeedForward to use bias=True.
+    AirLLM hardcodes bias=False which fails for non-Llama architectures (e.g. Qwen2).
+    Using bias=True is safe: MLX nn.Linear(bias=True) accepts weight-only updates,
+    so Llama models (no bias in weights) still work correctly.
+    """
+    global _AIRLLM_MLX_PATCHED
+    if _AIRLLM_MLX_PATCHED:
+        return
+    try:
+        import airllm.airllm_llama_mlx as mlx_mod
+        import mlx.core as mx
+        from mlx import nn
+        class PatchedAttention(nn.Module):
+            def __init__(self, args):
+                super().__init__()
+                self.args = args
+                self.n_heads = args.n_heads
+                self.n_kv_heads = args.n_kv_heads
+                self.repeats = self.n_heads // self.n_kv_heads
+                self.scale = args.head_dim ** -0.5
+                self.wq = nn.Linear(args.dim, args.n_heads * args.head_dim, bias=True)
+                self.wk = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=True)
+                self.wv = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=True)
+                self.wo = nn.Linear(args.n_heads * args.head_dim, args.dim, bias=True)
+                self.rope = nn.RoPE(
+                    args.head_dim, traditional=args.rope_traditional, base=args.rope_theta
+                )
+            def __call__(self, x, mask=None, cache=None):
+                B, L, D = x.shape
+                queries, keys, values = self.wq(x), self.wk(x), self.wv(x)
+                queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+                keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+                values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+                def repeat(a):
+                    a = mx.concatenate([mx.expand_dims(a, 2)] * self.repeats, axis=2)
+                    return a.reshape([B, self.n_heads, L, -1])
+                keys, values = map(repeat, (keys, values))
+                if cache is not None:
+                    key_cache, value_cache = cache
+                    queries = self.rope(queries, offset=key_cache.shape[2])
+                    keys = self.rope(keys, offset=key_cache.shape[2])
+                    keys = mx.concatenate([key_cache, keys], axis=2)
+                    values = mx.concatenate([value_cache, values], axis=2)
+                else:
+                    queries = self.rope(queries)
+                    keys = self.rope(keys)
+                scores = (queries * self.scale) @ keys.transpose(0, 1, 3, 2)
+                if mask is not None:
+                    scores += mask
+                weights = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)
+                output = (weights @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
+                return self.wo(output), (keys, values)
+        class PatchedFeedForward(nn.Module):
+            def __init__(self, args):
+                super().__init__()
+                self.w1 = nn.Linear(args.dim, args.hidden_dim, bias=True)
+                self.w2 = nn.Linear(args.hidden_dim, args.dim, bias=True)
+                self.w3 = nn.Linear(args.dim, args.hidden_dim, bias=True)
+            def __call__(self, x):
+                return self.w2(nn.silu(self.w1(x)) * self.w3(x))
+        mlx_mod.Attention = PatchedAttention
+        mlx_mod.FeedForward = PatchedFeedForward
+        _AIRLLM_MLX_PATCHED = True
+        logger.debug("Patched airllm MLX classes for bias support")
+    except Exception as e:
+        logger.warning(f"Failed to patch airllm MLX bias support: {e}")
+def get_airllm_response(
+    prompt: str = None,
+    model: str = None,
+    tools: list = None,
+    tool_map: Dict = None,
+    format: str = None,
+    messages: List[Dict[str, str]] = None,
+    auto_process_tool_calls: bool = False,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    Generate response using AirLLM for 70B+ model inference.
+    Supports macOS (MLX backend) and Linux (CUDA backend with 4-bit compression).
+    """
+    import platform
+    is_macos = platform.system() == "Darwin"
+    result = {
+        "response": None,
+        "messages": messages.copy() if messages else [],
+        "raw_response": None,
+        "tool_calls": [],
+        "tool_results": []
+    }
+    try:
+        from airllm import AutoModel
+    except ImportError:
+        result["response"] = ""
+        result["error"] = "airllm not installed. Install with: pip install airllm"
+        return result
+    # Patch airllm MLX classes to support models with bias (e.g. Qwen)
+    if is_macos:
+        _patch_airllm_mlx_bias()
+    if prompt:
+        if result['messages'] and result['messages'][-1]["role"] == "user":
+            result['messages'][-1]["content"] = prompt
+        else:
+            result['messages'].append({"role": "user", "content": prompt})
+    if format == "json":
+        json_instruction = """If you are returning a json object, begin directly with the opening {.
+Do not include any additional markdown formatting or leading ```json tags in your response."""
+        if result["messages"] and result["messages"][-1]["role"] == "user":
+            result["messages"][-1]["content"] += "\n" + json_instruction
+    model_name = model or "meta-llama/Meta-Llama-3.1-70B-Instruct"
+    # 4-bit compression requires CUDA via bitsandbytes; skip on macOS
+    default_compression = None if is_macos else "4bit"
+    compression = kwargs.get("compression", default_compression)
+    max_tokens = kwargs.get("max_tokens", 256)
+    temperature = kwargs.get("temperature", 0.7)
+    # Resolve HF token for gated model access
+    hf_token = kwargs.get("hf_token")
+    if not hf_token:
+        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    if not hf_token:
+        try:
+            from huggingface_hub import HfFolder
+            hf_token = HfFolder.get_token()
+        except Exception:
+            pass
+    # Load or retrieve cached model
+    cache_key = f"{model_name}:{compression}"
+    if cache_key not in _AIRLLM_MODEL_CACHE:
+        load_kwargs = {"pretrained_model_name_or_path": model_name}
+        if compression:
+            load_kwargs["compression"] = compression
+        if hf_token:
+            load_kwargs["hf_token"] = hf_token
+        # Pass through additional airllm kwargs
+        for k in ["delete_original", "max_seq_len", "prefetching"]:
+            if k in kwargs:
+                load_kwargs[k] = kwargs[k]
+        _AIRLLM_MODEL_CACHE[cache_key] = AutoModel.from_pretrained(**load_kwargs)
+    air_model = _AIRLLM_MODEL_CACHE[cache_key]
+    try:
+        chat_text = air_model.tokenizer.apply_chat_template(
+            result["messages"], tokenize=False, add_generation_prompt=True
+        )
+    except Exception:
+        # Fallback if chat template is not available
+        chat_text = "\n".join(
+            f"{m['role']}: {m['content']}" for m in result["messages"]
+        )
+        chat_text += "\nassistant:"
+    try:
+        if is_macos:
+            import mlx.core as mx
+            tokens = air_model.tokenizer(
+                chat_text, return_tensors="np", truncation=True, max_length=2048
+            )
+            output = air_model.generate(
+                mx.array(tokens['input_ids']),
+                max_new_tokens=max_tokens,
+            )
+            # MLX backend returns string directly
+            response_content = output if isinstance(output, str) else str(output)
+        else:
+            tokens = air_model.tokenizer(
+                chat_text, return_tensors="pt", truncation=True, max_length=2048
+            )
+            gen_out = air_model.generate(
+                tokens['input_ids'].cuda(),
+                max_new_tokens=max_tokens,
+            )
+            # CUDA backend returns token IDs, decode them
+            output_ids = gen_out.sequences[0] if hasattr(gen_out, 'sequences') else gen_out[0]
+            response_content = air_model.tokenizer.decode(output_ids, skip_special_tokens=True)
+            # Strip the input prompt from the output
+            input_text = air_model.tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
+            if response_content.startswith(input_text):
+                response_content = response_content[len(input_text):]
+        response_content = response_content.strip()
+        # Strip at common stop/special tokens that airllm doesn't handle
+        for stop_tok in ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "</s>"]:
+            if stop_tok in response_content:
+                response_content = response_content[:response_content.index(stop_tok)].strip()
+    except Exception as e:
+        logger.error(f"AirLLM inference error: {e}")
+        result["error"] = f"AirLLM inference error: {str(e)}"
+        result["response"] = ""
+        return result
+    result["response"] = response_content
+    result["raw_response"] = response_content
+    result["messages"].append({"role": "assistant", "content": response_content})
+    if format == "json":
+        try:
+            if response_content.startswith("```json"):
+                response_content = response_content.replace("```json", "").replace("```", "").strip()
+            parsed_response = json.loads(response_content)
+            result["response"] = parsed_response
+        except json.JSONDecodeError:
+            result["error"] = f"Invalid JSON response: {response_content}"
+    return result
 def get_litellm_response(
     prompt: str = None,
     model: str = None,
@@ -921,6 +1149,17 @@ def get_litellm_response(
             auto_process_tool_calls=auto_process_tool_calls,
             **kwargs
         )
+    elif provider == 'airllm':
+        return get_airllm_response(
+            prompt=prompt,
+            model=model,
+            tools=tools,
+            tool_map=tool_map,
+            format=format,
+            messages=messages,
+            auto_process_tool_calls=auto_process_tool_calls,
+            **kwargs
+        )
     elif provider == 'lmstudio' or (model and '.lmstudio' in str(model)):
         # LM Studio uses OpenAI-compatible API on port 1234
         # Also detect models with .lmstudio in path (e.g., /home/user/.lmstudio/models/...)

npcpy 1.3.21__py3-none-any.whl → 1.3.23__py3-none-any.whl

npcpy 1.3.21py3-none-any.whl → 1.3.23py3-none-any.whl