PyPI - ik-llama-cpp-python - Versions diffs - 0.1.0__cp311-cp311-win_amd64.whl - Mend

ik-llama-cpp-python 0.1.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

bin/convert_hf_to_gguf.py +4863 -0
bin/ggml.dll +0 -0
bin/llama-quantize.exe +0 -0
bin/llama.dll +0 -0
ik_llama_cpp/__init__.py +7 -0
ik_llama_cpp/_ctypes_api.py +418 -0
ik_llama_cpp/_internals.py +192 -0
ik_llama_cpp/_lib_loader.py +62 -0
ik_llama_cpp/bin/llama-quantize.exe +0 -0
ik_llama_cpp/lib/ggml.dll +0 -0
ik_llama_cpp/lib/llama.dll +0 -0
ik_llama_cpp/llama.py +236 -0
ik_llama_cpp/quantize.py +278 -0
ik_llama_cpp_python-0.1.0.dist-info/METADATA +249 -0
ik_llama_cpp_python-0.1.0.dist-info/RECORD +35 -0
ik_llama_cpp_python-0.1.0.dist-info/WHEEL +5 -0
ik_llama_cpp_python-0.1.0.dist-info/entry_points.txt +3 -0
ik_llama_cpp_python-0.1.0.dist-info/licenses/LICENSE +21 -0
include/ggml-alloc.h +76 -0
include/ggml-backend.h +245 -0
include/ggml-blas.h +23 -0
include/ggml-cann.h +125 -0
include/ggml-cuda.h +47 -0
include/ggml-kompute.h +46 -0
include/ggml-metal.h +67 -0
include/ggml-rpc.h +29 -0
include/ggml-sycl.h +42 -0
include/ggml-vulkan.h +29 -0
include/ggml.h +3167 -0
include/llama.h +1564 -0
lib/cmake/llama/llama-config.cmake +114 -0
lib/cmake/llama/llama-version.cmake +65 -0
lib/ggml.lib +0 -0
lib/llama.lib +0 -0
lib/pkgconfig/llama.pc +10 -0

ik_llama_cpp/llama.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""High-level IkLlama class — drop-in compatible with llama_cpp.Llama."""
+from __future__ import annotations
+import logging
+import re
+import struct
+from typing import Any
+from . import _ctypes_api as C
+from ._internals import IkModel, IkContext, make_batch_range, make_batch_single
+logger = logging.getLogger(__name__)
+# Special token markers that may leak into generated text
+_SPECIAL_TOKEN_RE = re.compile(
+    r"<start_of_turn>|<end_of_turn>|<turn\|>|<\|tool_response\|?>|</s>"
+)
+def _cpu_has_avx_vnni() -> bool:
+    """Detect AVX-VNNI support via CPUID (leaf 7, sub-leaf 1, EAX bit 4)."""
+    try:
+        import cpuinfo
+        info = cpuinfo.get_cpu_info()
+        flags = info.get("flags", [])
+        return "avx_vnni" in flags or "avxvnni" in flags
+    except ImportError:
+        pass
+    # Fallback: not detectable, assume absent
+    return False
+class IkLlama:
+    """High-level wrapper for ik_llama.cpp inference.
+    API designed to be compatible with ``llama_cpp.Llama`` so that
+    ``litegraph.LlamaCppBackend`` can use it as a drop-in replacement.
+    Usage::
+        llm = IkLlama("model.gguf", n_ctx=4096)
+        response = llm.create_chat_completion(
+            messages=[{"role": "user", "content": "Hello!"}],
+            temperature=0.3,
+            max_tokens=256,
+        )
+        print(response["choices"][0]["message"]["content"])
+    """
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        n_ctx: int = 4096,
+        n_threads: int = 0,
+        use_mmap: bool = True,
+        use_mlock: bool = False,
+        flash_attn: bool = True,
+        n_gpu_layers: int = 0,
+        verbose: bool = True,
+    ):
+        self._model = IkModel(
+            model_path, use_mmap=use_mmap, use_mlock=use_mlock,
+            n_gpu_layers=n_gpu_layers,
+        )
+        # Detect non-VNNI CPU — ik_llama.cpp flash attention
+        # (iqk_fa_templates.h) triggers GGML_ASSERT(S > 0) on longer
+        # prompts without AVX-VNNI, regardless of quant type.
+        self._has_vnni = _cpu_has_avx_vnni()
+        if flash_attn and not self._has_vnni:
+            logger.warning(
+                "AVX-VNNI not detected — disabling flash_attn to avoid "
+                "ik_llama.cpp flash attention assert failures on longer prompts. "
+                "For full ik_llama.cpp performance, use a Zen 4+ or Alder Lake+ CPU."
+            )
+            flash_attn = False
+        self._context = IkContext(
+            self._model, n_ctx=n_ctx, n_threads=n_threads,
+            flash_attn=flash_attn,
+        )
+        self._n_ctx = n_ctx
+        self._verbose = verbose
+    @property
+    def ctx(self):
+        """Raw context pointer — for perf timing access."""
+        return self._context.ctx
+    def tokenize(self, text: str, *, add_bos: bool = True, special: bool = False) -> list[int]:
+        return self._model.tokenize(text, add_bos=add_bos, special=special)
+    def detokenize(self, tokens: list[int]) -> str:
+        return self._model.detokenize(tokens)
+    def generate(
+        self,
+        tokens: list[int],
+        *,
+        max_tokens: int = 256,
+        temperature: float = 0.0,
+        top_k: int = 40,
+        top_p: float = 0.95,
+    ) -> list[int]:
+        """Generate tokens from a prompt token list. Returns generated token ids."""
+        self._context.perf_reset()
+        n_ubatch = self._context._n_ubatch
+        n_tokens = len(tokens)
+        # Prefill in n_ubatch-sized chunks to avoid compute buffer overflow
+        for i in range(0, n_tokens, n_ubatch):
+            chunk = tokens[i : i + n_ubatch]
+            is_last_chunk = (i + n_ubatch >= n_tokens)
+            batch = make_batch_range(chunk, pos_start=i, logits_last=is_last_chunk)
+            ret = self._context.decode(batch)
+            C.llama_batch_free(batch)
+            if ret != 0:
+                raise RuntimeError(
+                    f"llama_decode failed during prefill (chunk {i}..{i+len(chunk)}, "
+                    f"n_ubatch={n_ubatch}): {ret}"
+                )
+        generated: list[int] = []
+        pos = len(tokens)
+        for _ in range(max_tokens):
+            token_id = self._context.sample(
+                -1, temperature=temperature, top_k=top_k, top_p=top_p,
+            )
+            # EOG check using the model's own EOG token list
+            if C.llama_token_is_eog(self._model.model, token_id):
+                break
+            generated.append(token_id)
+            # Decode next token
+            batch = make_batch_single(token_id, pos)
+            ret = self._context.decode(batch)
+            C.llama_batch_free(batch)
+            if ret != 0:
+                break
+            pos += 1
+        return generated
+    def create_chat_completion(
+        self,
+        messages: list[dict[str, str]],
+        *,
+        temperature: float = 0.3,
+        max_tokens: int = 256,
+        top_k: int = 40,
+        top_p: float = 0.95,
+    ) -> dict[str, Any]:
+        """OpenAI-compatible chat completion.
+        Returns a dict matching the ``llama_cpp.Llama.create_chat_completion``
+        schema: choices[0].message.content, usage.prompt_tokens, etc.
+        """
+        prompt = self._apply_chat_template(messages)
+        tokens = self.tokenize(prompt, add_bos=False, special=True)
+        prompt_tokens = len(tokens)
+        gen_ids = self.generate(
+            tokens, max_tokens=max_tokens, temperature=temperature,
+            top_k=top_k, top_p=top_p,
+        )
+        text = self.detokenize(gen_ids)
+        # Strip special token markers that leak through sub-token generation
+        text = _SPECIAL_TOKEN_RE.sub("", text).strip()
+        completion_tokens = len(gen_ids)
+        return {
+            "id": "chatcmpl-ik",
+            "object": "chat.completion",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": text},
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        }
+    def chat(self, prompt: str, *, temperature: float = 0.3,
+             max_tokens: int = 256) -> str:
+        """Convenience: single user message -> response text."""
+        resp = self.create_chat_completion(
+            messages=[{"role": "user", "content": prompt}],
+            temperature=temperature, max_tokens=max_tokens,
+        )
+        return resp["choices"][0]["message"]["content"]
+    def close(self):
+        if self._context:
+            self._context.close()
+            self._context = None
+        if self._model:
+            self._model.close()
+            self._model = None
+    def __del__(self):
+        self.close()
+    @staticmethod
+    def _apply_chat_template(messages: list[dict[str, str]]) -> str:
+        """Apply Gemma-style chat template.
+        Format::
+            <bos><start_of_turn>user
+            {content}<end_of_turn>
+            <start_of_turn>model
+        """
+        parts = ["<bos>"]
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                parts.append(f"<start_of_turn>user\n{content}<end_of_turn>\n")
+            elif role == "user":
+                parts.append(f"<start_of_turn>user\n{content}<end_of_turn>\n")
+            elif role == "assistant":
+                parts.append(f"<start_of_turn>model\n{content}<end_of_turn>\n")
+        parts.append("<start_of_turn>model\n")
+        return "".join(parts)

ik_llama_cpp/quantize.py ADDED Viewed

@@ -0,0 +1,278 @@
+"""Quantize GGUF models using ik_llama.cpp's llama-quantize.
+Supports IQ4_KT and other ik_llama.cpp-specific quantization formats.
+Usage:
+    # Quantize with imatrix (recommended for IQ quants)
+    ik-llama-quantize model-bf16.gguf model-IQ4_KT.gguf IQ4_KT --imatrix model-imatrix.gguf
+    # Quantize without imatrix
+    ik-llama-quantize model-bf16.gguf model-IQ4_KT.gguf IQ4_KT
+    # Download bf16 + imatrix from HuggingFace and quantize in one step
+    ik-llama-quantize --hf-repo bartowski/google_gemma-4-E2B-it-GGUF --hf-quant IQ4_KT
+    # As a Python module
+    python -m ik_llama_cpp.quantize model-bf16.gguf model-IQ4_KT.gguf IQ4_KT
+"""
+from __future__ import annotations
+import argparse
+import os
+import platform
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+# ik_llama.cpp-specific quant types (superset of upstream llama.cpp)
+IK_QUANT_TYPES = [
+    "IQ4_KT", "IQ3_KT", "IQ2_KT", "IQ1_KT",
+    "IQ4_KS", "IQ4_KSS", "IQ3_KS",
+    "Q4_K_M", "Q4_K_S", "Q4_K_L",
+    "Q8_0", "Q6_K", "Q5_K_M", "Q3_K_M",
+]
+def find_quantize_bin() -> Path | None:
+    """Find the llama-quantize binary bundled with this package or on PATH."""
+    # 1. Check inside the installed package (ik_llama_cpp/bin/)
+    pkg_bin = Path(__file__).parent / "bin"
+    for name in ["llama-quantize.exe", "llama-quantize"]:
+        candidate = pkg_bin / name
+        if candidate.is_file():
+            return candidate
+    # 2. Check PATH
+    which = shutil.which("llama-quantize")
+    if which:
+        return Path(which)
+    # 3. Check common build dirs relative to source tree
+    src_root = Path(__file__).resolve().parent.parent
+    vendor_src = src_root / "vendor" / "ik_llama.cpp"
+    if vendor_src.is_dir():
+        for build_dir in ["build", "build/bin", "build/Release",
+                          "build/bin/Release", "build/examples/quantize",
+                          "build/examples/quantize/Release"]:
+            d = vendor_src / build_dir
+            for name in ["llama-quantize.exe", "llama-quantize"]:
+                candidate = d / name
+                if candidate.is_file():
+                    return candidate
+    return None
+def quantize(
+    input_path: str | Path,
+    output_path: str | Path,
+    quant_type: str = "IQ4_KT",
+    imatrix_path: str | Path | None = None,
+) -> Path:
+    """Quantize a GGUF model using ik_llama.cpp's llama-quantize.
+    Args:
+        input_path: Path to the source GGUF (bf16 or f16).
+        output_path: Path for the quantized output GGUF.
+        quant_type: Quantization type (e.g. "IQ4_KT", "Q4_K_M").
+        imatrix_path: Optional importance matrix for better quality.
+    Returns:
+        Path to the quantized output file.
+    Raises:
+        FileNotFoundError: If llama-quantize binary is not found.
+        subprocess.CalledProcessError: If quantization fails.
+    """
+    quantize_bin = find_quantize_bin()
+    if quantize_bin is None:
+        raise FileNotFoundError(
+            "llama-quantize not found. Ensure ik-llama-cpp-python is installed "
+            "with the quantize binary, or build it from source:\n"
+            "  pip install ik-llama-cpp-python  # includes llama-quantize\n"
+            "  # Or build from ik_llama.cpp source:\n"
+            "  cd vendor/ik_llama.cpp && mkdir build && cd build\n"
+            "  cmake .. -DLLAMA_BUILD_EXAMPLES=ON && cmake --build . --target llama-quantize"
+        )
+    input_path = Path(input_path)
+    output_path = Path(output_path)
+    if not input_path.is_file():
+        raise FileNotFoundError(f"Input GGUF not found: {input_path}")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    cmd = [str(quantize_bin)]
+    if imatrix_path is not None:
+        imatrix_path = Path(imatrix_path)
+        if not imatrix_path.is_file():
+            raise FileNotFoundError(f"imatrix file not found: {imatrix_path}")
+        cmd.extend(["--imatrix", str(imatrix_path)])
+    cmd.extend([str(input_path), str(output_path), quant_type])
+    print(f"Quantizing: {input_path.name} -> {output_path.name} ({quant_type})")
+    result = subprocess.run(cmd)
+    # If imatrix failed (format mismatch), retry without it
+    if result.returncode != 0 and imatrix_path is not None:
+        print(f"\nWarning: quantization with imatrix failed (likely format mismatch).")
+        print(f"Retrying without imatrix...")
+        # Clean up partial output
+        if output_path.is_file():
+            output_path.unlink()
+        cmd_no_imat = [str(quantize_bin), str(input_path), str(output_path), quant_type]
+        subprocess.run(cmd_no_imat, check=True)
+    elif result.returncode != 0:
+        raise subprocess.CalledProcessError(result.returncode, cmd)
+    if not output_path.is_file():
+        raise RuntimeError(f"Quantization completed but output not found: {output_path}")
+    size_gb = output_path.stat().st_size / (1024 ** 3)
+    print(f"Done! {output_path} ({size_gb:.2f} GB)")
+    return output_path
+def quantize_from_hf(
+    repo_id: str,
+    quant_type: str = "IQ4_KT",
+    output_dir: str | Path | None = None,
+) -> Path:
+    """Download a bf16 GGUF + imatrix from HuggingFace and quantize.
+    Expects the repo to follow bartowski's naming convention:
+      - <prefix>-bf16.gguf
+      - <prefix>-imatrix.gguf
+    Args:
+        repo_id: HuggingFace repo (e.g. "bartowski/google_gemma-4-E2B-it-GGUF").
+        quant_type: Quantization type (default: "IQ4_KT").
+        output_dir: Directory for downloaded and output files.
+    Returns:
+        Path to the quantized output file.
+    """
+    from huggingface_hub import hf_hub_download, list_repo_files
+    # Discover bf16 and imatrix files
+    files = list_repo_files(repo_id)
+    bf16_files = [f for f in files if f.endswith("-bf16.gguf")]
+    imatrix_files = [f for f in files if f.endswith("-imatrix.gguf")]
+    if not bf16_files:
+        # Fallback: try f16
+        bf16_files = [f for f in files if f.endswith("-f16.gguf")]
+    if not bf16_files:
+        raise FileNotFoundError(
+            f"No bf16/f16 source GGUF found in {repo_id}. "
+            f"Available: {[f for f in files if f.endswith('.gguf')]}"
+        )
+    bf16_name = bf16_files[0]
+    prefix = bf16_name.rsplit("-bf16.gguf", 1)[0] or bf16_name.rsplit("-f16.gguf", 1)[0]
+    output_name = f"{prefix}-{quant_type}.gguf"
+    if output_dir is None:
+        output_dir = Path("models") / repo_id.split("/")[-1].lower().replace("-gguf", "")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / output_name
+    if output_path.is_file():
+        size_gb = output_path.stat().st_size / (1024 ** 3)
+        print(f"Already exists: {output_path} ({size_gb:.2f} GB)")
+        return output_path
+    # Download bf16
+    bf16_path = output_dir / bf16_name
+    if not bf16_path.is_file():
+        print(f"Downloading {bf16_name} from {repo_id}...")
+        hf_hub_download(repo_id=repo_id, filename=bf16_name, local_dir=str(output_dir))
+    # Download imatrix (optional but recommended for IQ quants)
+    imatrix_path = None
+    if imatrix_files:
+        imat_name = imatrix_files[0]
+        imatrix_path = output_dir / imat_name
+        if not imatrix_path.is_file():
+            print(f"Downloading {imat_name} from {repo_id}...")
+            hf_hub_download(repo_id=repo_id, filename=imat_name, local_dir=str(output_dir))
+    # Quantize
+    result = quantize(bf16_path, output_path, quant_type, imatrix_path)
+    # Hint about cleanup
+    bf16_size_gb = bf16_path.stat().st_size / (1024 ** 3)
+    print(f"\nTip: delete {bf16_path.name} to save {bf16_size_gb:.1f} GB")
+    return result
+def main():
+    parser = argparse.ArgumentParser(
+        prog="ik-llama-quantize",
+        description="Quantize GGUF models using ik_llama.cpp (supports IQ4_KT and other IK quants)",
+    )
+    sub = parser.add_subparsers(dest="command")
+    # --- Direct quantize ---
+    p_quant = sub.add_parser("quantize", help="Quantize a local GGUF file")
+    p_quant.add_argument("input", help="Source GGUF file (bf16 or f16)")
+    p_quant.add_argument("output", help="Output GGUF file path")
+    p_quant.add_argument("type", nargs="?", default="IQ4_KT",
+                         help="Quantization type (default: IQ4_KT)")
+    p_quant.add_argument("--imatrix", help="Importance matrix file for better quality")
+    # --- Download + quantize from HuggingFace ---
+    p_hf = sub.add_parser("from-hf",
+                          help="Download bf16 from HuggingFace and quantize")
+    p_hf.add_argument("repo", help="HuggingFace repo ID (e.g. bartowski/google_gemma-4-E2B-it-GGUF)")
+    p_hf.add_argument("--type", default="IQ4_KT",
+                       help="Quantization type (default: IQ4_KT)")
+    p_hf.add_argument("--output-dir", help="Output directory (default: models/<repo-name>)")
+    # --- Check binary ---
+    sub.add_parser("check", help="Check if llama-quantize binary is available")
+    # Allow positional-only usage: ik-llama-quantize input output type
+    args = parser.parse_args()
+    if args.command is None:
+        # Support positional-only usage without subcommand
+        if len(sys.argv) >= 3 and not sys.argv[1].startswith("-"):
+            args = argparse.Namespace(
+                command="quantize",
+                input=sys.argv[1],
+                output=sys.argv[2],
+                type=sys.argv[3] if len(sys.argv) > 3 else "IQ4_KT",
+                imatrix=None,
+            )
+            # Check for --imatrix
+            for i, a in enumerate(sys.argv):
+                if a == "--imatrix" and i + 1 < len(sys.argv):
+                    args.imatrix = sys.argv[i + 1]
+        else:
+            parser.print_help()
+            sys.exit(1)
+    if args.command == "check":
+        b = find_quantize_bin()
+        if b:
+            print(f"llama-quantize found: {b}")
+        else:
+            print("llama-quantize not found")
+            sys.exit(1)
+    elif args.command == "quantize":
+        quantize(args.input, args.output, args.type, args.imatrix)
+    elif args.command == "from-hf":
+        quantize_from_hf(args.repo, args.type, args.output_dir)
+if __name__ == "__main__":
+    main()