ik-llama-cpp-python 0.1.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ik_llama_cpp/llama.py ADDED
@@ -0,0 +1,236 @@
1
+ """High-level IkLlama class — drop-in compatible with llama_cpp.Llama."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ import struct
8
+ from typing import Any
9
+
10
+ from . import _ctypes_api as C
11
+ from ._internals import IkModel, IkContext, make_batch_range, make_batch_single
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Special token markers that may leak into generated text
16
+ _SPECIAL_TOKEN_RE = re.compile(
17
+ r"<start_of_turn>|<end_of_turn>|<turn\|>|<\|tool_response\|?>|</s>"
18
+ )
19
+
20
+
21
+ def _cpu_has_avx_vnni() -> bool:
22
+ """Detect AVX-VNNI support via CPUID (leaf 7, sub-leaf 1, EAX bit 4)."""
23
+ try:
24
+ import cpuinfo
25
+ info = cpuinfo.get_cpu_info()
26
+ flags = info.get("flags", [])
27
+ return "avx_vnni" in flags or "avxvnni" in flags
28
+ except ImportError:
29
+ pass
30
+ # Fallback: not detectable, assume absent
31
+ return False
32
+
33
+
34
+ class IkLlama:
35
+ """High-level wrapper for ik_llama.cpp inference.
36
+
37
+ API designed to be compatible with ``llama_cpp.Llama`` so that
38
+ ``litegraph.LlamaCppBackend`` can use it as a drop-in replacement.
39
+
40
+ Usage::
41
+
42
+ llm = IkLlama("model.gguf", n_ctx=4096)
43
+ response = llm.create_chat_completion(
44
+ messages=[{"role": "user", "content": "Hello!"}],
45
+ temperature=0.3,
46
+ max_tokens=256,
47
+ )
48
+ print(response["choices"][0]["message"]["content"])
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ model_path: str,
54
+ *,
55
+ n_ctx: int = 4096,
56
+ n_threads: int = 0,
57
+ use_mmap: bool = True,
58
+ use_mlock: bool = False,
59
+ flash_attn: bool = True,
60
+ n_gpu_layers: int = 0,
61
+ verbose: bool = True,
62
+ ):
63
+ self._model = IkModel(
64
+ model_path, use_mmap=use_mmap, use_mlock=use_mlock,
65
+ n_gpu_layers=n_gpu_layers,
66
+ )
67
+
68
+ # Detect non-VNNI CPU — ik_llama.cpp flash attention
69
+ # (iqk_fa_templates.h) triggers GGML_ASSERT(S > 0) on longer
70
+ # prompts without AVX-VNNI, regardless of quant type.
71
+ self._has_vnni = _cpu_has_avx_vnni()
72
+ if flash_attn and not self._has_vnni:
73
+ logger.warning(
74
+ "AVX-VNNI not detected — disabling flash_attn to avoid "
75
+ "ik_llama.cpp flash attention assert failures on longer prompts. "
76
+ "For full ik_llama.cpp performance, use a Zen 4+ or Alder Lake+ CPU."
77
+ )
78
+ flash_attn = False
79
+
80
+ self._context = IkContext(
81
+ self._model, n_ctx=n_ctx, n_threads=n_threads,
82
+ flash_attn=flash_attn,
83
+ )
84
+ self._n_ctx = n_ctx
85
+ self._verbose = verbose
86
+
87
+ @property
88
+ def ctx(self):
89
+ """Raw context pointer — for perf timing access."""
90
+ return self._context.ctx
91
+
92
+ def tokenize(self, text: str, *, add_bos: bool = True, special: bool = False) -> list[int]:
93
+ return self._model.tokenize(text, add_bos=add_bos, special=special)
94
+
95
+ def detokenize(self, tokens: list[int]) -> str:
96
+ return self._model.detokenize(tokens)
97
+
98
+ def generate(
99
+ self,
100
+ tokens: list[int],
101
+ *,
102
+ max_tokens: int = 256,
103
+ temperature: float = 0.0,
104
+ top_k: int = 40,
105
+ top_p: float = 0.95,
106
+ ) -> list[int]:
107
+ """Generate tokens from a prompt token list. Returns generated token ids."""
108
+ self._context.perf_reset()
109
+
110
+ n_ubatch = self._context._n_ubatch
111
+ n_tokens = len(tokens)
112
+
113
+ # Prefill in n_ubatch-sized chunks to avoid compute buffer overflow
114
+ for i in range(0, n_tokens, n_ubatch):
115
+ chunk = tokens[i : i + n_ubatch]
116
+ is_last_chunk = (i + n_ubatch >= n_tokens)
117
+ batch = make_batch_range(chunk, pos_start=i, logits_last=is_last_chunk)
118
+ ret = self._context.decode(batch)
119
+ C.llama_batch_free(batch)
120
+ if ret != 0:
121
+ raise RuntimeError(
122
+ f"llama_decode failed during prefill (chunk {i}..{i+len(chunk)}, "
123
+ f"n_ubatch={n_ubatch}): {ret}"
124
+ )
125
+
126
+ generated: list[int] = []
127
+ pos = len(tokens)
128
+
129
+ for _ in range(max_tokens):
130
+ token_id = self._context.sample(
131
+ -1, temperature=temperature, top_k=top_k, top_p=top_p,
132
+ )
133
+
134
+ # EOG check using the model's own EOG token list
135
+ if C.llama_token_is_eog(self._model.model, token_id):
136
+ break
137
+
138
+ generated.append(token_id)
139
+
140
+ # Decode next token
141
+ batch = make_batch_single(token_id, pos)
142
+ ret = self._context.decode(batch)
143
+ C.llama_batch_free(batch)
144
+ if ret != 0:
145
+ break
146
+ pos += 1
147
+
148
+ return generated
149
+
150
+ def create_chat_completion(
151
+ self,
152
+ messages: list[dict[str, str]],
153
+ *,
154
+ temperature: float = 0.3,
155
+ max_tokens: int = 256,
156
+ top_k: int = 40,
157
+ top_p: float = 0.95,
158
+ ) -> dict[str, Any]:
159
+ """OpenAI-compatible chat completion.
160
+
161
+ Returns a dict matching the ``llama_cpp.Llama.create_chat_completion``
162
+ schema: choices[0].message.content, usage.prompt_tokens, etc.
163
+ """
164
+ prompt = self._apply_chat_template(messages)
165
+ tokens = self.tokenize(prompt, add_bos=False, special=True)
166
+ prompt_tokens = len(tokens)
167
+
168
+ gen_ids = self.generate(
169
+ tokens, max_tokens=max_tokens, temperature=temperature,
170
+ top_k=top_k, top_p=top_p,
171
+ )
172
+
173
+ text = self.detokenize(gen_ids)
174
+ # Strip special token markers that leak through sub-token generation
175
+ text = _SPECIAL_TOKEN_RE.sub("", text).strip()
176
+ completion_tokens = len(gen_ids)
177
+
178
+ return {
179
+ "id": "chatcmpl-ik",
180
+ "object": "chat.completion",
181
+ "choices": [
182
+ {
183
+ "index": 0,
184
+ "message": {"role": "assistant", "content": text},
185
+ "finish_reason": "stop",
186
+ }
187
+ ],
188
+ "usage": {
189
+ "prompt_tokens": prompt_tokens,
190
+ "completion_tokens": completion_tokens,
191
+ "total_tokens": prompt_tokens + completion_tokens,
192
+ },
193
+ }
194
+
195
+ def chat(self, prompt: str, *, temperature: float = 0.3,
196
+ max_tokens: int = 256) -> str:
197
+ """Convenience: single user message -> response text."""
198
+ resp = self.create_chat_completion(
199
+ messages=[{"role": "user", "content": prompt}],
200
+ temperature=temperature, max_tokens=max_tokens,
201
+ )
202
+ return resp["choices"][0]["message"]["content"]
203
+
204
+ def close(self):
205
+ if self._context:
206
+ self._context.close()
207
+ self._context = None
208
+ if self._model:
209
+ self._model.close()
210
+ self._model = None
211
+
212
+ def __del__(self):
213
+ self.close()
214
+
215
+ @staticmethod
216
+ def _apply_chat_template(messages: list[dict[str, str]]) -> str:
217
+ """Apply Gemma-style chat template.
218
+
219
+ Format::
220
+
221
+ <bos><start_of_turn>user
222
+ {content}<end_of_turn>
223
+ <start_of_turn>model
224
+ """
225
+ parts = ["<bos>"]
226
+ for msg in messages:
227
+ role = msg["role"]
228
+ content = msg["content"]
229
+ if role == "system":
230
+ parts.append(f"<start_of_turn>user\n{content}<end_of_turn>\n")
231
+ elif role == "user":
232
+ parts.append(f"<start_of_turn>user\n{content}<end_of_turn>\n")
233
+ elif role == "assistant":
234
+ parts.append(f"<start_of_turn>model\n{content}<end_of_turn>\n")
235
+ parts.append("<start_of_turn>model\n")
236
+ return "".join(parts)
@@ -0,0 +1,278 @@
1
+ """Quantize GGUF models using ik_llama.cpp's llama-quantize.
2
+
3
+ Supports IQ4_KT and other ik_llama.cpp-specific quantization formats.
4
+
5
+ Usage:
6
+ # Quantize with imatrix (recommended for IQ quants)
7
+ ik-llama-quantize model-bf16.gguf model-IQ4_KT.gguf IQ4_KT --imatrix model-imatrix.gguf
8
+
9
+ # Quantize without imatrix
10
+ ik-llama-quantize model-bf16.gguf model-IQ4_KT.gguf IQ4_KT
11
+
12
+ # Download bf16 + imatrix from HuggingFace and quantize in one step
13
+ ik-llama-quantize --hf-repo bartowski/google_gemma-4-E2B-it-GGUF --hf-quant IQ4_KT
14
+
15
+ # As a Python module
16
+ python -m ik_llama_cpp.quantize model-bf16.gguf model-IQ4_KT.gguf IQ4_KT
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import os
23
+ import platform
24
+ import shutil
25
+ import subprocess
26
+ import sys
27
+ from pathlib import Path
28
+
29
+
30
+ # ik_llama.cpp-specific quant types (superset of upstream llama.cpp)
31
+ IK_QUANT_TYPES = [
32
+ "IQ4_KT", "IQ3_KT", "IQ2_KT", "IQ1_KT",
33
+ "IQ4_KS", "IQ4_KSS", "IQ3_KS",
34
+ "Q4_K_M", "Q4_K_S", "Q4_K_L",
35
+ "Q8_0", "Q6_K", "Q5_K_M", "Q3_K_M",
36
+ ]
37
+
38
+
39
+ def find_quantize_bin() -> Path | None:
40
+ """Find the llama-quantize binary bundled with this package or on PATH."""
41
+ # 1. Check inside the installed package (ik_llama_cpp/bin/)
42
+ pkg_bin = Path(__file__).parent / "bin"
43
+ for name in ["llama-quantize.exe", "llama-quantize"]:
44
+ candidate = pkg_bin / name
45
+ if candidate.is_file():
46
+ return candidate
47
+
48
+ # 2. Check PATH
49
+ which = shutil.which("llama-quantize")
50
+ if which:
51
+ return Path(which)
52
+
53
+ # 3. Check common build dirs relative to source tree
54
+ src_root = Path(__file__).resolve().parent.parent
55
+ vendor_src = src_root / "vendor" / "ik_llama.cpp"
56
+ if vendor_src.is_dir():
57
+ for build_dir in ["build", "build/bin", "build/Release",
58
+ "build/bin/Release", "build/examples/quantize",
59
+ "build/examples/quantize/Release"]:
60
+ d = vendor_src / build_dir
61
+ for name in ["llama-quantize.exe", "llama-quantize"]:
62
+ candidate = d / name
63
+ if candidate.is_file():
64
+ return candidate
65
+
66
+ return None
67
+
68
+
69
+ def quantize(
70
+ input_path: str | Path,
71
+ output_path: str | Path,
72
+ quant_type: str = "IQ4_KT",
73
+ imatrix_path: str | Path | None = None,
74
+ ) -> Path:
75
+ """Quantize a GGUF model using ik_llama.cpp's llama-quantize.
76
+
77
+ Args:
78
+ input_path: Path to the source GGUF (bf16 or f16).
79
+ output_path: Path for the quantized output GGUF.
80
+ quant_type: Quantization type (e.g. "IQ4_KT", "Q4_K_M").
81
+ imatrix_path: Optional importance matrix for better quality.
82
+
83
+ Returns:
84
+ Path to the quantized output file.
85
+
86
+ Raises:
87
+ FileNotFoundError: If llama-quantize binary is not found.
88
+ subprocess.CalledProcessError: If quantization fails.
89
+ """
90
+ quantize_bin = find_quantize_bin()
91
+ if quantize_bin is None:
92
+ raise FileNotFoundError(
93
+ "llama-quantize not found. Ensure ik-llama-cpp-python is installed "
94
+ "with the quantize binary, or build it from source:\n"
95
+ " pip install ik-llama-cpp-python # includes llama-quantize\n"
96
+ " # Or build from ik_llama.cpp source:\n"
97
+ " cd vendor/ik_llama.cpp && mkdir build && cd build\n"
98
+ " cmake .. -DLLAMA_BUILD_EXAMPLES=ON && cmake --build . --target llama-quantize"
99
+ )
100
+
101
+ input_path = Path(input_path)
102
+ output_path = Path(output_path)
103
+
104
+ if not input_path.is_file():
105
+ raise FileNotFoundError(f"Input GGUF not found: {input_path}")
106
+
107
+ output_path.parent.mkdir(parents=True, exist_ok=True)
108
+
109
+ cmd = [str(quantize_bin)]
110
+ if imatrix_path is not None:
111
+ imatrix_path = Path(imatrix_path)
112
+ if not imatrix_path.is_file():
113
+ raise FileNotFoundError(f"imatrix file not found: {imatrix_path}")
114
+ cmd.extend(["--imatrix", str(imatrix_path)])
115
+ cmd.extend([str(input_path), str(output_path), quant_type])
116
+
117
+ print(f"Quantizing: {input_path.name} -> {output_path.name} ({quant_type})")
118
+ result = subprocess.run(cmd)
119
+
120
+ # If imatrix failed (format mismatch), retry without it
121
+ if result.returncode != 0 and imatrix_path is not None:
122
+ print(f"\nWarning: quantization with imatrix failed (likely format mismatch).")
123
+ print(f"Retrying without imatrix...")
124
+ # Clean up partial output
125
+ if output_path.is_file():
126
+ output_path.unlink()
127
+ cmd_no_imat = [str(quantize_bin), str(input_path), str(output_path), quant_type]
128
+ subprocess.run(cmd_no_imat, check=True)
129
+ elif result.returncode != 0:
130
+ raise subprocess.CalledProcessError(result.returncode, cmd)
131
+
132
+ if not output_path.is_file():
133
+ raise RuntimeError(f"Quantization completed but output not found: {output_path}")
134
+
135
+ size_gb = output_path.stat().st_size / (1024 ** 3)
136
+ print(f"Done! {output_path} ({size_gb:.2f} GB)")
137
+ return output_path
138
+
139
+
140
+ def quantize_from_hf(
141
+ repo_id: str,
142
+ quant_type: str = "IQ4_KT",
143
+ output_dir: str | Path | None = None,
144
+ ) -> Path:
145
+ """Download a bf16 GGUF + imatrix from HuggingFace and quantize.
146
+
147
+ Expects the repo to follow bartowski's naming convention:
148
+ - <prefix>-bf16.gguf
149
+ - <prefix>-imatrix.gguf
150
+
151
+ Args:
152
+ repo_id: HuggingFace repo (e.g. "bartowski/google_gemma-4-E2B-it-GGUF").
153
+ quant_type: Quantization type (default: "IQ4_KT").
154
+ output_dir: Directory for downloaded and output files.
155
+
156
+ Returns:
157
+ Path to the quantized output file.
158
+ """
159
+ from huggingface_hub import hf_hub_download, list_repo_files
160
+
161
+ # Discover bf16 and imatrix files
162
+ files = list_repo_files(repo_id)
163
+ bf16_files = [f for f in files if f.endswith("-bf16.gguf")]
164
+ imatrix_files = [f for f in files if f.endswith("-imatrix.gguf")]
165
+
166
+ if not bf16_files:
167
+ # Fallback: try f16
168
+ bf16_files = [f for f in files if f.endswith("-f16.gguf")]
169
+ if not bf16_files:
170
+ raise FileNotFoundError(
171
+ f"No bf16/f16 source GGUF found in {repo_id}. "
172
+ f"Available: {[f for f in files if f.endswith('.gguf')]}"
173
+ )
174
+
175
+ bf16_name = bf16_files[0]
176
+ prefix = bf16_name.rsplit("-bf16.gguf", 1)[0] or bf16_name.rsplit("-f16.gguf", 1)[0]
177
+ output_name = f"{prefix}-{quant_type}.gguf"
178
+
179
+ if output_dir is None:
180
+ output_dir = Path("models") / repo_id.split("/")[-1].lower().replace("-gguf", "")
181
+ output_dir = Path(output_dir)
182
+ output_dir.mkdir(parents=True, exist_ok=True)
183
+
184
+ output_path = output_dir / output_name
185
+ if output_path.is_file():
186
+ size_gb = output_path.stat().st_size / (1024 ** 3)
187
+ print(f"Already exists: {output_path} ({size_gb:.2f} GB)")
188
+ return output_path
189
+
190
+ # Download bf16
191
+ bf16_path = output_dir / bf16_name
192
+ if not bf16_path.is_file():
193
+ print(f"Downloading {bf16_name} from {repo_id}...")
194
+ hf_hub_download(repo_id=repo_id, filename=bf16_name, local_dir=str(output_dir))
195
+
196
+ # Download imatrix (optional but recommended for IQ quants)
197
+ imatrix_path = None
198
+ if imatrix_files:
199
+ imat_name = imatrix_files[0]
200
+ imatrix_path = output_dir / imat_name
201
+ if not imatrix_path.is_file():
202
+ print(f"Downloading {imat_name} from {repo_id}...")
203
+ hf_hub_download(repo_id=repo_id, filename=imat_name, local_dir=str(output_dir))
204
+
205
+ # Quantize
206
+ result = quantize(bf16_path, output_path, quant_type, imatrix_path)
207
+
208
+ # Hint about cleanup
209
+ bf16_size_gb = bf16_path.stat().st_size / (1024 ** 3)
210
+ print(f"\nTip: delete {bf16_path.name} to save {bf16_size_gb:.1f} GB")
211
+
212
+ return result
213
+
214
+
215
+ def main():
216
+ parser = argparse.ArgumentParser(
217
+ prog="ik-llama-quantize",
218
+ description="Quantize GGUF models using ik_llama.cpp (supports IQ4_KT and other IK quants)",
219
+ )
220
+ sub = parser.add_subparsers(dest="command")
221
+
222
+ # --- Direct quantize ---
223
+ p_quant = sub.add_parser("quantize", help="Quantize a local GGUF file")
224
+ p_quant.add_argument("input", help="Source GGUF file (bf16 or f16)")
225
+ p_quant.add_argument("output", help="Output GGUF file path")
226
+ p_quant.add_argument("type", nargs="?", default="IQ4_KT",
227
+ help="Quantization type (default: IQ4_KT)")
228
+ p_quant.add_argument("--imatrix", help="Importance matrix file for better quality")
229
+
230
+ # --- Download + quantize from HuggingFace ---
231
+ p_hf = sub.add_parser("from-hf",
232
+ help="Download bf16 from HuggingFace and quantize")
233
+ p_hf.add_argument("repo", help="HuggingFace repo ID (e.g. bartowski/google_gemma-4-E2B-it-GGUF)")
234
+ p_hf.add_argument("--type", default="IQ4_KT",
235
+ help="Quantization type (default: IQ4_KT)")
236
+ p_hf.add_argument("--output-dir", help="Output directory (default: models/<repo-name>)")
237
+
238
+ # --- Check binary ---
239
+ sub.add_parser("check", help="Check if llama-quantize binary is available")
240
+
241
+ # Allow positional-only usage: ik-llama-quantize input output type
242
+ args = parser.parse_args()
243
+
244
+ if args.command is None:
245
+ # Support positional-only usage without subcommand
246
+ if len(sys.argv) >= 3 and not sys.argv[1].startswith("-"):
247
+ args = argparse.Namespace(
248
+ command="quantize",
249
+ input=sys.argv[1],
250
+ output=sys.argv[2],
251
+ type=sys.argv[3] if len(sys.argv) > 3 else "IQ4_KT",
252
+ imatrix=None,
253
+ )
254
+ # Check for --imatrix
255
+ for i, a in enumerate(sys.argv):
256
+ if a == "--imatrix" and i + 1 < len(sys.argv):
257
+ args.imatrix = sys.argv[i + 1]
258
+ else:
259
+ parser.print_help()
260
+ sys.exit(1)
261
+
262
+ if args.command == "check":
263
+ b = find_quantize_bin()
264
+ if b:
265
+ print(f"llama-quantize found: {b}")
266
+ else:
267
+ print("llama-quantize not found")
268
+ sys.exit(1)
269
+
270
+ elif args.command == "quantize":
271
+ quantize(args.input, args.output, args.type, args.imatrix)
272
+
273
+ elif args.command == "from-hf":
274
+ quantize_from_hf(args.repo, args.type, args.output_dir)
275
+
276
+
277
+ if __name__ == "__main__":
278
+ main()