lollms-client 0.14.1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

@@ -0,0 +1,591 @@
1
+ # bindings/llamacpp/binding.py
2
+ import json
3
+ from lollms_client.lollms_llm_binding import LollmsLLMBinding
4
+ from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT
5
+ # from lollms_client.lollms_utilities import encode_image # Used for LLaVA if needed to prepare image data.
6
+
7
+ from typing import Optional, Callable, List, Union, Dict, Any
8
+ import os
9
+ import sys
10
+ import base64 # For LLaVA image encoding
11
+
12
+ from ascii_colors import ASCIIColors, trace_exception
13
+ import pipmaster as pm
14
+
15
+ # Ensure llama-cpp-python is installed
16
+ # Installation can be tricky due to C++ compilation.
17
+ # Users might need to install it with specific flags for their hardware (e.g., CUDA, Metal).
18
+ pm.ensure_packages(["llama-cpp-python", "pillow", "tiktoken"]) # tiktoken as a very last resort if llama_cpp fails
19
+
20
+ try:
21
+ from llama_cpp import Llama, LlamaGrammar, LogStdErrToPython
22
+ # For LLaVA (multimodal) support
23
+ from llama_cpp.llama_chat_format import LlamaChatCompletionRequestMessageImageContentPart, LlamaChatCompletionRequestMessageTextContentPart
24
+ except ImportError as e:
25
+ ASCIIColors.error(f"Failed to import llama_cpp: {e}. Please ensure it is installed correctly.")
26
+ ASCIIColors.error("Try: pip install llama-cpp-python")
27
+ ASCIIColors.error("For GPU support, you might need to compile it with specific flags, e.g.:")
28
+ ASCIIColors.error(" CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python (for NVIDIA)")
29
+ ASCIIColors.error(" CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 pip install llama-cpp-python (for Apple Metal)")
30
+ Llama = None
31
+ LlamaGrammar = None
32
+ LogStdErrToPython = None
33
+ LlamaChatCompletionRequestMessageImageContentPart = None
34
+ LlamaChatCompletionRequestMessageTextContentPart = None
35
+ # It's critical that the script can run even if llama_cpp is not installed,
36
+ # so LoLLMs can still list it as an available binding and guide user for installation.
37
+ # The __init__ will raise an error if Llama is None and an attempt is made to use the binding.
38
+
39
+
40
+ BindingName = "PythonLlamaCppBinding"
41
+
42
+ class PythonLlamaCppBinding(LollmsLLMBinding):
43
+ """
44
+ Llama.cpp binding implementation using the llama-cpp-python library.
45
+ This binding loads and runs GGUF models locally.
46
+ """
47
+
48
+ DEFAULT_CONFIG = {
49
+ "n_gpu_layers": 0,
50
+ "main_gpu": 0,
51
+ "tensor_split": None,
52
+ "vocab_only": False,
53
+ "use_mmap": True,
54
+ "use_mlock": False,
55
+ "seed": -1, # -1 for random
56
+ "n_ctx": 2048,
57
+ "n_batch": 512,
58
+ "n_threads": None,
59
+ "n_threads_batch": None,
60
+ "rope_scaling_type": None,
61
+ "rope_freq_base": 0.0,
62
+ "rope_freq_scale": 0.0,
63
+ "yarn_ext_factor": -1.0,
64
+ "yarn_attn_factor": 1.0,
65
+ "yarn_beta_fast": 32.0,
66
+ "yarn_beta_slow": 1.0,
67
+ "yarn_orig_ctx": 0,
68
+ "logits_all": False,
69
+ "embedding": False, # Enable for model.embed()
70
+ "chat_format": "chatml", # Default chat format, LLaVA needs specific e.g. "llava-1-5"
71
+ "clip_model_path": None, # For LLaVA: path to the mmproj GGUF file
72
+ "verbose": True,
73
+ "temperature": 0.7,
74
+ "top_k": 40,
75
+ "top_p": 0.9,
76
+ "repeat_penalty": 1.1,
77
+ "repeat_last_n": 64,
78
+ "mirostat_mode": 0,
79
+ "mirostat_tau": 5.0,
80
+ "mirostat_eta": 0.1,
81
+ "grammar_file": None,
82
+ }
83
+
84
+ def __init__(self,
85
+ model_path: str,
86
+ config: Optional[Dict[str, Any]] = None,
87
+ lollms_paths: Optional[Dict[str, str]] = None,
88
+ default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
89
+ **kwargs
90
+ ):
91
+
92
+ super().__init__(binding_name=BindingName)
93
+
94
+ if Llama is None: # Check if import failed
95
+ raise ImportError("Llama-cpp-python library is not available. Please install it.")
96
+
97
+ self.model_path = model_path
98
+ self.default_completion_format = default_completion_format
99
+ self.lollms_paths = lollms_paths if lollms_paths else {}
100
+
101
+ self.llama_config = {**self.DEFAULT_CONFIG, **(config or {}), **kwargs}
102
+
103
+ self.model: Optional[Llama] = None
104
+ self.grammar: Optional[LlamaGrammar] = None
105
+
106
+ # Resolve and load grammar if specified
107
+ self._load_grammar_from_config()
108
+
109
+ # Attempt to load the model
110
+ self.load_model(self.model_path)
111
+
112
+ def _load_grammar_from_config(self):
113
+ grammar_file_path = self.llama_config.get("grammar_file")
114
+ if grammar_file_path:
115
+ full_grammar_path = grammar_file_path
116
+ if self.lollms_paths.get('grammars_path') and not os.path.isabs(grammar_file_path):
117
+ full_grammar_path = os.path.join(self.lollms_paths['grammars_path'], grammar_file_path)
118
+
119
+ if os.path.exists(full_grammar_path):
120
+ try:
121
+ self.grammar = LlamaGrammar.from_file(full_grammar_path)
122
+ ASCIIColors.info(f"Loaded GBNF grammar from: {full_grammar_path}")
123
+ except Exception as e:
124
+ ASCIIColors.warning(f"Failed to load GBNF grammar from {full_grammar_path}: {e}")
125
+ else:
126
+ ASCIIColors.warning(f"Grammar file not found: {full_grammar_path}")
127
+
128
+ def load_model(self, model_path: str) -> bool:
129
+ self.model_path = model_path
130
+ resolved_model_path = self.model_path
131
+ if not os.path.exists(resolved_model_path):
132
+ models_base_path = self.lollms_paths.get('personal_models_path', self.lollms_paths.get('models_zoo_path'))
133
+ if models_base_path:
134
+ # Assuming model_path might be relative to a binding-specific folder within models_base_path
135
+ # e.g. models_zoo_path/llamacpp/model_name.gguf
136
+ # Or it could be directly models_zoo_path/model_name.gguf
137
+ potential_path_direct = os.path.join(models_base_path, self.model_path)
138
+ potential_path_binding_specific = os.path.join(models_base_path, self.binding_name.lower(), self.model_path)
139
+
140
+ if os.path.exists(potential_path_direct):
141
+ resolved_model_path = potential_path_direct
142
+ elif os.path.exists(potential_path_binding_specific):
143
+ resolved_model_path = potential_path_binding_specific
144
+ else:
145
+ raise FileNotFoundError(f"Model file '{self.model_path}' not found directly or in model paths: '{potential_path_direct}', '{potential_path_binding_specific}'")
146
+ else:
147
+ raise FileNotFoundError(f"Model file not found: {self.model_path}")
148
+
149
+ ASCIIColors.info(f"Attempting to load GGUF model from: {resolved_model_path}")
150
+ self.model_path = resolved_model_path # Update to resolved path
151
+
152
+ llama_constructor_keys = [
153
+ "n_gpu_layers", "main_gpu", "tensor_split", "vocab_only", "use_mmap", "use_mlock",
154
+ "seed", "n_ctx", "n_batch", "n_threads", "n_threads_batch",
155
+ "rope_scaling_type", "rope_freq_base", "rope_freq_scale",
156
+ "yarn_ext_factor", "yarn_attn_factor", "yarn_beta_fast", "yarn_beta_slow", "yarn_orig_ctx",
157
+ "logits_all", "embedding", "verbose", "chat_format", "clip_model_path"
158
+ ]
159
+ constructor_params = {k: self.llama_config[k] for k in llama_constructor_keys if k in self.llama_config and self.llama_config[k] is not None}
160
+
161
+ # Ensure seed is int
162
+ if "seed" in constructor_params and not isinstance(constructor_params["seed"], int):
163
+ constructor_params["seed"] = int(self.llama_config.get("seed", self.DEFAULT_CONFIG["seed"]))
164
+
165
+ if "n_ctx" in constructor_params: constructor_params["n_ctx"] = int(constructor_params["n_ctx"])
166
+
167
+ if "verbose" in constructor_params and isinstance(constructor_params["verbose"], str):
168
+ constructor_params["verbose"] = constructor_params["verbose"].lower() in ["true", "1", "yes"]
169
+
170
+ # Resolve clip_model_path for LLaVA if relative
171
+ if constructor_params.get("clip_model_path") and self.lollms_paths.get('personal_models_path'):
172
+ clip_path = constructor_params["clip_model_path"]
173
+ if not os.path.isabs(clip_path):
174
+ # Try resolving relative to where main model was found or standard models path
175
+ model_dir = os.path.dirname(self.model_path)
176
+ potential_clip_path1 = os.path.join(model_dir, clip_path)
177
+ potential_clip_path2 = os.path.join(self.lollms_paths['personal_models_path'], clip_path)
178
+ potential_clip_path3 = os.path.join(self.lollms_paths.get('models_zoo_path', ''), clip_path)
179
+
180
+ if os.path.exists(potential_clip_path1):
181
+ constructor_params["clip_model_path"] = potential_clip_path1
182
+ elif os.path.exists(potential_clip_path2):
183
+ constructor_params["clip_model_path"] = potential_clip_path2
184
+ elif self.lollms_paths.get('models_zoo_path') and os.path.exists(potential_clip_path3):
185
+ constructor_params["clip_model_path"] = potential_clip_path3
186
+ else:
187
+ ASCIIColors.warning(f"LLaVA clip_model_path '{clip_path}' not found at various potential locations.")
188
+
189
+
190
+ ASCIIColors.info(f"Llama.cpp constructor parameters: {constructor_params}")
191
+ try:
192
+ if constructor_params.get("verbose", False) and LogStdErrToPython:
193
+ LogStdErrToPython()
194
+ self.model = Llama(model_path=self.model_path, **constructor_params)
195
+ ASCIIColors.green("GGUF Model loaded successfully.")
196
+ self.llama_config["n_ctx"] = self.model.context_params.n_ctx # Update n_ctx from loaded model
197
+ return True
198
+ except Exception as e:
199
+ ASCIIColors.error(f"Failed to load GGUF model {self.model_path}: {e}")
200
+ trace_exception(e)
201
+ self.model = None
202
+ raise RuntimeError(f"Failed to load GGUF model {self.model_path}") from e
203
+
204
+ def generate_text(self,
205
+ prompt: str,
206
+ images: Optional[List[str]] = None,
207
+ system_prompt: str = "",
208
+ n_predict: Optional[int] = None,
209
+ stream: bool = False,
210
+ temperature: float = None,
211
+ top_k: int = None,
212
+ top_p: float = None,
213
+ repeat_penalty: float = None,
214
+ repeat_last_n: Optional[int] = None,
215
+ seed: Optional[int] = None,
216
+ streaming_callback: Optional[Callable[[str, int], bool]] = None,
217
+ use_chat_format: bool = True,
218
+ grammar: Optional[Union[str, LlamaGrammar]] = None,
219
+ **generation_kwargs
220
+ ) -> Union[str, Dict[str, any]]:
221
+
222
+ if not self.model:
223
+ return {"status": False, "error": "Llama.cpp model not loaded."}
224
+
225
+ gen_params_from_config = {
226
+ k: self.llama_config.get(k) for k in [
227
+ "temperature", "top_k", "top_p", "repeat_penalty",
228
+ "mirostat_mode", "mirostat_tau", "mirostat_eta"
229
+ ]
230
+ }
231
+ # repeat_last_n is penalty_last_n for Llama.generate, repeat_penalty_last_n for create_completion/chat_completion
232
+ _repeat_last_n_cfg = self.llama_config.get("repeat_last_n")
233
+
234
+ # Override with call-specific parameters
235
+ gen_params = {
236
+ "temperature": temperature if temperature is not None else gen_params_from_config["temperature"],
237
+ "top_k": top_k if top_k is not None else gen_params_from_config["top_k"],
238
+ "top_p": top_p if top_p is not None else gen_params_from_config["top_p"],
239
+ "repeat_penalty": repeat_penalty if repeat_penalty is not None else gen_params_from_config["repeat_penalty"],
240
+ "mirostat_mode": gen_params_from_config["mirostat_mode"],
241
+ "mirostat_tau": gen_params_from_config["mirostat_tau"],
242
+ "mirostat_eta": gen_params_from_config["mirostat_eta"],
243
+ }
244
+ _repeat_last_n = repeat_last_n if repeat_last_n is not None else _repeat_last_n_cfg
245
+ if _repeat_last_n is not None:
246
+ gen_params["penalty_last_n"] = _repeat_last_n # For Llama.generate (legacy, less used)
247
+ gen_params["repeat_penalty_last_n"] = _repeat_last_n # For create_completion / create_chat_completion
248
+
249
+ if n_predict is not None: gen_params['max_tokens'] = n_predict
250
+ if seed is not None: gen_params['seed'] = seed
251
+
252
+ gen_params = {k: v for k, v in gen_params.items() if v is not None} # Filter None
253
+ gen_params.update(generation_kwargs) # Add any extra kwargs
254
+
255
+ # Handle grammar for this call
256
+ active_grammar = self.grammar # Model's default grammar
257
+ if grammar:
258
+ if isinstance(grammar, LlamaGrammar):
259
+ active_grammar = grammar
260
+ elif isinstance(grammar, str): # Path to grammar file
261
+ g_path = grammar
262
+ if self.lollms_paths.get('grammars_path') and not os.path.isabs(g_path):
263
+ g_path = os.path.join(self.lollms_paths['grammars_path'], g_path)
264
+ if os.path.exists(g_path):
265
+ try:
266
+ active_grammar = LlamaGrammar.from_file(g_path)
267
+ except Exception as e_g: ASCIIColors.warning(f"Failed to load dynamic GBNF grammar from {g_path}: {e_g}")
268
+ else: ASCIIColors.warning(f"Dynamic grammar file not found: {g_path}")
269
+ if active_grammar: gen_params["grammar"] = active_grammar
270
+
271
+ full_response_text = ""
272
+ try:
273
+ if use_chat_format:
274
+ messages = []
275
+ if system_prompt and system_prompt.strip():
276
+ messages.append({"role": "system", "content": system_prompt})
277
+
278
+ user_message_content = prompt
279
+ if images and LlamaChatCompletionRequestMessageImageContentPart and LlamaChatCompletionRequestMessageTextContentPart:
280
+ # LLaVA format: content can be a list of text and image parts
281
+ content_parts = [{"type": "text", "text": prompt}]
282
+ for img_path in images:
283
+ try:
284
+ with open(img_path, "rb") as image_file:
285
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
286
+ image_type = os.path.splitext(img_path)[1][1:].lower() or "png"
287
+ if image_type == "jpg": image_type = "jpeg"
288
+ image_uri = f"data:image/{image_type};base64,{encoded_string}"
289
+ content_parts.append({"type": "image_url", "image_url": {"url": image_uri}})
290
+ except Exception as e_img:
291
+ ASCIIColors.error(f"Failed to process image {img_path} for LLaVA: {e_img}")
292
+ user_message_content = content_parts # type: ignore
293
+
294
+ messages.append({"role": "user", "content": user_message_content})
295
+
296
+ response_iter = self.model.create_chat_completion(messages=messages, stream=stream, **gen_params)
297
+
298
+ if stream:
299
+ for chunk in response_iter:
300
+ delta = chunk.get('choices', [{}])[0].get('delta', {})
301
+ chunk_content = delta.get('content', '')
302
+ if chunk_content:
303
+ full_response_text += chunk_content
304
+ if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
305
+ break
306
+ return full_response_text
307
+ else: # Not streaming (response_iter is a single dict)
308
+ return response_iter.get('choices', [{}])[0].get('message', {}).get('content', '')
309
+ else: # Raw completion
310
+ full_raw_prompt = f"{system_prompt}\n{prompt}" if system_prompt else prompt
311
+ response_iter = self.model.create_completion(prompt=full_raw_prompt, stream=stream, **gen_params)
312
+ if stream:
313
+ for chunk in response_iter:
314
+ chunk_content = chunk.get('choices', [{}])[0].get('text', '')
315
+ if chunk_content:
316
+ full_response_text += chunk_content
317
+ if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
318
+ break
319
+ return full_response_text
320
+ else:
321
+ return response_iter.get('choices', [{}])[0].get('text', '')
322
+
323
+ except Exception as ex:
324
+ error_message = f"Llama.cpp generation error: {str(ex)}"
325
+ trace_exception(ex)
326
+ return {"status": False, "error": error_message}
327
+
328
+ def tokenize(self, text: str) -> List[int]:
329
+ if not self.model:
330
+ ASCIIColors.warning("Llama.cpp model not loaded. Tokenization fallback to tiktoken.")
331
+ return tiktoken.model.encoding_for_model("gpt-3.5-turbo").encode(text)
332
+ return self.model.tokenize(text.encode("utf-8"), add_bos=False, special=False)
333
+
334
+ def detokenize(self, tokens: List[int]) -> str:
335
+ if not self.model:
336
+ ASCIIColors.warning("Llama.cpp model not loaded. Detokenization fallback to tiktoken.")
337
+ return tiktoken.model.encoding_for_model("gpt-3.5-turbo").decode(tokens)
338
+ try:
339
+ return self.model.detokenize(tokens).decode("utf-8", errors="ignore")
340
+ except Exception: # Fallback if detokenize gives non-utf8 bytes
341
+ return self.model.detokenize(tokens).decode("latin-1", errors="ignore")
342
+
343
+ def count_tokens(self, text: str) -> int:
344
+ if not self.model:
345
+ ASCIIColors.warning("Llama.cpp model not loaded. Token count fallback to tiktoken.")
346
+ return len(tiktoken.model.encoding_for_model("gpt-3.5-turbo").encode(text))
347
+ return len(self.tokenize(text))
348
+
349
+ def embed(self, text: str, **kwargs) -> List[float]:
350
+ if not self.model:
351
+ raise Exception("Llama.cpp model not loaded.")
352
+ if not self.llama_config.get("embedding"): # or not self.model.params.embedding:
353
+ raise Exception("Embedding support was not enabled when loading the model (set 'embedding: true' in config).")
354
+ try:
355
+ return self.model.embed(text)
356
+ except Exception as ex:
357
+ trace_exception(ex); raise Exception(f"Llama.cpp embedding failed: {str(ex)}") from ex
358
+
359
+ def get_model_info(self) -> dict:
360
+ if not self.model:
361
+ return {
362
+ "name": self.binding_name, "model_path": self.model_path, "loaded": False,
363
+ "error": "Model not loaded or failed to load."
364
+ }
365
+
366
+ is_llava_model = "llava" in self.model_path.lower() or \
367
+ (self.llama_config.get("chat_format", "").startswith("llava") and \
368
+ self.llama_config.get("clip_model_path") is not None)
369
+
370
+ return {
371
+ "name": self.binding_name, "model_path": self.model_path, "loaded": True,
372
+ "n_ctx": self.model.context_params.n_ctx,
373
+ "n_gpu_layers": self.llama_config.get("n_gpu_layers"),
374
+ "seed": self.llama_config.get("seed"),
375
+ "supports_structured_output": self.grammar is not None or self.llama_config.get("grammar_file") is not None,
376
+ "supports_vision": is_llava_model and LlamaChatCompletionRequestMessageImageContentPart is not None,
377
+ "config": self.llama_config
378
+ }
379
+
380
+ def listModels(self) -> List[Dict[str, str]]: # type: ignore
381
+ # This method is more for server-based bindings. For LlamaCpp, it describes the loaded model.
382
+ # It could be extended to scan lollms_paths for GGUF files.
383
+ if self.model:
384
+ return [{
385
+ 'model_name': os.path.basename(self.model_path), 'path': self.model_path, 'loaded': True,
386
+ 'n_ctx': str(self.model.context_params.n_ctx),
387
+ 'n_gpu_layers': str(self.llama_config.get("n_gpu_layers","N/A")),
388
+ }]
389
+ return [{'model_name': os.path.basename(self.model_path) if self.model_path else "Not specified",
390
+ 'path': self.model_path, 'loaded': False, 'error': "Model not loaded."}]
391
+
392
+ def unload_model(self):
393
+ if self.model:
394
+ del self.model
395
+ self.model = None
396
+ ASCIIColors.info("Llama.cpp model unloaded.")
397
+ # In Python, explicit memory freeing for C extensions can be tricky.
398
+ # `del self.model` removes the Python reference. If llama.cpp's Llama class
399
+ # has a proper __del__ method (it does), it should free its C resources.
400
+ # Forcing GC might help, but not guaranteed immediate effect.
401
+ # import gc; gc.collect()
402
+
403
+ def __del__(self):
404
+ self.unload_model()
405
+
406
+
407
+ if __name__ == '__main__':
408
+ global full_streamed_text
409
+ ASCIIColors.yellow("Testing PythonLlamaCppBinding...")
410
+
411
+ # --- IMPORTANT: Configure model path ---
412
+ # Replace with the ACTUAL PATH to your GGUF model file.
413
+ # e.g., gguf_model_path = "C:/Models/Mistral-7B-Instruct-v0.2-Q4_K_M.gguf"
414
+ # If this path is not found, a dummy GGUF will be created for basic tests.
415
+ gguf_model_path = "model.gguf" # <<< REPLACE THIS OR ENSURE 'model.gguf' EXISTS
416
+
417
+ # --- LLaVA Test Configuration (Optional) ---
418
+ # To test LLaVA, set this to your LLaVA GGUF model path
419
+ llava_test_model_path = None # e.g., "path/to/your/llava-v1.6-mistral-7b.Q4_K_M.gguf"
420
+ # And the corresponding mmproj (clip model) GGUF path
421
+ llava_test_clip_model_path = None # e.g., "path/to/your/mmproj-mistral7b-f16.gguf"
422
+ # And set the chat format for LLaVA
423
+ llava_chat_format = "llava-1-6" # or "llava-1-5" depending on your model
424
+
425
+ # Attempt to create a dummy GGUF if specified path doesn't exist (for placeholder testing)
426
+ is_dummy_model = False
427
+ if not os.path.exists(gguf_model_path):
428
+ ASCIIColors.warning(f"Model path '{gguf_model_path}' not found.")
429
+ ASCIIColors.warning("Creating a tiny dummy GGUF file ('dummy_model.gguf') for placeholder testing.")
430
+ ASCIIColors.warning("This dummy file WILL NOT WORK for actual inference.")
431
+ try:
432
+ with open("dummy_model.gguf", "wb") as f: # Minimal valid GGUF structure
433
+ f.write(b"GGUF\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00")
434
+ key, val = "tokenizer.ggml.model", "llama"
435
+ f.write(len(key).to_bytes(8,'little')+key.encode()+b"\x02\x00\x00\x00\x00\x00\x00\x00"+len(val).to_bytes(8,'little')+val.encode())
436
+ gguf_model_path = "dummy_model.gguf"
437
+ is_dummy_model = True
438
+ ASCIIColors.info(f"Using dummy GGUF: {gguf_model_path}. Real inference tests will fail or be skipped.")
439
+ except Exception as e_dummy:
440
+ ASCIIColors.error(f"Could not create dummy GGUF: {e_dummy}. Please set a valid GGUF model path.")
441
+ sys.exit(1)
442
+
443
+ binding_config = {
444
+ "n_gpu_layers": 0, # 0 for CPU, -1 for all possible layers to GPU, or specific number
445
+ "n_ctx": 1024, # Short context for testing
446
+ "seed": 1234,
447
+ "embedding": True, # Enable embedding generation for the test
448
+ "verbose": False, # Set to True for detailed llama.cpp logs
449
+ # "grammar_file": "json.gbnf" # Example for grammar test
450
+ }
451
+
452
+ mock_lollms_paths = { "personal_models_path": ".", "grammars_path": "grammars_test" }
453
+ if not os.path.exists(mock_lollms_paths["grammars_path"]):
454
+ os.makedirs(mock_lollms_paths["grammars_path"], exist_ok=True)
455
+ with open(os.path.join(mock_lollms_paths["grammars_path"], "test_grammar.gbnf"), "w") as f:
456
+ f.write('root ::= ("hello" | "world")')
457
+
458
+ active_binding = None
459
+ try:
460
+ ASCIIColors.cyan("\n--- Initializing PythonLlamaCppBinding ---")
461
+ active_binding = PythonLlamaCppBinding(model_path=gguf_model_path, config=binding_config, lollms_paths=mock_lollms_paths)
462
+ ASCIIColors.green(f"Binding initialized. Model: {active_binding.model_path}")
463
+ ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2)}")
464
+
465
+ if is_dummy_model:
466
+ ASCIIColors.warning("\nRUNNING WITH DUMMY MODEL. MOST FUNCTIONALITY TESTS WILL BE SKIPPED.")
467
+ else:
468
+ # --- List Models ---
469
+ ASCIIColors.cyan("\n--- Listing Models ---")
470
+ print(json.dumps(active_binding.listModels(), indent=2))
471
+
472
+ # --- Tokenize/Detokenize ---
473
+ ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
474
+ sample_text = "Hello, Llama.cpp world! This is a test sentence."
475
+ tokens = active_binding.tokenize(sample_text)
476
+ ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:15]}...")
477
+ token_count = active_binding.count_tokens(sample_text)
478
+ ASCIIColors.green(f"Token count: {token_count}")
479
+ detokenized_text = active_binding.detokenize(tokens)
480
+ ASCIIColors.green(f"Detokenized text: {detokenized_text}")
481
+ assert detokenized_text.strip() == sample_text.strip(), "Tokenization/Detokenization mismatch!"
482
+
483
+ # --- Text Generation (Non-Streaming, Chat Format) ---
484
+ ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat) ---")
485
+ prompt_text = "What is the capital of France?"
486
+ system_prompt_text = "You are a helpful geography expert."
487
+ generated_text = active_binding.generate_text(
488
+ prompt_text, system_prompt=system_prompt_text, n_predict=30, stream=False, use_chat_format=True
489
+ )
490
+ if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
491
+ else: ASCIIColors.error(f"Generation failed: {generated_text}")
492
+
493
+ # --- Text Generation (Streaming, Chat Format) ---
494
+ ASCIIColors.cyan("\n--- Text Generation (Streaming, Chat) ---")
495
+ full_streamed_text = ""
496
+ def stream_callback(chunk: str, msg_type: int):
497
+ global full_streamed_text; print(f"{ASCIIColors.GREEN}{chunk}{ASCIIColors.RESET}", end="", flush=True)
498
+ full_streamed_text += chunk; return True
499
+
500
+ result = active_binding.generate_text(
501
+ prompt_text, system_prompt=system_prompt_text, n_predict=50, stream=True,
502
+ streaming_callback=stream_callback, use_chat_format=True
503
+ )
504
+ print("\n--- End of Stream ---")
505
+ if isinstance(result, str): ASCIIColors.green(f"Full streamed text: {result}")
506
+ else: ASCIIColors.error(f"Streaming generation failed: {result}")
507
+
508
+ # --- Text Generation with Grammar ---
509
+ ASCIIColors.cyan("\n--- Text Generation with Grammar ---")
510
+ generated_grammar_text = active_binding.generate_text(
511
+ "Output a greeting:", n_predict=5, stream=False, use_chat_format=False, # Grammar often better with raw completion
512
+ grammar=os.path.join(mock_lollms_paths["grammars_path"], "test_grammar.gbnf")
513
+ )
514
+ if isinstance(generated_grammar_text, str):
515
+ ASCIIColors.green(f"Generated text with grammar: '{generated_grammar_text.strip()}'")
516
+ assert generated_grammar_text.strip().lower() in ["hello", "world"], "Grammar constraint failed!"
517
+ else: ASCIIColors.error(f"Grammar generation failed: {generated_grammar_text}")
518
+
519
+ # --- Embeddings ---
520
+ if binding_config.get("embedding"):
521
+ ASCIIColors.cyan("\n--- Embeddings ---")
522
+ embedding_text = "This is a test for embeddings."
523
+ try:
524
+ embedding_vector = active_binding.embed(embedding_text)
525
+ ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
526
+ ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
527
+ except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
528
+ else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false in config) ---")
529
+
530
+ # --- LLaVA Test (if configured and real model is LLaVA) ---
531
+ if not is_dummy_model and llava_test_model_path and os.path.exists(llava_test_model_path) and \
532
+ llava_test_clip_model_path and os.path.exists(llava_test_clip_model_path) and \
533
+ active_binding and active_binding.model_path.lower() == llava_test_model_path.lower():
534
+
535
+ ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
536
+ # This assumes the 'active_binding' was ALREADY loaded with the LLaVA model
537
+ # and its specific config (clip_model_path, chat_format="llava-1-x").
538
+ # If not, you'd need to unload and reload/reinitialize the binding for LLaVA.
539
+ if not (active_binding.llama_config.get("chat_format","").startswith("llava") and \
540
+ active_binding.llama_config.get("clip_model_path")):
541
+ ASCIIColors.warning("Current binding not configured for LLaVA. Skipping LLaVA test.")
542
+ ASCIIColors.warning("To test LLaVA, ensure gguf_model_path points to LLaVA model and config includes 'chat_format' and 'clip_model_path'.")
543
+ else:
544
+ dummy_image_path = "dummy_llava_image.png"
545
+ try:
546
+ from PIL import Image, ImageDraw
547
+ img = Image.new('RGB', (200, 80), color = ('cyan'))
548
+ d = ImageDraw.Draw(img); d.text((10,20), "LLaVA Test", fill=('black'))
549
+ img.save(dummy_image_path)
550
+ ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
551
+
552
+ llava_prompt = "What do you see in this image?"
553
+ llava_response = active_binding.generate_text(
554
+ prompt=llava_prompt, images=[dummy_image_path], n_predict=50, stream=False, use_chat_format=True
555
+ )
556
+ if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
557
+ else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
558
+ except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
559
+ except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
560
+ finally:
561
+ if os.path.exists(dummy_image_path): os.remove(dummy_image_path)
562
+ elif not is_dummy_model and llava_test_model_path: # If LLaVA test paths are set but model isn't LLaVA
563
+ ASCIIColors.yellow(f"LLaVA test paths are set, but current model '{active_binding.model_path if active_binding else 'N/A'}' is not '{llava_test_model_path}'.")
564
+ ASCIIColors.yellow("Skipping LLaVA-specific test section. To run, set main gguf_model_path to LLaVA model and configure LLaVA params.")
565
+
566
+
567
+ except ImportError as e_imp:
568
+ ASCIIColors.error(f"Import error: {e_imp}. Llama-cpp-python might not be installed/configured correctly.")
569
+ except FileNotFoundError as e_fnf:
570
+ ASCIIColors.error(f"Model file error: {e_fnf}. Ensure GGUF model path is correct.")
571
+ except RuntimeError as e_rt:
572
+ ASCIIColors.error(f"Runtime error (often model load failure or llama.cpp issue): {e_rt}")
573
+ trace_exception(e_rt)
574
+ except Exception as e_main:
575
+ ASCIIColors.error(f"An unexpected error occurred: {e_main}")
576
+ trace_exception(e_main)
577
+ finally:
578
+ if active_binding:
579
+ ASCIIColors.cyan("\n--- Unloading Model ---")
580
+ active_binding.unload_model()
581
+ ASCIIColors.green("Model unloaded.")
582
+
583
+ if is_dummy_model and os.path.exists("dummy_model.gguf"):
584
+ os.remove("dummy_model.gguf")
585
+
586
+ test_grammar_file = os.path.join(mock_lollms_paths["grammars_path"], "test_grammar.gbnf")
587
+ if os.path.exists(test_grammar_file): os.remove(test_grammar_file)
588
+ if os.path.exists(mock_lollms_paths["grammars_path"]) and not os.listdir(mock_lollms_paths["grammars_path"]):
589
+ os.rmdir(mock_lollms_paths["grammars_path"])
590
+
591
+ ASCIIColors.yellow("\nPythonLlamaCppBinding test finished.")