lollms-client 0.15.1__py3-none-any.whl → 0.15.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

@@ -24,211 +24,156 @@ pm.ensure_packages(["requests", "pillow"]) # pillow for dummy image in test
24
24
  if not pm.is_installed("llama-cpp-binaries"):
25
25
  def install_llama_cpp():
26
26
  system = platform.system()
27
+ python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
28
+
29
+ # Determine CUDA suffix based on common recent versions. Adjust if needed.
30
+ # For simplicity, we'll target a common recent CUDA version.
31
+ # Users with specific needs might need to install manually.
32
+ # As of late 2023/early 2024, cu121 or cu118 are common.
33
+ # The oobabooga binaries often use +cu124 for recent builds. Let's try that.
34
+ cuda_suffix = "+cu124"
35
+
27
36
 
28
37
  if system == "Windows":
29
- url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl"
38
+ # llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl
39
+ url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
40
+ fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl" # Generic py3
30
41
  elif system == "Linux":
31
- url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl"
42
+ # llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl
43
+ url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
44
+ fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
32
45
  else:
33
- print(f"Unsupported OS: {system}")
46
+ ASCIIColors.warning(f"Unsupported OS for prebuilt llama-cpp-binaries: {system}. Please install manually.")
34
47
  return
35
- pm.install(url)
48
+
49
+ ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
50
+ try:
51
+ pm.install(url)
52
+ except Exception as e:
53
+ ASCIIColors.warning(f"Failed to install specific version from {url}: {e}")
54
+ ASCIIColors.info(f"Attempting fallback URL: {fallback_url}")
55
+ try:
56
+ pm.install(fallback_url)
57
+ except Exception as e_fallback:
58
+ ASCIIColors.error(f"Failed to install from fallback URL {fallback_url}: {e_fallback}")
59
+ ASCIIColors.error("Please try installing llama-cpp-binaries manually, e.g., 'pip install llama-cpp-python[server]' or from a wheel.")
60
+
36
61
  install_llama_cpp()
37
62
 
38
63
  try:
39
64
  import llama_cpp_binaries
40
65
  except ImportError:
41
66
  ASCIIColors.error("llama-cpp-binaries package not found. Please install it.")
42
- ASCIIColors.error("You can try: pip install llama-cpp-binaries")
43
- ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases")
67
+ ASCIIColors.error("You can try: pip install llama-cpp-python[server] (for server support)")
68
+ ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases or https://pypi.org/project/llama-cpp-python/#files")
44
69
  llama_cpp_binaries = None
45
70
 
46
71
 
47
72
  # --- Predefined patterns ---
48
-
49
- # Quantization type strings (derived from ggml.h, llama.cpp, and common usage)
50
- # These are the "core component" strings, without separators like '.', '-', or '_'
51
73
  _QUANT_COMPONENTS_SET: Set[str] = {
52
- # K-quants (most common, often with S/M/L suffix, and now XS/XXS)
53
- "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K",
54
- "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S", # No Q6_K_S usually
55
- "Q3_K_M", "Q4_K_M", "Q5_K_M", # No Q2/Q6_K_M usually
56
- "Q3_K_L", # Only Q3_K_L is common
57
- # Adding XS and XXS variants for K-quants by analogy with IQ types
58
- "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
59
- "Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS",
60
-
61
- # Non-K-quant legacy types
62
- "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
63
-
64
- # Floating point types
65
- "F16", "FP16", "F32", "FP32", "BF16",
66
-
67
- # IQ (Innovative Quantization) types
68
- "IQ1_S", "IQ1_M",
69
- "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
70
- "IQ3_XXS", "IQ3_S", "IQ3_M",
71
- "IQ4_NL", "IQ4_XS",
72
-
73
- # Newer IQ K-Quant variants (IQ types using K-quant style super-blocks)
74
- "IQ3_M_K", "IQ3_S_K", # Adding IQ3_S_K as it's plausible
75
- "IQ4_XS_K", "IQ4_NL_K", # Adding IQ4_NL_K as it's plausible
76
-
77
- # Basic integer types (less common in user-facing LLM filenames as primary quantizer)
78
- "I8", "I16", "I32",
79
-
80
- # Special GGUF type names that might appear (from ggml.c `ggml_type_name`)
81
- "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
82
- "MOSTLY_Q8_0",
83
- "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
74
+ "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S",
75
+ "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q3_K_L", "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
76
+ "Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS", "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
77
+ "F16", "FP16", "F32", "FP32", "BF16", "IQ1_S", "IQ1_M", "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
78
+ "IQ3_XXS", "IQ3_S", "IQ3_M", "IQ4_NL", "IQ4_XS", "IQ3_M_K", "IQ3_S_K", "IQ4_XS_K", "IQ4_NL_K",
79
+ "I8", "I16", "I32", "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
80
+ "MOSTLY_Q8_0", "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
84
81
  "MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
85
- "MOSTLY_IQ1_S", "MOSTLY_IQ1_M", # Adding these
86
- "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
87
- "MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", # Adding IQ3_M, IQ3_S
88
- "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
82
+ "MOSTLY_IQ1_S", "MOSTLY_IQ1_M", "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
83
+ "MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
89
84
  }
90
-
91
- # Common descriptive suffixes for model names
92
85
  _MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
93
86
  "instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
94
- "base", "cont", "continue", "ft", # Fine-tuning related
95
- "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0", # Common version tags if they are truly suffixes
96
- # Be cautious with general version numbers (e.g., "v1", "v2") or model sizes (e.g., "7b")
97
- # as they are often integral parts of the base name. Only add if they are
98
- # *always* extraneous suffixes in your context.
99
- # The ones above are more specific and often appear as full suffix components.
87
+ "base", "cont", "continue", "ft", "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0"
100
88
  }
101
-
102
- # Combine, ensure uniqueness by using sets, then sort by length descending.
103
- # Sorting ensures longer patterns (e.g., "Q4_K_M") are checked before
104
- # shorter sub-patterns (e.g., "Q4_K" or "K_M").
105
89
  _ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
106
- list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)),
107
- key=len,
108
- reverse=True
90
+ list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)), key=len, reverse=True
109
91
  )
110
92
 
111
93
  def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
112
- """
113
- Extracts a base model name from a GGUF filename or path by removing
114
- the .gguf extension and then iteratively stripping known quantization
115
- patterns and common descriptive suffixes from the end of the name.
116
-
117
- The stripping is case-insensitive and checks for patterns preceded
118
- by '.', '-', or '_'.
119
-
120
- Args:
121
- file_path_or_name: The file path (as a string or Path object)
122
- or just the filename string.
123
-
124
- Returns:
125
- The derived base model name string.
126
- """
127
- if isinstance(file_path_or_name, str):
128
- p = Path(file_path_or_name)
129
- elif isinstance(file_path_or_name, Path):
130
- p = file_path_or_name
131
- else:
132
- raise TypeError(
133
- "Input must be a string or Path object. "
134
- f"Got: {type(file_path_or_name)}"
135
- )
136
-
137
- name_part = p.name # Full filename, e.g., "MyModel-7B-chat.Q4_K_M.gguf"
138
-
139
- # 1. Remove .gguf extension (case-insensitive)
140
- if name_part.lower().endswith(".gguf"):
141
- name_part = name_part[:-5] # Remove last 5 chars: ".gguf"
142
-
143
- # 2. Iteratively strip known components (quantization, common suffixes)
144
- # These components are usually preceded by '.', '-', or '_'
94
+ if isinstance(file_path_or_name, str): p = Path(file_path_or_name)
95
+ elif isinstance(file_path_or_name, Path): p = file_path_or_name
96
+ else: raise TypeError(f"Input must be a string or Path object. Got: {type(file_path_or_name)}")
97
+ name_part = p.stem if p.suffix.lower() == ".gguf" else p.name
98
+ if name_part.lower().endswith(".gguf"): name_part = name_part[:-5]
145
99
  while True:
146
100
  original_name_part_len = len(name_part)
147
101
  stripped_in_this_iteration = False
148
-
149
102
  for component in _ALL_REMOVABLE_COMPONENTS:
150
103
  component_lower = component.lower()
151
- # Check for patterns like ".component", "-component", or "_component"
152
104
  for separator in [".", "-", "_"]:
153
105
  pattern_to_check = f"{separator}{component_lower}"
154
106
  if name_part.lower().endswith(pattern_to_check):
155
- # Remove from the original-case name_part
156
107
  name_part = name_part[:-(len(pattern_to_check))]
157
- stripped_in_this_iteration = True
158
- break # Break from separator loop
159
- if stripped_in_this_iteration:
160
- break # Break from component loop (found a match, restart while loop with shorter name_part)
161
-
162
- # If no component was stripped in a full pass through _ALL_REMOVABLE_COMPONENTS,
163
- # or if name_part became empty, we're done.
164
- if not stripped_in_this_iteration or not name_part:
165
- break
166
-
167
- # 3. Final cleanup: remove trailing separators if any are left after stripping
168
- while name_part and name_part[-1] in ['.', '-', '_']:
169
- name_part = name_part[:-1]
170
-
108
+ stripped_in_this_iteration = True; break
109
+ if stripped_in_this_iteration: break
110
+ if not stripped_in_this_iteration or not name_part: break
111
+ while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
171
112
  return name_part
172
113
 
114
+ # --- Global Server Registry ---
115
+ _active_servers: Dict[tuple, 'LlamaCppServerProcess'] = {}
116
+ _server_ref_counts: Dict[tuple, int] = {}
117
+ _server_registry_lock = threading.Lock()
173
118
 
174
119
  BindingName = "LlamaCppServerBinding"
175
120
  DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
176
- DEFAULT_LLAMACPP_SERVER_PORT = 9641
177
- # Based on the LlamaServer class provided in the prompt
121
+ # Port is now dynamic, this constant is less critical for direct use but good for reference.
122
+ # DEFAULT_LLAMACPP_SERVER_PORT = 9641
123
+
178
124
  class LlamaCppServerProcess:
179
- def __init__(self, model_path: str|Path, clip_model_path: str = None, server_binary_path: str=None, port: int=None, server_args: Dict[str, Any]={}):
125
+ def __init__(self, model_path: Union[str, Path], clip_model_path: Optional[Union[str, Path]] = None, server_binary_path: Optional[Union[str, Path]]=None, server_args: Dict[str, Any]={}):
180
126
  self.model_path = Path(model_path)
181
- self.clip_model_path = clip_model_path
182
- self.server_binary_path = Path(server_binary_path)
183
- if self.server_binary_path is None:
184
- self.server_binary_path = llama_cpp_binaries.get_binary_path()
185
- self.port = port if port else DEFAULT_LLAMACPP_SERVER_PORT
127
+ self.clip_model_path = Path(clip_model_path) if clip_model_path else None
128
+
129
+ if server_binary_path:
130
+ self.server_binary_path = Path(server_binary_path)
131
+ elif llama_cpp_binaries:
132
+ self.server_binary_path = Path(llama_cpp_binaries.get_binary_path())
133
+ else:
134
+ raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
135
+
136
+ self.port: Optional[int] = None # Set by start() method
186
137
  self.server_args = server_args
187
138
  self.process: Optional[subprocess.Popen] = None
188
139
  self.session = requests.Session()
189
- self.host = DEFAULT_LLAMACPP_SERVER_HOST
190
- self.base_url = f"http://{self.host}:{self.port}"
140
+ self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
141
+ self.base_url: Optional[str] = None # Set by start() method
191
142
  self.is_healthy = False
192
- self._stderr_lines = [] # Store last few stderr lines for debugging
193
- self._stderr_thread = None
143
+ self._stderr_lines: List[str] = []
144
+ self._stderr_thread: Optional[threading.Thread] = None
194
145
 
195
146
  if not self.model_path.exists():
196
147
  raise FileNotFoundError(f"Model file not found: {self.model_path}")
148
+ if self.clip_model_path and not self.clip_model_path.exists():
149
+ ASCIIColors.warning(f"Clip model file '{self.clip_model_path}' not found. Vision features may not work or may use a different auto-detected clip model.")
197
150
  if not self.server_binary_path.exists():
198
151
  raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
199
152
 
200
- self._start_server()
201
-
202
153
  def _filter_stderr(self, stderr_pipe):
203
154
  try:
204
155
  for line in iter(stderr_pipe.readline, ''):
205
156
  if line:
206
157
  self._stderr_lines.append(line.strip())
207
- if len(self._stderr_lines) > 50: # Keep last 50 lines
208
- self._stderr_lines.pop(0)
209
- # Simple progress or key info logging
158
+ if len(self._stderr_lines) > 50: self._stderr_lines.pop(0)
210
159
  if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
211
- ASCIIColors.debug(f"[LLAMA_SERVER_STDERR] {line.strip()}")
212
- elif "running" in line and "port" in line: # Server startup message
213
- ASCIIColors.info(f"[LLAMA_SERVER_STDERR] {line.strip()}")
214
-
215
- except ValueError: # Pipe closed
216
- pass
217
- except Exception as e:
218
- ASCIIColors.warning(f"Exception in stderr filter thread: {e}")
219
-
220
-
221
- def _start_server(self, is_embedding=False):
160
+ ASCIIColors.debug(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
161
+ elif "running on port" in line: # Server startup message
162
+ ASCIIColors.info(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
163
+ except ValueError: pass
164
+ except Exception as e: ASCIIColors.warning(f"Exception in stderr filter thread for port {self.port}: {e}")
165
+
166
+ def start(self, port_to_use: int):
167
+ self.port = port_to_use
168
+ self.base_url = f"http://{self.host}:{self.port}"
169
+
222
170
  cmd = [
223
171
  str(self.server_binary_path),
224
172
  "--model", str(self.model_path),
225
173
  "--host", self.host,
226
174
  "--port", str(self.port),
227
- # Add other common defaults or arguments from self.server_args
228
175
  ]
229
176
 
230
- # Common arguments mapping from LlamaCppBinding to server CLI args
231
- # (This needs to be kept in sync with llama.cpp server's CLI)
232
177
  arg_map = {
233
178
  "n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
234
179
  "tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
@@ -236,446 +181,356 @@ class LlamaCppServerProcess:
236
181
  "n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
237
182
  "rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
238
183
  "rope_freq_scale": "--rope-freq-scale",
239
- "embedding": (lambda v: ["--embedding"] if is_embedding else []), # Server needs to be started with embedding support
184
+ "embedding": (lambda v: ["--embedding"] if v else []),
240
185
  "verbose": (lambda v: ["--verbose"] if v else []),
241
- "chat_template": "--chat-template", # For newer servers if they support jinja chat templates
242
- # Old llama.cpp server used --chatml or specific format flags
186
+ "chat_template": "--chat-template",
187
+ "parallel_slots": "--parallel", # Number of parallel processing slots
243
188
  }
244
189
 
245
- # For LLaVA, specific args are needed
246
- if self.clip_model_path:
190
+ if self.clip_model_path: # This should be the actual path resolved by the binding
247
191
  cmd.extend(["--mmproj", str(self.clip_model_path)])
248
- # The server might automatically detect LLaVA chat format or need a specific flag
249
- # e.g., --chat-template llava-1.5 (if server supports templates)
250
- # For older servers, a specific chat format flag like --chatml with LLaVA prompt structure was used.
251
- # The server from llama-cpp-binaries is usually quite up-to-date.
252
192
 
253
193
  for key, cli_arg in arg_map.items():
254
194
  val = self.server_args.get(key)
255
195
  if val is not None:
256
- if callable(cli_arg): # For args like --no-mmap
257
- cmd.extend(cli_arg(val))
258
- else:
259
- cmd.extend([cli_arg, str(val)])
196
+ if callable(cli_arg): cmd.extend(cli_arg(val))
197
+ else: cmd.extend([cli_arg, str(val)])
260
198
 
261
- # Add any extra CLI flags directly
262
199
  extra_cli_flags = self.server_args.get("extra_cli_flags", [])
263
- if isinstance(extra_cli_flags, str): # If it's a string, split it
264
- extra_cli_flags = extra_cli_flags.split()
200
+ if isinstance(extra_cli_flags, str): extra_cli_flags = extra_cli_flags.split()
265
201
  cmd.extend(extra_cli_flags)
266
202
 
267
-
268
- ASCIIColors.info(f"Starting Llama.cpp server with command: {' '.join(cmd)}")
203
+ ASCIIColors.info(f"Starting Llama.cpp server ({' '.join(cmd)})")
269
204
 
270
- # Prevent paths with spaces from breaking the command on some OS, though Popen usually handles this.
271
- # For safety, ensure paths are quoted if necessary, or rely on Popen's list-based command.
272
-
273
205
  env = os.environ.copy()
274
- # On Linux, it might be necessary to set LD_LIBRARY_PATH if server binary has shared lib dependencies in its folder
275
206
  if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
276
207
  lib_path_str = str(self.server_binary_path.parent.resolve())
277
208
  current_ld_path = env.get('LD_LIBRARY_PATH', '')
278
- if current_ld_path:
279
- env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}"
280
- else:
281
- env['LD_LIBRARY_PATH'] = lib_path_str
209
+ env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}" if current_ld_path else lib_path_str
282
210
 
283
211
  try:
284
- ASCIIColors.green(f"running server: {' '.join(cmd)}")
285
- self.process = subprocess.Popen(
286
- cmd,
287
- stderr=subprocess.PIPE,
288
- stdout=subprocess.PIPE, # Capture stdout as well for debugging
289
- text=True,
290
- bufsize=1, # Line buffered
291
- env=env
292
- )
212
+ self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
293
213
  except Exception as e:
294
- ASCIIColors.error(f"Failed to start llama.cpp server process: {e}")
295
- trace_exception(e)
296
- raise
214
+ ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
297
215
 
298
- # Start stderr/stdout reading threads
299
216
  self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
300
217
  self._stderr_thread.start()
301
- # self._stdout_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stdout,), daemon=True) # can use same filter
302
- # self._stdout_thread.start()
303
-
304
218
 
305
- # Wait for server to be healthy
306
219
  health_url = f"{self.base_url}/health"
307
- max_wait_time = self.server_args.get("server_startup_timeout", 60) # seconds
220
+ max_wait_time = self.server_args.get("server_startup_timeout", 60)
308
221
  start_time = time.time()
309
222
 
310
223
  while time.time() - start_time < max_wait_time:
311
224
  if self.process.poll() is not None:
312
- exit_code = self.process.poll()
313
- stderr_output = "\n".join(self._stderr_lines[-10:]) # Last 10 lines
314
- raise RuntimeError(f"Llama.cpp server process terminated unexpectedly with exit code {exit_code} during startup. Stderr:\n{stderr_output}")
225
+ stderr_output = "\n".join(self._stderr_lines[-10:])
226
+ raise RuntimeError(f"Llama.cpp server (port {self.port}) terminated unexpectedly (exit code {self.process.poll()}) during startup. Stderr:\n{stderr_output}")
315
227
  try:
316
228
  response = self.session.get(health_url, timeout=2)
317
229
  if response.status_code == 200 and response.json().get("status") == "ok":
318
230
  self.is_healthy = True
319
231
  ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
320
232
  return
321
- except requests.exceptions.ConnectionError:
322
- time.sleep(1) # Wait and retry
323
- except Exception as e:
324
- ASCIIColors.warning(f"Health check failed: {e}")
325
- time.sleep(1)
233
+ except requests.exceptions.ConnectionError: time.sleep(1)
234
+ except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
326
235
 
327
236
  self.is_healthy = False
328
- self.stop() # Ensure process is killed if health check failed
237
+ self.shutdown()
329
238
  stderr_output = "\n".join(self._stderr_lines[-10:])
330
239
  raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
331
240
 
332
- def stop(self):
241
+ def shutdown(self):
333
242
  self.is_healthy = False
334
243
  if self.process:
335
- ASCIIColors.info(f"Stopping Llama.cpp server (PID: {self.process.pid})...")
244
+ ASCIIColors.info(f"Shutting down Llama.cpp server (PID: {self.process.pid} on port {self.port})...")
336
245
  try:
337
- # Try graceful termination first
338
- if os.name == 'nt': # Windows
339
- # Sending CTRL_C_EVENT to the process group might be more effective for console apps
340
- # self.process.send_signal(signal.CTRL_C_EVENT) # Requires creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
341
- self.process.terminate() # For Windows, terminate is often like kill
342
- else: # POSIX
343
- self.process.terminate() # Sends SIGTERM
344
-
345
- self.process.wait(timeout=10) # Wait for graceful shutdown
246
+ if os.name == 'nt': self.process.terminate()
247
+ else: self.process.terminate()
248
+ self.process.wait(timeout=10)
346
249
  except subprocess.TimeoutExpired:
347
- ASCIIColors.warning("Llama.cpp server did not terminate gracefully, killing...")
348
- self.process.kill() # Force kill
349
- try:
350
- self.process.wait(timeout=5)
351
- except subprocess.TimeoutExpired:
352
- ASCIIColors.error("Failed to kill llama.cpp server process.")
353
- except Exception as e:
354
- ASCIIColors.error(f"Error during server stop: {e}")
250
+ ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
251
+ self.process.kill()
252
+ try: self.process.wait(timeout=5)
253
+ except subprocess.TimeoutExpired: ASCIIColors.error(f"Failed to kill llama.cpp server process (port {self.port}).")
254
+ except Exception as e: ASCIIColors.error(f"Error during server shutdown (port {self.port}): {e}")
355
255
  finally:
356
256
  self.process = None
357
- if self._stderr_thread and self._stderr_thread.is_alive():
358
- self._stderr_thread.join(timeout=1) # Wait for thread to finish
359
- ASCIIColors.info("Llama.cpp server stopped.")
257
+ if self._stderr_thread and self._stderr_thread.is_alive(): self._stderr_thread.join(timeout=1)
258
+ ASCIIColors.info(f"Llama.cpp server on port {self.port} shut down.")
360
259
 
361
260
 
362
261
  class LlamaCppServerBinding(LollmsLLMBinding):
363
- """
364
- Binding for llama.cpp server using pre-compiled binaries.
365
- Manages a local llama.cpp server subprocess and communicates via HTTP.
366
- """
367
- # Default parameters for the llama.cpp server
368
262
  DEFAULT_SERVER_ARGS = {
369
- "n_gpu_layers": 0,
370
- "n_ctx": 128000,
371
- "n_batch": 512,
372
- "embedding": False, # Enable if embeddings are needed via /embedding or /v1/embeddings
373
- "verbose": False,
374
- "server_startup_timeout": 120, # seconds
375
- # "chat_format": "chatml", # Deprecated in favor of --chat-template, but some old servers might need it
376
- # For LLaVA
377
- # "clip_model_path": None,
378
- # "chat_template": "llava-1.5" # if server supports it. Or specific prompt structure.
263
+ "n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
264
+ "embedding": False, "verbose": False, "server_startup_timeout": 120,
265
+ "parallel_slots": 4, # Default parallel slots for server
379
266
  }
380
267
 
381
- def __init__(self,
382
- model_name: str, # Name of the GGUF file (e.g., "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
383
- models_path: str,
384
- clip_model_name: str = None,
385
- config: Optional[Dict[str, Any]] = None, # Binding specific config from global_config.yaml
386
- default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
387
- **kwargs # Overrides for server_args
388
- ):
389
-
268
+ def __init__(self, model_name: str, models_path: str, clip_model_name: Optional[str] = None,
269
+ config: Optional[Dict[str, Any]] = None, default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat, **kwargs):
390
270
  super().__init__(binding_name=BindingName)
391
-
392
- if llama_cpp_binaries is None:
393
- raise ImportError("llama-cpp-binaries package is required but not found.")
271
+ if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
394
272
 
395
273
  self.models_path = Path(models_path)
396
- self.model_name = model_name
397
- self.model_path = self.models_path/self.model_name
398
- self.clip_model_path = self.models_path/clip_model_name if clip_model_name else None
399
- self.default_completion_format = default_completion_format
274
+ self.user_provided_model_name = model_name # Store the name/path user gave
400
275
 
401
- self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {})}
402
- self.server_args.update(kwargs) # Apply direct kwargs overrides
403
-
276
+ # Initial hint for clip_model_path, resolved fully in load_model
277
+ self.clip_model_path: Optional[Path] = None
278
+ if clip_model_name:
279
+ p_clip = Path(clip_model_name)
280
+ if p_clip.is_absolute() and p_clip.exists():
281
+ self.clip_model_path = p_clip
282
+ elif (self.models_path / clip_model_name).exists(): # Relative to models_path
283
+ self.clip_model_path = self.models_path / clip_model_name
284
+ else:
285
+ ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
286
+
287
+ self.default_completion_format = default_completion_format
288
+ self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
404
289
  self.server_binary_path = self._get_server_binary_path()
405
- self.current_model_path: Optional[Path] = None
290
+
291
+ self.current_model_path: Optional[Path] = None # Actual resolved path of loaded model
406
292
  self.server_process: Optional[LlamaCppServerProcess] = None
407
293
  self.port: Optional[int] = None
294
+ self.server_key: Optional[tuple] = None
408
295
 
409
- # Attempt to load the model (which starts the server)
410
- self.load_model(str(self.model_path))
296
+ if not self.load_model(self.user_provided_model_name):
297
+ ASCIIColors.error(f"Initial model load for '{self.user_provided_model_name}' failed. Binding may not be functional.")
411
298
 
412
299
  def _get_server_binary_path(self) -> Path:
413
- try:
414
- # Check if a custom path is provided in config
415
- custom_path_str = self.server_args.get("llama_server_binary_path")
416
- if custom_path_str:
417
- custom_path = Path(custom_path_str)
418
- if custom_path.exists() and custom_path.is_file():
419
- ASCIIColors.info(f"Using custom llama.cpp server binary path: {custom_path}")
420
- return custom_path
421
- else:
422
- ASCIIColors.warning(f"Custom llama.cpp server binary path '{custom_path_str}' not found or not a file. Falling back.")
423
-
424
- # Default to using llama_cpp_binaries
425
- bin_path_str = llama_cpp_binaries.get_binary_path() # specify "server"
300
+ custom_path_str = self.server_args.get("llama_server_binary_path")
301
+ if custom_path_str:
302
+ custom_path = Path(custom_path_str)
303
+ if custom_path.exists() and custom_path.is_file():
304
+ ASCIIColors.info(f"Using custom llama.cpp server binary: {custom_path}"); return custom_path
305
+ else: ASCIIColors.warning(f"Custom binary '{custom_path_str}' not found. Falling back.")
306
+ if llama_cpp_binaries:
307
+ bin_path_str = llama_cpp_binaries.get_binary_path()
426
308
  if bin_path_str:
427
309
  bin_path = Path(bin_path_str)
428
310
  if bin_path.exists() and bin_path.is_file():
429
- ASCIIColors.info(f"Using llama.cpp server binary from llama-cpp-binaries: {bin_path}")
430
- return bin_path
431
-
432
- raise FileNotFoundError("Could not locate llama.cpp server binary via llama-cpp-binaries or custom path.")
433
-
434
- except Exception as e:
435
- ASCIIColors.error(f"Error getting llama.cpp server binary path: {e}")
436
- trace_exception(e)
437
- # As a last resort, try a common name in system PATH or a known location if Lollms ships one
438
- # For now, rely on llama-cpp-binaries or explicit config.
439
- raise FileNotFoundError(
440
- "Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' is installed "
441
- "or provide 'llama_server_binary_path' in the binding's configuration."
442
- ) from e
443
-
444
- def _resolve_model_path(self, model_path: str) -> Path:
445
- # Search order:
446
- # 1. Absolute path
447
- # 2. Relative to binding-specific models path (e.g., personal_models_path/LlamaCppServerBinding/)
448
- # 3. Relative to personal_models_path
449
- # 4. Relative to models_zoo_path
311
+ ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
312
+ raise FileNotFoundError("Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' or 'llama-cpp-python[server]' is installed or provide 'llama_server_binary_path'.")
313
+
314
+ def _resolve_model_path(self, model_name_or_path: str) -> Path:
315
+ model_p = Path(model_name_or_path)
316
+ if model_p.is_absolute():
317
+ if model_p.exists(): return model_p
318
+ else: raise FileNotFoundError(f"Absolute model path specified but not found: {model_p}")
450
319
 
451
- model_p = Path(model_path)
452
- if model_p.is_absolute() and model_p.exists():
453
- return model_p
454
-
455
- paths_to_check = []
456
- binding_specific_folder_name = self.binding_name # "LlamaCppServerBinding"
457
- paths_to_check.append(self.models_path)
458
-
459
- for p in paths_to_check:
460
- if p.exists() and p.is_file():
461
- ASCIIColors.info(f"Found model at: {p}")
462
- return p
320
+ path_in_models_dir = self.models_path / model_name_or_path
321
+ if path_in_models_dir.exists() and path_in_models_dir.is_file():
322
+ ASCIIColors.info(f"Found model at: {path_in_models_dir}"); return path_in_models_dir
463
323
 
464
- raise FileNotFoundError(f"Model '{model_path}' not found in standard Lollms model paths or as an absolute path.")
324
+ raise FileNotFoundError(f"Model '{model_name_or_path}' not found as absolute path or within '{self.models_path}'.")
465
325
 
466
326
  def _find_available_port(self) -> int:
467
327
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
468
- s.bind(('', 0)) # Bind to port 0 to get an OS-assigned available port
469
- return s.getsockname()[1]
470
-
471
- def load_model(self, model_name: str) -> bool:
472
- resolved_path = self._resolve_model_path(model_name)
328
+ s.bind(('', 0)); return s.getsockname()[1]
329
+
330
+ def _release_server_instance(self):
331
+ if self.server_process and self.server_key:
332
+ with _server_registry_lock:
333
+ if self.server_key in _server_ref_counts:
334
+ _server_ref_counts[self.server_key] -= 1
335
+ ASCIIColors.info(f"Decremented ref count for server {self.server_key}. New count: {_server_ref_counts[self.server_key]}")
336
+ if _server_ref_counts[self.server_key] <= 0:
337
+ ASCIIColors.info(f"Ref count for server {self.server_key} is zero. Shutting it down.")
338
+ server_to_stop = _active_servers.pop(self.server_key, None)
339
+ _server_ref_counts.pop(self.server_key, None)
340
+ if server_to_stop:
341
+ try: server_to_stop.shutdown()
342
+ except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
343
+ # else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
344
+ else:
345
+ ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
346
+ _active_servers.pop(self.server_key, None) # Ensure removal
473
347
 
474
- if self.server_process and self.server_process.is_healthy and self.current_model_path == resolved_path:
475
- ASCIIColors.info(f"Model '{model_name}' is already loaded and server is running.")
476
- return True
348
+ self.server_process = None
349
+ self.port = None
350
+ self.server_key = None
477
351
 
478
- if self.server_process:
479
- self.unload_model() # Stop existing server
480
352
 
481
- self.model_name = model_name # Store the name provided by user
482
- self.current_model_path = resolved_path
483
- self.port = self._find_available_port()
353
+ def load_model(self, model_name_or_path: str) -> bool:
354
+ resolved_model_path = self._resolve_model_path(model_name_or_path)
484
355
 
485
- ASCIIColors.info(f"Attempting to start Llama.cpp server for model: {self.current_model_path} on port {self.port}")
486
-
487
- # Prepare server_args specifically for this model load
488
- current_server_args = self.server_args.copy()
489
-
490
- if not self.clip_model_path:
491
- # Try to find a corresponding .mmproj file or allow user to specify in config
492
- # e.g. if model is llava-v1.5-7b.Q4_K_M.gguf, look for llava-v1.5-7b.mmproj or mmproj-modelname.gguf
493
- base_name = get_gguf_model_base_name(self.current_model_path.stem) # etc.
494
-
495
- potential_clip_paths = [
496
- self.current_model_path.parent / f"{base_name}.mmproj",
497
- self.current_model_path.parent / f"mmproj-{base_name}.gguf", # Common pattern
498
- self.current_model_path.with_suffix(".mmproj"),
356
+ # Determine the clip_model_path for this server instance
357
+ # Priority: 1. Explicit `clip_model_path` from init (if exists) 2. Auto-detection
358
+ final_clip_model_path: Optional[Path] = None
359
+ if self.clip_model_path and self.clip_model_path.exists(): # From __init__
360
+ final_clip_model_path = self.clip_model_path
361
+ ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
362
+ elif not self.clip_model_path or (self.clip_model_path and not self.clip_model_path.exists()): # if init path was bad or not given
363
+ if self.clip_model_path and not self.clip_model_path.exists():
364
+ ASCIIColors.warning(f"Initial clip model path '{self.clip_model_path}' not found. Attempting auto-detection.")
365
+ base_name = get_gguf_model_base_name(resolved_model_path.stem)
366
+ potential_paths = [
367
+ resolved_model_path.parent / f"{base_name}.mmproj",
368
+ resolved_model_path.parent / f"mmproj-{base_name}.gguf",
369
+ resolved_model_path.with_suffix(".mmproj"),
370
+ self.models_path / f"{base_name}.mmproj", # Check in general models dir too
371
+ self.models_path / f"mmproj-{base_name}.gguf",
499
372
  ]
500
- found_clip_path = None
501
- for p_clip in potential_clip_paths:
373
+ for p_clip in potential_paths:
502
374
  if p_clip.exists():
503
- found_clip_path = str(p_clip)
504
- ASCIIColors.info(f"Auto-detected LLaVA clip model: {found_clip_path}")
375
+ final_clip_model_path = p_clip
376
+ ASCIIColors.info(f"Auto-detected LLaVA clip model: {final_clip_model_path}")
505
377
  break
506
- if found_clip_path:
507
- self.clip_model_path = found_clip_path
508
- # Set a default LLaVA chat template if server supports it, or rely on server auto-detection
509
- #if not current_server_args.get("chat_template") and not current_server_args.get("chat_format"):
510
- # current_server_args["chat_template"] = "llava-1.5" # Common default
511
- else:
512
- ASCIIColors.warning("Vision capabilities will likely not work. Please ensure the .mmproj file is "
513
- "next to the model or specify 'clip_model_path' in binding config.")
378
+
379
+ final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
380
+
381
+ # Server key based on model and essential server configurations (like clip model)
382
+ # More server_args could be added to the key if they necessitate separate server instances
383
+ # For example, different n_gpu_layers might require a server restart.
384
+ # For now, model and clip model are the main differentiators for distinct servers.
385
+ new_server_key = (str(resolved_model_path), final_clip_model_path_str)
386
+
387
+ with _server_registry_lock:
388
+ # If this binding instance is already using the exact same server, do nothing
389
+ if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
390
+ ASCIIColors.info(f"Model '{model_name_or_path}' with clip '{final_clip_model_path_str}' is already loaded and server is healthy on port {self.port}. No change.")
391
+ return True
392
+
393
+ # If this binding was using a *different* server, release it first
394
+ if self.server_process and self.server_key != new_server_key:
395
+ ASCIIColors.info(f"Switching models. Releasing previous server: {self.server_key}")
396
+ self._release_server_instance() # This clears self.server_process, self.port, self.server_key
397
+
398
+ # Check if a suitable server already exists in the global registry
399
+ if new_server_key in _active_servers:
400
+ existing_server = _active_servers[new_server_key]
401
+ if existing_server.is_healthy:
402
+ ASCIIColors.info(f"Reusing existing healthy server for {new_server_key} on port {existing_server.port}.")
403
+ self.server_process = existing_server
404
+ self.port = existing_server.port
405
+ _server_ref_counts[new_server_key] += 1
406
+ self.current_model_path = resolved_model_path
407
+ self.clip_model_path = final_clip_model_path # Update binding's clip path
408
+ self.server_key = new_server_key
409
+ return True
410
+ else: # Found existing but unhealthy server
411
+ ASCIIColors.warning(f"Found unhealthy server for {new_server_key}. Attempting to remove and restart.")
412
+ try: existing_server.shutdown()
413
+ except Exception as e: ASCIIColors.error(f"Error shutting down unhealthy server {new_server_key}: {e}")
414
+ _active_servers.pop(new_server_key, None)
415
+ _server_ref_counts.pop(new_server_key, None)
416
+
417
+ # No suitable server found or existing was unhealthy: start a new one
418
+ ASCIIColors.info(f"Starting new server for {new_server_key}.")
419
+ self.current_model_path = resolved_model_path
420
+ self.clip_model_path = final_clip_model_path # Update binding's clip path for the new server
421
+ self.server_key = new_server_key # Set before potential failure to allow cleanup by _release_server_instance
514
422
 
423
+ new_port_for_server = self._find_available_port()
424
+
425
+ current_server_args_for_new_server = self.server_args.copy()
426
+ # Ensure parallel_slots is set; it's crucial for shared servers
427
+ if "parallel_slots" not in current_server_args_for_new_server or not isinstance(current_server_args_for_new_server["parallel_slots"], int) or current_server_args_for_new_server["parallel_slots"] <=0:
428
+ current_server_args_for_new_server["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
429
+
430
+ ASCIIColors.info(f"New Llama.cpp server: model={self.current_model_path}, clip={self.clip_model_path}, port={new_port_for_server}, slots={current_server_args_for_new_server['parallel_slots']}")
431
+
432
+ try:
433
+ new_server = LlamaCppServerProcess(
434
+ model_path=str(self.current_model_path),
435
+ clip_model_path=str(self.clip_model_path) if self.clip_model_path else None,
436
+ server_binary_path=str(self.server_binary_path),
437
+ server_args=current_server_args_for_new_server,
438
+ )
439
+ new_server.start(port_to_use=new_port_for_server) # Actual server start
440
+
441
+ if new_server.is_healthy:
442
+ self.server_process = new_server
443
+ self.port = new_port_for_server
444
+ _active_servers[self.server_key] = new_server
445
+ _server_ref_counts[self.server_key] = 1
446
+ ASCIIColors.green(f"New server {self.server_key} started on port {self.port}.")
447
+ return True
448
+ else: # Should have been caught by new_server.start() raising an error
449
+ ASCIIColors.error(f"New server {self.server_key} failed to become healthy (this state should be rare).")
450
+ self._release_server_instance() # Clean up registry if something went very wrong
451
+ return False
452
+ except Exception as e:
453
+ ASCIIColors.error(f"Failed to load model '{model_name_or_path}' and start server: {e}")
454
+ trace_exception(e)
455
+ self._release_server_instance() # Ensure cleanup if start failed
456
+ return False
515
457
 
516
- try:
517
- self.server_process = LlamaCppServerProcess(
518
- model_path=str(self.current_model_path),
519
- clip_model_path = str(self.clip_model_path),
520
- server_binary_path=str(self.server_binary_path),
521
- port=self.port,
522
- server_args=current_server_args,
523
- )
524
- return self.server_process.is_healthy
525
- except Exception as e:
526
- ASCIIColors.error(f"Failed to load model '{model_name}' and start server: {e}")
527
- trace_exception(e)
528
- self.server_process = None
529
- self.current_model_path = None
530
- return False
531
458
 
532
459
  def unload_model(self):
533
460
  if self.server_process:
534
- self.server_process.stop()
535
- self.server_process = None
461
+ ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
462
+ self._release_server_instance() # Handles ref counting and actual shutdown if needed
463
+ else:
464
+ ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
536
465
  self.current_model_path = None
537
- self.port = None
538
- ASCIIColors.info("Llama.cpp server and model unloaded.")
539
-
466
+ self.clip_model_path = None # Also clear the instance's clip path idea
467
+ # self.port and self.server_key are cleared by _release_server_instance
468
+
540
469
  def _get_request_url(self, endpoint: str) -> str:
541
470
  if not self.server_process or not self.server_process.is_healthy:
542
471
  raise ConnectionError("Llama.cpp server is not running or not healthy.")
543
472
  return f"{self.server_process.base_url}{endpoint}"
544
473
 
545
- def _prepare_generation_payload(self,
546
- prompt: str,
547
- system_prompt: str = "",
548
- n_predict: Optional[int] = None,
549
- temperature: float = 0.7,
550
- top_k: int = 40,
551
- top_p: float = 0.9,
552
- repeat_penalty: float = 1.1,
553
- repeat_last_n: Optional[int] = 64, # Server calls this repeat_last_n or penalty_last_n
554
- seed: Optional[int] = None,
555
- stream: bool = False,
556
- use_chat_format: bool = True, # True for /v1/chat/completions, False for /completion
557
- images: Optional[List[str]] = None,
558
- **extra_params # For things like grammar, mirostat, etc from server_args
559
- ) -> Dict:
560
-
561
- # Start with defaults from server_args, then override with call params
474
+ def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
475
+ temperature: float = 0.7, top_k: int = 40, top_p: float = 0.9,
476
+ repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
477
+ seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
478
+ images: Optional[List[str]] = None, **extra_params) -> Dict:
562
479
  payload_params = {
563
- "temperature": self.server_args.get("temperature", 0.7),
564
- "top_k": self.server_args.get("top_k", 40),
565
- "top_p": self.server_args.get("top_p", 0.9),
566
- "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
567
- "repeat_last_n": self.server_args.get("repeat_last_n", 64),
568
- "mirostat": self.server_args.get("mirostat_mode", 0), # llama.cpp server uses mirostat (0=disabled, 1=v1, 2=v2)
569
- "mirostat_tau": self.server_args.get("mirostat_tau", 5.0),
570
- "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
571
- # Add other mappable params from self.server_args like min_p, typical_p, grammar etc.
480
+ "temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
481
+ "top_p": self.server_args.get("top_p", 0.9), "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
482
+ "repeat_last_n": self.server_args.get("repeat_last_n", 64), "mirostat": self.server_args.get("mirostat_mode", 0),
483
+ "mirostat_tau": self.server_args.get("mirostat_tau", 5.0), "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
572
484
  }
573
- if "grammar_string" in self.server_args and self.server_args["grammar_string"]: # From config
485
+ if "grammar_string" in self.server_args and self.server_args["grammar_string"]:
574
486
  payload_params["grammar"] = self.server_args["grammar_string"]
575
487
 
576
- # Override with specific call parameters
577
- payload_params.update({
578
- "temperature": temperature, "top_k": top_k, "top_p": top_p,
579
- "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n,
580
- })
581
- if n_predict is not None: payload_params['n_predict'] = n_predict # Server uses n_predict
488
+ payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
489
+ if n_predict is not None: payload_params['n_predict'] = n_predict
582
490
  if seed is not None: payload_params['seed'] = seed
583
-
584
- # Filter None values, as server might not like them
585
491
  payload_params = {k: v for k, v in payload_params.items() if v is not None}
586
- payload_params.update(extra_params) # Add any other specific params for this call
492
+ payload_params.update(extra_params)
587
493
 
588
494
  if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
589
- # Use /v1/chat/completions format
590
495
  messages = []
591
- if system_prompt and system_prompt.strip():
592
- messages.append({"role": "system", "content": system_prompt})
593
-
496
+ if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt})
594
497
  user_content: Union[str, List[Dict[str, Any]]] = prompt
595
- if images and self.clip_model_path: # Check if it's a LLaVA setup
498
+ if images and self.clip_model_path: # Use the binding's current clip_model_path
596
499
  image_parts = []
597
500
  for img_path in images:
598
501
  try:
599
- with open(img_path, "rb") as image_file:
600
- encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
601
- image_type = Path(img_path).suffix[1:].lower() or "png"
602
- if image_type == "jpg": image_type = "jpeg"
603
- # Llama.cpp server expects image data directly for LLaVA with /completion
604
- # For /v1/chat/completions, it expects OpenAI's format for multimodal
605
- image_parts.append({
606
- "type": "image_url",
607
- "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}
608
- })
609
- except Exception as ex:
610
- trace_exception(ex)
502
+ with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
503
+ image_type = Path(img_path).suffix[1:].lower() or "png"; image_type = "jpeg" if image_type == "jpg" else image_type
504
+ image_parts.append({"type": "image_url", "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}})
505
+ except Exception as ex: trace_exception(ex)
611
506
  user_content = [{"type": "text", "text": prompt}] + image_parts # type: ignore
612
-
613
507
  messages.append({"role": "user", "content": user_content})
614
-
615
508
  final_payload = {"messages": messages, "stream": stream, **payload_params}
616
- # n_predict is max_tokens for OpenAI API
617
- if 'n_predict' in final_payload:
618
- final_payload['max_tokens'] = final_payload.pop('n_predict')
619
-
509
+ if 'n_predict' in final_payload: final_payload['max_tokens'] = final_payload.pop('n_predict')
620
510
  return final_payload
621
511
  else:
622
- # Use /completion format (legacy or for raw text)
623
- # For LLaVA with /completion, images are typically passed in a special way in the prompt
624
- # or via an 'image_data' field if the server supports it.
625
- # The example class uses tokenized prompt for /completion.
626
- # For simplicity here, we'll send text prompt, server tokenizes.
627
- # Llama.cpp server's /completion often expects 'prompt' as string or tokens.
628
- # If images are involved with /completion, it needs specific handling.
629
- # Example: 'prompt': "USER: <image>\nWhat is this?\nASSISTANT:", 'image_data': [{'data': base64_image, 'id': 10}]
630
-
631
- full_prompt = prompt
632
- if system_prompt and system_prompt.strip():
633
- # Heuristic for instruct models, actual formatting depends on model/template
634
- full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:"
635
-
512
+ full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
636
513
  final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
637
-
638
- if images and self.server_args.get("clip_model_path"):
514
+ if images and self.clip_model_path: # Use binding's clip_model_path
639
515
  image_data_list = []
640
516
  for i, img_path in enumerate(images):
641
517
  try:
642
- with open(img_path, "rb") as image_file:
643
- encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
644
- image_data_list.append({"data": encoded_string, "id": i + 10}) # ID needs to be > 9 for llama.cpp server
645
- except Exception as e_img:
646
- ASCIIColors.error(f"Could not encode image {img_path} for /completion: {e_img}")
647
- if image_data_list:
648
- final_payload["image_data"] = image_data_list
649
- # The prompt needs to contain placeholder like USER: <image 1>\n<prompt>\nASSISTANT:
650
- # This part is tricky and model-dependent. For now, we assume user's prompt is already formatted.
651
- # Or, the server (if new enough) might handle it with chat_template even for /completion.
652
-
518
+ with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
519
+ image_data_list.append({"data": encoded_string, "id": i + 10})
520
+ except Exception as e_img: ASCIIColors.error(f"Could not encode image {img_path}: {e_img}")
521
+ if image_data_list: final_payload["image_data"] = image_data_list
653
522
  return final_payload
654
523
 
655
-
656
- def generate_text(self,
657
- prompt: str,
658
- images: Optional[List[str]] = None,
659
- system_prompt: str = "",
660
- n_predict: Optional[int] = None,
661
- stream: bool = False,
662
- temperature: float = None, # Use binding's default if None
663
- top_k: int = None,
664
- top_p: float = None,
665
- repeat_penalty: float = None,
666
- repeat_last_n: Optional[int] = None,
667
- seed: Optional[int] = None,
524
+ def generate_text(self, prompt: str, images: Optional[List[str]] = None, system_prompt: str = "",
525
+ n_predict: Optional[int] = None, stream: bool = False, temperature: float = None,
526
+ top_k: int = None, top_p: float = None, repeat_penalty: float = None,
527
+ repeat_last_n: Optional[int] = None, seed: Optional[int] = None,
668
528
  streaming_callback: Optional[Callable[[str, int], bool]] = None,
669
- use_chat_format_override: Optional[bool] = None, # Allow overriding binding's default format
670
- **generation_kwargs
671
- ) -> Union[str, Dict[str, any]]:
672
-
529
+ use_chat_format_override: Optional[bool] = None, **generation_kwargs) -> Union[str, Dict[str, any]]:
673
530
  if not self.server_process or not self.server_process.is_healthy:
674
531
  return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
675
532
 
676
- _use_chat_format = use_chat_format_override if use_chat_format_override is not None \
677
- else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
678
-
533
+ _use_chat_format = use_chat_format_override if use_chat_format_override is not None else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
679
534
  payload = self._prepare_generation_payload(
680
535
  prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
681
536
  temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
@@ -683,359 +538,331 @@ class LlamaCppServerBinding(LollmsLLMBinding):
683
538
  top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
684
539
  repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
685
540
  repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
686
- seed=seed if seed is not None else self.server_args.get("seed", -1), # Use server's default seed if not provided
687
- stream=stream, use_chat_format=_use_chat_format, images=images,
688
- **generation_kwargs
541
+ seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
542
+ use_chat_format=_use_chat_format, images=images, **generation_kwargs
689
543
  )
690
-
691
544
  endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
692
545
  request_url = self._get_request_url(endpoint)
693
546
 
694
- # For debugging, print payload (excluding potentially large image data)
695
- debug_payload = {k:v for k,v in payload.items() if k not in ["image_data"]}
696
- if "messages" in debug_payload:
697
- debug_payload["messages"] = [{k:v for k,v in msg.items() if k !="content" or not isinstance(v,list) or not any("image_url" in part for part in v)} for msg in debug_payload["messages"]]
698
- ASCIIColors.debug(f"Request to {request_url} with payload: {json.dumps(debug_payload, indent=2)[:500]}...")
547
+ # Debug payload (simplified)
548
+ # debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
549
+ # ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
550
+
699
551
 
700
552
  full_response_text = ""
701
553
  try:
702
554
  response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
703
555
  response.raise_for_status()
704
-
705
556
  if stream:
706
557
  for line in response.iter_lines():
707
558
  if not line: continue
708
559
  line_str = line.decode('utf-8').strip()
709
560
  if line_str.startswith('data: '): line_str = line_str[6:]
710
- if line_str == '[DONE]': break # OpenAI stream end
711
-
561
+ if line_str == '[DONE]': break
712
562
  try:
713
563
  chunk_data = json.loads(line_str)
714
- chunk_content = ""
715
- if _use_chat_format: # OpenAI /v1/chat/completions format
716
- delta = chunk_data.get('choices', [{}])[0].get('delta', {})
717
- chunk_content = delta.get('content', '')
718
- else: # /completion format
719
- chunk_content = chunk_data.get('content', '')
720
-
564
+ chunk_content = (chunk_data.get('choices', [{}])[0].get('delta', {}).get('content', '') if _use_chat_format
565
+ else chunk_data.get('content', ''))
721
566
  if chunk_content:
722
567
  full_response_text += chunk_content
723
568
  if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
724
- # If callback returns False, we should try to stop generation.
725
- # Llama.cpp server's /completion doesn't have a direct way to stop mid-stream via API.
726
- # Closing the connection might be the only way if server supports it.
727
- ASCIIColors.info("Streaming callback requested stop.")
728
- response.close() # Attempt to signal server by closing connection
729
- break
730
- if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): # /completion specific stop flags
731
- break
732
- except json.JSONDecodeError:
733
- ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
734
- continue # Or handle error
569
+ ASCIIColors.info("Streaming callback requested stop."); response.close(); break
570
+ if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): break
571
+ except json.JSONDecodeError: ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}"); continue
735
572
  return full_response_text
736
- else: # Not streaming
573
+ else:
737
574
  response_data = response.json()
738
- return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
739
-
575
+ return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
576
+ else response_data.get('content','') # /completion has 'content' at top level for non-stream
740
577
  except requests.exceptions.RequestException as e:
741
578
  error_message = f"Llama.cpp server request error: {e}"
742
579
  if e.response is not None:
743
- try:
744
- error_details = e.response.json()
745
- error_message += f" - Details: {error_details.get('error', e.response.text)}"
746
- except json.JSONDecodeError:
747
- error_message += f" - Response: {e.response.text[:200]}"
580
+ try: error_details = e.response.json(); error_message += f" - Details: {error_details.get('error', e.response.text)}"
581
+ except json.JSONDecodeError: error_message += f" - Response: {e.response.text[:200]}"
748
582
  ASCIIColors.error(error_message)
749
583
  return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
750
584
  except Exception as ex:
751
- error_message = f"Llama.cpp generation error: {str(ex)}"
752
- trace_exception(ex)
585
+ error_message = f"Llama.cpp generation error: {str(ex)}"; trace_exception(ex)
753
586
  return {"status": False, "error": error_message}
754
587
 
755
588
  def tokenize(self, text: str) -> List[int]:
756
- if not self.server_process or not self.server_process.is_healthy:
757
- raise ConnectionError("Llama.cpp server is not running.")
589
+ if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
758
590
  try:
759
591
  response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
760
- response.raise_for_status()
761
- return response.json().get("tokens", [])
762
- except Exception as e:
763
- ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e)
764
- return [] # Or raise
592
+ response.raise_for_status(); return response.json().get("tokens", [])
593
+ except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
765
594
 
766
595
  def detokenize(self, tokens: List[int]) -> str:
767
- if not self.server_process or not self.server_process.is_healthy:
768
- raise ConnectionError("Llama.cpp server is not running.")
596
+ if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
769
597
  try:
770
598
  response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
771
- response.raise_for_status()
772
- return response.json().get("content", "")
773
- except Exception as e:
774
- ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e)
775
- return "" # Or raise
599
+ response.raise_for_status(); return response.json().get("content", "")
600
+ except Exception as e: ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e); return ""
776
601
 
777
- def count_tokens(self, text: str) -> int:
778
- return len(self.tokenize(text))
602
+ def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
779
603
 
780
604
  def embed(self, text: str, **kwargs) -> List[float]:
781
- if not self.server_process or not self.server_process.is_healthy:
782
- raise Exception("Llama.cpp server is not running.")
783
- if not self.server_args.get("embedding"):
784
- raise Exception("Embedding support was not enabled in server_args (set 'embedding: true').")
785
-
605
+ if not self.server_process or not self.server_process.is_healthy: raise Exception("Server not running.")
606
+ if not self.server_args.get("embedding"): raise Exception("Embedding not enabled in server_args.")
786
607
  try:
787
- # llama.cpp server has /embedding endpoint (non-OpenAI) and /v1/embeddings (OpenAI-compatible)
788
- # Let's try /v1/embeddings first for compatibility
789
- payload = {"input": text}
790
- if "model" in kwargs: payload["model"] = kwargs["model"] # Can specify model if server handles multiple embedding models (unlikely for llama.cpp server)
791
-
792
- request_url = self._get_request_url("/v1/embeddings")
608
+ payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
793
609
  response = self.server_process.session.post(request_url, json=payload)
794
-
795
- if response.status_code == 404: # Fallback to /embedding if /v1/embeddings not found
796
- ASCIIColors.debug("Trying /embedding endpoint as /v1/embeddings was not found.")
610
+ if response.status_code == 404: # Fallback
797
611
  request_url = self._get_request_url("/embedding")
798
- response = self.server_process.session.post(request_url, json={"content": text}) # /embedding uses "content"
799
-
800
- response.raise_for_status()
801
- data = response.json()
802
-
803
- if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: # /v1/embeddings format
804
- return data["data"][0]["embedding"]
805
- elif "embedding" in data and isinstance(data["embedding"], list): # /embedding format
806
- return data["embedding"]
807
- else:
808
- raise ValueError(f"Unexpected embedding response format: {data}")
809
-
612
+ response = self.server_process.session.post(request_url, json={"content": text})
613
+ response.raise_for_status(); data = response.json()
614
+ if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: return data["data"][0]["embedding"]
615
+ elif "embedding" in data and isinstance(data["embedding"], list): return data["embedding"]
616
+ else: raise ValueError(f"Unexpected embedding response: {data}")
810
617
  except requests.exceptions.RequestException as e:
811
- err_msg = f"Llama.cpp server embedding request error: {e}"
618
+ err_msg = f"Embedding request error: {e}";
812
619
  if e.response: err_msg += f" - {e.response.text[:200]}"
813
620
  raise Exception(err_msg) from e
814
- except Exception as ex:
815
- trace_exception(ex); raise Exception(f"Llama.cpp embedding failed: {str(ex)}") from ex
621
+ except Exception as ex: trace_exception(ex); raise Exception(f"Embedding failed: {str(ex)}") from ex
816
622
 
817
623
  def get_model_info(self) -> dict:
818
624
  info = {
819
625
  "name": self.binding_name,
820
- "model_name": self.model_name, # User-provided name
626
+ "user_provided_model_name": self.user_provided_model_name,
821
627
  "model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
628
+ "clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
822
629
  "loaded": self.server_process is not None and self.server_process.is_healthy,
823
- "server_args": self.server_args,
824
- "port": self.port if self.port else "N/A"
630
+ "server_args": self.server_args, "port": self.port if self.port else "N/A",
631
+ "server_key": str(self.server_key) if self.server_key else "N/A",
825
632
  }
826
- if info["loaded"]:
827
- # Try to get more info from server's /props or /v1/models
633
+ if info["loaded"] and self.server_process:
828
634
  try:
829
- props_url = self._get_request_url("/props") # llama.cpp specific
830
- props_resp = self.server_process.session.get(props_url, timeout=5).json()
635
+ props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
831
636
  info.update({
832
- "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"), # Example path
637
+ "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
833
638
  "server_chat_format": props_resp.get("chat_format"),
834
- "server_clip_model": props_resp.get("mmproj"),
639
+ "server_clip_model_from_props": props_resp.get("mmproj"), # Server's view of clip model
835
640
  })
836
- except Exception: pass # Ignore if /props fails or data missing
837
-
838
- is_llava = ("llava" in self.model_name.lower() or "bakllava" in self.model_name.lower()) or \
839
- (self.server_args.get("clip_model_path") is not None) or \
840
- (info.get("server_clip_model") is not None)
641
+ except Exception: pass
841
642
 
643
+ is_llava = self.clip_model_path is not None or \
644
+ (info.get("server_clip_model_from_props") is not None) or \
645
+ ("llava" in self.current_model_path.name.lower() if self.current_model_path else False)
842
646
  info["supports_vision"] = is_llava
843
647
  info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
844
648
  return info
845
649
 
846
650
  def listModels(self) -> List[Dict[str, str]]:
847
- # This binding manages one GGUF model at a time by starting a server for it.
848
- # To "list models", we could scan the Lollms model directories for .gguf files.
849
651
  models_found = []
850
- gguf_pattern = "*.gguf"
851
-
852
- search_paths = []
853
- binding_specific_folder_name = self.binding_name
854
-
855
- search_paths.append(self.models_path)
856
-
857
652
  unique_models = set()
858
- for spath in search_paths:
859
- if spath.exists() and spath.is_dir():
860
- for model_file in spath.rglob(gguf_pattern): # rglob for recursive
861
- if model_file.is_file() and model_file.name not in unique_models:
862
- models_found.append({
863
- 'model_name': model_file.name,
864
- # Path relative to one of the main model roots for display/selection
865
- 'path_hint': str(model_file.relative_to(spath.parent) if model_file.is_relative_to(spath.parent) else model_file),
866
- 'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
867
- })
868
- unique_models.add(model_file.name)
653
+ if self.models_path.exists() and self.models_path.is_dir():
654
+ for model_file in self.models_path.rglob("*.gguf"):
655
+ if model_file.is_file() and model_file.name not in unique_models:
656
+ models_found.append({
657
+ 'model_name': model_file.name,
658
+ 'path_hint': str(model_file.relative_to(self.models_path.parent) if model_file.is_relative_to(self.models_path.parent) else model_file),
659
+ 'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
660
+ })
661
+ unique_models.add(model_file.name)
869
662
  return models_found
870
663
 
871
664
  def __del__(self):
872
- self.unload_model() # Ensure server is stopped when binding is deleted
665
+ self.unload_model()
873
666
 
874
667
 
875
668
  if __name__ == '__main__':
876
- global full_streamed_text
669
+ global full_streamed_text # Define for the callback
670
+ full_streamed_text = ""
877
671
  ASCIIColors.yellow("Testing LlamaCppServerBinding...")
878
672
 
879
673
  # --- Configuration ---
880
- # This should be the NAME of your GGUF model file. The binding will search for it.
881
- # e.g., "Mistral-7B-Instruct-v0.2-Q4_K_M.gguf"
882
- # Ensure this model is placed in one of the Lollms model directories.
883
- # For testing, you can put a small GGUF model in the same directory as this script
884
- # and set personal_models_path to "."
885
-
886
- # Adjust current_directory if your models are elsewhere for testing
887
- current_directory = Path(__file__).parent
888
- models_path = "E:\lollms\models\gguf\Mistral-Nemo-Instruct-2407-GGUF" #replace with your own model path
889
- model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
890
-
891
- # Binding config (passed to server_args)
674
+ # This should be the NAME of your GGUF model file.
675
+ # Ensure this model is placed in your models_path directory.
676
+ # Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
677
+ # model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
678
+
679
+ # For CI/local testing without specific paths, you might download a tiny model
680
+ # or require user to set environment variables for these.
681
+ # For this example, replace with your actual paths/model.
682
+ try:
683
+ models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
684
+ model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf") # A small model
685
+ llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
686
+ llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
687
+
688
+ models_path = Path(models_path_str)
689
+ models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
690
+
691
+ # Verify model exists, or skip tests gracefully
692
+ test_model_path = models_path / model_name_str
693
+ if not test_model_path.exists():
694
+ ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set LOLLMS_TEST_MODEL_GGUF and LOLLMS_MODELS_PATH env vars.")
695
+ ASCIIColors.warning("Some tests will be skipped.")
696
+ # sys.exit(1) # Or allow to continue with skips
697
+ primary_model_available = False
698
+ else:
699
+ primary_model_available = True
700
+
701
+ except Exception as e:
702
+ ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
703
+ sys.exit(1)
704
+
892
705
  binding_config = {
893
- "n_gpu_layers": 0, # Set to -1 or a number for GPU offload
894
- "n_ctx": 512, # Short context for testing
895
- "embedding": True, # Enable for embedding tests
896
- "verbose": False, # llama.cpp server verbose logs
897
- # "extra_cli_flags": ["--cont-batching"] # Example of extra flags
898
- "server_startup_timeout": 180 # Give more time for server to start, esp. with large models
706
+ "n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
707
+ "verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
899
708
  }
900
709
 
901
- active_binding = None
710
+ active_binding1: Optional[LlamaCppServerBinding] = None
711
+ active_binding2: Optional[LlamaCppServerBinding] = None
712
+ active_binding_llava: Optional[LlamaCppServerBinding] = None
713
+
902
714
  try:
903
- ASCIIColors.cyan("\n--- Initializing LlamaCppServerBinding ---")
904
- active_binding = LlamaCppServerBinding(
905
- model_name=model_name,
906
- models_path=models_path,
907
- config=binding_config
908
- )
909
- if not active_binding.server_process or not active_binding.server_process.is_healthy:
910
- raise RuntimeError("Server process failed to start or become healthy.")
715
+ if primary_model_available:
716
+ ASCIIColors.cyan("\n--- Initializing First LlamaCppServerBinding Instance ---")
717
+ active_binding1 = LlamaCppServerBinding(
718
+ model_name=model_name_str, models_path=str(models_path), config=binding_config
719
+ )
720
+ if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
721
+ raise RuntimeError("Server for binding1 failed to start or become healthy.")
722
+ ASCIIColors.green(f"Binding1 initialized. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
723
+ ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
724
+
725
+ ASCIIColors.cyan("\n--- Initializing Second LlamaCppServerBinding Instance (Same Model) ---")
726
+ active_binding2 = LlamaCppServerBinding(
727
+ model_name=model_name_str, models_path=str(models_path), config=binding_config # Same model and config
728
+ )
729
+ if not active_binding2.server_process or not active_binding2.server_process.is_healthy:
730
+ raise RuntimeError("Server for binding2 failed to start or become healthy (should reuse).")
731
+ ASCIIColors.green(f"Binding2 initialized. Server for '{active_binding2.current_model_path.name}' running on port {active_binding2.port}.")
732
+ ASCIIColors.info(f"Binding2 Model Info: {json.dumps(active_binding2.get_model_info(), indent=2)}")
911
733
 
912
- ASCIIColors.green(f"Binding initialized. Server for '{active_binding.model_name}' running on port {active_binding.port}.")
913
- ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2)}")
734
+ if active_binding1.port != active_binding2.port:
735
+ ASCIIColors.error("ERROR: Bindings for the same model are using different ports! Server sharing failed.")
736
+ else:
737
+ ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing appears to work.")
738
+
739
+ # --- List Models (scans configured directories) ---
740
+ ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
741
+ listed_models = active_binding1.listModels()
742
+ if listed_models: ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5: {listed_models[:5]}")
743
+ else: ASCIIColors.warning("No GGUF models found in search paths.")
744
+
745
+ # --- Tokenize/Detokenize ---
746
+ ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
747
+ sample_text = "Hello, Llama.cpp server world!"
748
+ tokens = active_binding1.tokenize(sample_text)
749
+ ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
750
+ if tokens:
751
+ detokenized_text = active_binding1.detokenize(tokens)
752
+ ASCIIColors.green(f"Detokenized text: {detokenized_text}")
753
+ else: ASCIIColors.warning("Tokenization returned empty list.")
754
+
755
+ # --- Text Generation (Non-Streaming, Chat API, binding1) ---
756
+ ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
757
+ prompt_text = "What is the capital of Germany?"
758
+ generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False, use_chat_format_override=True)
759
+ if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
760
+ else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
761
+
762
+ # --- Text Generation (Streaming, Completion API, binding2) ---
763
+ ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API, binding2) ---")
764
+ full_streamed_text = "" # Reset global
765
+ def stream_callback(chunk: str, msg_type: int): global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True); full_streamed_text += chunk; return True
766
+
767
+ result_b2 = active_binding2.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=30, stream=True, streaming_callback=stream_callback, use_chat_format_override=False)
768
+ print("\n--- End of Stream (binding2) ---")
769
+ if isinstance(result_b2, str): ASCIIColors.green(f"Full streamed text (binding2): {result_b2}")
770
+ else: ASCIIColors.error(f"Streaming generation failed (binding2): {result_b2}")
771
+
772
+ # --- Embeddings (binding1) ---
773
+ if binding_config.get("embedding"):
774
+ ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
775
+ try:
776
+ embedding_vector = active_binding1.embed("Test embedding.")
777
+ ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
778
+ except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
779
+ else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
780
+
781
+ else: # primary_model_available is False
782
+ ASCIIColors.warning("Primary test model not available. Skipping most tests.")
914
783
 
915
-
916
- # --- List Models (scans configured directories) ---
917
- ASCIIColors.cyan("\n--- Listing Models (from search paths) ---")
918
- listed_models = active_binding.listModels()
919
- if listed_models:
920
- ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5:")
921
- for m in listed_models[:5]: print(m)
922
- else: ASCIIColors.warning("No GGUF models found in search paths.")
923
-
924
- # --- Tokenize/Detokenize ---
925
- ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
926
- sample_text = "Hello, Llama.cpp server world!"
927
- tokens = active_binding.tokenize(sample_text)
928
- ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
929
- token_count = active_binding.count_tokens(sample_text)
930
- ASCIIColors.green(f"Token count: {token_count}")
931
- if tokens: # Only detokenize if tokenization worked
932
- detokenized_text = active_binding.detokenize(tokens)
933
- ASCIIColors.green(f"Detokenized text: {detokenized_text}")
934
- # Note: exact match might depend on BOS/EOS handling by server's tokenizer
935
- # assert detokenized_text.strip() == sample_text.strip(), "Tokenization/Detokenization mismatch!"
936
- else: ASCIIColors.warning("Tokenization returned empty list, skipping detokenization.")
937
-
938
- # --- Text Generation (Non-Streaming, Chat Format using /v1/chat/completions) ---
939
- ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API) ---")
940
- prompt_text = "What is the capital of Germany?"
941
- system_prompt_text = "You are a concise geography expert."
942
- generated_text = active_binding.generate_text(
943
- prompt_text, system_prompt=system_prompt_text, n_predict=20, stream=False,
944
- use_chat_format_override=True # Force /v1/chat/completions
945
- )
946
- if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
947
- else: ASCIIColors.error(f"Generation failed: {generated_text}")
948
-
949
- # --- Text Generation (Streaming, /completion API) ---
950
- ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API) ---")
951
- full_streamed_text = ""
952
- def stream_callback(chunk: str, msg_type: int):
953
- global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True)
954
- full_streamed_text += chunk; return True
955
-
956
- result = active_binding.generate_text(
957
- prompt_text, system_prompt=system_prompt_text, n_predict=30, stream=True,
958
- streaming_callback=stream_callback, use_chat_format_override=False # Force /completion
959
- )
960
- print("\n--- End of Stream ---")
961
- if isinstance(result, str): ASCIIColors.green(f"Full streamed text: {result}")
962
- else: ASCIIColors.error(f"Streaming generation failed: {result}")
963
-
964
- # --- Embeddings ---
965
- if binding_config.get("embedding"):
966
- ASCIIColors.cyan("\n--- Embeddings ---")
967
- embedding_text = "Test sentence for server-based embeddings."
968
- try:
969
- embedding_vector = active_binding.embed(embedding_text)
970
- ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
971
- ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
972
- except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
973
- else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false in config) ---")
974
784
 
975
785
  # --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
976
- # To test LLaVA:
977
- models_path = "E:\drumber" #replace with your own model path
978
- model_name = "llava-v1.6-mistral-7b.Q3_K_XS.gguf"
979
- model_path = Path(models_path)/model_name
980
- ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
981
- dummy_image_path = Path("E:\\drumber\\drumber.png")
982
- try:
983
- from PIL import Image, ImageDraw
984
- img = Image.new('RGB', (150, 70), color = ('magenta'))
985
- d = ImageDraw.Draw(img); d.text((10,10), "Server LLaVA", fill=('white'))
986
- img.save(dummy_image_path)
987
- ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
988
-
989
- llava_prompt = "Describe this image."
990
- # For /v1/chat/completions with LLaVA, images are passed in messages.
991
- # For /completion with LLaVA, prompt needs <image> placeholder and image_data field.
992
- llava_response = active_binding.generate_text(
993
- prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False,
994
- use_chat_format_override=True # Use /v1/chat/completions for easier multimodal
995
- )
996
- if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
997
- else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
998
- except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
999
- except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
1000
- finally:
1001
- if dummy_image_path.exists(): dummy_image_path.unlink()
1002
-
1003
- # --- Test changing model ---
1004
- # This part is conceptual. You'd need another GGUF model file for a real test.
1005
- # For now, we'll just call load_model with the same model to test the logic.
1006
-
1007
- ASCIIColors.cyan("\n--- Testing Model Change (reloading same model) ---")
1008
- reload_success = active_binding.load_model(str(model_path))
1009
- if reload_success and active_binding.server_process and active_binding.server_process.is_healthy:
1010
- ASCIIColors.green(f"Model reloaded/re-confirmed successfully. Server on port {active_binding.port}.")
1011
- # Quick generation test after reload
1012
- reloaded_gen = active_binding.generate_text("Ping", n_predict=5, stream=False)
1013
- if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping response: {reloaded_gen.strip()}")
1014
- else: ASCIIColors.error(f"Post-reload generation failed: {reloaded_gen}")
1015
- else:
1016
- ASCIIColors.error("Failed to reload model or server not healthy after reload attempt.")
786
+ ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
787
+ llava_model_path = models_path / llava_model_name_str
788
+ llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
1017
789
 
790
+ if llava_model_path.exists() and llava_clip_path_actual.exists():
791
+ dummy_image_path = models_path / "dummy_llava_image.png"
792
+ try:
793
+ from PIL import Image, ImageDraw
794
+ img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
795
+ ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
1018
796
 
1019
- except ImportError as e_imp:
1020
- ASCIIColors.error(f"Import error: {e_imp}. Ensure llama-cpp-binaries is installed.")
1021
- except FileNotFoundError as e_fnf:
1022
- ASCIIColors.error(f"File not found error: {e_fnf}. Check model or server binary paths.")
1023
- except ConnectionError as e_conn:
1024
- ASCIIColors.error(f"Connection error (server might have failed to start or is unresponsive): {e_conn}")
1025
- except RuntimeError as e_rt:
1026
- ASCIIColors.error(f"Runtime error (often server process issue): {e_rt}")
1027
- if active_binding and active_binding.server_process:
1028
- ASCIIColors.error("Last stderr lines from server:")
1029
- for line in active_binding.server_process._stderr_lines[-20:]: print(line) # Print last 20
1030
- except Exception as e_main:
1031
- ASCIIColors.error(f"An unexpected error occurred: {e_main}")
1032
- trace_exception(e_main)
1033
- finally:
1034
- if active_binding:
1035
- ASCIIColors.cyan("\n--- Unloading Model and Stopping Server ---")
1036
- active_binding.unload_model()
1037
- ASCIIColors.green("Server stopped and model unloaded.")
797
+ llava_binding_config = binding_config.copy()
798
+ # LLaVA might need specific chat template if server doesn't auto-detect well.
799
+ # llava_binding_config["chat_template"] = "llava-1.5"
1038
800
 
801
+ active_binding_llava = LlamaCppServerBinding(
802
+ model_name=str(llava_model_path), # Pass full path for clarity in test
803
+ models_path=str(models_path),
804
+ clip_model_name=str(llava_clip_path_actual), # Pass full path for clip
805
+ config=llava_binding_config
806
+ )
807
+ if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
808
+ raise RuntimeError("LLaVA server failed to start or become healthy.")
809
+ ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
810
+ ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
811
+
812
+
813
+ llava_prompt = "Describe this image."
814
+ llava_response = active_binding_llava.generate_text(
815
+ prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False, use_chat_format_override=True
816
+ )
817
+ if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
818
+ else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
819
+
820
+ except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
821
+ except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
822
+ finally:
823
+ if dummy_image_path.exists(): dummy_image_path.unlink()
824
+ else:
825
+ ASCIIColors.warning(f"LLaVA model '{llava_model_path.name}' or clip model '{llava_clip_path_actual.name}' not found in '{models_path}'. Skipping LLaVA test.")
826
+
827
+ if primary_model_available and active_binding1:
828
+ # --- Test changing model (using binding1 to load a different or same model) ---
829
+ ASCIIColors.cyan("\n--- Testing Model Change (binding1 reloads its model) ---")
830
+ # For a real change, use a different model name if available. Here, we reload the same.
831
+ reload_success = active_binding1.load_model(model_name_str) # Reload original model
832
+ if reload_success and active_binding1.server_process and active_binding1.server_process.is_healthy:
833
+ ASCIIColors.green(f"Model reloaded/re-confirmed successfully by binding1. Server on port {active_binding1.port}.")
834
+ reloaded_gen = active_binding1.generate_text("Ping", n_predict=5, stream=False)
835
+ if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
836
+ else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
837
+ else:
838
+ ASCIIColors.error("Failed to reload model or server not healthy after reload attempt by binding1.")
1039
839
 
840
+ except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}.")
841
+ except FileNotFoundError as e_fnf: ASCIIColors.error(f"File not found error: {e_fnf}.")
842
+ except ConnectionError as e_conn: ASCIIColors.error(f"Connection error: {e_conn}")
843
+ except RuntimeError as e_rt:
844
+ ASCIIColors.error(f"Runtime error: {e_rt}")
845
+ if active_binding1 and active_binding1.server_process: ASCIIColors.error(f"Binding1 stderr:\n{active_binding1.server_process._stderr_lines[-20:]}")
846
+ if active_binding2 and active_binding2.server_process: ASCIIColors.error(f"Binding2 stderr:\n{active_binding2.server_process._stderr_lines[-20:]}")
847
+ if active_binding_llava and active_binding_llava.server_process: ASCIIColors.error(f"LLaVA Binding stderr:\n{active_binding_llava.server_process._stderr_lines[-20:]}")
848
+ except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
849
+ finally:
850
+ ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
851
+ if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
852
+ if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
853
+ if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
854
+
855
+ # Check if any servers remain (should be none if all bindings unloaded)
856
+ with _server_registry_lock:
857
+ if _active_servers:
858
+ ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after all known bindings unloaded.")
859
+ for key, server_proc in list(_active_servers.items()): # list() for safe iteration if modifying
860
+ ASCIIColors.info(f"Force shutting down stray server: {key}")
861
+ try: server_proc.shutdown()
862
+ except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
863
+ _active_servers.pop(key,None)
864
+ _server_ref_counts.pop(key,None)
865
+ else:
866
+ ASCIIColors.green("All servers shut down correctly.")
1040
867
 
1041
868
  ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")