sentienceapi 0.92.2__py3-none-any.whl → 0.98.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (64) hide show
  1. sentience/__init__.py +107 -2
  2. sentience/_extension_loader.py +156 -1
  3. sentience/action_executor.py +2 -0
  4. sentience/actions.py +354 -9
  5. sentience/agent.py +4 -0
  6. sentience/agent_runtime.py +840 -0
  7. sentience/asserts/__init__.py +70 -0
  8. sentience/asserts/expect.py +621 -0
  9. sentience/asserts/query.py +383 -0
  10. sentience/async_api.py +8 -1
  11. sentience/backends/__init__.py +137 -0
  12. sentience/backends/actions.py +372 -0
  13. sentience/backends/browser_use_adapter.py +241 -0
  14. sentience/backends/cdp_backend.py +393 -0
  15. sentience/backends/exceptions.py +211 -0
  16. sentience/backends/playwright_backend.py +194 -0
  17. sentience/backends/protocol.py +216 -0
  18. sentience/backends/sentience_context.py +469 -0
  19. sentience/backends/snapshot.py +483 -0
  20. sentience/browser.py +230 -74
  21. sentience/canonicalization.py +207 -0
  22. sentience/cloud_tracing.py +65 -24
  23. sentience/constants.py +6 -0
  24. sentience/cursor_policy.py +142 -0
  25. sentience/extension/content.js +35 -0
  26. sentience/extension/injected_api.js +310 -15
  27. sentience/extension/manifest.json +1 -1
  28. sentience/extension/pkg/sentience_core.d.ts +22 -22
  29. sentience/extension/pkg/sentience_core.js +192 -144
  30. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  31. sentience/extension/release.json +29 -29
  32. sentience/failure_artifacts.py +241 -0
  33. sentience/integrations/__init__.py +6 -0
  34. sentience/integrations/langchain/__init__.py +12 -0
  35. sentience/integrations/langchain/context.py +18 -0
  36. sentience/integrations/langchain/core.py +326 -0
  37. sentience/integrations/langchain/tools.py +180 -0
  38. sentience/integrations/models.py +46 -0
  39. sentience/integrations/pydanticai/__init__.py +15 -0
  40. sentience/integrations/pydanticai/deps.py +20 -0
  41. sentience/integrations/pydanticai/toolset.py +468 -0
  42. sentience/llm_provider.py +695 -18
  43. sentience/models.py +536 -3
  44. sentience/ordinal.py +280 -0
  45. sentience/query.py +66 -4
  46. sentience/schemas/trace_v1.json +27 -1
  47. sentience/snapshot.py +384 -93
  48. sentience/snapshot_diff.py +39 -54
  49. sentience/text_search.py +1 -0
  50. sentience/trace_event_builder.py +20 -1
  51. sentience/trace_indexing/indexer.py +3 -49
  52. sentience/tracer_factory.py +1 -3
  53. sentience/verification.py +618 -0
  54. sentience/visual_agent.py +3 -1
  55. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +198 -40
  56. sentienceapi-0.98.0.dist-info/RECORD +92 -0
  57. sentience/utils.py +0 -296
  58. sentienceapi-0.92.2.dist-info/RECORD +0 -65
  59. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
  60. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
  61. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
  62. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
  63. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
  64. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
sentience/llm_provider.py CHANGED
@@ -1,5 +1,3 @@
1
- from typing import Optional
2
-
3
1
  """
4
2
  LLM Provider abstraction layer for Sentience SDK
5
3
  Enables "Bring Your Own Brain" (BYOB) pattern - plug in any LLM provider
@@ -7,6 +5,7 @@ Enables "Bring Your Own Brain" (BYOB) pattern - plug in any LLM provider
7
5
 
8
6
  from abc import ABC, abstractmethod
9
7
  from dataclasses import dataclass
8
+ from typing import Any
10
9
 
11
10
  from .llm_provider_utils import get_api_key_from_env, handle_provider_error, require_package
12
11
  from .llm_response_builder import LLMResponseBuilder
@@ -81,6 +80,48 @@ class LLMProvider(ABC):
81
80
  """
82
81
  pass
83
82
 
83
+ def supports_vision(self) -> bool:
84
+ """
85
+ Whether this provider supports image input for vision tasks.
86
+
87
+ Override in subclasses that support vision-capable models.
88
+
89
+ Returns:
90
+ True if provider supports vision, False otherwise
91
+ """
92
+ return False
93
+
94
+ def generate_with_image(
95
+ self,
96
+ system_prompt: str,
97
+ user_prompt: str,
98
+ image_base64: str,
99
+ **kwargs,
100
+ ) -> LLMResponse:
101
+ """
102
+ Generate a response with image input (for vision-capable models).
103
+
104
+ This method is used for vision fallback in assertions and visual agents.
105
+ Override in subclasses that support vision-capable models.
106
+
107
+ Args:
108
+ system_prompt: System instruction/context
109
+ user_prompt: User query/request
110
+ image_base64: Base64-encoded image (PNG or JPEG)
111
+ **kwargs: Provider-specific parameters (temperature, max_tokens, etc.)
112
+
113
+ Returns:
114
+ LLMResponse with content and token usage
115
+
116
+ Raises:
117
+ NotImplementedError: If provider doesn't support vision
118
+ """
119
+ raise NotImplementedError(
120
+ f"{type(self).__name__} does not support vision. "
121
+ "Use a vision-capable provider like OpenAIProvider with GPT-4o "
122
+ "or AnthropicProvider with Claude 3."
123
+ )
124
+
84
125
 
85
126
  class OpenAIProvider(LLMProvider):
86
127
  """
@@ -187,6 +228,92 @@ class OpenAIProvider(LLMProvider):
187
228
  model_lower = self._model_name.lower()
188
229
  return any(x in model_lower for x in ["gpt-4", "gpt-3.5"])
189
230
 
231
+ def supports_vision(self) -> bool:
232
+ """GPT-4o, GPT-4-turbo, and GPT-4-vision support vision."""
233
+ model_lower = self._model_name.lower()
234
+ return any(x in model_lower for x in ["gpt-4o", "gpt-4-turbo", "gpt-4-vision"])
235
+
236
+ def generate_with_image(
237
+ self,
238
+ system_prompt: str,
239
+ user_prompt: str,
240
+ image_base64: str,
241
+ temperature: float = 0.0,
242
+ max_tokens: int | None = None,
243
+ **kwargs,
244
+ ) -> LLMResponse:
245
+ """
246
+ Generate response with image input using OpenAI Vision API.
247
+
248
+ Args:
249
+ system_prompt: System instruction
250
+ user_prompt: User query
251
+ image_base64: Base64-encoded image (PNG or JPEG)
252
+ temperature: Sampling temperature (0.0 = deterministic)
253
+ max_tokens: Maximum tokens to generate
254
+ **kwargs: Additional OpenAI API parameters
255
+
256
+ Returns:
257
+ LLMResponse object
258
+
259
+ Raises:
260
+ NotImplementedError: If model doesn't support vision
261
+ """
262
+ if not self.supports_vision():
263
+ raise NotImplementedError(
264
+ f"Model {self._model_name} does not support vision. "
265
+ "Use gpt-4o, gpt-4-turbo, or gpt-4-vision-preview."
266
+ )
267
+
268
+ messages = []
269
+ if system_prompt:
270
+ messages.append({"role": "system", "content": system_prompt})
271
+
272
+ # Vision message format with image_url
273
+ messages.append(
274
+ {
275
+ "role": "user",
276
+ "content": [
277
+ {"type": "text", "text": user_prompt},
278
+ {
279
+ "type": "image_url",
280
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
281
+ },
282
+ ],
283
+ }
284
+ )
285
+
286
+ # Build API parameters
287
+ api_params = {
288
+ "model": self._model_name,
289
+ "messages": messages,
290
+ "temperature": temperature,
291
+ }
292
+
293
+ if max_tokens:
294
+ api_params["max_tokens"] = max_tokens
295
+
296
+ # Merge additional parameters
297
+ api_params.update(kwargs)
298
+
299
+ # Call OpenAI API
300
+ try:
301
+ response = self.client.chat.completions.create(**api_params)
302
+ except Exception as e:
303
+ handle_provider_error(e, "OpenAI", "generate response with image")
304
+
305
+ choice = response.choices[0]
306
+ usage = response.usage
307
+
308
+ return LLMResponseBuilder.from_openai_format(
309
+ content=choice.message.content,
310
+ prompt_tokens=usage.prompt_tokens if usage else None,
311
+ completion_tokens=usage.completion_tokens if usage else None,
312
+ total_tokens=usage.total_tokens if usage else None,
313
+ model_name=response.model,
314
+ finish_reason=choice.finish_reason,
315
+ )
316
+
190
317
  @property
191
318
  def model_name(self) -> str:
192
319
  return self._model_name
@@ -277,6 +404,94 @@ class AnthropicProvider(LLMProvider):
277
404
  """Anthropic doesn't have native JSON mode (requires prompt engineering)"""
278
405
  return False
279
406
 
407
+ def supports_vision(self) -> bool:
408
+ """Claude 3 models (Opus, Sonnet, Haiku) all support vision."""
409
+ model_lower = self._model_name.lower()
410
+ return any(x in model_lower for x in ["claude-3", "claude-3.5"])
411
+
412
+ def generate_with_image(
413
+ self,
414
+ system_prompt: str,
415
+ user_prompt: str,
416
+ image_base64: str,
417
+ temperature: float = 0.0,
418
+ max_tokens: int = 1024,
419
+ **kwargs,
420
+ ) -> LLMResponse:
421
+ """
422
+ Generate response with image input using Anthropic Vision API.
423
+
424
+ Args:
425
+ system_prompt: System instruction
426
+ user_prompt: User query
427
+ image_base64: Base64-encoded image (PNG or JPEG)
428
+ temperature: Sampling temperature
429
+ max_tokens: Maximum tokens to generate (required by Anthropic)
430
+ **kwargs: Additional Anthropic API parameters
431
+
432
+ Returns:
433
+ LLMResponse object
434
+
435
+ Raises:
436
+ NotImplementedError: If model doesn't support vision
437
+ """
438
+ if not self.supports_vision():
439
+ raise NotImplementedError(
440
+ f"Model {self._model_name} does not support vision. "
441
+ "Use Claude 3 models (claude-3-opus, claude-3-sonnet, claude-3-haiku)."
442
+ )
443
+
444
+ # Anthropic vision message format
445
+ messages = [
446
+ {
447
+ "role": "user",
448
+ "content": [
449
+ {
450
+ "type": "image",
451
+ "source": {
452
+ "type": "base64",
453
+ "media_type": "image/png",
454
+ "data": image_base64,
455
+ },
456
+ },
457
+ {
458
+ "type": "text",
459
+ "text": user_prompt,
460
+ },
461
+ ],
462
+ }
463
+ ]
464
+
465
+ # Build API parameters
466
+ api_params = {
467
+ "model": self._model_name,
468
+ "max_tokens": max_tokens,
469
+ "temperature": temperature,
470
+ "messages": messages,
471
+ }
472
+
473
+ if system_prompt:
474
+ api_params["system"] = system_prompt
475
+
476
+ # Merge additional parameters
477
+ api_params.update(kwargs)
478
+
479
+ # Call Anthropic API
480
+ try:
481
+ response = self.client.messages.create(**api_params)
482
+ except Exception as e:
483
+ handle_provider_error(e, "Anthropic", "generate response with image")
484
+
485
+ content = response.content[0].text if response.content else ""
486
+
487
+ return LLMResponseBuilder.from_anthropic_format(
488
+ content=content,
489
+ input_tokens=response.usage.input_tokens if hasattr(response, "usage") else None,
490
+ output_tokens=response.usage.output_tokens if hasattr(response, "usage") else None,
491
+ model_name=response.model,
492
+ stop_reason=response.stop_reason,
493
+ )
494
+
280
495
  @property
281
496
  def model_name(self) -> str:
282
497
  return self._model_name
@@ -527,15 +742,22 @@ class LocalLLMProvider(LLMProvider):
527
742
  """
528
743
  super().__init__(model_name) # Initialize base class with model name
529
744
 
530
- # Import required packages with consistent error handling
745
+ # Import required packages with consistent error handling.
746
+ # These are optional dependencies, so keep them out of module import-time.
531
747
  try:
532
- import torch
533
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
534
- except ImportError:
748
+ import torch # type: ignore[import-not-found]
749
+ from transformers import ( # type: ignore[import-not-found]
750
+ AutoModelForCausalLM,
751
+ AutoTokenizer,
752
+ BitsAndBytesConfig,
753
+ )
754
+ except ImportError as exc:
535
755
  raise ImportError(
536
756
  "transformers and torch required for local LLM. "
537
757
  "Install with: pip install transformers torch"
538
- )
758
+ ) from exc
759
+
760
+ self._torch = torch
539
761
 
540
762
  # Load tokenizer
541
763
  self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -556,21 +778,44 @@ class LocalLLMProvider(LLMProvider):
556
778
  elif load_in_8bit:
557
779
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
558
780
 
781
+ device = (device or "auto").strip().lower()
782
+
559
783
  # Determine torch dtype
560
784
  if torch_dtype == "auto":
561
- dtype = torch.float16 if device != "cpu" else torch.float32
785
+ dtype = torch.float16 if device not in {"cpu"} else torch.float32
562
786
  else:
563
787
  dtype = getattr(torch, torch_dtype)
564
788
 
565
- # Load model
566
- self.model = AutoModelForCausalLM.from_pretrained(
567
- model_name,
568
- quantization_config=quantization_config,
569
- torch_dtype=dtype if quantization_config is None else None,
570
- device_map=device,
571
- trust_remote_code=True,
572
- low_cpu_mem_usage=True,
573
- )
789
+ # device_map is a Transformers concept (not a literal "cpu/mps/cuda" device string).
790
+ # - "auto" enables Accelerate device mapping.
791
+ # - Otherwise, we load normally and then move the model to the requested device.
792
+ device_map: str | None = "auto" if device == "auto" else None
793
+
794
+ def _load(*, device_map_override: str | None) -> Any:
795
+ return AutoModelForCausalLM.from_pretrained(
796
+ model_name,
797
+ quantization_config=quantization_config,
798
+ torch_dtype=dtype if quantization_config is None else None,
799
+ device_map=device_map_override,
800
+ trust_remote_code=True,
801
+ low_cpu_mem_usage=True,
802
+ )
803
+
804
+ try:
805
+ self.model = _load(device_map_override=device_map)
806
+ except KeyError as e:
807
+ # Some envs / accelerate versions can crash on auto mapping (e.g. KeyError: 'cpu').
808
+ # Keep demo ergonomics: default stays "auto", but we gracefully fall back.
809
+ if device == "auto" and ("cpu" in str(e).lower()):
810
+ device = "cpu"
811
+ dtype = torch.float32
812
+ self.model = _load(device_map_override=None)
813
+ else:
814
+ raise
815
+
816
+ # If we didn't use device_map, move model explicitly (only safe for non-quantized loads).
817
+ if device_map is None and quantization_config is None and device in {"cpu", "cuda", "mps"}:
818
+ self.model = self.model.to(device)
574
819
  self.model.eval()
575
820
 
576
821
  def generate(
@@ -596,7 +841,7 @@ class LocalLLMProvider(LLMProvider):
596
841
  Returns:
597
842
  LLMResponse object
598
843
  """
599
- import torch
844
+ torch = self._torch
600
845
 
601
846
  # Auto-determine sampling based on temperature
602
847
  do_sample = temperature > 0
@@ -657,3 +902,435 @@ class LocalLLMProvider(LLMProvider):
657
902
  @property
658
903
  def model_name(self) -> str:
659
904
  return self._model_name
905
+
906
+
907
+ class LocalVisionLLMProvider(LLMProvider):
908
+ """
909
+ Local vision-language LLM provider using HuggingFace Transformers.
910
+
911
+ Intended for models like:
912
+ - Qwen/Qwen3-VL-8B-Instruct
913
+
914
+ Notes on Mac (MPS) + quantization:
915
+ - Transformers BitsAndBytes (4-bit/8-bit) typically requires CUDA and does NOT work on MPS.
916
+ - If you want quantized local vision on Apple Silicon, you may prefer MLX-based stacks
917
+ (e.g., mlx-vlm) or llama.cpp/gguf pipelines.
918
+ """
919
+
920
+ def __init__(
921
+ self,
922
+ model_name: str = "Qwen/Qwen3-VL-8B-Instruct",
923
+ device: str = "auto",
924
+ torch_dtype: str = "auto",
925
+ load_in_4bit: bool = False,
926
+ load_in_8bit: bool = False,
927
+ trust_remote_code: bool = True,
928
+ ):
929
+ super().__init__(model_name)
930
+
931
+ # Import required packages with consistent error handling
932
+ try:
933
+ import torch # type: ignore[import-not-found]
934
+ from transformers import AutoProcessor # type: ignore[import-not-found]
935
+ except ImportError as exc:
936
+ raise ImportError(
937
+ "transformers and torch are required for LocalVisionLLMProvider. "
938
+ "Install with: pip install transformers torch"
939
+ ) from exc
940
+
941
+ self._torch = torch
942
+
943
+ # Resolve device
944
+ if device == "auto":
945
+ if (
946
+ getattr(torch.backends, "mps", None) is not None
947
+ and torch.backends.mps.is_available()
948
+ ):
949
+ device = "mps"
950
+ elif torch.cuda.is_available():
951
+ device = "cuda"
952
+ else:
953
+ device = "cpu"
954
+
955
+ if device == "mps" and (load_in_4bit or load_in_8bit):
956
+ raise ValueError(
957
+ "Quantized (4-bit/8-bit) Transformers loading is typically not supported on Apple MPS. "
958
+ "Set load_in_4bit/load_in_8bit to False for MPS, or use a different local runtime "
959
+ "(e.g., MLX/llama.cpp) for quantized vision models."
960
+ )
961
+
962
+ # Determine torch dtype
963
+ if torch_dtype == "auto":
964
+ dtype = torch.float16 if device in ("cuda", "mps") else torch.float32
965
+ else:
966
+ dtype = getattr(torch, torch_dtype)
967
+
968
+ # Load processor
969
+ self.processor = AutoProcessor.from_pretrained(
970
+ model_name, trust_remote_code=trust_remote_code
971
+ )
972
+
973
+ # Load model (prefer vision2seq; fall back with guidance)
974
+ try:
975
+ import importlib
976
+
977
+ transformers = importlib.import_module("transformers")
978
+ AutoModelForVision2Seq = getattr(transformers, "AutoModelForVision2Seq", None)
979
+ if AutoModelForVision2Seq is None:
980
+ raise AttributeError("transformers.AutoModelForVision2Seq is not available")
981
+
982
+ self.model = AutoModelForVision2Seq.from_pretrained(
983
+ model_name,
984
+ torch_dtype=dtype,
985
+ trust_remote_code=trust_remote_code,
986
+ low_cpu_mem_usage=True,
987
+ )
988
+ except Exception as exc:
989
+ # Some transformers versions/models don't expose AutoModelForVision2Seq.
990
+ # We fail loudly with a helpful message rather than silently doing text-only.
991
+ raise ImportError(
992
+ "Failed to load a vision-capable Transformers model. "
993
+ "Try upgrading transformers (vision models often require newer versions), "
994
+ "or use a model class supported by your installed transformers build."
995
+ ) from exc
996
+
997
+ # Move to device
998
+ self.device = device
999
+ self.model.to(device)
1000
+
1001
+ self.model.eval()
1002
+
1003
+ def supports_json_mode(self) -> bool:
1004
+ return False
1005
+
1006
+ def supports_vision(self) -> bool:
1007
+ return True
1008
+
1009
+ @property
1010
+ def model_name(self) -> str:
1011
+ return self._model_name
1012
+
1013
+ def generate(
1014
+ self,
1015
+ system_prompt: str,
1016
+ user_prompt: str,
1017
+ max_new_tokens: int = 512,
1018
+ temperature: float = 0.1,
1019
+ top_p: float = 0.9,
1020
+ **kwargs,
1021
+ ) -> LLMResponse:
1022
+ """
1023
+ Text-only generation (no image). Provided for interface completeness.
1024
+ """
1025
+ torch = self._torch
1026
+
1027
+ messages = []
1028
+ if system_prompt:
1029
+ messages.append({"role": "system", "content": system_prompt})
1030
+ messages.append({"role": "user", "content": user_prompt})
1031
+
1032
+ if hasattr(self.processor, "apply_chat_template"):
1033
+ prompt = self.processor.apply_chat_template(
1034
+ messages, tokenize=False, add_generation_prompt=True
1035
+ )
1036
+ else:
1037
+ prompt = (system_prompt + "\n\n" if system_prompt else "") + user_prompt
1038
+
1039
+ inputs = self.processor(text=[prompt], return_tensors="pt")
1040
+ inputs = {
1041
+ k: (v.to(self.model.device) if hasattr(v, "to") else v) for k, v in inputs.items()
1042
+ }
1043
+
1044
+ do_sample = temperature > 0
1045
+ with torch.no_grad():
1046
+ outputs = self.model.generate(
1047
+ **inputs,
1048
+ max_new_tokens=max_new_tokens,
1049
+ do_sample=do_sample,
1050
+ temperature=temperature if do_sample else 1.0,
1051
+ top_p=top_p,
1052
+ **kwargs,
1053
+ )
1054
+
1055
+ # Decode
1056
+ input_len = inputs["input_ids"].shape[1] if "input_ids" in inputs else 0
1057
+ generated = outputs[0][input_len:]
1058
+ if hasattr(self.processor, "batch_decode"):
1059
+ text = self.processor.batch_decode([generated], skip_special_tokens=True)[0].strip()
1060
+ else:
1061
+ text = str(generated)
1062
+
1063
+ return LLMResponseBuilder.from_local_format(
1064
+ content=text,
1065
+ prompt_tokens=int(input_len) if input_len else None,
1066
+ completion_tokens=int(generated.shape[0]) if hasattr(generated, "shape") else None,
1067
+ model_name=self._model_name,
1068
+ )
1069
+
1070
+ def generate_with_image(
1071
+ self,
1072
+ system_prompt: str,
1073
+ user_prompt: str,
1074
+ image_base64: str,
1075
+ max_new_tokens: int = 256,
1076
+ temperature: float = 0.0,
1077
+ top_p: float = 0.9,
1078
+ **kwargs,
1079
+ ) -> LLMResponse:
1080
+ """
1081
+ Vision generation using an image + prompt.
1082
+
1083
+ This is used by vision fallback in assertions and by visual agents.
1084
+ """
1085
+ torch = self._torch
1086
+
1087
+ # Lazy import PIL to avoid adding a hard dependency for text-only users.
1088
+ try:
1089
+ from PIL import Image # type: ignore[import-not-found]
1090
+ except ImportError as exc:
1091
+ raise ImportError(
1092
+ "Pillow is required for LocalVisionLLMProvider image input. Install with: pip install pillow"
1093
+ ) from exc
1094
+
1095
+ import base64
1096
+ import io
1097
+
1098
+ img_bytes = base64.b64decode(image_base64)
1099
+ image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
1100
+
1101
+ # Prefer processor chat template if available (needed by many VL models).
1102
+ messages = []
1103
+ if system_prompt:
1104
+ messages.append({"role": "system", "content": system_prompt})
1105
+ messages.append(
1106
+ {
1107
+ "role": "user",
1108
+ "content": [
1109
+ {"type": "image", "image": image},
1110
+ {"type": "text", "text": user_prompt},
1111
+ ],
1112
+ }
1113
+ )
1114
+
1115
+ if hasattr(self.processor, "apply_chat_template"):
1116
+ prompt = self.processor.apply_chat_template(
1117
+ messages, tokenize=False, add_generation_prompt=True
1118
+ )
1119
+ else:
1120
+ raise NotImplementedError(
1121
+ "This local vision model/processor does not expose apply_chat_template(). "
1122
+ "Install/upgrade to a Transformers version that supports your model's chat template."
1123
+ )
1124
+
1125
+ inputs = self.processor(text=[prompt], images=[image], return_tensors="pt")
1126
+ inputs = {
1127
+ k: (v.to(self.model.device) if hasattr(v, "to") else v) for k, v in inputs.items()
1128
+ }
1129
+
1130
+ do_sample = temperature > 0
1131
+ with torch.no_grad():
1132
+ outputs = self.model.generate(
1133
+ **inputs,
1134
+ max_new_tokens=max_new_tokens,
1135
+ do_sample=do_sample,
1136
+ temperature=temperature if do_sample else 1.0,
1137
+ top_p=top_p,
1138
+ **kwargs,
1139
+ )
1140
+
1141
+ input_len = inputs["input_ids"].shape[1] if "input_ids" in inputs else 0
1142
+ generated = outputs[0][input_len:]
1143
+
1144
+ if hasattr(self.processor, "batch_decode"):
1145
+ text = self.processor.batch_decode([generated], skip_special_tokens=True)[0].strip()
1146
+ elif hasattr(self.processor, "tokenizer") and hasattr(self.processor.tokenizer, "decode"):
1147
+ text = self.processor.tokenizer.decode(generated, skip_special_tokens=True).strip()
1148
+ else:
1149
+ text = ""
1150
+
1151
+ return LLMResponseBuilder.from_local_format(
1152
+ content=text,
1153
+ prompt_tokens=int(input_len) if input_len else None,
1154
+ completion_tokens=int(generated.shape[0]) if hasattr(generated, "shape") else None,
1155
+ model_name=self._model_name,
1156
+ )
1157
+
1158
+
1159
+ class MLXVLMProvider(LLMProvider):
1160
+ """
1161
+ Local vision-language provider using MLX-VLM (Apple Silicon optimized).
1162
+
1163
+ Recommended for running *quantized* vision models on Mac (M1/M2/M3/M4), e.g.:
1164
+ - mlx-community/Qwen3-VL-8B-Instruct-3bit
1165
+
1166
+ Optional dependencies:
1167
+ - mlx-vlm
1168
+ - pillow
1169
+
1170
+ Notes:
1171
+ - MLX-VLM APIs can vary across versions; this provider tries a couple common call shapes.
1172
+ - For best results, use an MLX-converted model repo under `mlx-community/`.
1173
+ """
1174
+
1175
+ def __init__(
1176
+ self,
1177
+ model: str = "mlx-community/Qwen3-VL-8B-Instruct-3bit",
1178
+ *,
1179
+ default_max_tokens: int = 256,
1180
+ default_temperature: float = 0.0,
1181
+ **kwargs,
1182
+ ):
1183
+ super().__init__(model)
1184
+ self._default_max_tokens = default_max_tokens
1185
+ self._default_temperature = default_temperature
1186
+ self._default_kwargs = dict(kwargs)
1187
+
1188
+ # Lazy imports to keep base SDK light.
1189
+ try:
1190
+ import importlib
1191
+
1192
+ self._mlx_vlm = importlib.import_module("mlx_vlm")
1193
+ except ImportError as exc:
1194
+ raise ImportError(
1195
+ "mlx-vlm is required for MLXVLMProvider. Install with: pip install mlx-vlm"
1196
+ ) from exc
1197
+
1198
+ try:
1199
+ from PIL import Image # type: ignore[import-not-found]
1200
+
1201
+ self._PIL_Image = Image
1202
+ except ImportError as exc:
1203
+ raise ImportError(
1204
+ "Pillow is required for MLXVLMProvider. Install with: pip install pillow"
1205
+ ) from exc
1206
+
1207
+ # Some mlx_vlm versions expose load(model_id) -> (model, processor)
1208
+ self._model = None
1209
+ self._processor = None
1210
+ load_fn = getattr(self._mlx_vlm, "load", None)
1211
+ if callable(load_fn):
1212
+ try:
1213
+ loaded = load_fn(model)
1214
+ if isinstance(loaded, tuple) and len(loaded) >= 2:
1215
+ self._model, self._processor = loaded[0], loaded[1]
1216
+ except Exception:
1217
+ # Keep it lazy; we'll try loading on demand during generate_with_image().
1218
+ self._model, self._processor = None, None
1219
+
1220
+ def supports_json_mode(self) -> bool:
1221
+ return False
1222
+
1223
+ def supports_vision(self) -> bool:
1224
+ return True
1225
+
1226
+ @property
1227
+ def model_name(self) -> str:
1228
+ return self._model_name
1229
+
1230
+ def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse:
1231
+ """
1232
+ Text-only generation is not a primary MLX-VLM use-case. We attempt it if the installed
1233
+ mlx_vlm exposes a compatible `generate()` signature; otherwise, raise a clear error.
1234
+ """
1235
+ generate_fn = getattr(self._mlx_vlm, "generate", None)
1236
+ if not callable(generate_fn):
1237
+ raise NotImplementedError("mlx_vlm.generate is not available in your mlx-vlm install.")
1238
+
1239
+ prompt = (system_prompt + "\n\n" if system_prompt else "") + user_prompt
1240
+ max_tokens = kwargs.pop("max_tokens", self._default_max_tokens)
1241
+ temperature = kwargs.pop("temperature", self._default_temperature)
1242
+ merged_kwargs = {**self._default_kwargs, **kwargs}
1243
+
1244
+ try:
1245
+ out = generate_fn(
1246
+ self._model_name,
1247
+ prompt=prompt,
1248
+ max_tokens=max_tokens,
1249
+ temperature=temperature,
1250
+ **merged_kwargs,
1251
+ )
1252
+ except TypeError as exc:
1253
+ if self._model is None or self._processor is None:
1254
+ raise NotImplementedError(
1255
+ "Text-only generation is not supported by this mlx-vlm version without a loaded model."
1256
+ ) from exc
1257
+ out = generate_fn(
1258
+ self._model,
1259
+ self._processor,
1260
+ prompt,
1261
+ max_tokens=max_tokens,
1262
+ temperature=temperature,
1263
+ **merged_kwargs,
1264
+ )
1265
+
1266
+ text = getattr(out, "text", None) or getattr(out, "output", None) or str(out)
1267
+ return LLMResponseBuilder.from_local_format(
1268
+ content=str(text).strip(),
1269
+ prompt_tokens=None,
1270
+ completion_tokens=None,
1271
+ model_name=self._model_name,
1272
+ )
1273
+
1274
+ def generate_with_image(
1275
+ self,
1276
+ system_prompt: str,
1277
+ user_prompt: str,
1278
+ image_base64: str,
1279
+ **kwargs,
1280
+ ) -> LLMResponse:
1281
+ import base64
1282
+ import io
1283
+
1284
+ generate_fn = getattr(self._mlx_vlm, "generate", None)
1285
+ if not callable(generate_fn):
1286
+ raise NotImplementedError("mlx_vlm.generate is not available in your mlx-vlm install.")
1287
+
1288
+ img_bytes = base64.b64decode(image_base64)
1289
+ image = self._PIL_Image.open(io.BytesIO(img_bytes)).convert("RGB")
1290
+
1291
+ prompt = (system_prompt + "\n\n" if system_prompt else "") + user_prompt
1292
+ max_tokens = kwargs.pop("max_tokens", self._default_max_tokens)
1293
+ temperature = kwargs.pop("temperature", self._default_temperature)
1294
+ merged_kwargs = {**self._default_kwargs, **kwargs}
1295
+
1296
+ # Try a couple common MLX-VLM call shapes.
1297
+ try:
1298
+ # 1) generate(model_id, image=..., prompt=...)
1299
+ out = generate_fn(
1300
+ self._model_name,
1301
+ image=image,
1302
+ prompt=prompt,
1303
+ max_tokens=max_tokens,
1304
+ temperature=temperature,
1305
+ **merged_kwargs,
1306
+ )
1307
+ except TypeError as exc:
1308
+ # 2) generate(model, processor, prompt, image, ...)
1309
+ if self._model is None or self._processor is None:
1310
+ load_fn = getattr(self._mlx_vlm, "load", None)
1311
+ if callable(load_fn):
1312
+ loaded = load_fn(self._model_name)
1313
+ if isinstance(loaded, tuple) and len(loaded) >= 2:
1314
+ self._model, self._processor = loaded[0], loaded[1]
1315
+ if self._model is None or self._processor is None:
1316
+ raise NotImplementedError(
1317
+ "Unable to call mlx_vlm.generate with your installed mlx-vlm version. "
1318
+ "Please upgrade mlx-vlm or use LocalVisionLLMProvider (Transformers backend)."
1319
+ ) from exc
1320
+ out = generate_fn(
1321
+ self._model,
1322
+ self._processor,
1323
+ prompt,
1324
+ image,
1325
+ max_tokens=max_tokens,
1326
+ temperature=temperature,
1327
+ **merged_kwargs,
1328
+ )
1329
+
1330
+ text = getattr(out, "text", None) or getattr(out, "output", None) or str(out)
1331
+ return LLMResponseBuilder.from_local_format(
1332
+ content=str(text).strip(),
1333
+ prompt_tokens=None,
1334
+ completion_tokens=None,
1335
+ model_name=self._model_name,
1336
+ )