sentienceapi 0.90.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (50) hide show
  1. sentience/__init__.py +153 -0
  2. sentience/_extension_loader.py +40 -0
  3. sentience/actions.py +837 -0
  4. sentience/agent.py +1246 -0
  5. sentience/agent_config.py +43 -0
  6. sentience/async_api.py +101 -0
  7. sentience/base_agent.py +194 -0
  8. sentience/browser.py +1037 -0
  9. sentience/cli.py +130 -0
  10. sentience/cloud_tracing.py +382 -0
  11. sentience/conversational_agent.py +509 -0
  12. sentience/expect.py +188 -0
  13. sentience/extension/background.js +233 -0
  14. sentience/extension/content.js +298 -0
  15. sentience/extension/injected_api.js +1473 -0
  16. sentience/extension/manifest.json +36 -0
  17. sentience/extension/pkg/sentience_core.d.ts +51 -0
  18. sentience/extension/pkg/sentience_core.js +529 -0
  19. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  20. sentience/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
  21. sentience/extension/release.json +115 -0
  22. sentience/extension/test-content.js +4 -0
  23. sentience/formatting.py +59 -0
  24. sentience/generator.py +202 -0
  25. sentience/inspector.py +365 -0
  26. sentience/llm_provider.py +637 -0
  27. sentience/models.py +412 -0
  28. sentience/overlay.py +222 -0
  29. sentience/query.py +303 -0
  30. sentience/read.py +185 -0
  31. sentience/recorder.py +589 -0
  32. sentience/schemas/trace_v1.json +216 -0
  33. sentience/screenshot.py +100 -0
  34. sentience/snapshot.py +516 -0
  35. sentience/text_search.py +290 -0
  36. sentience/trace_indexing/__init__.py +27 -0
  37. sentience/trace_indexing/index_schema.py +111 -0
  38. sentience/trace_indexing/indexer.py +357 -0
  39. sentience/tracer_factory.py +211 -0
  40. sentience/tracing.py +285 -0
  41. sentience/utils.py +296 -0
  42. sentience/wait.py +137 -0
  43. sentienceapi-0.90.17.dist-info/METADATA +917 -0
  44. sentienceapi-0.90.17.dist-info/RECORD +50 -0
  45. sentienceapi-0.90.17.dist-info/WHEEL +5 -0
  46. sentienceapi-0.90.17.dist-info/entry_points.txt +2 -0
  47. sentienceapi-0.90.17.dist-info/licenses/LICENSE +24 -0
  48. sentienceapi-0.90.17.dist-info/licenses/LICENSE-APACHE +201 -0
  49. sentienceapi-0.90.17.dist-info/licenses/LICENSE-MIT +21 -0
  50. sentienceapi-0.90.17.dist-info/top_level.txt +1 -0
@@ -0,0 +1,637 @@
1
+ """
2
+ LLM Provider abstraction layer for Sentience SDK
3
+ Enables "Bring Your Own Brain" (BYOB) pattern - plug in any LLM provider
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass
11
+ class LLMResponse:
12
+ """Standardized LLM response across all providers"""
13
+
14
+ content: str
15
+ prompt_tokens: int | None = None
16
+ completion_tokens: int | None = None
17
+ total_tokens: int | None = None
18
+ model_name: str | None = None
19
+ finish_reason: str | None = None
20
+
21
+
22
+ class LLMProvider(ABC):
23
+ """
24
+ Abstract base class for LLM providers.
25
+
26
+ Implement this interface to add support for any LLM:
27
+ - OpenAI (GPT-4, GPT-3.5)
28
+ - Anthropic (Claude)
29
+ - Local models (Ollama, LlamaCpp)
30
+ - Azure OpenAI
31
+ - Any other completion API
32
+ """
33
+
34
+ @abstractmethod
35
+ def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse:
36
+ """
37
+ Generate a response from the LLM
38
+
39
+ Args:
40
+ system_prompt: System instruction/context
41
+ user_prompt: User query/request
42
+ **kwargs: Provider-specific parameters (temperature, max_tokens, etc.)
43
+
44
+ Returns:
45
+ LLMResponse with content and token usage
46
+ """
47
+ pass
48
+
49
+ @abstractmethod
50
+ def supports_json_mode(self) -> bool:
51
+ """
52
+ Whether this provider supports structured JSON output
53
+
54
+ Returns:
55
+ True if provider has native JSON mode, False otherwise
56
+ """
57
+ pass
58
+
59
+ @property
60
+ @abstractmethod
61
+ def model_name(self) -> str:
62
+ """
63
+ Model identifier (e.g., "gpt-4o", "claude-3-sonnet")
64
+
65
+ Returns:
66
+ Model name string
67
+ """
68
+ pass
69
+
70
+
71
+ class OpenAIProvider(LLMProvider):
72
+ """
73
+ OpenAI provider implementation (GPT-4, GPT-4o, GPT-3.5-turbo, etc.)
74
+
75
+ Example:
76
+ >>> from sentience.llm_provider import OpenAIProvider
77
+ >>> llm = OpenAIProvider(api_key="sk-...", model="gpt-4o")
78
+ >>> response = llm.generate("You are a helpful assistant", "Hello!")
79
+ >>> print(response.content)
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ api_key: str | None = None,
85
+ model: str = "gpt-4o",
86
+ base_url: str | None = None,
87
+ organization: str | None = None,
88
+ ):
89
+ """
90
+ Initialize OpenAI provider
91
+
92
+ Args:
93
+ api_key: OpenAI API key (or set OPENAI_API_KEY env var)
94
+ model: Model name (gpt-4o, gpt-4-turbo, gpt-3.5-turbo, etc.)
95
+ base_url: Custom API base URL (for compatible APIs)
96
+ organization: OpenAI organization ID
97
+ """
98
+ try:
99
+ from openai import OpenAI
100
+ except ImportError:
101
+ raise ImportError("OpenAI package not installed. Install with: pip install openai")
102
+
103
+ self.client = OpenAI(api_key=api_key, base_url=base_url, organization=organization)
104
+ self._model_name = model
105
+
106
+ def generate(
107
+ self,
108
+ system_prompt: str,
109
+ user_prompt: str,
110
+ temperature: float = 0.0,
111
+ max_tokens: int | None = None,
112
+ json_mode: bool = False,
113
+ **kwargs,
114
+ ) -> LLMResponse:
115
+ """
116
+ Generate response using OpenAI API
117
+
118
+ Args:
119
+ system_prompt: System instruction
120
+ user_prompt: User query
121
+ temperature: Sampling temperature (0.0 = deterministic, 1.0 = creative)
122
+ max_tokens: Maximum tokens to generate
123
+ json_mode: Enable JSON response format (requires model support)
124
+ **kwargs: Additional OpenAI API parameters
125
+
126
+ Returns:
127
+ LLMResponse object
128
+ """
129
+ messages = []
130
+ if system_prompt:
131
+ messages.append({"role": "system", "content": system_prompt})
132
+ messages.append({"role": "user", "content": user_prompt})
133
+
134
+ # Build API parameters
135
+ api_params = {
136
+ "model": self._model_name,
137
+ "messages": messages,
138
+ "temperature": temperature,
139
+ }
140
+
141
+ if max_tokens:
142
+ api_params["max_tokens"] = max_tokens
143
+
144
+ if json_mode and self.supports_json_mode():
145
+ api_params["response_format"] = {"type": "json_object"}
146
+
147
+ # Merge additional parameters
148
+ api_params.update(kwargs)
149
+
150
+ # Call OpenAI API
151
+ response = self.client.chat.completions.create(**api_params)
152
+
153
+ choice = response.choices[0]
154
+ usage = response.usage
155
+
156
+ return LLMResponse(
157
+ content=choice.message.content,
158
+ prompt_tokens=usage.prompt_tokens if usage else None,
159
+ completion_tokens=usage.completion_tokens if usage else None,
160
+ total_tokens=usage.total_tokens if usage else None,
161
+ model_name=response.model,
162
+ finish_reason=choice.finish_reason,
163
+ )
164
+
165
+ def supports_json_mode(self) -> bool:
166
+ """OpenAI models support JSON mode (GPT-4, GPT-3.5-turbo)"""
167
+ model_lower = self._model_name.lower()
168
+ return any(x in model_lower for x in ["gpt-4", "gpt-3.5"])
169
+
170
+ @property
171
+ def model_name(self) -> str:
172
+ return self._model_name
173
+
174
+
175
+ class AnthropicProvider(LLMProvider):
176
+ """
177
+ Anthropic provider implementation (Claude 3 Opus, Sonnet, Haiku, etc.)
178
+
179
+ Example:
180
+ >>> from sentience.llm_provider import AnthropicProvider
181
+ >>> llm = AnthropicProvider(api_key="sk-ant-...", model="claude-3-sonnet-20240229")
182
+ >>> response = llm.generate("You are a helpful assistant", "Hello!")
183
+ >>> print(response.content)
184
+ """
185
+
186
+ def __init__(self, api_key: str | None = None, model: str = "claude-3-5-sonnet-20241022"):
187
+ """
188
+ Initialize Anthropic provider
189
+
190
+ Args:
191
+ api_key: Anthropic API key (or set ANTHROPIC_API_KEY env var)
192
+ model: Model name (claude-3-opus, claude-3-sonnet, claude-3-haiku, etc.)
193
+ """
194
+ try:
195
+ from anthropic import Anthropic
196
+ except ImportError:
197
+ raise ImportError(
198
+ "Anthropic package not installed. Install with: pip install anthropic"
199
+ )
200
+
201
+ self.client = Anthropic(api_key=api_key)
202
+ self._model_name = model
203
+
204
+ def generate(
205
+ self,
206
+ system_prompt: str,
207
+ user_prompt: str,
208
+ temperature: float = 0.0,
209
+ max_tokens: int = 1024,
210
+ **kwargs,
211
+ ) -> LLMResponse:
212
+ """
213
+ Generate response using Anthropic API
214
+
215
+ Args:
216
+ system_prompt: System instruction
217
+ user_prompt: User query
218
+ temperature: Sampling temperature
219
+ max_tokens: Maximum tokens to generate (required by Anthropic)
220
+ **kwargs: Additional Anthropic API parameters
221
+
222
+ Returns:
223
+ LLMResponse object
224
+ """
225
+ # Build API parameters
226
+ api_params = {
227
+ "model": self._model_name,
228
+ "max_tokens": max_tokens,
229
+ "temperature": temperature,
230
+ "messages": [{"role": "user", "content": user_prompt}],
231
+ }
232
+
233
+ if system_prompt:
234
+ api_params["system"] = system_prompt
235
+
236
+ # Merge additional parameters
237
+ api_params.update(kwargs)
238
+
239
+ # Call Anthropic API
240
+ response = self.client.messages.create(**api_params)
241
+
242
+ content = response.content[0].text if response.content else ""
243
+
244
+ return LLMResponse(
245
+ content=content,
246
+ prompt_tokens=response.usage.input_tokens if hasattr(response, "usage") else None,
247
+ completion_tokens=response.usage.output_tokens if hasattr(response, "usage") else None,
248
+ total_tokens=(
249
+ (response.usage.input_tokens + response.usage.output_tokens)
250
+ if hasattr(response, "usage")
251
+ else None
252
+ ),
253
+ model_name=response.model,
254
+ finish_reason=response.stop_reason,
255
+ )
256
+
257
+ def supports_json_mode(self) -> bool:
258
+ """Anthropic doesn't have native JSON mode (requires prompt engineering)"""
259
+ return False
260
+
261
+ @property
262
+ def model_name(self) -> str:
263
+ return self._model_name
264
+
265
+
266
+ class GLMProvider(LLMProvider):
267
+ """
268
+ Zhipu AI GLM provider implementation (GLM-4, GLM-4-Plus, etc.)
269
+
270
+ Requirements:
271
+ pip install zhipuai
272
+
273
+ Example:
274
+ >>> from sentience.llm_provider import GLMProvider
275
+ >>> llm = GLMProvider(api_key="your-api-key", model="glm-4-plus")
276
+ >>> response = llm.generate("You are a helpful assistant", "Hello!")
277
+ >>> print(response.content)
278
+ """
279
+
280
+ def __init__(self, api_key: str | None = None, model: str = "glm-4-plus"):
281
+ """
282
+ Initialize GLM provider
283
+
284
+ Args:
285
+ api_key: Zhipu AI API key (or set GLM_API_KEY env var)
286
+ model: Model name (glm-4-plus, glm-4, glm-4-air, glm-4-flash, etc.)
287
+ """
288
+ try:
289
+ from zhipuai import ZhipuAI
290
+ except ImportError:
291
+ raise ImportError("ZhipuAI package not installed. Install with: pip install zhipuai")
292
+
293
+ self.client = ZhipuAI(api_key=api_key)
294
+ self._model_name = model
295
+
296
+ def generate(
297
+ self,
298
+ system_prompt: str,
299
+ user_prompt: str,
300
+ temperature: float = 0.0,
301
+ max_tokens: int | None = None,
302
+ **kwargs,
303
+ ) -> LLMResponse:
304
+ """
305
+ Generate response using GLM API
306
+
307
+ Args:
308
+ system_prompt: System instruction
309
+ user_prompt: User query
310
+ temperature: Sampling temperature (0.0 = deterministic, 1.0 = creative)
311
+ max_tokens: Maximum tokens to generate
312
+ **kwargs: Additional GLM API parameters
313
+
314
+ Returns:
315
+ LLMResponse object
316
+ """
317
+ messages = []
318
+ if system_prompt:
319
+ messages.append({"role": "system", "content": system_prompt})
320
+ messages.append({"role": "user", "content": user_prompt})
321
+
322
+ # Build API parameters
323
+ api_params = {
324
+ "model": self._model_name,
325
+ "messages": messages,
326
+ "temperature": temperature,
327
+ }
328
+
329
+ if max_tokens:
330
+ api_params["max_tokens"] = max_tokens
331
+
332
+ # Merge additional parameters
333
+ api_params.update(kwargs)
334
+
335
+ # Call GLM API
336
+ response = self.client.chat.completions.create(**api_params)
337
+
338
+ choice = response.choices[0]
339
+ usage = response.usage
340
+
341
+ return LLMResponse(
342
+ content=choice.message.content,
343
+ prompt_tokens=usage.prompt_tokens if usage else None,
344
+ completion_tokens=usage.completion_tokens if usage else None,
345
+ total_tokens=usage.total_tokens if usage else None,
346
+ model_name=response.model,
347
+ finish_reason=choice.finish_reason,
348
+ )
349
+
350
+ def supports_json_mode(self) -> bool:
351
+ """GLM-4 models support JSON mode"""
352
+ return "glm-4" in self._model_name.lower()
353
+
354
+ @property
355
+ def model_name(self) -> str:
356
+ return self._model_name
357
+
358
+
359
+ class GeminiProvider(LLMProvider):
360
+ """
361
+ Google Gemini provider implementation (Gemini 2.0, Gemini 1.5 Pro, etc.)
362
+
363
+ Requirements:
364
+ pip install google-generativeai
365
+
366
+ Example:
367
+ >>> from sentience.llm_provider import GeminiProvider
368
+ >>> llm = GeminiProvider(api_key="your-api-key", model="gemini-2.0-flash-exp")
369
+ >>> response = llm.generate("You are a helpful assistant", "Hello!")
370
+ >>> print(response.content)
371
+ """
372
+
373
+ def __init__(self, api_key: str | None = None, model: str = "gemini-2.0-flash-exp"):
374
+ """
375
+ Initialize Gemini provider
376
+
377
+ Args:
378
+ api_key: Google API key (or set GEMINI_API_KEY or GOOGLE_API_KEY env var)
379
+ model: Model name (gemini-2.0-flash-exp, gemini-1.5-pro, gemini-1.5-flash, etc.)
380
+ """
381
+ try:
382
+ import google.generativeai as genai
383
+ except ImportError:
384
+ raise ImportError(
385
+ "Google Generative AI package not installed. Install with: pip install google-generativeai"
386
+ )
387
+
388
+ # Configure API key
389
+ if api_key:
390
+ genai.configure(api_key=api_key)
391
+ else:
392
+ import os
393
+
394
+ api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
395
+ if api_key:
396
+ genai.configure(api_key=api_key)
397
+
398
+ self.genai = genai
399
+ self._model_name = model
400
+ self.model = genai.GenerativeModel(model)
401
+
402
+ def generate(
403
+ self,
404
+ system_prompt: str,
405
+ user_prompt: str,
406
+ temperature: float = 0.0,
407
+ max_tokens: int | None = None,
408
+ **kwargs,
409
+ ) -> LLMResponse:
410
+ """
411
+ Generate response using Gemini API
412
+
413
+ Args:
414
+ system_prompt: System instruction
415
+ user_prompt: User query
416
+ temperature: Sampling temperature (0.0 = deterministic, 2.0 = very creative)
417
+ max_tokens: Maximum tokens to generate
418
+ **kwargs: Additional Gemini API parameters
419
+
420
+ Returns:
421
+ LLMResponse object
422
+ """
423
+ # Combine system and user prompts (Gemini doesn't have separate system role in all versions)
424
+ full_prompt = f"{system_prompt}\n\n{user_prompt}" if system_prompt else user_prompt
425
+
426
+ # Build generation config
427
+ generation_config = {
428
+ "temperature": temperature,
429
+ }
430
+
431
+ if max_tokens:
432
+ generation_config["max_output_tokens"] = max_tokens
433
+
434
+ # Merge additional parameters
435
+ generation_config.update(kwargs)
436
+
437
+ # Call Gemini API
438
+ response = self.model.generate_content(full_prompt, generation_config=generation_config)
439
+
440
+ # Extract content
441
+ content = response.text if response.text else ""
442
+
443
+ # Token usage (if available)
444
+ prompt_tokens = None
445
+ completion_tokens = None
446
+ total_tokens = None
447
+
448
+ if hasattr(response, "usage_metadata") and response.usage_metadata:
449
+ prompt_tokens = response.usage_metadata.prompt_token_count
450
+ completion_tokens = response.usage_metadata.candidates_token_count
451
+ total_tokens = response.usage_metadata.total_token_count
452
+
453
+ return LLMResponse(
454
+ content=content,
455
+ prompt_tokens=prompt_tokens,
456
+ completion_tokens=completion_tokens,
457
+ total_tokens=total_tokens,
458
+ model_name=self._model_name,
459
+ finish_reason=None, # Gemini uses different finish reason format
460
+ )
461
+
462
+ def supports_json_mode(self) -> bool:
463
+ """Gemini 1.5+ models support JSON mode via response_mime_type"""
464
+ model_lower = self._model_name.lower()
465
+ return any(x in model_lower for x in ["gemini-1.5", "gemini-2.0"])
466
+
467
+ @property
468
+ def model_name(self) -> str:
469
+ return self._model_name
470
+
471
+
472
+ class LocalLLMProvider(LLMProvider):
473
+ """
474
+ Local LLM provider using HuggingFace Transformers
475
+ Supports Qwen, Llama, Gemma, Phi, and other instruction-tuned models
476
+
477
+ Example:
478
+ >>> from sentience.llm_provider import LocalLLMProvider
479
+ >>> llm = LocalLLMProvider(model_name="Qwen/Qwen2.5-3B-Instruct")
480
+ >>> response = llm.generate("You are helpful", "Hello!")
481
+ """
482
+
483
+ def __init__(
484
+ self,
485
+ model_name: str = "Qwen/Qwen2.5-3B-Instruct",
486
+ device: str = "auto",
487
+ load_in_4bit: bool = False,
488
+ load_in_8bit: bool = False,
489
+ torch_dtype: str = "auto",
490
+ ):
491
+ """
492
+ Initialize local LLM using HuggingFace Transformers
493
+
494
+ Args:
495
+ model_name: HuggingFace model identifier
496
+ Popular options:
497
+ - "Qwen/Qwen2.5-3B-Instruct" (recommended, 3B params)
498
+ - "meta-llama/Llama-3.2-3B-Instruct" (3B params)
499
+ - "google/gemma-2-2b-it" (2B params)
500
+ - "microsoft/Phi-3-mini-4k-instruct" (3.8B params)
501
+ device: Device to run on ("cpu", "cuda", "mps", "auto")
502
+ load_in_4bit: Use 4-bit quantization (saves 75% memory)
503
+ load_in_8bit: Use 8-bit quantization (saves 50% memory)
504
+ torch_dtype: Data type ("auto", "float16", "bfloat16", "float32")
505
+ """
506
+ try:
507
+ import torch
508
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
509
+ except ImportError:
510
+ raise ImportError(
511
+ "transformers and torch required for local LLM. "
512
+ "Install with: pip install transformers torch"
513
+ )
514
+
515
+ self._model_name = model_name
516
+
517
+ # Load tokenizer
518
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
519
+
520
+ # Set padding token if not present
521
+ if self.tokenizer.pad_token is None:
522
+ self.tokenizer.pad_token = self.tokenizer.eos_token
523
+
524
+ # Configure quantization
525
+ quantization_config = None
526
+ if load_in_4bit:
527
+ quantization_config = BitsAndBytesConfig(
528
+ load_in_4bit=True,
529
+ bnb_4bit_compute_dtype=torch.float16,
530
+ bnb_4bit_use_double_quant=True,
531
+ bnb_4bit_quant_type="nf4",
532
+ )
533
+ elif load_in_8bit:
534
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
535
+
536
+ # Determine torch dtype
537
+ if torch_dtype == "auto":
538
+ dtype = torch.float16 if device != "cpu" else torch.float32
539
+ else:
540
+ dtype = getattr(torch, torch_dtype)
541
+
542
+ # Load model
543
+ self.model = AutoModelForCausalLM.from_pretrained(
544
+ model_name,
545
+ quantization_config=quantization_config,
546
+ torch_dtype=dtype if quantization_config is None else None,
547
+ device_map=device,
548
+ trust_remote_code=True,
549
+ low_cpu_mem_usage=True,
550
+ )
551
+ self.model.eval()
552
+
553
+ def generate(
554
+ self,
555
+ system_prompt: str,
556
+ user_prompt: str,
557
+ max_new_tokens: int = 512,
558
+ temperature: float = 0.1,
559
+ top_p: float = 0.9,
560
+ **kwargs,
561
+ ) -> LLMResponse:
562
+ """
563
+ Generate response using local model
564
+
565
+ Args:
566
+ system_prompt: System instruction
567
+ user_prompt: User query
568
+ max_new_tokens: Maximum tokens to generate
569
+ temperature: Sampling temperature (0 = greedy, higher = more random)
570
+ top_p: Nucleus sampling parameter
571
+ **kwargs: Additional generation parameters
572
+
573
+ Returns:
574
+ LLMResponse object
575
+ """
576
+ import torch
577
+
578
+ # Auto-determine sampling based on temperature
579
+ do_sample = temperature > 0
580
+
581
+ # Format prompt using model's chat template
582
+ messages = []
583
+ if system_prompt:
584
+ messages.append({"role": "system", "content": system_prompt})
585
+ messages.append({"role": "user", "content": user_prompt})
586
+
587
+ # Use model's native chat template if available
588
+ if hasattr(self.tokenizer, "apply_chat_template"):
589
+ formatted_prompt = self.tokenizer.apply_chat_template(
590
+ messages, tokenize=False, add_generation_prompt=True
591
+ )
592
+ else:
593
+ # Fallback formatting
594
+ formatted_prompt = ""
595
+ if system_prompt:
596
+ formatted_prompt += f"System: {system_prompt}\n\n"
597
+ formatted_prompt += f"User: {user_prompt}\n\nAssistant:"
598
+
599
+ # Tokenize
600
+ inputs = self.tokenizer(formatted_prompt, return_tensors="pt", truncation=True).to(
601
+ self.model.device
602
+ )
603
+
604
+ input_length = inputs["input_ids"].shape[1]
605
+
606
+ # Generate
607
+ with torch.no_grad():
608
+ outputs = self.model.generate(
609
+ **inputs,
610
+ max_new_tokens=max_new_tokens,
611
+ temperature=temperature if do_sample else 1.0,
612
+ top_p=top_p,
613
+ do_sample=do_sample,
614
+ pad_token_id=self.tokenizer.pad_token_id,
615
+ eos_token_id=self.tokenizer.eos_token_id,
616
+ **kwargs,
617
+ )
618
+
619
+ # Decode only the new tokens
620
+ generated_tokens = outputs[0][input_length:]
621
+ response_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
622
+
623
+ return LLMResponse(
624
+ content=response_text,
625
+ prompt_tokens=input_length,
626
+ completion_tokens=len(generated_tokens),
627
+ total_tokens=input_length + len(generated_tokens),
628
+ model_name=self._model_name,
629
+ )
630
+
631
+ def supports_json_mode(self) -> bool:
632
+ """Local models typically need prompt engineering for JSON"""
633
+ return False
634
+
635
+ @property
636
+ def model_name(self) -> str:
637
+ return self._model_name