hindsight-api 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/admin/__init__.py +1 -0
- hindsight_api/admin/cli.py +252 -0
- hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
- hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
- hindsight_api/api/http.py +282 -20
- hindsight_api/api/mcp.py +47 -52
- hindsight_api/config.py +238 -6
- hindsight_api/engine/cross_encoder.py +599 -86
- hindsight_api/engine/db_budget.py +284 -0
- hindsight_api/engine/db_utils.py +11 -0
- hindsight_api/engine/embeddings.py +453 -26
- hindsight_api/engine/entity_resolver.py +8 -5
- hindsight_api/engine/interface.py +8 -4
- hindsight_api/engine/llm_wrapper.py +241 -27
- hindsight_api/engine/memory_engine.py +609 -122
- hindsight_api/engine/query_analyzer.py +4 -3
- hindsight_api/engine/response_models.py +38 -0
- hindsight_api/engine/retain/fact_extraction.py +388 -192
- hindsight_api/engine/retain/fact_storage.py +34 -8
- hindsight_api/engine/retain/link_utils.py +24 -16
- hindsight_api/engine/retain/orchestrator.py +52 -17
- hindsight_api/engine/retain/types.py +9 -0
- hindsight_api/engine/search/graph_retrieval.py +42 -13
- hindsight_api/engine/search/link_expansion_retrieval.py +256 -0
- hindsight_api/engine/search/mpfp_retrieval.py +362 -117
- hindsight_api/engine/search/reranking.py +2 -2
- hindsight_api/engine/search/retrieval.py +847 -200
- hindsight_api/engine/search/tags.py +172 -0
- hindsight_api/engine/search/think_utils.py +1 -1
- hindsight_api/engine/search/trace.py +12 -0
- hindsight_api/engine/search/tracer.py +24 -1
- hindsight_api/engine/search/types.py +21 -0
- hindsight_api/engine/task_backend.py +109 -18
- hindsight_api/engine/utils.py +1 -1
- hindsight_api/extensions/context.py +10 -1
- hindsight_api/main.py +56 -4
- hindsight_api/metrics.py +433 -48
- hindsight_api/migrations.py +141 -1
- hindsight_api/models.py +3 -1
- hindsight_api/pg0.py +53 -0
- hindsight_api/server.py +39 -2
- {hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/METADATA +5 -1
- hindsight_api-0.3.0.dist-info/RECORD +82 -0
- {hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/entry_points.txt +1 -0
- hindsight_api-0.2.1.dist-info/RECORD +0 -75
- {hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/WHEEL +0 -0
|
@@ -19,9 +19,12 @@ from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinish
|
|
|
19
19
|
from ..config import (
|
|
20
20
|
DEFAULT_LLM_MAX_CONCURRENT,
|
|
21
21
|
DEFAULT_LLM_TIMEOUT,
|
|
22
|
+
ENV_LLM_GROQ_SERVICE_TIER,
|
|
22
23
|
ENV_LLM_MAX_CONCURRENT,
|
|
23
24
|
ENV_LLM_TIMEOUT,
|
|
24
25
|
)
|
|
26
|
+
from ..metrics import get_metrics_collector
|
|
27
|
+
from .response_models import TokenUsage
|
|
25
28
|
|
|
26
29
|
# Seed applied to every Groq request for deterministic behavior.
|
|
27
30
|
DEFAULT_LLM_SEED = 4242
|
|
@@ -63,6 +66,7 @@ class LLMProvider:
|
|
|
63
66
|
base_url: str,
|
|
64
67
|
model: str,
|
|
65
68
|
reasoning_effort: str = "low",
|
|
69
|
+
groq_service_tier: str | None = None,
|
|
66
70
|
):
|
|
67
71
|
"""
|
|
68
72
|
Initialize LLM provider.
|
|
@@ -73,18 +77,25 @@ class LLMProvider:
|
|
|
73
77
|
base_url: Base URL for the API.
|
|
74
78
|
model: Model name.
|
|
75
79
|
reasoning_effort: Reasoning effort level for supported providers.
|
|
80
|
+
groq_service_tier: Groq service tier ("on_demand", "flex", "auto"). Default: None (uses Groq's default).
|
|
76
81
|
"""
|
|
77
82
|
self.provider = provider.lower()
|
|
78
83
|
self.api_key = api_key
|
|
79
84
|
self.base_url = base_url
|
|
80
85
|
self.model = model
|
|
81
86
|
self.reasoning_effort = reasoning_effort
|
|
87
|
+
# Default to 'auto' for best performance, users can override to 'on_demand' for free tier
|
|
88
|
+
self.groq_service_tier = groq_service_tier or os.getenv(ENV_LLM_GROQ_SERVICE_TIER, "auto")
|
|
82
89
|
|
|
83
90
|
# Validate provider
|
|
84
|
-
valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
|
|
91
|
+
valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio", "mock"]
|
|
85
92
|
if self.provider not in valid_providers:
|
|
86
93
|
raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
|
|
87
94
|
|
|
95
|
+
# Mock provider tracking (for testing)
|
|
96
|
+
self._mock_calls: list[dict] = []
|
|
97
|
+
self._mock_response: Any = None
|
|
98
|
+
|
|
88
99
|
# Set default base URLs
|
|
89
100
|
if not self.base_url:
|
|
90
101
|
if self.provider == "groq":
|
|
@@ -94,8 +105,8 @@ class LLMProvider:
|
|
|
94
105
|
elif self.provider == "lmstudio":
|
|
95
106
|
self.base_url = "http://localhost:1234/v1"
|
|
96
107
|
|
|
97
|
-
# Validate API key (not needed for ollama or
|
|
98
|
-
if self.provider not in ("ollama", "lmstudio") and not self.api_key:
|
|
108
|
+
# Validate API key (not needed for ollama, lmstudio, or mock)
|
|
109
|
+
if self.provider not in ("ollama", "lmstudio", "mock") and not self.api_key:
|
|
99
110
|
raise ValueError(f"API key not found for {self.provider}")
|
|
100
111
|
|
|
101
112
|
# Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
|
|
@@ -106,7 +117,10 @@ class LLMProvider:
|
|
|
106
117
|
self._gemini_client = None
|
|
107
118
|
self._anthropic_client = None
|
|
108
119
|
|
|
109
|
-
if self.provider == "
|
|
120
|
+
if self.provider == "mock":
|
|
121
|
+
# Mock provider - no client needed
|
|
122
|
+
pass
|
|
123
|
+
elif self.provider == "gemini":
|
|
110
124
|
self._gemini_client = genai.Client(api_key=self.api_key)
|
|
111
125
|
elif self.provider == "anthropic":
|
|
112
126
|
from anthropic import AsyncAnthropic
|
|
@@ -169,6 +183,7 @@ class LLMProvider:
|
|
|
169
183
|
max_backoff: float = 60.0,
|
|
170
184
|
skip_validation: bool = False,
|
|
171
185
|
strict_schema: bool = False,
|
|
186
|
+
return_usage: bool = False,
|
|
172
187
|
) -> Any:
|
|
173
188
|
"""
|
|
174
189
|
Make an LLM API call with retry logic.
|
|
@@ -184,21 +199,43 @@ class LLMProvider:
|
|
|
184
199
|
max_backoff: Maximum backoff time in seconds.
|
|
185
200
|
skip_validation: Return raw JSON without Pydantic validation.
|
|
186
201
|
strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
|
|
202
|
+
return_usage: If True, return tuple (result, TokenUsage) instead of just result.
|
|
187
203
|
|
|
188
204
|
Returns:
|
|
189
|
-
Parsed response if response_format is provided, otherwise text content.
|
|
205
|
+
If return_usage=False: Parsed response if response_format is provided, otherwise text content.
|
|
206
|
+
If return_usage=True: Tuple of (result, TokenUsage) with token counts from the LLM call.
|
|
190
207
|
|
|
191
208
|
Raises:
|
|
192
209
|
OutputTooLongError: If output exceeds token limits.
|
|
193
210
|
Exception: Re-raises API errors after retries exhausted.
|
|
194
211
|
"""
|
|
212
|
+
queue_start_time = time.time()
|
|
195
213
|
async with _global_llm_semaphore:
|
|
196
214
|
start_time = time.time()
|
|
215
|
+
semaphore_wait_time = start_time - queue_start_time
|
|
216
|
+
|
|
217
|
+
# Handle Mock provider (for testing)
|
|
218
|
+
if self.provider == "mock":
|
|
219
|
+
return await self._call_mock(
|
|
220
|
+
messages,
|
|
221
|
+
response_format,
|
|
222
|
+
scope,
|
|
223
|
+
return_usage,
|
|
224
|
+
)
|
|
197
225
|
|
|
198
226
|
# Handle Gemini provider separately
|
|
199
227
|
if self.provider == "gemini":
|
|
200
228
|
return await self._call_gemini(
|
|
201
|
-
messages,
|
|
229
|
+
messages,
|
|
230
|
+
response_format,
|
|
231
|
+
max_retries,
|
|
232
|
+
initial_backoff,
|
|
233
|
+
max_backoff,
|
|
234
|
+
skip_validation,
|
|
235
|
+
start_time,
|
|
236
|
+
scope,
|
|
237
|
+
return_usage,
|
|
238
|
+
semaphore_wait_time,
|
|
202
239
|
)
|
|
203
240
|
|
|
204
241
|
# Handle Anthropic provider separately
|
|
@@ -212,6 +249,9 @@ class LLMProvider:
|
|
|
212
249
|
max_backoff,
|
|
213
250
|
skip_validation,
|
|
214
251
|
start_time,
|
|
252
|
+
scope,
|
|
253
|
+
return_usage,
|
|
254
|
+
semaphore_wait_time,
|
|
215
255
|
)
|
|
216
256
|
|
|
217
257
|
# Handle Ollama with native API for structured output (better schema enforcement)
|
|
@@ -226,6 +266,9 @@ class LLMProvider:
|
|
|
226
266
|
max_backoff,
|
|
227
267
|
skip_validation,
|
|
228
268
|
start_time,
|
|
269
|
+
scope,
|
|
270
|
+
return_usage,
|
|
271
|
+
semaphore_wait_time,
|
|
229
272
|
)
|
|
230
273
|
|
|
231
274
|
call_params = {
|
|
@@ -263,11 +306,15 @@ class LLMProvider:
|
|
|
263
306
|
# Provider-specific parameters
|
|
264
307
|
if self.provider == "groq":
|
|
265
308
|
call_params["seed"] = DEFAULT_LLM_SEED
|
|
266
|
-
extra_body = {
|
|
267
|
-
#
|
|
309
|
+
extra_body: dict[str, Any] = {}
|
|
310
|
+
# Add service_tier if configured (requires paid plan for flex/auto)
|
|
311
|
+
if self.groq_service_tier:
|
|
312
|
+
extra_body["service_tier"] = self.groq_service_tier
|
|
313
|
+
# Add reasoning parameters for reasoning models
|
|
268
314
|
if is_reasoning_model:
|
|
269
315
|
extra_body["include_reasoning"] = False
|
|
270
|
-
|
|
316
|
+
if extra_body:
|
|
317
|
+
call_params["extra_body"] = extra_body
|
|
271
318
|
|
|
272
319
|
last_exception = None
|
|
273
320
|
|
|
@@ -370,21 +417,46 @@ class LLMProvider:
|
|
|
370
417
|
response = await self._client.chat.completions.create(**call_params)
|
|
371
418
|
result = response.choices[0].message.content
|
|
372
419
|
|
|
373
|
-
#
|
|
420
|
+
# Record token usage metrics
|
|
374
421
|
duration = time.time() - start_time
|
|
375
422
|
usage = response.usage
|
|
376
|
-
if
|
|
377
|
-
|
|
423
|
+
input_tokens = usage.prompt_tokens or 0 if usage else 0
|
|
424
|
+
output_tokens = usage.completion_tokens or 0 if usage else 0
|
|
425
|
+
total_tokens = usage.total_tokens or 0 if usage else 0
|
|
426
|
+
|
|
427
|
+
# Record LLM metrics
|
|
428
|
+
metrics = get_metrics_collector()
|
|
429
|
+
metrics.record_llm_call(
|
|
430
|
+
provider=self.provider,
|
|
431
|
+
model=self.model,
|
|
432
|
+
scope=scope,
|
|
433
|
+
duration=duration,
|
|
434
|
+
input_tokens=input_tokens,
|
|
435
|
+
output_tokens=output_tokens,
|
|
436
|
+
success=True,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Log slow calls
|
|
440
|
+
if duration > 10.0 and usage:
|
|
441
|
+
ratio = max(1, output_tokens) / max(1, input_tokens)
|
|
378
442
|
cached_tokens = 0
|
|
379
443
|
if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
|
|
380
444
|
cached_tokens = getattr(usage.prompt_tokens_details, "cached_tokens", 0) or 0
|
|
381
445
|
cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
|
|
446
|
+
wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
|
|
382
447
|
logger.info(
|
|
383
|
-
f"slow llm call: model={self.provider}/{self.model}, "
|
|
384
|
-
f"input_tokens={
|
|
385
|
-
f"total_tokens={
|
|
448
|
+
f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
|
|
449
|
+
f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
|
|
450
|
+
f"total_tokens={total_tokens}{cache_info}, time={duration:.3f}s{wait_info}, ratio out/in={ratio:.2f}"
|
|
386
451
|
)
|
|
387
452
|
|
|
453
|
+
if return_usage:
|
|
454
|
+
token_usage = TokenUsage(
|
|
455
|
+
input_tokens=input_tokens,
|
|
456
|
+
output_tokens=output_tokens,
|
|
457
|
+
total_tokens=total_tokens,
|
|
458
|
+
)
|
|
459
|
+
return result, token_usage
|
|
388
460
|
return result
|
|
389
461
|
|
|
390
462
|
except LengthFinishReasonError as e:
|
|
@@ -443,6 +515,9 @@ class LLMProvider:
|
|
|
443
515
|
max_backoff: float,
|
|
444
516
|
skip_validation: bool,
|
|
445
517
|
start_time: float,
|
|
518
|
+
scope: str = "memory",
|
|
519
|
+
return_usage: bool = False,
|
|
520
|
+
semaphore_wait_time: float = 0.0,
|
|
446
521
|
) -> Any:
|
|
447
522
|
"""Handle Anthropic-specific API calls."""
|
|
448
523
|
from anthropic import APIConnectionError, APIStatusError, RateLimitError
|
|
@@ -515,17 +590,40 @@ class LLMProvider:
|
|
|
515
590
|
else:
|
|
516
591
|
result = content
|
|
517
592
|
|
|
518
|
-
#
|
|
593
|
+
# Record metrics and log slow calls
|
|
519
594
|
duration = time.time() - start_time
|
|
595
|
+
input_tokens = response.usage.input_tokens or 0 if response.usage else 0
|
|
596
|
+
output_tokens = response.usage.output_tokens or 0 if response.usage else 0
|
|
597
|
+
total_tokens = input_tokens + output_tokens
|
|
598
|
+
|
|
599
|
+
# Record LLM metrics
|
|
600
|
+
metrics = get_metrics_collector()
|
|
601
|
+
metrics.record_llm_call(
|
|
602
|
+
provider=self.provider,
|
|
603
|
+
model=self.model,
|
|
604
|
+
scope=scope,
|
|
605
|
+
duration=duration,
|
|
606
|
+
input_tokens=input_tokens,
|
|
607
|
+
output_tokens=output_tokens,
|
|
608
|
+
success=True,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Log slow calls
|
|
520
612
|
if duration > 10.0:
|
|
521
|
-
|
|
522
|
-
output_tokens = response.usage.output_tokens
|
|
613
|
+
wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
|
|
523
614
|
logger.info(
|
|
524
|
-
f"slow llm call: model={self.provider}/{self.model}, "
|
|
615
|
+
f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
|
|
525
616
|
f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
|
|
526
|
-
f"time={duration:.3f}s"
|
|
617
|
+
f"time={duration:.3f}s{wait_info}"
|
|
527
618
|
)
|
|
528
619
|
|
|
620
|
+
if return_usage:
|
|
621
|
+
token_usage = TokenUsage(
|
|
622
|
+
input_tokens=input_tokens,
|
|
623
|
+
output_tokens=output_tokens,
|
|
624
|
+
total_tokens=total_tokens,
|
|
625
|
+
)
|
|
626
|
+
return result, token_usage
|
|
529
627
|
return result
|
|
530
628
|
|
|
531
629
|
except json.JSONDecodeError as e:
|
|
@@ -580,6 +678,9 @@ class LLMProvider:
|
|
|
580
678
|
max_backoff: float,
|
|
581
679
|
skip_validation: bool,
|
|
582
680
|
start_time: float,
|
|
681
|
+
scope: str = "memory",
|
|
682
|
+
return_usage: bool = False,
|
|
683
|
+
semaphore_wait_time: float = 0.0,
|
|
583
684
|
) -> Any:
|
|
584
685
|
"""
|
|
585
686
|
Call Ollama using native API with JSON schema enforcement.
|
|
@@ -654,11 +755,39 @@ class LLMProvider:
|
|
|
654
755
|
else:
|
|
655
756
|
raise
|
|
656
757
|
|
|
758
|
+
# Extract token usage from Ollama response
|
|
759
|
+
# Ollama returns prompt_eval_count (input) and eval_count (output)
|
|
760
|
+
duration = time.time() - start_time
|
|
761
|
+
input_tokens = result.get("prompt_eval_count", 0) or 0
|
|
762
|
+
output_tokens = result.get("eval_count", 0) or 0
|
|
763
|
+
total_tokens = input_tokens + output_tokens
|
|
764
|
+
|
|
765
|
+
# Record LLM metrics
|
|
766
|
+
metrics = get_metrics_collector()
|
|
767
|
+
metrics.record_llm_call(
|
|
768
|
+
provider=self.provider,
|
|
769
|
+
model=self.model,
|
|
770
|
+
scope=scope,
|
|
771
|
+
duration=duration,
|
|
772
|
+
input_tokens=input_tokens,
|
|
773
|
+
output_tokens=output_tokens,
|
|
774
|
+
success=True,
|
|
775
|
+
)
|
|
776
|
+
|
|
657
777
|
# Validate against Pydantic model or return raw JSON
|
|
658
778
|
if skip_validation:
|
|
659
|
-
|
|
779
|
+
validated_result = json_data
|
|
660
780
|
else:
|
|
661
|
-
|
|
781
|
+
validated_result = response_format.model_validate(json_data)
|
|
782
|
+
|
|
783
|
+
if return_usage:
|
|
784
|
+
token_usage = TokenUsage(
|
|
785
|
+
input_tokens=input_tokens,
|
|
786
|
+
output_tokens=output_tokens,
|
|
787
|
+
total_tokens=total_tokens,
|
|
788
|
+
)
|
|
789
|
+
return validated_result, token_usage
|
|
790
|
+
return validated_result
|
|
662
791
|
|
|
663
792
|
except httpx.HTTPStatusError as e:
|
|
664
793
|
last_exception = e
|
|
@@ -701,6 +830,9 @@ class LLMProvider:
|
|
|
701
830
|
max_backoff: float,
|
|
702
831
|
skip_validation: bool,
|
|
703
832
|
start_time: float,
|
|
833
|
+
scope: str = "memory",
|
|
834
|
+
return_usage: bool = False,
|
|
835
|
+
semaphore_wait_time: float = 0.0,
|
|
704
836
|
) -> Any:
|
|
705
837
|
"""Handle Gemini-specific API calls."""
|
|
706
838
|
# Convert OpenAI-style messages to Gemini format
|
|
@@ -777,16 +909,43 @@ class LLMProvider:
|
|
|
777
909
|
else:
|
|
778
910
|
result = content
|
|
779
911
|
|
|
780
|
-
#
|
|
912
|
+
# Record metrics and log slow calls
|
|
781
913
|
duration = time.time() - start_time
|
|
782
|
-
|
|
914
|
+
input_tokens = 0
|
|
915
|
+
output_tokens = 0
|
|
916
|
+
if hasattr(response, "usage_metadata") and response.usage_metadata:
|
|
783
917
|
usage = response.usage_metadata
|
|
918
|
+
input_tokens = usage.prompt_token_count or 0
|
|
919
|
+
output_tokens = usage.candidates_token_count or 0
|
|
920
|
+
|
|
921
|
+
# Record LLM metrics
|
|
922
|
+
metrics = get_metrics_collector()
|
|
923
|
+
metrics.record_llm_call(
|
|
924
|
+
provider=self.provider,
|
|
925
|
+
model=self.model,
|
|
926
|
+
scope=scope,
|
|
927
|
+
duration=duration,
|
|
928
|
+
input_tokens=input_tokens,
|
|
929
|
+
output_tokens=output_tokens,
|
|
930
|
+
success=True,
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Log slow calls
|
|
934
|
+
if duration > 10.0 and input_tokens > 0:
|
|
935
|
+
wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
|
|
784
936
|
logger.info(
|
|
785
|
-
f"slow llm call: model={self.provider}/{self.model}, "
|
|
786
|
-
f"input_tokens={
|
|
787
|
-
f"time={duration:.3f}s"
|
|
937
|
+
f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
|
|
938
|
+
f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
|
|
939
|
+
f"time={duration:.3f}s{wait_info}"
|
|
788
940
|
)
|
|
789
941
|
|
|
942
|
+
if return_usage:
|
|
943
|
+
token_usage = TokenUsage(
|
|
944
|
+
input_tokens=input_tokens,
|
|
945
|
+
output_tokens=output_tokens,
|
|
946
|
+
total_tokens=input_tokens + output_tokens,
|
|
947
|
+
)
|
|
948
|
+
return result, token_usage
|
|
790
949
|
return result
|
|
791
950
|
|
|
792
951
|
except json.JSONDecodeError as e:
|
|
@@ -828,6 +987,61 @@ class LLMProvider:
|
|
|
828
987
|
raise last_exception
|
|
829
988
|
raise RuntimeError("Gemini call failed after all retries")
|
|
830
989
|
|
|
990
|
+
async def _call_mock(
|
|
991
|
+
self,
|
|
992
|
+
messages: list[dict[str, str]],
|
|
993
|
+
response_format: Any | None,
|
|
994
|
+
scope: str,
|
|
995
|
+
return_usage: bool,
|
|
996
|
+
) -> Any:
|
|
997
|
+
"""
|
|
998
|
+
Handle mock provider calls for testing.
|
|
999
|
+
|
|
1000
|
+
Records the call and returns a configurable mock response.
|
|
1001
|
+
"""
|
|
1002
|
+
# Record the call for test verification
|
|
1003
|
+
call_record = {
|
|
1004
|
+
"provider": self.provider,
|
|
1005
|
+
"model": self.model,
|
|
1006
|
+
"messages": messages,
|
|
1007
|
+
"response_format": response_format.__name__
|
|
1008
|
+
if response_format and hasattr(response_format, "__name__")
|
|
1009
|
+
else str(response_format),
|
|
1010
|
+
"scope": scope,
|
|
1011
|
+
}
|
|
1012
|
+
self._mock_calls.append(call_record)
|
|
1013
|
+
logger.debug(f"Mock LLM call recorded: scope={scope}, model={self.model}")
|
|
1014
|
+
|
|
1015
|
+
# Return mock response
|
|
1016
|
+
if self._mock_response is not None:
|
|
1017
|
+
result = self._mock_response
|
|
1018
|
+
elif response_format is not None:
|
|
1019
|
+
# Try to create a minimal valid instance of the response format
|
|
1020
|
+
try:
|
|
1021
|
+
# For Pydantic models, try to create with minimal valid data
|
|
1022
|
+
result = {"mock": True}
|
|
1023
|
+
except Exception:
|
|
1024
|
+
result = {"mock": True}
|
|
1025
|
+
else:
|
|
1026
|
+
result = "mock response"
|
|
1027
|
+
|
|
1028
|
+
if return_usage:
|
|
1029
|
+
token_usage = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15)
|
|
1030
|
+
return result, token_usage
|
|
1031
|
+
return result
|
|
1032
|
+
|
|
1033
|
+
def set_mock_response(self, response: Any) -> None:
|
|
1034
|
+
"""Set the response to return from mock calls."""
|
|
1035
|
+
self._mock_response = response
|
|
1036
|
+
|
|
1037
|
+
def get_mock_calls(self) -> list[dict]:
|
|
1038
|
+
"""Get the list of recorded mock calls."""
|
|
1039
|
+
return self._mock_calls
|
|
1040
|
+
|
|
1041
|
+
def clear_mock_calls(self) -> None:
|
|
1042
|
+
"""Clear the recorded mock calls."""
|
|
1043
|
+
self._mock_calls = []
|
|
1044
|
+
|
|
831
1045
|
@classmethod
|
|
832
1046
|
def for_memory(cls) -> "LLMProvider":
|
|
833
1047
|
"""Create provider for memory operations from environment variables."""
|