hindsight-api 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. hindsight_api/admin/__init__.py +1 -0
  2. hindsight_api/admin/cli.py +252 -0
  3. hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
  4. hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
  5. hindsight_api/api/http.py +282 -20
  6. hindsight_api/api/mcp.py +47 -52
  7. hindsight_api/config.py +238 -6
  8. hindsight_api/engine/cross_encoder.py +599 -86
  9. hindsight_api/engine/db_budget.py +284 -0
  10. hindsight_api/engine/db_utils.py +11 -0
  11. hindsight_api/engine/embeddings.py +453 -26
  12. hindsight_api/engine/entity_resolver.py +8 -5
  13. hindsight_api/engine/interface.py +8 -4
  14. hindsight_api/engine/llm_wrapper.py +241 -27
  15. hindsight_api/engine/memory_engine.py +609 -122
  16. hindsight_api/engine/query_analyzer.py +4 -3
  17. hindsight_api/engine/response_models.py +38 -0
  18. hindsight_api/engine/retain/fact_extraction.py +388 -192
  19. hindsight_api/engine/retain/fact_storage.py +34 -8
  20. hindsight_api/engine/retain/link_utils.py +24 -16
  21. hindsight_api/engine/retain/orchestrator.py +52 -17
  22. hindsight_api/engine/retain/types.py +9 -0
  23. hindsight_api/engine/search/graph_retrieval.py +42 -13
  24. hindsight_api/engine/search/link_expansion_retrieval.py +256 -0
  25. hindsight_api/engine/search/mpfp_retrieval.py +362 -117
  26. hindsight_api/engine/search/reranking.py +2 -2
  27. hindsight_api/engine/search/retrieval.py +847 -200
  28. hindsight_api/engine/search/tags.py +172 -0
  29. hindsight_api/engine/search/think_utils.py +1 -1
  30. hindsight_api/engine/search/trace.py +12 -0
  31. hindsight_api/engine/search/tracer.py +24 -1
  32. hindsight_api/engine/search/types.py +21 -0
  33. hindsight_api/engine/task_backend.py +109 -18
  34. hindsight_api/engine/utils.py +1 -1
  35. hindsight_api/extensions/context.py +10 -1
  36. hindsight_api/main.py +56 -4
  37. hindsight_api/metrics.py +433 -48
  38. hindsight_api/migrations.py +141 -1
  39. hindsight_api/models.py +3 -1
  40. hindsight_api/pg0.py +53 -0
  41. hindsight_api/server.py +39 -2
  42. {hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/METADATA +5 -1
  43. hindsight_api-0.3.0.dist-info/RECORD +82 -0
  44. {hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/entry_points.txt +1 -0
  45. hindsight_api-0.2.1.dist-info/RECORD +0 -75
  46. {hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/WHEEL +0 -0
@@ -19,9 +19,12 @@ from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinish
19
19
  from ..config import (
20
20
  DEFAULT_LLM_MAX_CONCURRENT,
21
21
  DEFAULT_LLM_TIMEOUT,
22
+ ENV_LLM_GROQ_SERVICE_TIER,
22
23
  ENV_LLM_MAX_CONCURRENT,
23
24
  ENV_LLM_TIMEOUT,
24
25
  )
26
+ from ..metrics import get_metrics_collector
27
+ from .response_models import TokenUsage
25
28
 
26
29
  # Seed applied to every Groq request for deterministic behavior.
27
30
  DEFAULT_LLM_SEED = 4242
@@ -63,6 +66,7 @@ class LLMProvider:
63
66
  base_url: str,
64
67
  model: str,
65
68
  reasoning_effort: str = "low",
69
+ groq_service_tier: str | None = None,
66
70
  ):
67
71
  """
68
72
  Initialize LLM provider.
@@ -73,18 +77,25 @@ class LLMProvider:
73
77
  base_url: Base URL for the API.
74
78
  model: Model name.
75
79
  reasoning_effort: Reasoning effort level for supported providers.
80
+ groq_service_tier: Groq service tier ("on_demand", "flex", "auto"). Default: None (uses Groq's default).
76
81
  """
77
82
  self.provider = provider.lower()
78
83
  self.api_key = api_key
79
84
  self.base_url = base_url
80
85
  self.model = model
81
86
  self.reasoning_effort = reasoning_effort
87
+ # Default to 'auto' for best performance, users can override to 'on_demand' for free tier
88
+ self.groq_service_tier = groq_service_tier or os.getenv(ENV_LLM_GROQ_SERVICE_TIER, "auto")
82
89
 
83
90
  # Validate provider
84
- valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
91
+ valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio", "mock"]
85
92
  if self.provider not in valid_providers:
86
93
  raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
87
94
 
95
+ # Mock provider tracking (for testing)
96
+ self._mock_calls: list[dict] = []
97
+ self._mock_response: Any = None
98
+
88
99
  # Set default base URLs
89
100
  if not self.base_url:
90
101
  if self.provider == "groq":
@@ -94,8 +105,8 @@ class LLMProvider:
94
105
  elif self.provider == "lmstudio":
95
106
  self.base_url = "http://localhost:1234/v1"
96
107
 
97
- # Validate API key (not needed for ollama or lmstudio)
98
- if self.provider not in ("ollama", "lmstudio") and not self.api_key:
108
+ # Validate API key (not needed for ollama, lmstudio, or mock)
109
+ if self.provider not in ("ollama", "lmstudio", "mock") and not self.api_key:
99
110
  raise ValueError(f"API key not found for {self.provider}")
100
111
 
101
112
  # Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
@@ -106,7 +117,10 @@ class LLMProvider:
106
117
  self._gemini_client = None
107
118
  self._anthropic_client = None
108
119
 
109
- if self.provider == "gemini":
120
+ if self.provider == "mock":
121
+ # Mock provider - no client needed
122
+ pass
123
+ elif self.provider == "gemini":
110
124
  self._gemini_client = genai.Client(api_key=self.api_key)
111
125
  elif self.provider == "anthropic":
112
126
  from anthropic import AsyncAnthropic
@@ -169,6 +183,7 @@ class LLMProvider:
169
183
  max_backoff: float = 60.0,
170
184
  skip_validation: bool = False,
171
185
  strict_schema: bool = False,
186
+ return_usage: bool = False,
172
187
  ) -> Any:
173
188
  """
174
189
  Make an LLM API call with retry logic.
@@ -184,21 +199,43 @@ class LLMProvider:
184
199
  max_backoff: Maximum backoff time in seconds.
185
200
  skip_validation: Return raw JSON without Pydantic validation.
186
201
  strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
202
+ return_usage: If True, return tuple (result, TokenUsage) instead of just result.
187
203
 
188
204
  Returns:
189
- Parsed response if response_format is provided, otherwise text content.
205
+ If return_usage=False: Parsed response if response_format is provided, otherwise text content.
206
+ If return_usage=True: Tuple of (result, TokenUsage) with token counts from the LLM call.
190
207
 
191
208
  Raises:
192
209
  OutputTooLongError: If output exceeds token limits.
193
210
  Exception: Re-raises API errors after retries exhausted.
194
211
  """
212
+ queue_start_time = time.time()
195
213
  async with _global_llm_semaphore:
196
214
  start_time = time.time()
215
+ semaphore_wait_time = start_time - queue_start_time
216
+
217
+ # Handle Mock provider (for testing)
218
+ if self.provider == "mock":
219
+ return await self._call_mock(
220
+ messages,
221
+ response_format,
222
+ scope,
223
+ return_usage,
224
+ )
197
225
 
198
226
  # Handle Gemini provider separately
199
227
  if self.provider == "gemini":
200
228
  return await self._call_gemini(
201
- messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
229
+ messages,
230
+ response_format,
231
+ max_retries,
232
+ initial_backoff,
233
+ max_backoff,
234
+ skip_validation,
235
+ start_time,
236
+ scope,
237
+ return_usage,
238
+ semaphore_wait_time,
202
239
  )
203
240
 
204
241
  # Handle Anthropic provider separately
@@ -212,6 +249,9 @@ class LLMProvider:
212
249
  max_backoff,
213
250
  skip_validation,
214
251
  start_time,
252
+ scope,
253
+ return_usage,
254
+ semaphore_wait_time,
215
255
  )
216
256
 
217
257
  # Handle Ollama with native API for structured output (better schema enforcement)
@@ -226,6 +266,9 @@ class LLMProvider:
226
266
  max_backoff,
227
267
  skip_validation,
228
268
  start_time,
269
+ scope,
270
+ return_usage,
271
+ semaphore_wait_time,
229
272
  )
230
273
 
231
274
  call_params = {
@@ -263,11 +306,15 @@ class LLMProvider:
263
306
  # Provider-specific parameters
264
307
  if self.provider == "groq":
265
308
  call_params["seed"] = DEFAULT_LLM_SEED
266
- extra_body = {"service_tier": "auto"}
267
- # Only add reasoning parameters for reasoning models
309
+ extra_body: dict[str, Any] = {}
310
+ # Add service_tier if configured (requires paid plan for flex/auto)
311
+ if self.groq_service_tier:
312
+ extra_body["service_tier"] = self.groq_service_tier
313
+ # Add reasoning parameters for reasoning models
268
314
  if is_reasoning_model:
269
315
  extra_body["include_reasoning"] = False
270
- call_params["extra_body"] = extra_body
316
+ if extra_body:
317
+ call_params["extra_body"] = extra_body
271
318
 
272
319
  last_exception = None
273
320
 
@@ -370,21 +417,46 @@ class LLMProvider:
370
417
  response = await self._client.chat.completions.create(**call_params)
371
418
  result = response.choices[0].message.content
372
419
 
373
- # Log slow calls
420
+ # Record token usage metrics
374
421
  duration = time.time() - start_time
375
422
  usage = response.usage
376
- if duration > 10.0:
377
- ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
423
+ input_tokens = usage.prompt_tokens or 0 if usage else 0
424
+ output_tokens = usage.completion_tokens or 0 if usage else 0
425
+ total_tokens = usage.total_tokens or 0 if usage else 0
426
+
427
+ # Record LLM metrics
428
+ metrics = get_metrics_collector()
429
+ metrics.record_llm_call(
430
+ provider=self.provider,
431
+ model=self.model,
432
+ scope=scope,
433
+ duration=duration,
434
+ input_tokens=input_tokens,
435
+ output_tokens=output_tokens,
436
+ success=True,
437
+ )
438
+
439
+ # Log slow calls
440
+ if duration > 10.0 and usage:
441
+ ratio = max(1, output_tokens) / max(1, input_tokens)
378
442
  cached_tokens = 0
379
443
  if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
380
444
  cached_tokens = getattr(usage.prompt_tokens_details, "cached_tokens", 0) or 0
381
445
  cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
446
+ wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
382
447
  logger.info(
383
- f"slow llm call: model={self.provider}/{self.model}, "
384
- f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
385
- f"total_tokens={usage.total_tokens}{cache_info}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
448
+ f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
449
+ f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
450
+ f"total_tokens={total_tokens}{cache_info}, time={duration:.3f}s{wait_info}, ratio out/in={ratio:.2f}"
386
451
  )
387
452
 
453
+ if return_usage:
454
+ token_usage = TokenUsage(
455
+ input_tokens=input_tokens,
456
+ output_tokens=output_tokens,
457
+ total_tokens=total_tokens,
458
+ )
459
+ return result, token_usage
388
460
  return result
389
461
 
390
462
  except LengthFinishReasonError as e:
@@ -443,6 +515,9 @@ class LLMProvider:
443
515
  max_backoff: float,
444
516
  skip_validation: bool,
445
517
  start_time: float,
518
+ scope: str = "memory",
519
+ return_usage: bool = False,
520
+ semaphore_wait_time: float = 0.0,
446
521
  ) -> Any:
447
522
  """Handle Anthropic-specific API calls."""
448
523
  from anthropic import APIConnectionError, APIStatusError, RateLimitError
@@ -515,17 +590,40 @@ class LLMProvider:
515
590
  else:
516
591
  result = content
517
592
 
518
- # Log slow calls
593
+ # Record metrics and log slow calls
519
594
  duration = time.time() - start_time
595
+ input_tokens = response.usage.input_tokens or 0 if response.usage else 0
596
+ output_tokens = response.usage.output_tokens or 0 if response.usage else 0
597
+ total_tokens = input_tokens + output_tokens
598
+
599
+ # Record LLM metrics
600
+ metrics = get_metrics_collector()
601
+ metrics.record_llm_call(
602
+ provider=self.provider,
603
+ model=self.model,
604
+ scope=scope,
605
+ duration=duration,
606
+ input_tokens=input_tokens,
607
+ output_tokens=output_tokens,
608
+ success=True,
609
+ )
610
+
611
+ # Log slow calls
520
612
  if duration > 10.0:
521
- input_tokens = response.usage.input_tokens
522
- output_tokens = response.usage.output_tokens
613
+ wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
523
614
  logger.info(
524
- f"slow llm call: model={self.provider}/{self.model}, "
615
+ f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
525
616
  f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
526
- f"time={duration:.3f}s"
617
+ f"time={duration:.3f}s{wait_info}"
527
618
  )
528
619
 
620
+ if return_usage:
621
+ token_usage = TokenUsage(
622
+ input_tokens=input_tokens,
623
+ output_tokens=output_tokens,
624
+ total_tokens=total_tokens,
625
+ )
626
+ return result, token_usage
529
627
  return result
530
628
 
531
629
  except json.JSONDecodeError as e:
@@ -580,6 +678,9 @@ class LLMProvider:
580
678
  max_backoff: float,
581
679
  skip_validation: bool,
582
680
  start_time: float,
681
+ scope: str = "memory",
682
+ return_usage: bool = False,
683
+ semaphore_wait_time: float = 0.0,
583
684
  ) -> Any:
584
685
  """
585
686
  Call Ollama using native API with JSON schema enforcement.
@@ -654,11 +755,39 @@ class LLMProvider:
654
755
  else:
655
756
  raise
656
757
 
758
+ # Extract token usage from Ollama response
759
+ # Ollama returns prompt_eval_count (input) and eval_count (output)
760
+ duration = time.time() - start_time
761
+ input_tokens = result.get("prompt_eval_count", 0) or 0
762
+ output_tokens = result.get("eval_count", 0) or 0
763
+ total_tokens = input_tokens + output_tokens
764
+
765
+ # Record LLM metrics
766
+ metrics = get_metrics_collector()
767
+ metrics.record_llm_call(
768
+ provider=self.provider,
769
+ model=self.model,
770
+ scope=scope,
771
+ duration=duration,
772
+ input_tokens=input_tokens,
773
+ output_tokens=output_tokens,
774
+ success=True,
775
+ )
776
+
657
777
  # Validate against Pydantic model or return raw JSON
658
778
  if skip_validation:
659
- return json_data
779
+ validated_result = json_data
660
780
  else:
661
- return response_format.model_validate(json_data)
781
+ validated_result = response_format.model_validate(json_data)
782
+
783
+ if return_usage:
784
+ token_usage = TokenUsage(
785
+ input_tokens=input_tokens,
786
+ output_tokens=output_tokens,
787
+ total_tokens=total_tokens,
788
+ )
789
+ return validated_result, token_usage
790
+ return validated_result
662
791
 
663
792
  except httpx.HTTPStatusError as e:
664
793
  last_exception = e
@@ -701,6 +830,9 @@ class LLMProvider:
701
830
  max_backoff: float,
702
831
  skip_validation: bool,
703
832
  start_time: float,
833
+ scope: str = "memory",
834
+ return_usage: bool = False,
835
+ semaphore_wait_time: float = 0.0,
704
836
  ) -> Any:
705
837
  """Handle Gemini-specific API calls."""
706
838
  # Convert OpenAI-style messages to Gemini format
@@ -777,16 +909,43 @@ class LLMProvider:
777
909
  else:
778
910
  result = content
779
911
 
780
- # Log slow calls
912
+ # Record metrics and log slow calls
781
913
  duration = time.time() - start_time
782
- if duration > 10.0 and hasattr(response, "usage_metadata") and response.usage_metadata:
914
+ input_tokens = 0
915
+ output_tokens = 0
916
+ if hasattr(response, "usage_metadata") and response.usage_metadata:
783
917
  usage = response.usage_metadata
918
+ input_tokens = usage.prompt_token_count or 0
919
+ output_tokens = usage.candidates_token_count or 0
920
+
921
+ # Record LLM metrics
922
+ metrics = get_metrics_collector()
923
+ metrics.record_llm_call(
924
+ provider=self.provider,
925
+ model=self.model,
926
+ scope=scope,
927
+ duration=duration,
928
+ input_tokens=input_tokens,
929
+ output_tokens=output_tokens,
930
+ success=True,
931
+ )
932
+
933
+ # Log slow calls
934
+ if duration > 10.0 and input_tokens > 0:
935
+ wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
784
936
  logger.info(
785
- f"slow llm call: model={self.provider}/{self.model}, "
786
- f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}, "
787
- f"time={duration:.3f}s"
937
+ f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
938
+ f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
939
+ f"time={duration:.3f}s{wait_info}"
788
940
  )
789
941
 
942
+ if return_usage:
943
+ token_usage = TokenUsage(
944
+ input_tokens=input_tokens,
945
+ output_tokens=output_tokens,
946
+ total_tokens=input_tokens + output_tokens,
947
+ )
948
+ return result, token_usage
790
949
  return result
791
950
 
792
951
  except json.JSONDecodeError as e:
@@ -828,6 +987,61 @@ class LLMProvider:
828
987
  raise last_exception
829
988
  raise RuntimeError("Gemini call failed after all retries")
830
989
 
990
+ async def _call_mock(
991
+ self,
992
+ messages: list[dict[str, str]],
993
+ response_format: Any | None,
994
+ scope: str,
995
+ return_usage: bool,
996
+ ) -> Any:
997
+ """
998
+ Handle mock provider calls for testing.
999
+
1000
+ Records the call and returns a configurable mock response.
1001
+ """
1002
+ # Record the call for test verification
1003
+ call_record = {
1004
+ "provider": self.provider,
1005
+ "model": self.model,
1006
+ "messages": messages,
1007
+ "response_format": response_format.__name__
1008
+ if response_format and hasattr(response_format, "__name__")
1009
+ else str(response_format),
1010
+ "scope": scope,
1011
+ }
1012
+ self._mock_calls.append(call_record)
1013
+ logger.debug(f"Mock LLM call recorded: scope={scope}, model={self.model}")
1014
+
1015
+ # Return mock response
1016
+ if self._mock_response is not None:
1017
+ result = self._mock_response
1018
+ elif response_format is not None:
1019
+ # Try to create a minimal valid instance of the response format
1020
+ try:
1021
+ # For Pydantic models, try to create with minimal valid data
1022
+ result = {"mock": True}
1023
+ except Exception:
1024
+ result = {"mock": True}
1025
+ else:
1026
+ result = "mock response"
1027
+
1028
+ if return_usage:
1029
+ token_usage = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15)
1030
+ return result, token_usage
1031
+ return result
1032
+
1033
+ def set_mock_response(self, response: Any) -> None:
1034
+ """Set the response to return from mock calls."""
1035
+ self._mock_response = response
1036
+
1037
+ def get_mock_calls(self) -> list[dict]:
1038
+ """Get the list of recorded mock calls."""
1039
+ return self._mock_calls
1040
+
1041
+ def clear_mock_calls(self) -> None:
1042
+ """Clear the recorded mock calls."""
1043
+ self._mock_calls = []
1044
+
831
1045
  @classmethod
832
1046
  def for_memory(cls) -> "LLMProvider":
833
1047
  """Create provider for memory operations from environment variables."""