hindsight-api 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. hindsight_api/__init__.py +10 -9
  2. hindsight_api/alembic/env.py +5 -8
  3. hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
  4. hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
  5. hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
  6. hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
  7. hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
  8. hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
  9. hindsight_api/api/__init__.py +10 -10
  10. hindsight_api/api/http.py +575 -593
  11. hindsight_api/api/mcp.py +31 -33
  12. hindsight_api/banner.py +13 -6
  13. hindsight_api/config.py +17 -12
  14. hindsight_api/engine/__init__.py +9 -9
  15. hindsight_api/engine/cross_encoder.py +23 -27
  16. hindsight_api/engine/db_utils.py +5 -4
  17. hindsight_api/engine/embeddings.py +22 -21
  18. hindsight_api/engine/entity_resolver.py +81 -75
  19. hindsight_api/engine/llm_wrapper.py +74 -88
  20. hindsight_api/engine/memory_engine.py +663 -673
  21. hindsight_api/engine/query_analyzer.py +100 -97
  22. hindsight_api/engine/response_models.py +105 -106
  23. hindsight_api/engine/retain/__init__.py +9 -16
  24. hindsight_api/engine/retain/bank_utils.py +34 -58
  25. hindsight_api/engine/retain/chunk_storage.py +4 -12
  26. hindsight_api/engine/retain/deduplication.py +9 -28
  27. hindsight_api/engine/retain/embedding_processing.py +4 -11
  28. hindsight_api/engine/retain/embedding_utils.py +3 -4
  29. hindsight_api/engine/retain/entity_processing.py +7 -17
  30. hindsight_api/engine/retain/fact_extraction.py +155 -165
  31. hindsight_api/engine/retain/fact_storage.py +11 -23
  32. hindsight_api/engine/retain/link_creation.py +11 -39
  33. hindsight_api/engine/retain/link_utils.py +166 -95
  34. hindsight_api/engine/retain/observation_regeneration.py +39 -52
  35. hindsight_api/engine/retain/orchestrator.py +72 -62
  36. hindsight_api/engine/retain/types.py +49 -43
  37. hindsight_api/engine/search/__init__.py +15 -1
  38. hindsight_api/engine/search/fusion.py +6 -15
  39. hindsight_api/engine/search/graph_retrieval.py +234 -0
  40. hindsight_api/engine/search/mpfp_retrieval.py +438 -0
  41. hindsight_api/engine/search/observation_utils.py +9 -16
  42. hindsight_api/engine/search/reranking.py +4 -7
  43. hindsight_api/engine/search/retrieval.py +388 -193
  44. hindsight_api/engine/search/scoring.py +5 -7
  45. hindsight_api/engine/search/temporal_extraction.py +8 -11
  46. hindsight_api/engine/search/think_utils.py +115 -39
  47. hindsight_api/engine/search/trace.py +68 -38
  48. hindsight_api/engine/search/tracer.py +49 -35
  49. hindsight_api/engine/search/types.py +22 -16
  50. hindsight_api/engine/task_backend.py +21 -26
  51. hindsight_api/engine/utils.py +25 -10
  52. hindsight_api/main.py +21 -40
  53. hindsight_api/mcp_local.py +190 -0
  54. hindsight_api/metrics.py +44 -30
  55. hindsight_api/migrations.py +10 -8
  56. hindsight_api/models.py +60 -72
  57. hindsight_api/pg0.py +64 -337
  58. hindsight_api/server.py +3 -6
  59. {hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +6 -5
  60. hindsight_api-0.1.6.dist-info/RECORD +64 -0
  61. {hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
  62. hindsight_api-0.1.4.dist-info/RECORD +0 -61
  63. {hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0
@@ -6,9 +6,9 @@ API response models should be kept separate and convert from these core models t
6
6
  API stability even if internal models change.
7
7
  """
8
8
 
9
- from typing import Optional, List, Dict, Any
10
- from pydantic import BaseModel, Field, ConfigDict
9
+ from typing import Any
11
10
 
11
+ from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  # Valid fact types for recall operations (excludes 'observation' which is internal)
14
14
  VALID_RECALL_FACT_TYPES = frozenset(["world", "experience", "opinion"])
@@ -23,17 +23,12 @@ class DispositionTraits(BaseModel):
23
23
  - literalism: 1=flexible interpretation, 5=literal interpretation (how strictly to interpret information)
24
24
  - empathy: 1=detached, 5=empathetic (how much to consider emotional context)
25
25
  """
26
+
26
27
  skepticism: int = Field(ge=1, le=5, description="How skeptical vs trusting (1=trusting, 5=skeptical)")
27
28
  literalism: int = Field(ge=1, le=5, description="How literally to interpret information (1=flexible, 5=literal)")
28
29
  empathy: int = Field(ge=1, le=5, description="How much to consider emotional context (1=detached, 5=empathetic)")
29
30
 
30
- model_config = ConfigDict(json_schema_extra={
31
- "example": {
32
- "skepticism": 3,
33
- "literalism": 3,
34
- "empathy": 3
35
- }
36
- })
31
+ model_config = ConfigDict(json_schema_extra={"example": {"skepticism": 3, "literalism": 3, "empathy": 3}})
37
32
 
38
33
 
39
34
  class MemoryFact(BaseModel):
@@ -43,38 +38,44 @@ class MemoryFact(BaseModel):
43
38
  This represents a unit of information stored in the memory system,
44
39
  including both the content and metadata.
45
40
  """
46
- model_config = ConfigDict(json_schema_extra={
47
- "example": {
48
- "id": "123e4567-e89b-12d3-a456-426614174000",
49
- "text": "Alice works at Google on the AI team",
50
- "fact_type": "world",
51
- "entities": ["Alice", "Google"],
52
- "context": "work info",
53
- "occurred_start": "2024-01-15T10:30:00Z",
54
- "occurred_end": "2024-01-15T10:30:00Z",
55
- "mentioned_at": "2024-01-15T10:30:00Z",
56
- "document_id": "session_abc123",
57
- "metadata": {"source": "slack"},
58
- "chunk_id": "bank123_session_abc123_0",
59
- "activation": 0.95
41
+
42
+ model_config = ConfigDict(
43
+ json_schema_extra={
44
+ "example": {
45
+ "id": "123e4567-e89b-12d3-a456-426614174000",
46
+ "text": "Alice works at Google on the AI team",
47
+ "fact_type": "world",
48
+ "entities": ["Alice", "Google"],
49
+ "context": "work info",
50
+ "occurred_start": "2024-01-15T10:30:00Z",
51
+ "occurred_end": "2024-01-15T10:30:00Z",
52
+ "mentioned_at": "2024-01-15T10:30:00Z",
53
+ "document_id": "session_abc123",
54
+ "metadata": {"source": "slack"},
55
+ "chunk_id": "bank123_session_abc123_0",
56
+ "activation": 0.95,
57
+ }
60
58
  }
61
- })
59
+ )
62
60
 
63
61
  id: str = Field(description="Unique identifier for the memory fact")
64
62
  text: str = Field(description="The actual text content of the memory")
65
63
  fact_type: str = Field(description="Type of fact: 'world', 'experience', 'opinion', or 'observation'")
66
- entities: Optional[List[str]] = Field(None, description="Entity names mentioned in this fact")
67
- context: Optional[str] = Field(None, description="Additional context for the memory")
68
- occurred_start: Optional[str] = Field(None, description="ISO format date when the event started occurring")
69
- occurred_end: Optional[str] = Field(None, description="ISO format date when the event ended occurring")
70
- mentioned_at: Optional[str] = Field(None, description="ISO format date when the fact was mentioned/learned")
71
- document_id: Optional[str] = Field(None, description="ID of the document this memory belongs to")
72
- metadata: Optional[Dict[str, str]] = Field(None, description="User-defined metadata")
73
- chunk_id: Optional[str] = Field(None, description="ID of the chunk this fact was extracted from (format: bank_id_document_id_chunk_index)")
64
+ entities: list[str] | None = Field(None, description="Entity names mentioned in this fact")
65
+ context: str | None = Field(None, description="Additional context for the memory")
66
+ occurred_start: str | None = Field(None, description="ISO format date when the event started occurring")
67
+ occurred_end: str | None = Field(None, description="ISO format date when the event ended occurring")
68
+ mentioned_at: str | None = Field(None, description="ISO format date when the fact was mentioned/learned")
69
+ document_id: str | None = Field(None, description="ID of the document this memory belongs to")
70
+ metadata: dict[str, str] | None = Field(None, description="User-defined metadata")
71
+ chunk_id: str | None = Field(
72
+ None, description="ID of the chunk this fact was extracted from (format: bank_id_document_id_chunk_index)"
73
+ )
74
74
 
75
75
 
76
76
  class ChunkInfo(BaseModel):
77
77
  """Information about a chunk."""
78
+
78
79
  chunk_text: str = Field(description="The raw chunk text")
79
80
  chunk_index: int = Field(description="Index of the chunk within the document")
80
81
  truncated: bool = Field(default=False, description="Whether the chunk was truncated due to token limits")
@@ -87,35 +88,33 @@ class RecallResult(BaseModel):
87
88
  Contains a list of matching memory facts and optional trace information
88
89
  for debugging and transparency.
89
90
  """
90
- model_config = ConfigDict(json_schema_extra={
91
- "example": {
92
- "results": [
93
- {
94
- "id": "123e4567-e89b-12d3-a456-426614174000",
95
- "text": "Alice works at Google on the AI team",
96
- "fact_type": "world",
97
- "context": "work info",
98
- "occurred_start": "2024-01-15T10:30:00Z",
99
- "occurred_end": "2024-01-15T10:30:00Z",
100
- "activation": 0.95
101
- }
102
- ],
103
- "trace": {
104
- "query": "What did Alice say about machine learning?",
105
- "num_results": 1
91
+
92
+ model_config = ConfigDict(
93
+ json_schema_extra={
94
+ "example": {
95
+ "results": [
96
+ {
97
+ "id": "123e4567-e89b-12d3-a456-426614174000",
98
+ "text": "Alice works at Google on the AI team",
99
+ "fact_type": "world",
100
+ "context": "work info",
101
+ "occurred_start": "2024-01-15T10:30:00Z",
102
+ "occurred_end": "2024-01-15T10:30:00Z",
103
+ "activation": 0.95,
104
+ }
105
+ ],
106
+ "trace": {"query": "What did Alice say about machine learning?", "num_results": 1},
106
107
  }
107
108
  }
108
- })
109
+ )
109
110
 
110
- results: List[MemoryFact] = Field(description="List of memory facts matching the query")
111
- trace: Optional[Dict[str, Any]] = Field(None, description="Trace information for debugging")
112
- entities: Optional[Dict[str, "EntityState"]] = Field(
113
- None,
114
- description="Entity states for entities mentioned in results (keyed by canonical name)"
111
+ results: list[MemoryFact] = Field(description="List of memory facts matching the query")
112
+ trace: dict[str, Any] | None = Field(None, description="Trace information for debugging")
113
+ entities: dict[str, "EntityState"] | None = Field(
114
+ None, description="Entity states for entities mentioned in results (keyed by canonical name)"
115
115
  )
116
- chunks: Optional[Dict[str, ChunkInfo]] = Field(
117
- None,
118
- description="Chunks for facts, keyed by '{document_id}_{chunk_index}'"
116
+ chunks: dict[str, ChunkInfo] | None = Field(
117
+ None, description="Chunks for facts, keyed by '{document_id}_{chunk_index}'"
119
118
  )
120
119
 
121
120
 
@@ -126,37 +125,35 @@ class ReflectResult(BaseModel):
126
125
  Contains the formulated answer, the facts it was based on (organized by type),
127
126
  and any new opinions that were formed during the reflection process.
128
127
  """
129
- model_config = ConfigDict(json_schema_extra={
130
- "example": {
131
- "text": "Based on my knowledge, machine learning is being actively used in healthcare...",
132
- "based_on": {
133
- "world": [
134
- {
135
- "id": "123e4567-e89b-12d3-a456-426614174000",
136
- "text": "Machine learning is used in medical diagnosis",
137
- "fact_type": "world",
138
- "context": "healthcare",
139
- "occurred_start": "2024-01-15T10:30:00Z",
140
- "occurred_end": "2024-01-15T10:30:00Z"
141
- }
142
- ],
143
- "experience": [],
144
- "opinion": []
145
- },
146
- "new_opinions": [
147
- "Machine learning has great potential in healthcare"
148
- ]
128
+
129
+ model_config = ConfigDict(
130
+ json_schema_extra={
131
+ "example": {
132
+ "text": "Based on my knowledge, machine learning is being actively used in healthcare...",
133
+ "based_on": {
134
+ "world": [
135
+ {
136
+ "id": "123e4567-e89b-12d3-a456-426614174000",
137
+ "text": "Machine learning is used in medical diagnosis",
138
+ "fact_type": "world",
139
+ "context": "healthcare",
140
+ "occurred_start": "2024-01-15T10:30:00Z",
141
+ "occurred_end": "2024-01-15T10:30:00Z",
142
+ }
143
+ ],
144
+ "experience": [],
145
+ "opinion": [],
146
+ },
147
+ "new_opinions": ["Machine learning has great potential in healthcare"],
148
+ }
149
149
  }
150
- })
150
+ )
151
151
 
152
152
  text: str = Field(description="The formulated answer text")
153
- based_on: Dict[str, List[MemoryFact]] = Field(
153
+ based_on: dict[str, list[MemoryFact]] = Field(
154
154
  description="Facts used to formulate the answer, organized by type (world, experience, opinion)"
155
155
  )
156
- new_opinions: List[str] = Field(
157
- default_factory=list,
158
- description="List of newly formed opinions during reflection"
159
- )
156
+ new_opinions: list[str] = Field(default_factory=list, description="List of newly formed opinions during reflection")
160
157
 
161
158
 
162
159
  class Opinion(BaseModel):
@@ -166,12 +163,12 @@ class Opinion(BaseModel):
166
163
  Opinions represent the bank's formed perspectives on topics,
167
164
  with a confidence level indicating strength of belief.
168
165
  """
169
- model_config = ConfigDict(json_schema_extra={
170
- "example": {
171
- "text": "Machine learning has great potential in healthcare",
172
- "confidence": 0.85
166
+
167
+ model_config = ConfigDict(
168
+ json_schema_extra={
169
+ "example": {"text": "Machine learning has great potential in healthcare", "confidence": 0.85}
173
170
  }
174
- })
171
+ )
175
172
 
176
173
  text: str = Field(description="The opinion text")
177
174
  confidence: float = Field(description="Confidence score between 0.0 and 1.0")
@@ -184,15 +181,15 @@ class EntityObservation(BaseModel):
184
181
  Observations are objective facts synthesized from multiple memory facts
185
182
  about an entity, without personality influence.
186
183
  """
187
- model_config = ConfigDict(json_schema_extra={
188
- "example": {
189
- "text": "John is detail-oriented and works at Google",
190
- "mentioned_at": "2024-01-15T10:30:00Z"
184
+
185
+ model_config = ConfigDict(
186
+ json_schema_extra={
187
+ "example": {"text": "John is detail-oriented and works at Google", "mentioned_at": "2024-01-15T10:30:00Z"}
191
188
  }
192
- })
189
+ )
193
190
 
194
191
  text: str = Field(description="The observation text")
195
- mentioned_at: Optional[str] = Field(None, description="ISO format date when this observation was created")
192
+ mentioned_at: str | None = Field(None, description="ISO format date when this observation was created")
196
193
 
197
194
 
198
195
  class EntityState(BaseModel):
@@ -201,20 +198,22 @@ class EntityState(BaseModel):
201
198
 
202
199
  Contains observations synthesized from facts about the entity.
203
200
  """
204
- model_config = ConfigDict(json_schema_extra={
205
- "example": {
206
- "entity_id": "123e4567-e89b-12d3-a456-426614174000",
207
- "canonical_name": "John",
208
- "observations": [
209
- {"text": "John is detail-oriented", "mentioned_at": "2024-01-15T10:30:00Z"},
210
- {"text": "John works at Google on the AI team", "mentioned_at": "2024-01-14T09:00:00Z"}
211
- ]
201
+
202
+ model_config = ConfigDict(
203
+ json_schema_extra={
204
+ "example": {
205
+ "entity_id": "123e4567-e89b-12d3-a456-426614174000",
206
+ "canonical_name": "John",
207
+ "observations": [
208
+ {"text": "John is detail-oriented", "mentioned_at": "2024-01-15T10:30:00Z"},
209
+ {"text": "John works at Google on the AI team", "mentioned_at": "2024-01-14T09:00:00Z"},
210
+ ],
211
+ }
212
212
  }
213
- })
213
+ )
214
214
 
215
215
  entity_id: str = Field(description="Unique identifier for the entity")
216
216
  canonical_name: str = Field(description="Canonical name of the entity")
217
- observations: List[EntityObservation] = Field(
218
- default_factory=list,
219
- description="List of observations about this entity"
217
+ observations: list[EntityObservation] = Field(
218
+ default_factory=list, description="List of observations about this entity"
220
219
  )
@@ -12,23 +12,16 @@ This package contains modular components for the retain operation:
12
12
  - fact_storage: Handle fact insertion into database
13
13
  """
14
14
 
15
- from .types import (
16
- RetainContent,
17
- ExtractedFact,
18
- ProcessedFact,
19
- ChunkMetadata,
20
- EntityRef,
21
- CausalRelation,
22
- RetainBatch
15
+ from . import (
16
+ chunk_storage,
17
+ deduplication,
18
+ embedding_processing,
19
+ entity_processing,
20
+ fact_extraction,
21
+ fact_storage,
22
+ link_creation,
23
23
  )
24
-
25
- from . import fact_extraction
26
- from . import embedding_processing
27
- from . import deduplication
28
- from . import entity_processing
29
- from . import link_creation
30
- from . import chunk_storage
31
- from . import fact_storage
24
+ from .types import CausalRelation, ChunkMetadata, EntityRef, ExtractedFact, ProcessedFact, RetainBatch, RetainContent
32
25
 
33
26
  __all__ = [
34
27
  # Types
@@ -5,8 +5,10 @@ bank profile utilities for disposition and background management.
5
5
  import json
6
6
  import logging
7
7
  import re
8
- from typing import Dict, Optional, TypedDict
8
+ from typing import TypedDict
9
+
9
10
  from pydantic import BaseModel, Field
11
+
10
12
  from ..db_utils import acquire_with_retry
11
13
  from ..response_models import DispositionTraits
12
14
 
@@ -21,6 +23,7 @@ DEFAULT_DISPOSITION = {
21
23
 
22
24
  class BankProfile(TypedDict):
23
25
  """Type for bank profile data."""
26
+
24
27
  name: str
25
28
  disposition: DispositionTraits
26
29
  background: str
@@ -28,6 +31,7 @@ class BankProfile(TypedDict):
28
31
 
29
32
  class BackgroundMergeResponse(BaseModel):
30
33
  """LLM response for background merge with disposition inference."""
34
+
31
35
  background: str = Field(description="Merged background in first person perspective")
32
36
  disposition: DispositionTraits = Field(description="Inferred disposition traits (skepticism, literalism, empathy)")
33
37
 
@@ -51,7 +55,7 @@ async def get_bank_profile(pool, bank_id: str) -> BankProfile:
51
55
  SELECT name, disposition, background
52
56
  FROM banks WHERE bank_id = $1
53
57
  """,
54
- bank_id
58
+ bank_id,
55
59
  )
56
60
 
57
61
  if row:
@@ -61,9 +65,7 @@ async def get_bank_profile(pool, bank_id: str) -> BankProfile:
61
65
  disposition_data = json.loads(disposition_data)
62
66
 
63
67
  return BankProfile(
64
- name=row["name"],
65
- disposition=DispositionTraits(**disposition_data),
66
- background=row["background"]
68
+ name=row["name"], disposition=DispositionTraits(**disposition_data), background=row["background"]
67
69
  )
68
70
 
69
71
  # Bank doesn't exist, create with defaults
@@ -76,21 +78,13 @@ async def get_bank_profile(pool, bank_id: str) -> BankProfile:
76
78
  bank_id,
77
79
  bank_id, # Default name is the bank_id
78
80
  json.dumps(DEFAULT_DISPOSITION),
79
- ""
81
+ "",
80
82
  )
81
83
 
82
- return BankProfile(
83
- name=bank_id,
84
- disposition=DispositionTraits(**DEFAULT_DISPOSITION),
85
- background=""
86
- )
84
+ return BankProfile(name=bank_id, disposition=DispositionTraits(**DEFAULT_DISPOSITION), background="")
87
85
 
88
86
 
89
- async def update_bank_disposition(
90
- pool,
91
- bank_id: str,
92
- disposition: Dict[str, int]
93
- ) -> None:
87
+ async def update_bank_disposition(pool, bank_id: str, disposition: dict[str, int]) -> None:
94
88
  """
95
89
  Update bank disposition traits.
96
90
 
@@ -111,17 +105,11 @@ async def update_bank_disposition(
111
105
  WHERE bank_id = $1
112
106
  """,
113
107
  bank_id,
114
- json.dumps(disposition)
108
+ json.dumps(disposition),
115
109
  )
116
110
 
117
111
 
118
- async def merge_bank_background(
119
- pool,
120
- llm_config,
121
- bank_id: str,
122
- new_info: str,
123
- update_disposition: bool = True
124
- ) -> dict:
112
+ async def merge_bank_background(pool, llm_config, bank_id: str, new_info: str, update_disposition: bool = True) -> dict:
125
113
  """
126
114
  Merge new background information with existing background using LLM.
127
115
  Normalizes to first person ("I") and resolves conflicts.
@@ -142,12 +130,7 @@ async def merge_bank_background(
142
130
  current_background = profile["background"]
143
131
 
144
132
  # Use LLM to merge backgrounds and optionally infer disposition
145
- result = await _llm_merge_background(
146
- llm_config,
147
- current_background,
148
- new_info,
149
- infer_disposition=update_disposition
150
- )
133
+ result = await _llm_merge_background(llm_config, current_background, new_info, infer_disposition=update_disposition)
151
134
 
152
135
  merged_background = result["background"]
153
136
  inferred_disposition = result.get("disposition")
@@ -166,7 +149,7 @@ async def merge_bank_background(
166
149
  """,
167
150
  bank_id,
168
151
  merged_background,
169
- json.dumps(inferred_disposition)
152
+ json.dumps(inferred_disposition),
170
153
  )
171
154
  else:
172
155
  # Update only background
@@ -178,7 +161,7 @@ async def merge_bank_background(
178
161
  WHERE bank_id = $1
179
162
  """,
180
163
  bank_id,
181
- merged_background
164
+ merged_background,
182
165
  )
183
166
 
184
167
  response = {"background": merged_background}
@@ -188,12 +171,7 @@ async def merge_bank_background(
188
171
  return response
189
172
 
190
173
 
191
- async def _llm_merge_background(
192
- llm_config,
193
- current: str,
194
- new_info: str,
195
- infer_disposition: bool = False
196
- ) -> dict:
174
+ async def _llm_merge_background(llm_config, current: str, new_info: str, infer_disposition: bool = False) -> dict:
197
175
  """
198
176
  Use LLM to intelligently merge background information.
199
177
  Optionally infer Big Five disposition traits from the merged background.
@@ -273,25 +251,19 @@ Merged background:"""
273
251
  response_format=BackgroundMergeResponse,
274
252
  scope="bank_background",
275
253
  temperature=0.3,
276
- max_completion_tokens=8192
254
+ max_completion_tokens=8192,
277
255
  )
278
256
  logger.info(f"Successfully got structured response: background={parsed.background[:100]}")
279
257
 
280
258
  # Convert Pydantic model to dict format
281
- return {
282
- "background": parsed.background,
283
- "disposition": parsed.disposition.model_dump()
284
- }
259
+ return {"background": parsed.background, "disposition": parsed.disposition.model_dump()}
285
260
  except Exception as e:
286
261
  logger.warning(f"Structured output failed, falling back to manual parsing: {e}")
287
262
  # Fall through to manual parsing below
288
263
 
289
264
  # Manual parsing fallback or non-disposition merge
290
265
  content = await llm_config.call(
291
- messages=messages,
292
- scope="bank_background",
293
- temperature=0.3,
294
- max_completion_tokens=8192
266
+ messages=messages, scope="bank_background", temperature=0.3, max_completion_tokens=8192
295
267
  )
296
268
 
297
269
  logger.info(f"LLM response for background merge (first 500 chars): {content[:500]}")
@@ -310,7 +282,7 @@ Merged background:"""
310
282
  # Method 2: Extract from markdown code blocks
311
283
  if result is None:
312
284
  # Remove markdown code blocks
313
- code_block_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
285
+ code_block_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", content, re.DOTALL)
314
286
  if code_block_match:
315
287
  try:
316
288
  result = json.loads(code_block_match.group(1))
@@ -321,7 +293,9 @@ Merged background:"""
321
293
  # Method 3: Find nested JSON structure
322
294
  if result is None:
323
295
  # Look for JSON object with nested structure
324
- json_match = re.search(r'\{[^{}]*"background"[^{}]*"disposition"[^{}]*\{[^{}]*\}[^{}]*\}', content, re.DOTALL)
296
+ json_match = re.search(
297
+ r'\{[^{}]*"background"[^{}]*"disposition"[^{}]*\{[^{}]*\}[^{}]*\}', content, re.DOTALL
298
+ )
325
299
  if json_match:
326
300
  try:
327
301
  result = json.loads(json_match.group())
@@ -335,7 +309,7 @@ Merged background:"""
335
309
  # Fallback: use new_info as background with default disposition
336
310
  return {
337
311
  "background": new_info if new_info else current if current else "",
338
- "disposition": DEFAULT_DISPOSITION.copy()
312
+ "disposition": DEFAULT_DISPOSITION.copy(),
339
313
  }
340
314
 
341
315
  # Validate disposition values
@@ -401,13 +375,15 @@ async def list_banks(pool) -> list:
401
375
  if isinstance(disposition_data, str):
402
376
  disposition_data = json.loads(disposition_data)
403
377
 
404
- result.append({
405
- "bank_id": row["bank_id"],
406
- "name": row["name"],
407
- "disposition": disposition_data,
408
- "background": row["background"],
409
- "created_at": row["created_at"].isoformat() if row["created_at"] else None,
410
- "updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
411
- })
378
+ result.append(
379
+ {
380
+ "bank_id": row["bank_id"],
381
+ "name": row["name"],
382
+ "disposition": disposition_data,
383
+ "background": row["background"],
384
+ "created_at": row["created_at"].isoformat() if row["created_at"] else None,
385
+ "updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
386
+ }
387
+ )
412
388
 
413
389
  return result
@@ -3,20 +3,15 @@ Chunk storage for retain pipeline.
3
3
 
4
4
  Handles storage of document chunks in the database.
5
5
  """
6
+
6
7
  import logging
7
- from typing import List, Dict, Optional
8
8
 
9
9
  from .types import ChunkMetadata
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
- async def store_chunks_batch(
15
- conn,
16
- bank_id: str,
17
- document_id: str,
18
- chunks: List[ChunkMetadata]
19
- ) -> Dict[int, str]:
14
+ async def store_chunks_batch(conn, bank_id: str, document_id: str, chunks: list[ChunkMetadata]) -> dict[int, str]:
20
15
  """
21
16
  Store document chunks in the database.
22
17
 
@@ -55,16 +50,13 @@ async def store_chunks_batch(
55
50
  [document_id] * len(chunk_texts),
56
51
  [bank_id] * len(chunk_texts),
57
52
  chunk_texts,
58
- chunk_indices
53
+ chunk_indices,
59
54
  )
60
55
 
61
56
  return chunk_id_map
62
57
 
63
58
 
64
- def map_facts_to_chunks(
65
- facts_chunk_indices: List[int],
66
- chunk_id_map: Dict[int, str]
67
- ) -> List[Optional[str]]:
59
+ def map_facts_to_chunks(facts_chunk_indices: list[int], chunk_id_map: dict[int, str]) -> list[str | None]:
68
60
  """
69
61
  Map fact chunk indices to chunk IDs.
70
62
 
@@ -3,22 +3,17 @@ Deduplication logic for retain pipeline.
3
3
 
4
4
  Checks for duplicate facts using semantic similarity and temporal proximity.
5
5
  """
6
+
6
7
  import logging
7
- from datetime import datetime
8
- from typing import List
9
8
  from collections import defaultdict
9
+ from datetime import UTC
10
10
 
11
11
  from .types import ProcessedFact
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
- async def check_duplicates_batch(
17
- conn,
18
- bank_id: str,
19
- facts: List[ProcessedFact],
20
- duplicate_checker_fn
21
- ) -> List[bool]:
16
+ async def check_duplicates_batch(conn, bank_id: str, facts: list[ProcessedFact], duplicate_checker_fn) -> list[bool]:
22
17
  """
23
18
  Check which facts are duplicates using batched time-window queries.
24
19
 
@@ -47,16 +42,12 @@ async def check_duplicates_batch(
47
42
 
48
43
  # Defensive: if both are None (shouldn't happen), use now()
49
44
  if fact_date is None:
50
- from datetime import datetime, timezone
51
- fact_date = datetime.now(timezone.utc)
45
+ from datetime import datetime
46
+
47
+ fact_date = datetime.now(UTC)
52
48
 
53
49
  # Round to 12-hour bucket to group similar times
54
- bucket_key = fact_date.replace(
55
- hour=(fact_date.hour // 12) * 12,
56
- minute=0,
57
- second=0,
58
- microsecond=0
59
- )
50
+ bucket_key = fact_date.replace(hour=(fact_date.hour // 12) * 12, minute=0, second=0, microsecond=0)
60
51
  time_buckets[bucket_key].append((idx, fact))
61
52
 
62
53
  # Process each bucket in batch
@@ -68,14 +59,7 @@ async def check_duplicates_batch(
68
59
  embeddings = [item[1].embedding for item in bucket_items]
69
60
 
70
61
  # Check duplicates for this time bucket
71
- dup_flags = await duplicate_checker_fn(
72
- conn,
73
- bank_id,
74
- texts,
75
- embeddings,
76
- bucket_date,
77
- time_window_hours=24
78
- )
62
+ dup_flags = await duplicate_checker_fn(conn, bank_id, texts, embeddings, bucket_date, time_window_hours=24)
79
63
 
80
64
  # Map results back to original indices
81
65
  for idx, is_dup in zip(indices, dup_flags):
@@ -84,10 +68,7 @@ async def check_duplicates_batch(
84
68
  return all_is_duplicate
85
69
 
86
70
 
87
- def filter_duplicates(
88
- facts: List[ProcessedFact],
89
- is_duplicate_flags: List[bool]
90
- ) -> List[ProcessedFact]:
71
+ def filter_duplicates(facts: list[ProcessedFact], is_duplicate_flags: list[bool]) -> list[ProcessedFact]:
91
72
  """
92
73
  Filter out duplicate facts based on duplicate flags.
93
74