msg-summarizer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ """Resolve iMessage handles to real names via the macOS AddressBook (Contacts).
2
+
3
+ Reads the AddressBook SQLite stores directly (under ~/Library/Application
4
+ Support/AddressBook), which existing Full Disk Access already covers — no extra
5
+ permission prompt. Fails soft: returns {} if Contacts can't be read, so the
6
+ rest of the app still works with raw handles.
7
+ """
8
+
9
+ import glob
10
+ import pathlib
11
+ import re
12
+ import sqlite3
13
+
14
+ _AB_DIR = pathlib.Path.home() / "Library" / "Application Support" / "AddressBook"
15
+
16
+
17
+ def normalize_handle(handle: str) -> str:
18
+ """Reduce a handle to a match key: lowercased email, or a phone's last 10 digits."""
19
+ s = (handle or "").strip().lower()
20
+ if "@" in s:
21
+ return s
22
+ digits = re.sub(r"\D", "", s)
23
+ return digits[-10:] if len(digits) >= 10 else (digits or s)
24
+
25
+
26
+ def _addressbook_paths() -> list[str]:
27
+ paths = glob.glob(str(_AB_DIR / "Sources" / "*" / "AddressBook-v22.abcddb"))
28
+ top = _AB_DIR / "AddressBook-v22.abcddb"
29
+ if top.exists():
30
+ paths.append(str(top))
31
+ return paths
32
+
33
+
34
+ def _record_name(first, last, org) -> str:
35
+ parts = [p for p in (first, last) if p]
36
+ if parts:
37
+ return " ".join(parts)
38
+ return org or ""
39
+
40
+
41
+ def _harvest(conn: sqlite3.Connection, mapping: dict, source: str) -> None:
42
+ for table, col in (("ZABCDPHONENUMBER", "ZFULLNUMBER"), ("ZABCDEMAILADDRESS", "ZADDRESS")):
43
+ query = (
44
+ f"SELECT x.{col}, r.Z_PK, r.ZFIRSTNAME, r.ZLASTNAME, r.ZORGANIZATION "
45
+ f"FROM {table} x JOIN ZABCDRECORD r ON x.ZOWNER = r.Z_PK"
46
+ )
47
+ try:
48
+ rows = conn.execute(query).fetchall()
49
+ except sqlite3.Error:
50
+ continue # schema variation in this source; skip this table
51
+ for value, pk, first, last, org in rows:
52
+ if not value:
53
+ continue
54
+ name = _record_name(first, last, org)
55
+ if not name:
56
+ continue
57
+ mapping.setdefault(normalize_handle(value), (f"ab:{source}:{pk}", name))
58
+
59
+
60
+ def load_contacts() -> dict[str, tuple[str, str]]:
61
+ """Map normalized handle -> (contact_key, display_name).
62
+
63
+ The contact_key is stable per contact card, so a person's phone and email
64
+ collapse to the same key (and thus the same merged Person).
65
+ """
66
+ mapping: dict[str, tuple[str, str]] = {}
67
+ for i, path in enumerate(_addressbook_paths()):
68
+ try:
69
+ conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True)
70
+ except sqlite3.OperationalError:
71
+ continue
72
+ try:
73
+ _harvest(conn, mapping, source=str(i))
74
+ finally:
75
+ conn.close()
76
+ return mapping
msg_summarizer/db.py ADDED
@@ -0,0 +1,401 @@
1
+ import datetime
2
+ import pathlib
3
+ import re
4
+ import sqlite3
5
+
6
+ from typedstream.stream import TypedStreamReader
7
+
8
+ from .contacts import load_contacts, normalize_handle
9
+ from .models import Chat, Conversation, Message, Person
10
+
11
+ DB_PATH = pathlib.Path.home() / "Library" / "Messages" / "chat.db"
12
+
13
+ # Seconds between Unix epoch (1970-01-01) and Apple Core Data epoch (2001-01-01)
14
+ _APPLE_EPOCH_OFFSET = 978307200
15
+
16
+ # Base query; date-bound clauses and ORDER BY are appended by _fetch().
17
+ # Messages on macOS Ventura+ usually leave `text` NULL and store the body in
18
+ # `attributedBody`, so we select both and resolve in _rows_to_messages().
19
+ _MESSAGES_BASE = """
20
+ SELECT
21
+ m.ROWID AS message_rowid,
22
+ m.date AS apple_date,
23
+ h.id AS handle_id,
24
+ m.is_from_me AS is_from_me,
25
+ m.text AS text,
26
+ m.attributedBody AS attributed_body
27
+ FROM message m
28
+ LEFT JOIN handle h ON m.handle_id = h.ROWID
29
+ JOIN chat_message_join cmj ON m.ROWID = cmj.message_id
30
+ JOIN chat c ON cmj.chat_id = c.ROWID
31
+ WHERE
32
+ (
33
+ LOWER(h.id) LIKE :contact_pattern
34
+ OR LOWER(c.display_name) LIKE :contact_pattern
35
+ OR LOWER(c.chat_identifier) LIKE :contact_pattern
36
+ )
37
+ AND (
38
+ (m.text IS NOT NULL AND m.text != '')
39
+ OR m.attributedBody IS NOT NULL
40
+ )
41
+ """
42
+
43
+
44
+ def _apple_ts(dt: datetime.datetime) -> int:
45
+ """Convert a datetime to an Apple nanosecond timestamp."""
46
+ return int((dt.timestamp() - _APPLE_EPOCH_OFFSET) * 1_000_000_000)
47
+
48
+
49
+ def _from_apple_ts(apple_ts: int) -> datetime.datetime:
50
+ unix = (apple_ts / 1_000_000_000) + _APPLE_EPOCH_OFFSET
51
+ return datetime.datetime.fromtimestamp(unix, tz=datetime.timezone.utc)
52
+
53
+
54
+ def _connect(db_path: pathlib.Path) -> sqlite3.Connection:
55
+ """Open *db_path* read-only, or raise a helpful PermissionError."""
56
+ try:
57
+ return sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
58
+ except sqlite3.OperationalError as exc:
59
+ raise PermissionError(
60
+ f"Cannot open {db_path}.\n"
61
+ "Grant Full Disk Access to your terminal app in:\n"
62
+ " System Settings > Privacy & Security > Full Disk Access"
63
+ ) from exc
64
+
65
+
66
+ def _decode_attributed_body(blob: bytes | None) -> str | None:
67
+ """Extract the message text from a Messages `attributedBody` blob.
68
+
69
+ macOS Ventura+ leaves the `text` column NULL and stores the body as a
70
+ serialized NSAttributedString (Apple's "typedstream" format). The message
71
+ text is the first string payload in the stream, ahead of the attribute-run
72
+ metadata (e.g. ``__kIMMessagePartAttributeName``).
73
+ """
74
+ if not blob:
75
+ return None
76
+ try:
77
+ for event in TypedStreamReader.from_data(blob):
78
+ if isinstance(event, bytes):
79
+ return event.decode("utf-8", errors="replace")
80
+ if isinstance(event, str):
81
+ return event
82
+ except Exception:
83
+ return None
84
+ return None
85
+
86
+
87
+ def _rows_to_messages(rows: list[tuple]) -> list[Message]:
88
+ messages: list[Message] = []
89
+ for rowid, apple_date, handle_id, is_from_me, text, attributed_body in rows:
90
+ resolved = text if text else _decode_attributed_body(attributed_body)
91
+ if not resolved or not resolved.strip():
92
+ continue
93
+ messages.append(
94
+ Message(
95
+ timestamp=_from_apple_ts(apple_date),
96
+ sender="me" if is_from_me else (handle_id or "them"),
97
+ text=resolved.strip(),
98
+ is_from_me=bool(is_from_me),
99
+ rowid=rowid,
100
+ )
101
+ )
102
+ return messages
103
+
104
+
105
+ _LIST_CONTACTS_QUERY = """
106
+ SELECT
107
+ h.id AS handle_id,
108
+ COALESCE(c.display_name, '') AS display_name,
109
+ COUNT(DISTINCT m.ROWID) AS msg_count
110
+ FROM handle h
111
+ JOIN chat_handle_join chj ON h.ROWID = chj.handle_id
112
+ JOIN chat c ON chj.chat_id = c.ROWID
113
+ LEFT JOIN chat_message_join cmj ON c.ROWID = cmj.chat_id
114
+ LEFT JOIN message m ON cmj.message_id = m.ROWID
115
+ GROUP BY h.id, c.display_name
116
+ ORDER BY msg_count DESC
117
+ LIMIT :limit
118
+ """
119
+
120
+
121
+ def list_contacts(
122
+ db_path: pathlib.Path = DB_PATH,
123
+ limit: int = 50,
124
+ ) -> list[tuple[str, str, int]]:
125
+ """Return (handle_id, display_name, msg_count) for contacts in the DB."""
126
+ conn = _connect(db_path)
127
+ with conn:
128
+ rows = conn.execute(_LIST_CONTACTS_QUERY, {"limit": limit}).fetchall()
129
+
130
+ return [(handle_id, display_name, msg_count) for handle_id, display_name, msg_count in rows]
131
+
132
+
133
+ def fetch_messages(
134
+ contact: str,
135
+ db_path: pathlib.Path = DB_PATH,
136
+ start: datetime.datetime | None = None,
137
+ end: datetime.datetime | None = None,
138
+ ) -> Conversation:
139
+ """Fetch messages matching *contact*, optionally bounded by *start*/*end* (UTC).
140
+
141
+ With no bounds, returns the full message history.
142
+ """
143
+ query = _MESSAGES_BASE
144
+ params: dict[str, object] = {"contact_pattern": f"%{contact.lower()}%"}
145
+ if start is not None:
146
+ query += " AND m.date >= :start_ts\n"
147
+ params["start_ts"] = _apple_ts(start)
148
+ if end is not None:
149
+ query += " AND m.date <= :end_ts\n"
150
+ params["end_ts"] = _apple_ts(end)
151
+ query += "ORDER BY m.date ASC"
152
+
153
+ conn = _connect(db_path)
154
+ with conn:
155
+ rows = conn.execute(query, params).fetchall()
156
+
157
+ return Conversation(contact=contact, messages=_rows_to_messages(rows))
158
+
159
+
160
+ # --- Chat- and person-oriented model -------------------------------------
161
+
162
+ _CHATS_QUERY = """
163
+ SELECT
164
+ c.ROWID AS chat_id,
165
+ c.chat_identifier AS identifier,
166
+ COALESCE(c.display_name, '') AS display_name,
167
+ COUNT(DISTINCT cmj.message_id) AS msg_count,
168
+ MAX(m.date) AS last_date
169
+ FROM chat c
170
+ LEFT JOIN chat_message_join cmj ON c.ROWID = cmj.chat_id
171
+ LEFT JOIN message m ON cmj.message_id = m.ROWID
172
+ GROUP BY c.ROWID
173
+ """
174
+
175
+ _CHAT_PARTICIPANTS_QUERY = """
176
+ SELECT chj.chat_id, h.id
177
+ FROM chat_handle_join chj
178
+ JOIN handle h ON chj.handle_id = h.ROWID
179
+ """
180
+
181
+ _EPOCH_MIN = datetime.datetime.min.replace(tzinfo=datetime.timezone.utc)
182
+
183
+ # How many resolved names to show before collapsing to "& N more" in a group name.
184
+ _GROUP_NAME_MAX = 4
185
+
186
+
187
+ def _load_raw_chats(db_path: pathlib.Path) -> tuple[list[tuple], dict[int, list[str]]]:
188
+ """Load raw chat rows + a chat_id -> [handle ids] map. Shared by list_chats/people."""
189
+ conn = _connect(db_path)
190
+ with conn:
191
+ chat_rows = conn.execute(_CHATS_QUERY).fetchall()
192
+ part_rows = conn.execute(_CHAT_PARTICIPANTS_QUERY).fetchall()
193
+ participants: dict[int, list[str]] = {}
194
+ for chat_id, handle_id in part_rows:
195
+ participants.setdefault(chat_id, []).append(handle_id)
196
+ return chat_rows, participants
197
+
198
+
199
+ def _build_people(
200
+ chat_rows: list[tuple],
201
+ participants: dict[int, list[str]],
202
+ contacts: dict[str, tuple[str, str]],
203
+ ) -> list[Person]:
204
+ """Derive Person objects from raw chats, merging handles via Contacts."""
205
+ people: dict[str, Person] = {}
206
+ for chat_id, _identifier, _display_name, msg_count, last_date in chat_rows:
207
+ parts = participants.get(chat_id, [])
208
+ is_group = len(parts) > 1
209
+ last = _from_apple_ts(last_date) if last_date else None
210
+ for handle_id in parts:
211
+ norm = normalize_handle(handle_id)
212
+ entry = contacts.get(norm)
213
+ key, name = entry if entry else (norm, handle_id)
214
+ person = people.get(key)
215
+ if person is None:
216
+ person = Person(key=key, name=name)
217
+ people[key] = person
218
+ if handle_id not in person.handles:
219
+ person.handles.append(handle_id)
220
+ if is_group:
221
+ person.group_chat_ids.append(chat_id)
222
+ else:
223
+ person.dm_chat_ids.append(chat_id)
224
+ person.message_count += msg_count or 0
225
+ if last and (person.last_date is None or last > person.last_date):
226
+ person.last_date = last
227
+ out = list(people.values())
228
+ out.sort(key=lambda p: p.last_date or _EPOCH_MIN, reverse=True)
229
+ return out
230
+
231
+
232
+ def _format_group_name(names: list[str]) -> str:
233
+ if not names:
234
+ return "Group"
235
+ if len(names) <= _GROUP_NAME_MAX:
236
+ return ", ".join(names)
237
+ return ", ".join(names[:_GROUP_NAME_MAX]) + f" & {len(names) - _GROUP_NAME_MAX} more"
238
+
239
+
240
+ def _build_chats(
241
+ chat_rows: list[tuple],
242
+ participants: dict[int, list[str]],
243
+ handle_to_person: dict[str, Person],
244
+ ) -> list[Chat]:
245
+ chats: list[Chat] = []
246
+ for chat_id, identifier, display_name, msg_count, last_date in chat_rows:
247
+ parts = participants.get(chat_id, [])
248
+ is_group = len(parts) > 1
249
+ if display_name:
250
+ name = display_name
251
+ elif not is_group and parts:
252
+ person = handle_to_person.get(parts[0])
253
+ name = person.name if person else parts[0]
254
+ elif is_group:
255
+ resolved = sorted(
256
+ (handle_to_person[h].name if h in handle_to_person else h) for h in parts
257
+ )
258
+ name = _format_group_name(resolved)
259
+ else:
260
+ name = identifier or "Unknown"
261
+ chats.append(
262
+ Chat(
263
+ chat_id=chat_id,
264
+ identifier=identifier or "",
265
+ display_name=display_name or "",
266
+ name=name,
267
+ is_group=is_group,
268
+ participants=parts,
269
+ message_count=msg_count or 0,
270
+ last_date=_from_apple_ts(last_date) if last_date else None,
271
+ )
272
+ )
273
+ chats.sort(key=lambda c: c.last_date or _EPOCH_MIN, reverse=True)
274
+ return chats
275
+
276
+
277
+ def list_chats(db_path: pathlib.Path = DB_PATH) -> list[Chat]:
278
+ """Return every chat thread (1:1 and group), newest activity first.
279
+
280
+ Names are resolved: 1:1s use the matched Contact's name; unnamed groups use
281
+ the alphabetized list of participant Contact names.
282
+ """
283
+ chat_rows, participants = _load_raw_chats(db_path)
284
+ contacts = load_contacts()
285
+ people = _build_people(chat_rows, participants, contacts)
286
+ handle_to_person = {h: p for p in people for h in p.handles}
287
+ return _build_chats(chat_rows, participants, handle_to_person)
288
+
289
+
290
+ def _fetch_by_chats(
291
+ chat_ids: list[int],
292
+ label: str,
293
+ db_path: pathlib.Path,
294
+ start: datetime.datetime | None,
295
+ end: datetime.datetime | None,
296
+ ) -> Conversation:
297
+ if not chat_ids:
298
+ return Conversation(contact=label, messages=[])
299
+
300
+ placeholders = ",".join("?" for _ in chat_ids)
301
+ query = f"""
302
+ SELECT m.ROWID, m.date, h.id, m.is_from_me, m.text, m.attributedBody
303
+ FROM message m
304
+ LEFT JOIN handle h ON m.handle_id = h.ROWID
305
+ JOIN chat_message_join cmj ON m.ROWID = cmj.message_id
306
+ WHERE cmj.chat_id IN ({placeholders})
307
+ AND ((m.text IS NOT NULL AND m.text != '') OR m.attributedBody IS NOT NULL)
308
+ """
309
+ params: list[object] = list(chat_ids)
310
+ if start is not None:
311
+ query += " AND m.date >= ?"
312
+ params.append(_apple_ts(start))
313
+ if end is not None:
314
+ query += " AND m.date <= ?"
315
+ params.append(_apple_ts(end))
316
+ query += " ORDER BY m.date ASC"
317
+
318
+ conn = _connect(db_path)
319
+ with conn:
320
+ rows = conn.execute(query, params).fetchall()
321
+ return Conversation(contact=label, messages=_rows_to_messages(rows))
322
+
323
+
324
+ def fetch_chat_messages(
325
+ chat_id: int,
326
+ label: str | None = None,
327
+ db_path: pathlib.Path = DB_PATH,
328
+ start: datetime.datetime | None = None,
329
+ end: datetime.datetime | None = None,
330
+ ) -> Conversation:
331
+ """Fetch the messages of a single chat thread."""
332
+ return _fetch_by_chats([chat_id], label or str(chat_id), db_path, start, end)
333
+
334
+
335
+ def fetch_person_messages(
336
+ person: Person,
337
+ db_path: pathlib.Path = DB_PATH,
338
+ start: datetime.datetime | None = None,
339
+ end: datetime.datetime | None = None,
340
+ ) -> Conversation:
341
+ """Fetch a person's direct (1:1) messages, merged across their DM chats."""
342
+ return _fetch_by_chats(person.dm_chat_ids, person.name, db_path, start, end)
343
+
344
+
345
+ def list_people(db_path: pathlib.Path = DB_PATH) -> list[Person]:
346
+ """Derive people from chats, merging handles via Contacts and normalized ids.
347
+
348
+ A person's headline is their 1:1 DM(s); group chats they're in are tracked
349
+ separately so the UI can link to them.
350
+ """
351
+ chat_rows, participants = _load_raw_chats(db_path)
352
+ return _build_people(chat_rows, participants, load_contacts())
353
+
354
+
355
+ def list_conversations(db_path: pathlib.Path = DB_PATH) -> list[dict]:
356
+ """Unified Chats list: one row per person DM (merged across handles) plus
357
+ one row per group chat. Sorted newest first.
358
+
359
+ Each row is shaped as a click target:
360
+ * person DM: {"kind": "person", "key": ..., "name", ...}
361
+ * group: {"kind": "chat", "chat_id": ..., "name", ...}
362
+ """
363
+ chat_rows, participants = _load_raw_chats(db_path)
364
+ contacts = load_contacts()
365
+ people = _build_people(chat_rows, participants, contacts)
366
+ handle_to_person = {h: p for p in people for h in p.handles}
367
+ chats = _build_chats(chat_rows, participants, handle_to_person)
368
+
369
+ rows: list[dict] = []
370
+ for person in people:
371
+ if not person.dm_chat_ids:
372
+ continue
373
+ rows.append(
374
+ {
375
+ "kind": "person",
376
+ "key": person.key,
377
+ "name": person.name,
378
+ "is_group": False,
379
+ "message_count": person.message_count,
380
+ "last_date": person.last_date.isoformat() if person.last_date else None,
381
+ "participant_count": 1,
382
+ "handle_count": len(person.handles),
383
+ }
384
+ )
385
+ for chat in chats:
386
+ if not chat.is_group:
387
+ continue
388
+ rows.append(
389
+ {
390
+ "kind": "chat",
391
+ "chat_id": chat.chat_id,
392
+ "name": chat.name,
393
+ "is_group": True,
394
+ "message_count": chat.message_count,
395
+ "last_date": chat.last_date.isoformat() if chat.last_date else None,
396
+ "participant_count": len(chat.participants),
397
+ "handle_count": len(chat.participants),
398
+ }
399
+ )
400
+ rows.sort(key=lambda r: r["last_date"] or "", reverse=True)
401
+ return rows
@@ -0,0 +1,152 @@
1
+ """On-device embedding for semantic search over message windows.
2
+
3
+ Uses BAAI/bge-small-en-v1.5 via fastembed (ONNX runtime, no PyTorch).
4
+ The model file is downloaded once on first use and cached locally.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from datetime import datetime
9
+
10
+ import numpy as np
11
+
12
+ from .models import Message
13
+
14
+ _MODEL_NAME = "BAAI/bge-small-en-v1.5"
15
+ _DIM = 384
16
+ _WINDOW_SIZE = 5
17
+
18
+ _model = None
19
+
20
+
21
+ def _load_model():
22
+ """Lazy-load the embedding model on first use (slow once; cached after)."""
23
+ global _model
24
+ if _model is None:
25
+ from fastembed import TextEmbedding # heavy import; defer
26
+ _model = TextEmbedding(model_name=_MODEL_NAME)
27
+ return _model
28
+
29
+
30
+ def embedding_dim() -> int:
31
+ return _DIM
32
+
33
+
34
+ @dataclass
35
+ class Window:
36
+ first_rowid: int
37
+ last_rowid: int
38
+ first_ts: datetime
39
+ last_ts: datetime
40
+ text: str
41
+
42
+
43
+ def window_messages(messages: list[Message], n: int = _WINDOW_SIZE) -> list[Window]:
44
+ """Group *messages* (in chronological order) into windows of up to n messages each.
45
+
46
+ Each window is one semantic unit — short context bundles tend to embed better
47
+ than single messages like "lol" or "ok".
48
+ """
49
+ out: list[Window] = []
50
+ for i in range(0, len(messages), n):
51
+ chunk = messages[i : i + n]
52
+ if not chunk:
53
+ continue
54
+ text = "\n".join(
55
+ f"{'me' if m.is_from_me else m.sender}: {m.text}" for m in chunk
56
+ )
57
+ out.append(
58
+ Window(
59
+ first_rowid=chunk[0].rowid,
60
+ last_rowid=chunk[-1].rowid,
61
+ first_ts=chunk[0].timestamp,
62
+ last_ts=chunk[-1].timestamp,
63
+ text=text,
64
+ )
65
+ )
66
+ return out
67
+
68
+
69
+ def embed_documents(texts: list[str], batch_size: int = 64) -> np.ndarray:
70
+ """Embed document texts. Returns N x D float32 (L2-normalized)."""
71
+ if not texts:
72
+ return np.zeros((0, _DIM), dtype=np.float32)
73
+ model = _load_model()
74
+ vecs = list(model.embed(texts, batch_size=batch_size))
75
+ return np.stack([np.asarray(v, dtype=np.float32) for v in vecs])
76
+
77
+
78
+ def embed_query(text: str) -> np.ndarray:
79
+ """Embed a query. Returns a 1-D float32 vector (L2-normalized)."""
80
+ model = _load_model()
81
+ # query_embed applies the BGE query-side prefix expected by this model.
82
+ return np.asarray(next(model.query_embed([text])), dtype=np.float32)
83
+
84
+
85
+ def cosine_topk(
86
+ query_vec: np.ndarray, doc_matrix: np.ndarray, k: int = 20
87
+ ) -> list[tuple[int, float]]:
88
+ """Top-K cosine similarities (vectors assumed L2-normalized)."""
89
+ if doc_matrix.size == 0:
90
+ return []
91
+ scores = doc_matrix @ query_vec # (N,)
92
+ k = min(k, len(scores))
93
+ top = np.argpartition(-scores, k - 1)[:k]
94
+ top = top[np.argsort(-scores[top])]
95
+ return [(int(i), float(scores[i])) for i in top]
96
+
97
+
98
+ # Per-target in-memory cache: (target_kind, target_id) -> (meta, matrix, last_window_id)
99
+ _index_cache: dict[tuple[str, str], tuple[list[dict], np.ndarray, int]] = {}
100
+
101
+
102
+ def get_index_matrix(target_kind: str, target_id: str):
103
+ """Return (meta, matrix) for *target*, reloading from disk if it's grown."""
104
+ from .store import get_index_progress, load_windows # avoid cycle at import
105
+
106
+ _, last_window_id = get_index_progress(target_kind, target_id)
107
+ key = (target_kind, target_id)
108
+ cached = _index_cache.get(key)
109
+ if cached is not None and cached[2] == last_window_id:
110
+ return cached[0], cached[1]
111
+ meta, vec_bytes = load_windows(target_kind, target_id)
112
+ matrix = np.frombuffer(vec_bytes, dtype=np.float32).reshape(-1, _DIM).copy()
113
+ _index_cache[key] = (meta, matrix, last_window_id)
114
+ return meta, matrix
115
+
116
+
117
+ def invalidate_index(target_kind: str, target_id: str) -> None:
118
+ _index_cache.pop((target_kind, target_id), None)
119
+
120
+
121
+ def search_index(
122
+ target_kind: str,
123
+ target_id: str,
124
+ query: str,
125
+ k: int = 20,
126
+ start_ts: str | None = None,
127
+ end_ts: str | None = None,
128
+ ) -> list[dict]:
129
+ """Return top-K window hits for *query* in *target*'s index.
130
+
131
+ If *start_ts*/*end_ts* (ISO 8601 UTC strings) are given, only windows whose
132
+ time range overlaps [start_ts, end_ts] are eligible — so callers can scope
133
+ retrieval to the same date range the user picked elsewhere in the UI.
134
+ """
135
+ meta, matrix = get_index_matrix(target_kind, target_id)
136
+ if len(meta) == 0:
137
+ return []
138
+ q_vec = embed_query(query)
139
+ scores = matrix @ q_vec
140
+ if start_ts is not None or end_ts is not None:
141
+ for i, m in enumerate(meta):
142
+ if start_ts is not None and m["last_ts"] < start_ts:
143
+ scores[i] = -np.inf
144
+ if end_ts is not None and m["first_ts"] > end_ts:
145
+ scores[i] = -np.inf
146
+ eligible = int(np.sum(scores > -1e30))
147
+ if eligible == 0:
148
+ return []
149
+ k = min(k, eligible)
150
+ top = np.argpartition(-scores, k - 1)[:k]
151
+ top = top[np.argsort(-scores[top])]
152
+ return [{**meta[i], "score": float(scores[i])} for i in top]