msg-summarizer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msg_summarizer/__init__.py +0 -0
- msg_summarizer/advisor.py +125 -0
- msg_summarizer/backfill.py +112 -0
- msg_summarizer/cli.py +394 -0
- msg_summarizer/config.py +57 -0
- msg_summarizer/contacts.py +76 -0
- msg_summarizer/db.py +401 -0
- msg_summarizer/embeddings.py +152 -0
- msg_summarizer/indexer.py +104 -0
- msg_summarizer/models.py +44 -0
- msg_summarizer/stats.py +166 -0
- msg_summarizer/store.py +327 -0
- msg_summarizer/summarizer.py +531 -0
- msg_summarizer/web/__init__.py +0 -0
- msg_summarizer/web/app.py +838 -0
- msg_summarizer/web/static/assets/index-8M1XSupb.css +1 -0
- msg_summarizer/web/static/assets/index-BhdDljAs.js +43 -0
- msg_summarizer/web/static/index.html +13 -0
- msg_summarizer-0.1.0.dist-info/METADATA +178 -0
- msg_summarizer-0.1.0.dist-info/RECORD +23 -0
- msg_summarizer-0.1.0.dist-info/WHEEL +4 -0
- msg_summarizer-0.1.0.dist-info/entry_points.txt +2 -0
- msg_summarizer-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Resolve iMessage handles to real names via the macOS AddressBook (Contacts).
|
|
2
|
+
|
|
3
|
+
Reads the AddressBook SQLite stores directly (under ~/Library/Application
|
|
4
|
+
Support/AddressBook), which existing Full Disk Access already covers — no extra
|
|
5
|
+
permission prompt. Fails soft: returns {} if Contacts can't be read, so the
|
|
6
|
+
rest of the app still works with raw handles.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import glob
|
|
10
|
+
import pathlib
|
|
11
|
+
import re
|
|
12
|
+
import sqlite3
|
|
13
|
+
|
|
14
|
+
_AB_DIR = pathlib.Path.home() / "Library" / "Application Support" / "AddressBook"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def normalize_handle(handle: str) -> str:
|
|
18
|
+
"""Reduce a handle to a match key: lowercased email, or a phone's last 10 digits."""
|
|
19
|
+
s = (handle or "").strip().lower()
|
|
20
|
+
if "@" in s:
|
|
21
|
+
return s
|
|
22
|
+
digits = re.sub(r"\D", "", s)
|
|
23
|
+
return digits[-10:] if len(digits) >= 10 else (digits or s)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _addressbook_paths() -> list[str]:
|
|
27
|
+
paths = glob.glob(str(_AB_DIR / "Sources" / "*" / "AddressBook-v22.abcddb"))
|
|
28
|
+
top = _AB_DIR / "AddressBook-v22.abcddb"
|
|
29
|
+
if top.exists():
|
|
30
|
+
paths.append(str(top))
|
|
31
|
+
return paths
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _record_name(first, last, org) -> str:
|
|
35
|
+
parts = [p for p in (first, last) if p]
|
|
36
|
+
if parts:
|
|
37
|
+
return " ".join(parts)
|
|
38
|
+
return org or ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _harvest(conn: sqlite3.Connection, mapping: dict, source: str) -> None:
|
|
42
|
+
for table, col in (("ZABCDPHONENUMBER", "ZFULLNUMBER"), ("ZABCDEMAILADDRESS", "ZADDRESS")):
|
|
43
|
+
query = (
|
|
44
|
+
f"SELECT x.{col}, r.Z_PK, r.ZFIRSTNAME, r.ZLASTNAME, r.ZORGANIZATION "
|
|
45
|
+
f"FROM {table} x JOIN ZABCDRECORD r ON x.ZOWNER = r.Z_PK"
|
|
46
|
+
)
|
|
47
|
+
try:
|
|
48
|
+
rows = conn.execute(query).fetchall()
|
|
49
|
+
except sqlite3.Error:
|
|
50
|
+
continue # schema variation in this source; skip this table
|
|
51
|
+
for value, pk, first, last, org in rows:
|
|
52
|
+
if not value:
|
|
53
|
+
continue
|
|
54
|
+
name = _record_name(first, last, org)
|
|
55
|
+
if not name:
|
|
56
|
+
continue
|
|
57
|
+
mapping.setdefault(normalize_handle(value), (f"ab:{source}:{pk}", name))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_contacts() -> dict[str, tuple[str, str]]:
|
|
61
|
+
"""Map normalized handle -> (contact_key, display_name).
|
|
62
|
+
|
|
63
|
+
The contact_key is stable per contact card, so a person's phone and email
|
|
64
|
+
collapse to the same key (and thus the same merged Person).
|
|
65
|
+
"""
|
|
66
|
+
mapping: dict[str, tuple[str, str]] = {}
|
|
67
|
+
for i, path in enumerate(_addressbook_paths()):
|
|
68
|
+
try:
|
|
69
|
+
conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True)
|
|
70
|
+
except sqlite3.OperationalError:
|
|
71
|
+
continue
|
|
72
|
+
try:
|
|
73
|
+
_harvest(conn, mapping, source=str(i))
|
|
74
|
+
finally:
|
|
75
|
+
conn.close()
|
|
76
|
+
return mapping
|
msg_summarizer/db.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import pathlib
|
|
3
|
+
import re
|
|
4
|
+
import sqlite3
|
|
5
|
+
|
|
6
|
+
from typedstream.stream import TypedStreamReader
|
|
7
|
+
|
|
8
|
+
from .contacts import load_contacts, normalize_handle
|
|
9
|
+
from .models import Chat, Conversation, Message, Person
|
|
10
|
+
|
|
11
|
+
DB_PATH = pathlib.Path.home() / "Library" / "Messages" / "chat.db"
|
|
12
|
+
|
|
13
|
+
# Seconds between Unix epoch (1970-01-01) and Apple Core Data epoch (2001-01-01)
|
|
14
|
+
_APPLE_EPOCH_OFFSET = 978307200
|
|
15
|
+
|
|
16
|
+
# Base query; date-bound clauses and ORDER BY are appended by _fetch().
|
|
17
|
+
# Messages on macOS Ventura+ usually leave `text` NULL and store the body in
|
|
18
|
+
# `attributedBody`, so we select both and resolve in _rows_to_messages().
|
|
19
|
+
_MESSAGES_BASE = """
|
|
20
|
+
SELECT
|
|
21
|
+
m.ROWID AS message_rowid,
|
|
22
|
+
m.date AS apple_date,
|
|
23
|
+
h.id AS handle_id,
|
|
24
|
+
m.is_from_me AS is_from_me,
|
|
25
|
+
m.text AS text,
|
|
26
|
+
m.attributedBody AS attributed_body
|
|
27
|
+
FROM message m
|
|
28
|
+
LEFT JOIN handle h ON m.handle_id = h.ROWID
|
|
29
|
+
JOIN chat_message_join cmj ON m.ROWID = cmj.message_id
|
|
30
|
+
JOIN chat c ON cmj.chat_id = c.ROWID
|
|
31
|
+
WHERE
|
|
32
|
+
(
|
|
33
|
+
LOWER(h.id) LIKE :contact_pattern
|
|
34
|
+
OR LOWER(c.display_name) LIKE :contact_pattern
|
|
35
|
+
OR LOWER(c.chat_identifier) LIKE :contact_pattern
|
|
36
|
+
)
|
|
37
|
+
AND (
|
|
38
|
+
(m.text IS NOT NULL AND m.text != '')
|
|
39
|
+
OR m.attributedBody IS NOT NULL
|
|
40
|
+
)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _apple_ts(dt: datetime.datetime) -> int:
|
|
45
|
+
"""Convert a datetime to an Apple nanosecond timestamp."""
|
|
46
|
+
return int((dt.timestamp() - _APPLE_EPOCH_OFFSET) * 1_000_000_000)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _from_apple_ts(apple_ts: int) -> datetime.datetime:
|
|
50
|
+
unix = (apple_ts / 1_000_000_000) + _APPLE_EPOCH_OFFSET
|
|
51
|
+
return datetime.datetime.fromtimestamp(unix, tz=datetime.timezone.utc)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _connect(db_path: pathlib.Path) -> sqlite3.Connection:
|
|
55
|
+
"""Open *db_path* read-only, or raise a helpful PermissionError."""
|
|
56
|
+
try:
|
|
57
|
+
return sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
|
58
|
+
except sqlite3.OperationalError as exc:
|
|
59
|
+
raise PermissionError(
|
|
60
|
+
f"Cannot open {db_path}.\n"
|
|
61
|
+
"Grant Full Disk Access to your terminal app in:\n"
|
|
62
|
+
" System Settings > Privacy & Security > Full Disk Access"
|
|
63
|
+
) from exc
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _decode_attributed_body(blob: bytes | None) -> str | None:
|
|
67
|
+
"""Extract the message text from a Messages `attributedBody` blob.
|
|
68
|
+
|
|
69
|
+
macOS Ventura+ leaves the `text` column NULL and stores the body as a
|
|
70
|
+
serialized NSAttributedString (Apple's "typedstream" format). The message
|
|
71
|
+
text is the first string payload in the stream, ahead of the attribute-run
|
|
72
|
+
metadata (e.g. ``__kIMMessagePartAttributeName``).
|
|
73
|
+
"""
|
|
74
|
+
if not blob:
|
|
75
|
+
return None
|
|
76
|
+
try:
|
|
77
|
+
for event in TypedStreamReader.from_data(blob):
|
|
78
|
+
if isinstance(event, bytes):
|
|
79
|
+
return event.decode("utf-8", errors="replace")
|
|
80
|
+
if isinstance(event, str):
|
|
81
|
+
return event
|
|
82
|
+
except Exception:
|
|
83
|
+
return None
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _rows_to_messages(rows: list[tuple]) -> list[Message]:
|
|
88
|
+
messages: list[Message] = []
|
|
89
|
+
for rowid, apple_date, handle_id, is_from_me, text, attributed_body in rows:
|
|
90
|
+
resolved = text if text else _decode_attributed_body(attributed_body)
|
|
91
|
+
if not resolved or not resolved.strip():
|
|
92
|
+
continue
|
|
93
|
+
messages.append(
|
|
94
|
+
Message(
|
|
95
|
+
timestamp=_from_apple_ts(apple_date),
|
|
96
|
+
sender="me" if is_from_me else (handle_id or "them"),
|
|
97
|
+
text=resolved.strip(),
|
|
98
|
+
is_from_me=bool(is_from_me),
|
|
99
|
+
rowid=rowid,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
return messages
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
_LIST_CONTACTS_QUERY = """
|
|
106
|
+
SELECT
|
|
107
|
+
h.id AS handle_id,
|
|
108
|
+
COALESCE(c.display_name, '') AS display_name,
|
|
109
|
+
COUNT(DISTINCT m.ROWID) AS msg_count
|
|
110
|
+
FROM handle h
|
|
111
|
+
JOIN chat_handle_join chj ON h.ROWID = chj.handle_id
|
|
112
|
+
JOIN chat c ON chj.chat_id = c.ROWID
|
|
113
|
+
LEFT JOIN chat_message_join cmj ON c.ROWID = cmj.chat_id
|
|
114
|
+
LEFT JOIN message m ON cmj.message_id = m.ROWID
|
|
115
|
+
GROUP BY h.id, c.display_name
|
|
116
|
+
ORDER BY msg_count DESC
|
|
117
|
+
LIMIT :limit
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def list_contacts(
|
|
122
|
+
db_path: pathlib.Path = DB_PATH,
|
|
123
|
+
limit: int = 50,
|
|
124
|
+
) -> list[tuple[str, str, int]]:
|
|
125
|
+
"""Return (handle_id, display_name, msg_count) for contacts in the DB."""
|
|
126
|
+
conn = _connect(db_path)
|
|
127
|
+
with conn:
|
|
128
|
+
rows = conn.execute(_LIST_CONTACTS_QUERY, {"limit": limit}).fetchall()
|
|
129
|
+
|
|
130
|
+
return [(handle_id, display_name, msg_count) for handle_id, display_name, msg_count in rows]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def fetch_messages(
|
|
134
|
+
contact: str,
|
|
135
|
+
db_path: pathlib.Path = DB_PATH,
|
|
136
|
+
start: datetime.datetime | None = None,
|
|
137
|
+
end: datetime.datetime | None = None,
|
|
138
|
+
) -> Conversation:
|
|
139
|
+
"""Fetch messages matching *contact*, optionally bounded by *start*/*end* (UTC).
|
|
140
|
+
|
|
141
|
+
With no bounds, returns the full message history.
|
|
142
|
+
"""
|
|
143
|
+
query = _MESSAGES_BASE
|
|
144
|
+
params: dict[str, object] = {"contact_pattern": f"%{contact.lower()}%"}
|
|
145
|
+
if start is not None:
|
|
146
|
+
query += " AND m.date >= :start_ts\n"
|
|
147
|
+
params["start_ts"] = _apple_ts(start)
|
|
148
|
+
if end is not None:
|
|
149
|
+
query += " AND m.date <= :end_ts\n"
|
|
150
|
+
params["end_ts"] = _apple_ts(end)
|
|
151
|
+
query += "ORDER BY m.date ASC"
|
|
152
|
+
|
|
153
|
+
conn = _connect(db_path)
|
|
154
|
+
with conn:
|
|
155
|
+
rows = conn.execute(query, params).fetchall()
|
|
156
|
+
|
|
157
|
+
return Conversation(contact=contact, messages=_rows_to_messages(rows))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# --- Chat- and person-oriented model -------------------------------------
|
|
161
|
+
|
|
162
|
+
_CHATS_QUERY = """
|
|
163
|
+
SELECT
|
|
164
|
+
c.ROWID AS chat_id,
|
|
165
|
+
c.chat_identifier AS identifier,
|
|
166
|
+
COALESCE(c.display_name, '') AS display_name,
|
|
167
|
+
COUNT(DISTINCT cmj.message_id) AS msg_count,
|
|
168
|
+
MAX(m.date) AS last_date
|
|
169
|
+
FROM chat c
|
|
170
|
+
LEFT JOIN chat_message_join cmj ON c.ROWID = cmj.chat_id
|
|
171
|
+
LEFT JOIN message m ON cmj.message_id = m.ROWID
|
|
172
|
+
GROUP BY c.ROWID
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
_CHAT_PARTICIPANTS_QUERY = """
|
|
176
|
+
SELECT chj.chat_id, h.id
|
|
177
|
+
FROM chat_handle_join chj
|
|
178
|
+
JOIN handle h ON chj.handle_id = h.ROWID
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
_EPOCH_MIN = datetime.datetime.min.replace(tzinfo=datetime.timezone.utc)
|
|
182
|
+
|
|
183
|
+
# How many resolved names to show before collapsing to "& N more" in a group name.
|
|
184
|
+
_GROUP_NAME_MAX = 4
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _load_raw_chats(db_path: pathlib.Path) -> tuple[list[tuple], dict[int, list[str]]]:
|
|
188
|
+
"""Load raw chat rows + a chat_id -> [handle ids] map. Shared by list_chats/people."""
|
|
189
|
+
conn = _connect(db_path)
|
|
190
|
+
with conn:
|
|
191
|
+
chat_rows = conn.execute(_CHATS_QUERY).fetchall()
|
|
192
|
+
part_rows = conn.execute(_CHAT_PARTICIPANTS_QUERY).fetchall()
|
|
193
|
+
participants: dict[int, list[str]] = {}
|
|
194
|
+
for chat_id, handle_id in part_rows:
|
|
195
|
+
participants.setdefault(chat_id, []).append(handle_id)
|
|
196
|
+
return chat_rows, participants
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _build_people(
|
|
200
|
+
chat_rows: list[tuple],
|
|
201
|
+
participants: dict[int, list[str]],
|
|
202
|
+
contacts: dict[str, tuple[str, str]],
|
|
203
|
+
) -> list[Person]:
|
|
204
|
+
"""Derive Person objects from raw chats, merging handles via Contacts."""
|
|
205
|
+
people: dict[str, Person] = {}
|
|
206
|
+
for chat_id, _identifier, _display_name, msg_count, last_date in chat_rows:
|
|
207
|
+
parts = participants.get(chat_id, [])
|
|
208
|
+
is_group = len(parts) > 1
|
|
209
|
+
last = _from_apple_ts(last_date) if last_date else None
|
|
210
|
+
for handle_id in parts:
|
|
211
|
+
norm = normalize_handle(handle_id)
|
|
212
|
+
entry = contacts.get(norm)
|
|
213
|
+
key, name = entry if entry else (norm, handle_id)
|
|
214
|
+
person = people.get(key)
|
|
215
|
+
if person is None:
|
|
216
|
+
person = Person(key=key, name=name)
|
|
217
|
+
people[key] = person
|
|
218
|
+
if handle_id not in person.handles:
|
|
219
|
+
person.handles.append(handle_id)
|
|
220
|
+
if is_group:
|
|
221
|
+
person.group_chat_ids.append(chat_id)
|
|
222
|
+
else:
|
|
223
|
+
person.dm_chat_ids.append(chat_id)
|
|
224
|
+
person.message_count += msg_count or 0
|
|
225
|
+
if last and (person.last_date is None or last > person.last_date):
|
|
226
|
+
person.last_date = last
|
|
227
|
+
out = list(people.values())
|
|
228
|
+
out.sort(key=lambda p: p.last_date or _EPOCH_MIN, reverse=True)
|
|
229
|
+
return out
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _format_group_name(names: list[str]) -> str:
|
|
233
|
+
if not names:
|
|
234
|
+
return "Group"
|
|
235
|
+
if len(names) <= _GROUP_NAME_MAX:
|
|
236
|
+
return ", ".join(names)
|
|
237
|
+
return ", ".join(names[:_GROUP_NAME_MAX]) + f" & {len(names) - _GROUP_NAME_MAX} more"
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _build_chats(
|
|
241
|
+
chat_rows: list[tuple],
|
|
242
|
+
participants: dict[int, list[str]],
|
|
243
|
+
handle_to_person: dict[str, Person],
|
|
244
|
+
) -> list[Chat]:
|
|
245
|
+
chats: list[Chat] = []
|
|
246
|
+
for chat_id, identifier, display_name, msg_count, last_date in chat_rows:
|
|
247
|
+
parts = participants.get(chat_id, [])
|
|
248
|
+
is_group = len(parts) > 1
|
|
249
|
+
if display_name:
|
|
250
|
+
name = display_name
|
|
251
|
+
elif not is_group and parts:
|
|
252
|
+
person = handle_to_person.get(parts[0])
|
|
253
|
+
name = person.name if person else parts[0]
|
|
254
|
+
elif is_group:
|
|
255
|
+
resolved = sorted(
|
|
256
|
+
(handle_to_person[h].name if h in handle_to_person else h) for h in parts
|
|
257
|
+
)
|
|
258
|
+
name = _format_group_name(resolved)
|
|
259
|
+
else:
|
|
260
|
+
name = identifier or "Unknown"
|
|
261
|
+
chats.append(
|
|
262
|
+
Chat(
|
|
263
|
+
chat_id=chat_id,
|
|
264
|
+
identifier=identifier or "",
|
|
265
|
+
display_name=display_name or "",
|
|
266
|
+
name=name,
|
|
267
|
+
is_group=is_group,
|
|
268
|
+
participants=parts,
|
|
269
|
+
message_count=msg_count or 0,
|
|
270
|
+
last_date=_from_apple_ts(last_date) if last_date else None,
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
chats.sort(key=lambda c: c.last_date or _EPOCH_MIN, reverse=True)
|
|
274
|
+
return chats
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def list_chats(db_path: pathlib.Path = DB_PATH) -> list[Chat]:
|
|
278
|
+
"""Return every chat thread (1:1 and group), newest activity first.
|
|
279
|
+
|
|
280
|
+
Names are resolved: 1:1s use the matched Contact's name; unnamed groups use
|
|
281
|
+
the alphabetized list of participant Contact names.
|
|
282
|
+
"""
|
|
283
|
+
chat_rows, participants = _load_raw_chats(db_path)
|
|
284
|
+
contacts = load_contacts()
|
|
285
|
+
people = _build_people(chat_rows, participants, contacts)
|
|
286
|
+
handle_to_person = {h: p for p in people for h in p.handles}
|
|
287
|
+
return _build_chats(chat_rows, participants, handle_to_person)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _fetch_by_chats(
|
|
291
|
+
chat_ids: list[int],
|
|
292
|
+
label: str,
|
|
293
|
+
db_path: pathlib.Path,
|
|
294
|
+
start: datetime.datetime | None,
|
|
295
|
+
end: datetime.datetime | None,
|
|
296
|
+
) -> Conversation:
|
|
297
|
+
if not chat_ids:
|
|
298
|
+
return Conversation(contact=label, messages=[])
|
|
299
|
+
|
|
300
|
+
placeholders = ",".join("?" for _ in chat_ids)
|
|
301
|
+
query = f"""
|
|
302
|
+
SELECT m.ROWID, m.date, h.id, m.is_from_me, m.text, m.attributedBody
|
|
303
|
+
FROM message m
|
|
304
|
+
LEFT JOIN handle h ON m.handle_id = h.ROWID
|
|
305
|
+
JOIN chat_message_join cmj ON m.ROWID = cmj.message_id
|
|
306
|
+
WHERE cmj.chat_id IN ({placeholders})
|
|
307
|
+
AND ((m.text IS NOT NULL AND m.text != '') OR m.attributedBody IS NOT NULL)
|
|
308
|
+
"""
|
|
309
|
+
params: list[object] = list(chat_ids)
|
|
310
|
+
if start is not None:
|
|
311
|
+
query += " AND m.date >= ?"
|
|
312
|
+
params.append(_apple_ts(start))
|
|
313
|
+
if end is not None:
|
|
314
|
+
query += " AND m.date <= ?"
|
|
315
|
+
params.append(_apple_ts(end))
|
|
316
|
+
query += " ORDER BY m.date ASC"
|
|
317
|
+
|
|
318
|
+
conn = _connect(db_path)
|
|
319
|
+
with conn:
|
|
320
|
+
rows = conn.execute(query, params).fetchall()
|
|
321
|
+
return Conversation(contact=label, messages=_rows_to_messages(rows))
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def fetch_chat_messages(
|
|
325
|
+
chat_id: int,
|
|
326
|
+
label: str | None = None,
|
|
327
|
+
db_path: pathlib.Path = DB_PATH,
|
|
328
|
+
start: datetime.datetime | None = None,
|
|
329
|
+
end: datetime.datetime | None = None,
|
|
330
|
+
) -> Conversation:
|
|
331
|
+
"""Fetch the messages of a single chat thread."""
|
|
332
|
+
return _fetch_by_chats([chat_id], label or str(chat_id), db_path, start, end)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def fetch_person_messages(
|
|
336
|
+
person: Person,
|
|
337
|
+
db_path: pathlib.Path = DB_PATH,
|
|
338
|
+
start: datetime.datetime | None = None,
|
|
339
|
+
end: datetime.datetime | None = None,
|
|
340
|
+
) -> Conversation:
|
|
341
|
+
"""Fetch a person's direct (1:1) messages, merged across their DM chats."""
|
|
342
|
+
return _fetch_by_chats(person.dm_chat_ids, person.name, db_path, start, end)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def list_people(db_path: pathlib.Path = DB_PATH) -> list[Person]:
|
|
346
|
+
"""Derive people from chats, merging handles via Contacts and normalized ids.
|
|
347
|
+
|
|
348
|
+
A person's headline is their 1:1 DM(s); group chats they're in are tracked
|
|
349
|
+
separately so the UI can link to them.
|
|
350
|
+
"""
|
|
351
|
+
chat_rows, participants = _load_raw_chats(db_path)
|
|
352
|
+
return _build_people(chat_rows, participants, load_contacts())
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def list_conversations(db_path: pathlib.Path = DB_PATH) -> list[dict]:
|
|
356
|
+
"""Unified Chats list: one row per person DM (merged across handles) plus
|
|
357
|
+
one row per group chat. Sorted newest first.
|
|
358
|
+
|
|
359
|
+
Each row is shaped as a click target:
|
|
360
|
+
* person DM: {"kind": "person", "key": ..., "name", ...}
|
|
361
|
+
* group: {"kind": "chat", "chat_id": ..., "name", ...}
|
|
362
|
+
"""
|
|
363
|
+
chat_rows, participants = _load_raw_chats(db_path)
|
|
364
|
+
contacts = load_contacts()
|
|
365
|
+
people = _build_people(chat_rows, participants, contacts)
|
|
366
|
+
handle_to_person = {h: p for p in people for h in p.handles}
|
|
367
|
+
chats = _build_chats(chat_rows, participants, handle_to_person)
|
|
368
|
+
|
|
369
|
+
rows: list[dict] = []
|
|
370
|
+
for person in people:
|
|
371
|
+
if not person.dm_chat_ids:
|
|
372
|
+
continue
|
|
373
|
+
rows.append(
|
|
374
|
+
{
|
|
375
|
+
"kind": "person",
|
|
376
|
+
"key": person.key,
|
|
377
|
+
"name": person.name,
|
|
378
|
+
"is_group": False,
|
|
379
|
+
"message_count": person.message_count,
|
|
380
|
+
"last_date": person.last_date.isoformat() if person.last_date else None,
|
|
381
|
+
"participant_count": 1,
|
|
382
|
+
"handle_count": len(person.handles),
|
|
383
|
+
}
|
|
384
|
+
)
|
|
385
|
+
for chat in chats:
|
|
386
|
+
if not chat.is_group:
|
|
387
|
+
continue
|
|
388
|
+
rows.append(
|
|
389
|
+
{
|
|
390
|
+
"kind": "chat",
|
|
391
|
+
"chat_id": chat.chat_id,
|
|
392
|
+
"name": chat.name,
|
|
393
|
+
"is_group": True,
|
|
394
|
+
"message_count": chat.message_count,
|
|
395
|
+
"last_date": chat.last_date.isoformat() if chat.last_date else None,
|
|
396
|
+
"participant_count": len(chat.participants),
|
|
397
|
+
"handle_count": len(chat.participants),
|
|
398
|
+
}
|
|
399
|
+
)
|
|
400
|
+
rows.sort(key=lambda r: r["last_date"] or "", reverse=True)
|
|
401
|
+
return rows
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""On-device embedding for semantic search over message windows.
|
|
2
|
+
|
|
3
|
+
Uses BAAI/bge-small-en-v1.5 via fastembed (ONNX runtime, no PyTorch).
|
|
4
|
+
The model file is downloaded once on first use and cached locally.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from .models import Message
|
|
13
|
+
|
|
14
|
+
_MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
|
15
|
+
_DIM = 384
|
|
16
|
+
_WINDOW_SIZE = 5
|
|
17
|
+
|
|
18
|
+
_model = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_model():
|
|
22
|
+
"""Lazy-load the embedding model on first use (slow once; cached after)."""
|
|
23
|
+
global _model
|
|
24
|
+
if _model is None:
|
|
25
|
+
from fastembed import TextEmbedding # heavy import; defer
|
|
26
|
+
_model = TextEmbedding(model_name=_MODEL_NAME)
|
|
27
|
+
return _model
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def embedding_dim() -> int:
|
|
31
|
+
return _DIM
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Window:
|
|
36
|
+
first_rowid: int
|
|
37
|
+
last_rowid: int
|
|
38
|
+
first_ts: datetime
|
|
39
|
+
last_ts: datetime
|
|
40
|
+
text: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def window_messages(messages: list[Message], n: int = _WINDOW_SIZE) -> list[Window]:
|
|
44
|
+
"""Group *messages* (in chronological order) into windows of up to n messages each.
|
|
45
|
+
|
|
46
|
+
Each window is one semantic unit — short context bundles tend to embed better
|
|
47
|
+
than single messages like "lol" or "ok".
|
|
48
|
+
"""
|
|
49
|
+
out: list[Window] = []
|
|
50
|
+
for i in range(0, len(messages), n):
|
|
51
|
+
chunk = messages[i : i + n]
|
|
52
|
+
if not chunk:
|
|
53
|
+
continue
|
|
54
|
+
text = "\n".join(
|
|
55
|
+
f"{'me' if m.is_from_me else m.sender}: {m.text}" for m in chunk
|
|
56
|
+
)
|
|
57
|
+
out.append(
|
|
58
|
+
Window(
|
|
59
|
+
first_rowid=chunk[0].rowid,
|
|
60
|
+
last_rowid=chunk[-1].rowid,
|
|
61
|
+
first_ts=chunk[0].timestamp,
|
|
62
|
+
last_ts=chunk[-1].timestamp,
|
|
63
|
+
text=text,
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
return out
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def embed_documents(texts: list[str], batch_size: int = 64) -> np.ndarray:
|
|
70
|
+
"""Embed document texts. Returns N x D float32 (L2-normalized)."""
|
|
71
|
+
if not texts:
|
|
72
|
+
return np.zeros((0, _DIM), dtype=np.float32)
|
|
73
|
+
model = _load_model()
|
|
74
|
+
vecs = list(model.embed(texts, batch_size=batch_size))
|
|
75
|
+
return np.stack([np.asarray(v, dtype=np.float32) for v in vecs])
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def embed_query(text: str) -> np.ndarray:
|
|
79
|
+
"""Embed a query. Returns a 1-D float32 vector (L2-normalized)."""
|
|
80
|
+
model = _load_model()
|
|
81
|
+
# query_embed applies the BGE query-side prefix expected by this model.
|
|
82
|
+
return np.asarray(next(model.query_embed([text])), dtype=np.float32)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def cosine_topk(
|
|
86
|
+
query_vec: np.ndarray, doc_matrix: np.ndarray, k: int = 20
|
|
87
|
+
) -> list[tuple[int, float]]:
|
|
88
|
+
"""Top-K cosine similarities (vectors assumed L2-normalized)."""
|
|
89
|
+
if doc_matrix.size == 0:
|
|
90
|
+
return []
|
|
91
|
+
scores = doc_matrix @ query_vec # (N,)
|
|
92
|
+
k = min(k, len(scores))
|
|
93
|
+
top = np.argpartition(-scores, k - 1)[:k]
|
|
94
|
+
top = top[np.argsort(-scores[top])]
|
|
95
|
+
return [(int(i), float(scores[i])) for i in top]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# Per-target in-memory cache: (target_kind, target_id) -> (meta, matrix, last_window_id)
|
|
99
|
+
_index_cache: dict[tuple[str, str], tuple[list[dict], np.ndarray, int]] = {}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_index_matrix(target_kind: str, target_id: str):
|
|
103
|
+
"""Return (meta, matrix) for *target*, reloading from disk if it's grown."""
|
|
104
|
+
from .store import get_index_progress, load_windows # avoid cycle at import
|
|
105
|
+
|
|
106
|
+
_, last_window_id = get_index_progress(target_kind, target_id)
|
|
107
|
+
key = (target_kind, target_id)
|
|
108
|
+
cached = _index_cache.get(key)
|
|
109
|
+
if cached is not None and cached[2] == last_window_id:
|
|
110
|
+
return cached[0], cached[1]
|
|
111
|
+
meta, vec_bytes = load_windows(target_kind, target_id)
|
|
112
|
+
matrix = np.frombuffer(vec_bytes, dtype=np.float32).reshape(-1, _DIM).copy()
|
|
113
|
+
_index_cache[key] = (meta, matrix, last_window_id)
|
|
114
|
+
return meta, matrix
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def invalidate_index(target_kind: str, target_id: str) -> None:
|
|
118
|
+
_index_cache.pop((target_kind, target_id), None)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def search_index(
|
|
122
|
+
target_kind: str,
|
|
123
|
+
target_id: str,
|
|
124
|
+
query: str,
|
|
125
|
+
k: int = 20,
|
|
126
|
+
start_ts: str | None = None,
|
|
127
|
+
end_ts: str | None = None,
|
|
128
|
+
) -> list[dict]:
|
|
129
|
+
"""Return top-K window hits for *query* in *target*'s index.
|
|
130
|
+
|
|
131
|
+
If *start_ts*/*end_ts* (ISO 8601 UTC strings) are given, only windows whose
|
|
132
|
+
time range overlaps [start_ts, end_ts] are eligible — so callers can scope
|
|
133
|
+
retrieval to the same date range the user picked elsewhere in the UI.
|
|
134
|
+
"""
|
|
135
|
+
meta, matrix = get_index_matrix(target_kind, target_id)
|
|
136
|
+
if len(meta) == 0:
|
|
137
|
+
return []
|
|
138
|
+
q_vec = embed_query(query)
|
|
139
|
+
scores = matrix @ q_vec
|
|
140
|
+
if start_ts is not None or end_ts is not None:
|
|
141
|
+
for i, m in enumerate(meta):
|
|
142
|
+
if start_ts is not None and m["last_ts"] < start_ts:
|
|
143
|
+
scores[i] = -np.inf
|
|
144
|
+
if end_ts is not None and m["first_ts"] > end_ts:
|
|
145
|
+
scores[i] = -np.inf
|
|
146
|
+
eligible = int(np.sum(scores > -1e30))
|
|
147
|
+
if eligible == 0:
|
|
148
|
+
return []
|
|
149
|
+
k = min(k, eligible)
|
|
150
|
+
top = np.argpartition(-scores, k - 1)[:k]
|
|
151
|
+
top = top[np.argsort(-scores[top])]
|
|
152
|
+
return [{**meta[i], "score": float(scores[i])} for i in top]
|