superlocalmemory 3.4.1 → 3.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pyproject.toml +11 -2
- package/scripts/postinstall.js +26 -7
- package/src/superlocalmemory/cli/commands.py +42 -60
- package/src/superlocalmemory/cli/daemon.py +107 -47
- package/src/superlocalmemory/cli/main.py +10 -0
- package/src/superlocalmemory/cli/setup_wizard.py +137 -9
- package/src/superlocalmemory/core/config.py +28 -0
- package/src/superlocalmemory/core/consolidation_engine.py +38 -1
- package/src/superlocalmemory/core/engine.py +9 -0
- package/src/superlocalmemory/core/health_monitor.py +313 -0
- package/src/superlocalmemory/core/reranker_worker.py +19 -5
- package/src/superlocalmemory/ingestion/__init__.py +13 -0
- package/src/superlocalmemory/ingestion/adapter_manager.py +234 -0
- package/src/superlocalmemory/ingestion/base_adapter.py +177 -0
- package/src/superlocalmemory/ingestion/calendar_adapter.py +340 -0
- package/src/superlocalmemory/ingestion/credentials.py +118 -0
- package/src/superlocalmemory/ingestion/gmail_adapter.py +369 -0
- package/src/superlocalmemory/ingestion/parsers.py +100 -0
- package/src/superlocalmemory/ingestion/transcript_adapter.py +156 -0
- package/src/superlocalmemory/learning/consolidation_worker.py +47 -1
- package/src/superlocalmemory/learning/entity_compiler.py +377 -0
- package/src/superlocalmemory/mesh/__init__.py +12 -0
- package/src/superlocalmemory/mesh/broker.py +344 -0
- package/src/superlocalmemory/retrieval/entity_channel.py +12 -6
- package/src/superlocalmemory/server/api.py +6 -7
- package/src/superlocalmemory/server/routes/entity.py +95 -0
- package/src/superlocalmemory/server/routes/ingest.py +110 -0
- package/src/superlocalmemory/server/routes/mesh.py +186 -0
- package/src/superlocalmemory/server/unified_daemon.py +691 -0
- package/src/superlocalmemory/storage/schema_v343.py +229 -0
- package/src/superlocalmemory.egg-info/PKG-INFO +0 -597
- package/src/superlocalmemory.egg-info/SOURCES.txt +0 -287
- package/src/superlocalmemory.egg-info/dependency_links.txt +0 -1
- package/src/superlocalmemory.egg-info/entry_points.txt +0 -2
- package/src/superlocalmemory.egg-info/requires.txt +0 -47
- package/src/superlocalmemory.egg-info/top_level.txt +0 -1
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the Elastic License 2.0 - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
|
+
|
|
5
|
+
"""Gmail ingestion adapter — 3 tiers of Gmail access.
|
|
6
|
+
|
|
7
|
+
Tier 1: File import (.mbox from Google Takeout) — zero setup
|
|
8
|
+
Tier 1.5: IMAP polling — no GCP, just email/password
|
|
9
|
+
Tier 2: Gmail API with OAuth polling — needs GCP OAuth client, no Pub/Sub
|
|
10
|
+
Tier 3: Gmail API with Pub/Sub push — full GCP (future)
|
|
11
|
+
|
|
12
|
+
OPT-IN only. Enabled via: slm adapters enable gmail
|
|
13
|
+
|
|
14
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
15
|
+
License: Elastic-2.0
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from superlocalmemory.ingestion.base_adapter import BaseAdapter, AdapterConfig, IngestItem
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("superlocalmemory.ingestion.gmail")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GmailAdapter(BaseAdapter):
|
|
31
|
+
"""Gmail ingestion with automatic tier detection."""
|
|
32
|
+
|
|
33
|
+
source_type = "gmail"
|
|
34
|
+
|
|
35
|
+
def __init__(self, config: AdapterConfig | None = None, tier: str = "auto"):
|
|
36
|
+
super().__init__(config)
|
|
37
|
+
self._tier = tier
|
|
38
|
+
self._mbox_path: str | None = None
|
|
39
|
+
self._mbox_processed = False
|
|
40
|
+
self._history_id: str | None = None
|
|
41
|
+
self._poll_interval = 300 # 5 min for API polling
|
|
42
|
+
|
|
43
|
+
def run(self) -> None:
|
|
44
|
+
"""Detect tier and run."""
|
|
45
|
+
self._detect_tier()
|
|
46
|
+
logger.info("Gmail adapter starting (tier=%s)", self._tier)
|
|
47
|
+
super().run()
|
|
48
|
+
|
|
49
|
+
def fetch_items(self) -> list[IngestItem]:
|
|
50
|
+
"""Fetch items based on active tier."""
|
|
51
|
+
if self._tier == "mbox":
|
|
52
|
+
return self._fetch_mbox()
|
|
53
|
+
elif self._tier == "imap":
|
|
54
|
+
return self._fetch_imap()
|
|
55
|
+
elif self._tier == "oauth":
|
|
56
|
+
return self._fetch_oauth()
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
def wait_for_next_cycle(self) -> None:
|
|
60
|
+
"""Tier 1 (mbox): run once then stop. Others: poll interval."""
|
|
61
|
+
if self._tier == "mbox" and self._mbox_processed:
|
|
62
|
+
logger.info("MBOX import complete, adapter stopping")
|
|
63
|
+
self.stop()
|
|
64
|
+
return
|
|
65
|
+
self._stop_event.wait(self._poll_interval)
|
|
66
|
+
|
|
67
|
+
# -- Tier detection --
|
|
68
|
+
|
|
69
|
+
def _detect_tier(self) -> None:
|
|
70
|
+
"""Auto-detect the best available tier."""
|
|
71
|
+
if self._tier != "auto":
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
adapters_path = Path.home() / ".superlocalmemory" / "adapters.json"
|
|
75
|
+
cfg = {}
|
|
76
|
+
if adapters_path.exists():
|
|
77
|
+
cfg = json.loads(adapters_path.read_text()).get("gmail", {})
|
|
78
|
+
|
|
79
|
+
# Check for explicit tier
|
|
80
|
+
if cfg.get("tier") == "mbox" or cfg.get("mbox_path"):
|
|
81
|
+
self._tier = "mbox"
|
|
82
|
+
self._mbox_path = cfg.get("mbox_path", "")
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
if cfg.get("tier") == "imap":
|
|
86
|
+
self._tier = "imap"
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
# Check for OAuth credentials
|
|
90
|
+
from superlocalmemory.ingestion.credentials import has_credential
|
|
91
|
+
if has_credential("gmail", "refresh_token"):
|
|
92
|
+
self._tier = "oauth"
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
# Default: look for mbox file
|
|
96
|
+
mbox_dir = Path.home() / ".superlocalmemory" / "import"
|
|
97
|
+
mbox_files = list(mbox_dir.glob("*.mbox")) if mbox_dir.exists() else []
|
|
98
|
+
if mbox_files:
|
|
99
|
+
self._tier = "mbox"
|
|
100
|
+
self._mbox_path = str(mbox_files[0])
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
logger.warning("No Gmail credentials or MBOX file found. "
|
|
104
|
+
"Place .mbox in ~/.superlocalmemory/import/ or run setup.")
|
|
105
|
+
self._tier = "mbox" # Will return empty if no file
|
|
106
|
+
|
|
107
|
+
# -- Tier 1: MBOX file import --
|
|
108
|
+
|
|
109
|
+
def _fetch_mbox(self) -> list[IngestItem]:
|
|
110
|
+
"""Parse .mbox file from Google Takeout."""
|
|
111
|
+
if self._mbox_processed or not self._mbox_path:
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
path = Path(self._mbox_path)
|
|
115
|
+
if not path.exists():
|
|
116
|
+
logger.warning("MBOX file not found: %s", path)
|
|
117
|
+
self._mbox_processed = True
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
import mailbox
|
|
121
|
+
items = []
|
|
122
|
+
mbox = mailbox.mbox(str(path))
|
|
123
|
+
total = len(mbox)
|
|
124
|
+
logger.info("Parsing MBOX: %d messages", total)
|
|
125
|
+
|
|
126
|
+
for i, message in enumerate(mbox):
|
|
127
|
+
if self._stop_event.is_set():
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
msg_id = message.get("Message-ID", f"mbox-{i}")
|
|
132
|
+
subject = message.get("Subject", "(no subject)")
|
|
133
|
+
from_addr = message.get("From", "unknown")
|
|
134
|
+
date = message.get("Date", "")
|
|
135
|
+
|
|
136
|
+
# Extract plain text body
|
|
137
|
+
body = ""
|
|
138
|
+
if message.is_multipart():
|
|
139
|
+
for part in message.walk():
|
|
140
|
+
if part.get_content_type() == "text/plain":
|
|
141
|
+
payload = part.get_payload(decode=True)
|
|
142
|
+
if payload:
|
|
143
|
+
body = payload.decode("utf-8", errors="replace")
|
|
144
|
+
break
|
|
145
|
+
else:
|
|
146
|
+
payload = message.get_payload(decode=True)
|
|
147
|
+
if payload:
|
|
148
|
+
body = payload.decode("utf-8", errors="replace")
|
|
149
|
+
|
|
150
|
+
# Truncate body
|
|
151
|
+
body = body[:3000] if body else ""
|
|
152
|
+
|
|
153
|
+
content = f"Email: {subject}\nFrom: {from_addr}\nDate: {date}\n\n{body}"
|
|
154
|
+
|
|
155
|
+
items.append(IngestItem(
|
|
156
|
+
content=content,
|
|
157
|
+
dedup_key=str(msg_id).strip("<>"),
|
|
158
|
+
metadata={
|
|
159
|
+
"subject": subject,
|
|
160
|
+
"from": from_addr,
|
|
161
|
+
"date": date,
|
|
162
|
+
"source": "mbox_import",
|
|
163
|
+
},
|
|
164
|
+
))
|
|
165
|
+
|
|
166
|
+
# Progress logging
|
|
167
|
+
if (i + 1) % 100 == 0:
|
|
168
|
+
logger.info("MBOX progress: %d/%d messages", i + 1, total)
|
|
169
|
+
|
|
170
|
+
except Exception as exc:
|
|
171
|
+
logger.debug("Failed to parse message %d: %s", i, exc)
|
|
172
|
+
|
|
173
|
+
self._mbox_processed = True
|
|
174
|
+
logger.info("MBOX import: %d messages extracted", len(items))
|
|
175
|
+
return items
|
|
176
|
+
|
|
177
|
+
# -- Tier 1.5: IMAP polling --
|
|
178
|
+
|
|
179
|
+
def _fetch_imap(self) -> list[IngestItem]:
|
|
180
|
+
"""Poll via IMAP. Requires email + password credentials."""
|
|
181
|
+
try:
|
|
182
|
+
import imaplib
|
|
183
|
+
from superlocalmemory.ingestion.credentials import load_credential
|
|
184
|
+
|
|
185
|
+
host = load_credential("gmail", "imap_host") or "imap.gmail.com"
|
|
186
|
+
email = load_credential("gmail", "email")
|
|
187
|
+
password = load_credential("gmail", "password")
|
|
188
|
+
|
|
189
|
+
if not email or not password:
|
|
190
|
+
logger.warning("IMAP credentials not found. Run: slm adapters enable gmail --setup")
|
|
191
|
+
return []
|
|
192
|
+
|
|
193
|
+
conn = imaplib.IMAP4_SSL(host)
|
|
194
|
+
conn.login(email, password)
|
|
195
|
+
conn.select("INBOX")
|
|
196
|
+
|
|
197
|
+
# Fetch last 20 unseen messages
|
|
198
|
+
_, msg_nums = conn.search(None, "UNSEEN")
|
|
199
|
+
items = []
|
|
200
|
+
|
|
201
|
+
for num in msg_nums[0].split()[-20:]:
|
|
202
|
+
if self._stop_event.is_set():
|
|
203
|
+
break
|
|
204
|
+
try:
|
|
205
|
+
_, data = conn.fetch(num, "(RFC822)")
|
|
206
|
+
import email as email_lib
|
|
207
|
+
msg = email_lib.message_from_bytes(data[0][1])
|
|
208
|
+
msg_id = msg.get("Message-ID", f"imap-{num.decode()}")
|
|
209
|
+
subject = msg.get("Subject", "(no subject)")
|
|
210
|
+
from_addr = msg.get("From", "unknown")
|
|
211
|
+
|
|
212
|
+
body = ""
|
|
213
|
+
if msg.is_multipart():
|
|
214
|
+
for part in msg.walk():
|
|
215
|
+
if part.get_content_type() == "text/plain":
|
|
216
|
+
payload = part.get_payload(decode=True)
|
|
217
|
+
if payload:
|
|
218
|
+
body = payload.decode("utf-8", errors="replace")
|
|
219
|
+
break
|
|
220
|
+
else:
|
|
221
|
+
payload = msg.get_payload(decode=True)
|
|
222
|
+
if payload:
|
|
223
|
+
body = payload.decode("utf-8", errors="replace")
|
|
224
|
+
|
|
225
|
+
body = body[:3000] if body else ""
|
|
226
|
+
content = f"Email: {subject}\nFrom: {from_addr}\n\n{body}"
|
|
227
|
+
|
|
228
|
+
items.append(IngestItem(
|
|
229
|
+
content=content,
|
|
230
|
+
dedup_key=str(msg_id).strip("<>"),
|
|
231
|
+
metadata={"subject": subject, "from": from_addr, "source": "imap"},
|
|
232
|
+
))
|
|
233
|
+
except Exception as exc:
|
|
234
|
+
logger.debug("IMAP fetch error: %s", exc)
|
|
235
|
+
|
|
236
|
+
conn.logout()
|
|
237
|
+
return items
|
|
238
|
+
|
|
239
|
+
except Exception as exc:
|
|
240
|
+
logger.warning("IMAP polling failed: %s", exc)
|
|
241
|
+
return []
|
|
242
|
+
|
|
243
|
+
# -- Tier 2: OAuth API polling --
|
|
244
|
+
|
|
245
|
+
def _fetch_oauth(self) -> list[IngestItem]:
|
|
246
|
+
"""Poll Gmail API with OAuth. Requires google-api-python-client."""
|
|
247
|
+
try:
|
|
248
|
+
from superlocalmemory.ingestion.credentials import load_credential
|
|
249
|
+
|
|
250
|
+
refresh_token = load_credential("gmail", "refresh_token")
|
|
251
|
+
client_id = load_credential("gmail", "client_id")
|
|
252
|
+
client_secret = load_credential("gmail", "client_secret")
|
|
253
|
+
|
|
254
|
+
if not all([refresh_token, client_id, client_secret]):
|
|
255
|
+
logger.warning("Gmail OAuth credentials incomplete. Run setup.")
|
|
256
|
+
return []
|
|
257
|
+
|
|
258
|
+
# Build credentials
|
|
259
|
+
from google.oauth2.credentials import Credentials
|
|
260
|
+
from googleapiclient.discovery import build
|
|
261
|
+
|
|
262
|
+
creds = Credentials(
|
|
263
|
+
token=None,
|
|
264
|
+
refresh_token=refresh_token,
|
|
265
|
+
client_id=client_id,
|
|
266
|
+
client_secret=client_secret,
|
|
267
|
+
token_uri="https://oauth2.googleapis.com/token",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
service = build("gmail", "v1", credentials=creds)
|
|
271
|
+
|
|
272
|
+
# Get history since last sync
|
|
273
|
+
if self._history_id:
|
|
274
|
+
results = service.users().history().list(
|
|
275
|
+
userId="me",
|
|
276
|
+
startHistoryId=self._history_id,
|
|
277
|
+
historyTypes=["messageAdded"],
|
|
278
|
+
).execute()
|
|
279
|
+
history = results.get("history", [])
|
|
280
|
+
msg_ids = []
|
|
281
|
+
for h in history:
|
|
282
|
+
for added in h.get("messagesAdded", []):
|
|
283
|
+
msg_ids.append(added["message"]["id"])
|
|
284
|
+
else:
|
|
285
|
+
# Initial: get last 20 messages
|
|
286
|
+
results = service.users().messages().list(
|
|
287
|
+
userId="me", maxResults=20,
|
|
288
|
+
).execute()
|
|
289
|
+
msg_ids = [m["id"] for m in results.get("messages", [])]
|
|
290
|
+
|
|
291
|
+
# Update history ID for next cycle
|
|
292
|
+
profile = service.users().getProfile(userId="me").execute()
|
|
293
|
+
self._history_id = profile.get("historyId")
|
|
294
|
+
|
|
295
|
+
items = []
|
|
296
|
+
for msg_id in msg_ids:
|
|
297
|
+
if self._stop_event.is_set():
|
|
298
|
+
break
|
|
299
|
+
try:
|
|
300
|
+
msg = service.users().messages().get(
|
|
301
|
+
userId="me", id=msg_id, format="full",
|
|
302
|
+
).execute()
|
|
303
|
+
headers = {h["name"]: h["value"] for h in msg.get("payload", {}).get("headers", [])}
|
|
304
|
+
subject = headers.get("Subject", "(no subject)")
|
|
305
|
+
from_addr = headers.get("From", "unknown")
|
|
306
|
+
date = headers.get("Date", "")
|
|
307
|
+
|
|
308
|
+
# Extract body from payload
|
|
309
|
+
body = self._extract_gmail_body(msg.get("payload", {}))
|
|
310
|
+
body = body[:3000] if body else ""
|
|
311
|
+
content = f"Email: {subject}\nFrom: {from_addr}\nDate: {date}\n\n{body}"
|
|
312
|
+
|
|
313
|
+
items.append(IngestItem(
|
|
314
|
+
content=content,
|
|
315
|
+
dedup_key=msg_id,
|
|
316
|
+
metadata={"subject": subject, "from": from_addr, "date": date, "source": "oauth"},
|
|
317
|
+
))
|
|
318
|
+
except Exception as exc:
|
|
319
|
+
logger.debug("Gmail API fetch error for %s: %s", msg_id, exc)
|
|
320
|
+
|
|
321
|
+
return items
|
|
322
|
+
|
|
323
|
+
except ImportError:
|
|
324
|
+
logger.warning("Gmail OAuth requires: pip install 'superlocalmemory[ingestion]'")
|
|
325
|
+
return []
|
|
326
|
+
except Exception as exc:
|
|
327
|
+
logger.warning("Gmail OAuth polling failed: %s", exc)
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
@staticmethod
|
|
331
|
+
def _extract_gmail_body(payload: dict) -> str:
|
|
332
|
+
"""Extract plain text body from Gmail API payload."""
|
|
333
|
+
import base64
|
|
334
|
+
|
|
335
|
+
if payload.get("mimeType") == "text/plain":
|
|
336
|
+
data = payload.get("body", {}).get("data", "")
|
|
337
|
+
if data:
|
|
338
|
+
return base64.urlsafe_b64decode(data).decode("utf-8", errors="replace")
|
|
339
|
+
|
|
340
|
+
for part in payload.get("parts", []):
|
|
341
|
+
if part.get("mimeType") == "text/plain":
|
|
342
|
+
data = part.get("body", {}).get("data", "")
|
|
343
|
+
if data:
|
|
344
|
+
return base64.urlsafe_b64decode(data).decode("utf-8", errors="replace")
|
|
345
|
+
# Recurse into nested parts
|
|
346
|
+
if "parts" in part:
|
|
347
|
+
result = GmailAdapter._extract_gmail_body(part)
|
|
348
|
+
if result:
|
|
349
|
+
return result
|
|
350
|
+
|
|
351
|
+
return ""
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# ---------------------------------------------------------------------------
|
|
355
|
+
# CLI entry point
|
|
356
|
+
# ---------------------------------------------------------------------------
|
|
357
|
+
|
|
358
|
+
if __name__ == "__main__":
|
|
359
|
+
import logging as _logging
|
|
360
|
+
_logging.basicConfig(level=_logging.INFO, format="%(asctime)s %(message)s")
|
|
361
|
+
|
|
362
|
+
adapters_path = Path.home() / ".superlocalmemory" / "adapters.json"
|
|
363
|
+
tier = "auto"
|
|
364
|
+
if adapters_path.exists():
|
|
365
|
+
cfg = json.loads(adapters_path.read_text()).get("gmail", {})
|
|
366
|
+
tier = cfg.get("tier", "auto")
|
|
367
|
+
|
|
368
|
+
adapter = GmailAdapter(tier=tier)
|
|
369
|
+
adapter.run()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the Elastic License 2.0 - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
|
+
|
|
5
|
+
"""Parsers for ingestion file formats: SRT, VTT, MBOX, ICS."""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import NamedTuple
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Utterance(NamedTuple):
|
|
16
|
+
speaker: str
|
|
17
|
+
text: str
|
|
18
|
+
timestamp: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_srt(filepath: str | Path) -> list[Utterance]:
|
|
22
|
+
"""Parse SubRip (.srt) file into utterances."""
|
|
23
|
+
content = Path(filepath).read_text(encoding="utf-8", errors="replace")
|
|
24
|
+
blocks = re.split(r"\n\n+", content.strip())
|
|
25
|
+
utterances = []
|
|
26
|
+
for block in blocks:
|
|
27
|
+
lines = block.strip().split("\n")
|
|
28
|
+
if len(lines) < 3:
|
|
29
|
+
continue
|
|
30
|
+
# Line 1: sequence number, Line 2: timestamps, Line 3+: text
|
|
31
|
+
timestamp = lines[1].strip() if len(lines) > 1 else ""
|
|
32
|
+
text = " ".join(lines[2:]).strip()
|
|
33
|
+
if text:
|
|
34
|
+
# Try to extract speaker from "Speaker: text" pattern
|
|
35
|
+
speaker, content_text = _extract_speaker(text)
|
|
36
|
+
utterances.append(Utterance(speaker=speaker, text=content_text, timestamp=timestamp))
|
|
37
|
+
return utterances
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_vtt(filepath: str | Path) -> list[Utterance]:
|
|
41
|
+
"""Parse WebVTT (.vtt) file into utterances."""
|
|
42
|
+
content = Path(filepath).read_text(encoding="utf-8", errors="replace")
|
|
43
|
+
# Remove WEBVTT header
|
|
44
|
+
content = re.sub(r"^WEBVTT.*?\n\n", "", content, flags=re.DOTALL)
|
|
45
|
+
blocks = re.split(r"\n\n+", content.strip())
|
|
46
|
+
utterances = []
|
|
47
|
+
for block in blocks:
|
|
48
|
+
lines = block.strip().split("\n")
|
|
49
|
+
timestamp = ""
|
|
50
|
+
text_lines = []
|
|
51
|
+
for line in lines:
|
|
52
|
+
if "-->" in line:
|
|
53
|
+
timestamp = line.strip()
|
|
54
|
+
elif line.strip() and not line.strip().isdigit():
|
|
55
|
+
# Remove VTT tags like <v Speaker>
|
|
56
|
+
clean = re.sub(r"<v\s+([^>]+)>", r"\1: ", line)
|
|
57
|
+
clean = re.sub(r"<[^>]+>", "", clean).strip()
|
|
58
|
+
if clean:
|
|
59
|
+
text_lines.append(clean)
|
|
60
|
+
text = " ".join(text_lines)
|
|
61
|
+
if text:
|
|
62
|
+
speaker, content_text = _extract_speaker(text)
|
|
63
|
+
utterances.append(Utterance(speaker=speaker, text=content_text, timestamp=timestamp))
|
|
64
|
+
return utterances
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def parse_transcript_file(filepath: str | Path) -> tuple[str, list[str]]:
|
|
68
|
+
"""Parse any transcript file (.srt, .vtt, .txt).
|
|
69
|
+
|
|
70
|
+
Returns (combined_text, list_of_speakers).
|
|
71
|
+
"""
|
|
72
|
+
path = Path(filepath)
|
|
73
|
+
suffix = path.suffix.lower()
|
|
74
|
+
|
|
75
|
+
if suffix == ".srt":
|
|
76
|
+
utterances = parse_srt(path)
|
|
77
|
+
elif suffix == ".vtt":
|
|
78
|
+
utterances = parse_vtt(path)
|
|
79
|
+
else:
|
|
80
|
+
# Plain text — treat entire file as one utterance
|
|
81
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
82
|
+
return text[:5000], []
|
|
83
|
+
|
|
84
|
+
speakers = list({u.speaker for u in utterances if u.speaker != "unknown"})
|
|
85
|
+
combined = "\n".join(f"[{u.speaker}] {u.text}" for u in utterances)
|
|
86
|
+
return combined[:5000], speakers
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def content_hash(filepath: str | Path) -> str:
|
|
90
|
+
"""SHA256 of file content (first 32 chars). Path-independent for dedup."""
|
|
91
|
+
content = Path(filepath).read_bytes()
|
|
92
|
+
return hashlib.sha256(content).hexdigest()[:32]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _extract_speaker(text: str) -> tuple[str, str]:
|
|
96
|
+
"""Extract speaker from 'Speaker: text' or 'Speaker Name: text' pattern."""
|
|
97
|
+
match = re.match(r"^([A-Z][a-zA-Z\s]{0,30}):\s*(.+)", text)
|
|
98
|
+
if match:
|
|
99
|
+
return match.group(1).strip(), match.group(2).strip()
|
|
100
|
+
return "unknown", text
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the Elastic License 2.0 - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
|
+
|
|
5
|
+
"""Transcript ingestion adapter — watches for .srt/.vtt/.txt files.
|
|
6
|
+
|
|
7
|
+
Uses watchdog (cross-platform file watcher) to detect new transcript files.
|
|
8
|
+
Parses them, extracts speaker diarization, propagates entities, and POSTs
|
|
9
|
+
to the daemon's /ingest endpoint.
|
|
10
|
+
|
|
11
|
+
OPT-IN only. Enabled via: slm adapters enable transcript
|
|
12
|
+
|
|
13
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
14
|
+
License: Elastic-2.0
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from superlocalmemory.ingestion.base_adapter import BaseAdapter, AdapterConfig, IngestItem
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("superlocalmemory.ingestion.transcript")
|
|
28
|
+
|
|
29
|
+
_WATCH_EXTENSIONS = {".srt", ".vtt", ".txt"}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TranscriptAdapter(BaseAdapter):
|
|
33
|
+
"""Watches a directory for transcript files and ingests them."""
|
|
34
|
+
|
|
35
|
+
source_type = "transcript"
|
|
36
|
+
|
|
37
|
+
def __init__(self, watch_dir: str | Path, config: AdapterConfig | None = None):
|
|
38
|
+
super().__init__(config)
|
|
39
|
+
self._watch_dir = Path(watch_dir)
|
|
40
|
+
self._pending_files: list[Path] = []
|
|
41
|
+
self._observer = None
|
|
42
|
+
|
|
43
|
+
def run(self) -> None:
|
|
44
|
+
"""Start file watcher then enter the base adapter loop."""
|
|
45
|
+
if not self._watch_dir.exists():
|
|
46
|
+
logger.error("Watch directory does not exist: %s", self._watch_dir)
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
# Start watchdog observer
|
|
50
|
+
try:
|
|
51
|
+
from watchdog.observers import Observer
|
|
52
|
+
from watchdog.events import FileSystemEventHandler
|
|
53
|
+
|
|
54
|
+
class _Handler(FileSystemEventHandler):
|
|
55
|
+
def __init__(self, adapter: TranscriptAdapter):
|
|
56
|
+
self._adapter = adapter
|
|
57
|
+
|
|
58
|
+
def on_created(self, event):
|
|
59
|
+
if event.is_directory:
|
|
60
|
+
return
|
|
61
|
+
path = Path(event.src_path)
|
|
62
|
+
if path.suffix.lower() in _WATCH_EXTENSIONS:
|
|
63
|
+
self._adapter._pending_files.append(path)
|
|
64
|
+
|
|
65
|
+
self._observer = Observer()
|
|
66
|
+
self._observer.schedule(_Handler(self), str(self._watch_dir), recursive=False)
|
|
67
|
+
self._observer.start()
|
|
68
|
+
logger.info("Watching for transcripts in: %s", self._watch_dir)
|
|
69
|
+
except ImportError:
|
|
70
|
+
logger.warning("watchdog not installed — polling mode only")
|
|
71
|
+
|
|
72
|
+
# Also scan for existing files on first run
|
|
73
|
+
for path in self._watch_dir.iterdir():
|
|
74
|
+
if path.suffix.lower() in _WATCH_EXTENSIONS and path.is_file():
|
|
75
|
+
self._pending_files.append(path)
|
|
76
|
+
|
|
77
|
+
super().run()
|
|
78
|
+
|
|
79
|
+
# Cleanup
|
|
80
|
+
if self._observer:
|
|
81
|
+
self._observer.stop()
|
|
82
|
+
self._observer.join()
|
|
83
|
+
|
|
84
|
+
def fetch_items(self) -> list[IngestItem]:
|
|
85
|
+
"""Return pending transcript files as IngestItems."""
|
|
86
|
+
if not self._pending_files:
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
items = []
|
|
90
|
+
batch = list(self._pending_files)
|
|
91
|
+
self._pending_files.clear()
|
|
92
|
+
|
|
93
|
+
for filepath in batch:
|
|
94
|
+
try:
|
|
95
|
+
from superlocalmemory.ingestion.parsers import (
|
|
96
|
+
parse_transcript_file, content_hash,
|
|
97
|
+
)
|
|
98
|
+
combined_text, speakers = parse_transcript_file(filepath)
|
|
99
|
+
dedup = content_hash(filepath)
|
|
100
|
+
|
|
101
|
+
# Main transcript ingestion
|
|
102
|
+
items.append(IngestItem(
|
|
103
|
+
content=f"Meeting transcript ({filepath.name}):\n{combined_text}",
|
|
104
|
+
dedup_key=dedup,
|
|
105
|
+
metadata={
|
|
106
|
+
"filename": filepath.name,
|
|
107
|
+
"speakers": speakers,
|
|
108
|
+
"source": "file_watcher",
|
|
109
|
+
},
|
|
110
|
+
))
|
|
111
|
+
|
|
112
|
+
# Entity propagation: each speaker gets a timeline entry
|
|
113
|
+
for speaker in speakers:
|
|
114
|
+
items.append(IngestItem(
|
|
115
|
+
content=f"{speaker} participated in meeting: {filepath.stem}. "
|
|
116
|
+
f"Transcript file: {filepath.name}",
|
|
117
|
+
dedup_key=f"speaker-{speaker}-{dedup}",
|
|
118
|
+
metadata={
|
|
119
|
+
"entity_name": speaker,
|
|
120
|
+
"meeting_file": filepath.name,
|
|
121
|
+
"source": "entity_propagation",
|
|
122
|
+
},
|
|
123
|
+
))
|
|
124
|
+
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
logger.warning("Failed to parse %s: %s", filepath, exc)
|
|
127
|
+
|
|
128
|
+
return items
|
|
129
|
+
|
|
130
|
+
def wait_for_next_cycle(self) -> None:
|
|
131
|
+
"""Wait 30s for new files (watchdog handles detection)."""
|
|
132
|
+
self._stop_event.wait(30)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
# CLI entry point: python -m superlocalmemory.ingestion.transcript_adapter
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
import logging as _logging
|
|
141
|
+
_logging.basicConfig(level=_logging.INFO, format="%(asctime)s %(message)s")
|
|
142
|
+
|
|
143
|
+
# Load config
|
|
144
|
+
watch_dir = ""
|
|
145
|
+
adapters_path = Path.home() / ".superlocalmemory" / "adapters.json"
|
|
146
|
+
if adapters_path.exists():
|
|
147
|
+
cfg = json.loads(adapters_path.read_text())
|
|
148
|
+
watch_dir = cfg.get("transcript", {}).get("watch_dir", "")
|
|
149
|
+
|
|
150
|
+
if not watch_dir:
|
|
151
|
+
print("No watch_dir configured. Set it in ~/.superlocalmemory/adapters.json")
|
|
152
|
+
print(' {"transcript": {"enabled": true, "watch_dir": "/path/to/transcripts"}}')
|
|
153
|
+
sys.exit(1)
|
|
154
|
+
|
|
155
|
+
adapter = TranscriptAdapter(watch_dir=watch_dir)
|
|
156
|
+
adapter.run()
|
|
@@ -73,7 +73,38 @@ class ConsolidationWorker:
|
|
|
73
73
|
except Exception as exc:
|
|
74
74
|
logger.debug("Pattern generation failed: %s", exc)
|
|
75
75
|
|
|
76
|
-
# 4.
|
|
76
|
+
# 4. Recompute graph intelligence (v3.4.2: wired into learning pipeline)
|
|
77
|
+
try:
|
|
78
|
+
from superlocalmemory.core.graph_analyzer import GraphAnalyzer
|
|
79
|
+
conn_ga = sqlite3.connect(self._memory_db, timeout=10)
|
|
80
|
+
conn_ga.execute("PRAGMA busy_timeout=5000")
|
|
81
|
+
conn_ga.row_factory = sqlite3.Row
|
|
82
|
+
|
|
83
|
+
class _DBProxy:
|
|
84
|
+
"""Minimal DB proxy for GraphAnalyzer compatibility."""
|
|
85
|
+
def __init__(self, connection: sqlite3.Connection) -> None:
|
|
86
|
+
self._conn = connection
|
|
87
|
+
def execute(self, sql: str, params: tuple = ()) -> list:
|
|
88
|
+
cursor = self._conn.execute(sql, params)
|
|
89
|
+
if sql.strip().upper().startswith(("INSERT", "UPDATE", "DELETE", "ALTER", "CREATE")):
|
|
90
|
+
self._conn.commit()
|
|
91
|
+
return []
|
|
92
|
+
return cursor.fetchall()
|
|
93
|
+
|
|
94
|
+
ga = GraphAnalyzer(_DBProxy(conn_ga))
|
|
95
|
+
if not dry_run:
|
|
96
|
+
ga_result = ga.compute_and_store(profile_id)
|
|
97
|
+
stats["graph_nodes"] = ga_result.get("node_count", 0)
|
|
98
|
+
stats["graph_communities"] = ga_result.get("community_count", 0)
|
|
99
|
+
logger.info(
|
|
100
|
+
"Graph analysis: %d nodes, %d communities",
|
|
101
|
+
stats["graph_nodes"], stats["graph_communities"],
|
|
102
|
+
)
|
|
103
|
+
conn_ga.close()
|
|
104
|
+
except Exception as exc:
|
|
105
|
+
logger.debug("Graph analysis failed: %s", exc)
|
|
106
|
+
|
|
107
|
+
# 5. Check if ranker should retrain
|
|
77
108
|
try:
|
|
78
109
|
from superlocalmemory.learning.feedback import FeedbackCollector
|
|
79
110
|
collector = FeedbackCollector(Path(self._learning_db))
|
|
@@ -88,6 +119,21 @@ class ConsolidationWorker:
|
|
|
88
119
|
except Exception as exc:
|
|
89
120
|
logger.debug("Retrain check failed: %s", exc)
|
|
90
121
|
|
|
122
|
+
# 6. Entity compilation (v3.4.3: compiled truth per entity)
|
|
123
|
+
if not dry_run:
|
|
124
|
+
try:
|
|
125
|
+
from superlocalmemory.learning.entity_compiler import EntityCompiler
|
|
126
|
+
from superlocalmemory.core.config import SLMConfig
|
|
127
|
+
config = SLMConfig.load()
|
|
128
|
+
compiler = EntityCompiler(self._memory_db, config)
|
|
129
|
+
ec_result = compiler.compile_all(profile_id)
|
|
130
|
+
stats["entities_compiled"] = ec_result.get("compiled", 0)
|
|
131
|
+
if ec_result["compiled"] > 0:
|
|
132
|
+
logger.info("Entity compilation: %d entities compiled",
|
|
133
|
+
ec_result["compiled"])
|
|
134
|
+
except Exception as exc:
|
|
135
|
+
logger.debug("Entity compilation failed: %s", exc)
|
|
136
|
+
|
|
91
137
|
return stats
|
|
92
138
|
|
|
93
139
|
def _deduplicate(self, profile_id: str, dry_run: bool) -> int:
|