nexo-brain 3.1.8 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,412 @@
1
+ from __future__ import annotations
2
+ """Shared transcript helpers for Deep Sleep and public MCP fallback tools."""
3
+
4
+ import json
5
+ import os
6
+ import re
7
+ import unicodedata
8
+ from datetime import datetime, timedelta
9
+ from pathlib import Path
10
+
11
+ MIN_USER_MESSAGES = 3
12
+ DEFAULT_TRANSCRIPT_HOURS = 24
13
+ MAX_TRANSCRIPT_HOURS = 30 * 24
14
+
15
+ _SENSITIVE_PATTERNS = re.compile(
16
+ r'(?:'
17
+ r'sk-ant-[A-Za-z0-9_-]+'
18
+ r'|shpat_[A-Fa-f0-9]+'
19
+ r'|shpss_[A-Fa-f0-9]+'
20
+ r'|sk-[A-Za-z0-9]{20,}'
21
+ r'|ghp_[A-Za-z0-9]{36,}'
22
+ r'|gho_[A-Za-z0-9]{36,}'
23
+ r'|AIza[A-Za-z0-9_-]{35}'
24
+ r'|ya29\.[A-Za-z0-9_-]+'
25
+ r'|xox[bpsa]-[A-Za-z0-9-]+'
26
+ r'|EAAG[A-Za-z0-9]+'
27
+ r'|[Pp]assword\s*[:=]\s*\S+'
28
+ r'|[Ss]ecret\s*[:=]\s*\S+'
29
+ r'|[Tt]oken\s*[:=]\s*\S+'
30
+ r'|[Aa]pi[_-]?[Kk]ey\s*[:=]\s*\S+'
31
+ r')'
32
+ )
33
+
34
+
35
+ def _redact_sensitive(text: str) -> str:
36
+ return _SENSITIVE_PATTERNS.sub("[REDACTED]", text)
37
+
38
+
39
+ def _normalize_text(text: str | None) -> str:
40
+ if not text:
41
+ return ""
42
+ normalized = unicodedata.normalize("NFKD", str(text))
43
+ ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
44
+ return ascii_text.lower()
45
+
46
+
47
+ def _tokenize(text: str | None) -> set[str]:
48
+ normalized = _normalize_text(text)
49
+ return {
50
+ token
51
+ for token in re.findall(r"[a-z0-9][a-z0-9._:-]{1,}", normalized)
52
+ if len(token) >= 3
53
+ }
54
+
55
+
56
+ def _score_text_match(query_tokens: set[str], haystack: str) -> float:
57
+ if not query_tokens:
58
+ return 0.0
59
+ haystack_tokens = _tokenize(haystack)
60
+ if not haystack_tokens:
61
+ return 0.0
62
+ intersection = query_tokens & haystack_tokens
63
+ if not intersection:
64
+ return 0.0
65
+ smaller = min(len(query_tokens), len(haystack_tokens))
66
+ return len(intersection) / max(1, smaller)
67
+
68
+
69
+ def _truncate(text: str | None, limit: int = 240) -> str:
70
+ if not text:
71
+ return ""
72
+ clean = str(text).strip()
73
+ return clean if len(clean) <= limit else clean[: limit - 3] + "..."
74
+
75
+
76
+ def _session_identifier(client: str, session_file: str) -> str:
77
+ return f"{client}:{session_file}"
78
+
79
+
80
+ def _claude_root() -> Path:
81
+ return Path.home() / ".claude" / "projects"
82
+
83
+
84
+ def _codex_roots() -> list[Path]:
85
+ return [
86
+ Path.home() / ".codex" / "sessions",
87
+ Path.home() / ".codex" / "archived_sessions",
88
+ ]
89
+
90
+
91
+ def clamp_transcript_hours(hours: int | float | str | None) -> int:
92
+ try:
93
+ value = int(float(hours or DEFAULT_TRANSCRIPT_HOURS))
94
+ except Exception:
95
+ value = DEFAULT_TRANSCRIPT_HOURS
96
+ return max(1, min(value, MAX_TRANSCRIPT_HOURS))
97
+
98
+
99
+ def find_claude_session_files() -> list[Path]:
100
+ claude_dir = _claude_root()
101
+ if not claude_dir.exists():
102
+ return []
103
+ return sorted(claude_dir.rglob("*.jsonl"))
104
+
105
+
106
+ def find_codex_session_files() -> list[Path]:
107
+ files: list[Path] = []
108
+ seen: set[str] = set()
109
+ for root in _codex_roots():
110
+ if not root.exists():
111
+ continue
112
+ for jsonl in sorted(root.rglob("*.jsonl")):
113
+ key = jsonl.name
114
+ if key in seen:
115
+ continue
116
+ seen.add(key)
117
+ files.append(jsonl)
118
+ return files
119
+
120
+
121
+ def extract_claude_session(jsonl_path: Path) -> dict | None:
122
+ messages = []
123
+ tool_uses = []
124
+ user_msg_count = 0
125
+
126
+ try:
127
+ with open(jsonl_path, "r") as f:
128
+ for line_no, line in enumerate(f, 1):
129
+ line = line.strip()
130
+ if not line:
131
+ continue
132
+ try:
133
+ payload = json.loads(line)
134
+ except json.JSONDecodeError:
135
+ continue
136
+
137
+ msg_type = payload.get("type")
138
+ if msg_type == "user":
139
+ content = payload.get("message", {}).get("content", "")
140
+ if isinstance(content, str) and content.strip():
141
+ if content.startswith("<system-reminder>"):
142
+ continue
143
+ messages.append(
144
+ {
145
+ "role": "user",
146
+ "index": line_no,
147
+ "text": _redact_sensitive(content[:5000]),
148
+ "uuid": payload.get("uuid", ""),
149
+ }
150
+ )
151
+ user_msg_count += 1
152
+ elif msg_type in ("message", "assistant"):
153
+ msg = payload.get("message", {})
154
+ content_blocks = msg.get("content", [])
155
+ text_parts = []
156
+ for block in content_blocks:
157
+ if not isinstance(block, dict):
158
+ continue
159
+ if block.get("type") == "text":
160
+ text_parts.append(block.get("text", ""))
161
+ elif block.get("type") == "tool_use":
162
+ tool_input = block.get("input", {})
163
+ raw_file = (
164
+ tool_input.get("file_path", "")
165
+ or str(tool_input.get("command", ""))[:100]
166
+ ) if isinstance(tool_input, dict) else ""
167
+ tool_uses.append(
168
+ {
169
+ "tool": block.get("name", ""),
170
+ "input_keys": list(tool_input.keys()) if isinstance(tool_input, dict) else [],
171
+ "file": _redact_sensitive(raw_file),
172
+ }
173
+ )
174
+ combined = "\n".join(part for part in text_parts if part).strip()
175
+ if combined:
176
+ messages.append(
177
+ {
178
+ "role": "assistant",
179
+ "index": line_no,
180
+ "text": _redact_sensitive(combined[:5000]),
181
+ }
182
+ )
183
+ except Exception:
184
+ return None
185
+
186
+ if user_msg_count < MIN_USER_MESSAGES:
187
+ return None
188
+
189
+ return {
190
+ "client": "claude_code",
191
+ "session_file": _session_identifier("claude_code", jsonl_path.name),
192
+ "display_name": jsonl_path.name,
193
+ "session_path": str(jsonl_path),
194
+ "message_count": len(messages),
195
+ "user_message_count": user_msg_count,
196
+ "tool_use_count": len(tool_uses),
197
+ "messages": messages,
198
+ "tool_uses": tool_uses,
199
+ "source": "claude_projects",
200
+ }
201
+
202
+
203
+ def extract_codex_session(jsonl_path: Path) -> dict | None:
204
+ messages = []
205
+ tool_uses = []
206
+ user_msg_count = 0
207
+ session_meta: dict = {}
208
+
209
+ try:
210
+ with open(jsonl_path, "r") as f:
211
+ for line_no, line in enumerate(f, 1):
212
+ line = line.strip()
213
+ if not line:
214
+ continue
215
+ try:
216
+ payload = json.loads(line)
217
+ except json.JSONDecodeError:
218
+ continue
219
+
220
+ item_type = payload.get("type")
221
+ data = payload.get("payload", {})
222
+
223
+ if item_type == "session_meta" and isinstance(data, dict):
224
+ session_meta = data
225
+ continue
226
+
227
+ if item_type == "event_msg" and isinstance(data, dict) and data.get("type") == "user_message":
228
+ content = str(data.get("message", "") or "").strip()
229
+ if not content or content.startswith("<environment_context>"):
230
+ continue
231
+ messages.append(
232
+ {
233
+ "role": "user",
234
+ "index": line_no,
235
+ "text": _redact_sensitive(content[:5000]),
236
+ }
237
+ )
238
+ user_msg_count += 1
239
+ continue
240
+
241
+ if item_type == "response_item" and isinstance(data, dict):
242
+ response_type = data.get("type")
243
+ role = data.get("role")
244
+ if response_type == "message" and role == "assistant":
245
+ text_parts = []
246
+ for block in data.get("content", []) or []:
247
+ if isinstance(block, dict) and block.get("type") == "output_text":
248
+ text_parts.append(str(block.get("text", "")))
249
+ combined = "\n".join(part for part in text_parts if part).strip()
250
+ if combined:
251
+ messages.append(
252
+ {
253
+ "role": "assistant",
254
+ "index": line_no,
255
+ "text": _redact_sensitive(combined[:5000]),
256
+ }
257
+ )
258
+ elif response_type == "function_call":
259
+ tool_uses.append(
260
+ {
261
+ "tool": data.get("name", ""),
262
+ "input_keys": [],
263
+ "file": _redact_sensitive(str(data.get("arguments", ""))[:100]),
264
+ }
265
+ )
266
+ except Exception:
267
+ return None
268
+
269
+ if user_msg_count < MIN_USER_MESSAGES:
270
+ return None
271
+
272
+ return {
273
+ "client": "codex",
274
+ "session_file": _session_identifier("codex", jsonl_path.name),
275
+ "display_name": jsonl_path.name,
276
+ "session_path": str(jsonl_path),
277
+ "message_count": len(messages),
278
+ "user_message_count": user_msg_count,
279
+ "tool_use_count": len(tool_uses),
280
+ "messages": messages,
281
+ "tool_uses": tool_uses,
282
+ "source": session_meta.get("source", "codex"),
283
+ "cwd": session_meta.get("cwd", ""),
284
+ "originator": session_meta.get("originator", ""),
285
+ "session_uid": session_meta.get("id", ""),
286
+ }
287
+
288
+
289
+ def collect_transcripts_since(since_iso: str, until_iso: str = "") -> list[dict]:
290
+ since_dt = datetime.fromisoformat(since_iso)
291
+ until_dt = datetime.fromisoformat(until_iso) if until_iso else datetime.now()
292
+ sessions = []
293
+ transcript_files: list[tuple[str, Path]] = [
294
+ ("claude_code", path) for path in find_claude_session_files()
295
+ ] + [
296
+ ("codex", path) for path in find_codex_session_files()
297
+ ]
298
+ for client, session_file in transcript_files:
299
+ try:
300
+ mtime = datetime.fromtimestamp(session_file.stat().st_mtime)
301
+ except OSError:
302
+ continue
303
+ if not (since_dt < mtime <= until_dt):
304
+ continue
305
+ session = extract_codex_session(session_file) if client == "codex" else extract_claude_session(session_file)
306
+ if session:
307
+ session["modified"] = mtime.isoformat()
308
+ sessions.append(session)
309
+ sessions.sort(key=lambda row: row["modified"])
310
+ return sessions
311
+
312
+
313
+ def list_recent_transcripts(hours: int = DEFAULT_TRANSCRIPT_HOURS, client: str = "", limit: int = 10) -> list[dict]:
314
+ window = clamp_transcript_hours(hours)
315
+ since = datetime.now() - timedelta(hours=window)
316
+ sessions = collect_transcripts_since(since.isoformat())
317
+ filtered = []
318
+ for item in sessions:
319
+ if client and item.get("client") != client:
320
+ continue
321
+ filtered.append(item)
322
+ filtered.sort(key=lambda row: row.get("modified", ""), reverse=True)
323
+ return filtered[: max(1, int(limit or 10))]
324
+
325
+
326
+ def search_transcripts(query: str, *, hours: int = DEFAULT_TRANSCRIPT_HOURS, client: str = "", limit: int = 10) -> list[dict]:
327
+ rows = list_recent_transcripts(hours=hours, client=client, limit=200)
328
+ query_tokens = _tokenize(query)
329
+ if not query_tokens:
330
+ return rows[: max(1, int(limit or 10))]
331
+
332
+ matches: list[dict] = []
333
+ cutoff_seconds = clamp_transcript_hours(hours) * 3600
334
+ now = datetime.now().timestamp()
335
+ for item in rows:
336
+ snippets = []
337
+ best_score = 0.0
338
+ for message in item.get("messages") or []:
339
+ text = str(message.get("text", "") or "")
340
+ score = _score_text_match(query_tokens, text)
341
+ if score <= 0:
342
+ continue
343
+ best_score = max(best_score, score)
344
+ snippets.append(
345
+ {
346
+ "role": message.get("role", ""),
347
+ "index": message.get("index", 0),
348
+ "snippet": _truncate(text, 220),
349
+ "score": round(score, 4),
350
+ }
351
+ )
352
+ meta_text = " ".join(
353
+ [
354
+ str(item.get("display_name", "") or ""),
355
+ str(item.get("session_file", "") or ""),
356
+ str(item.get("source", "") or ""),
357
+ str(item.get("cwd", "") or ""),
358
+ ]
359
+ )
360
+ meta_score = _score_text_match(query_tokens, meta_text)
361
+ best_score = max(best_score, meta_score)
362
+ if best_score <= 0:
363
+ continue
364
+ modified = item.get("modified", "")
365
+ try:
366
+ modified_ts = datetime.fromisoformat(modified).timestamp()
367
+ except Exception:
368
+ modified_ts = now
369
+ recency = max(0.0, 1.0 - ((now - modified_ts) / max(1, cutoff_seconds)))
370
+ item["_score"] = round(best_score + recency * 0.35, 4)
371
+ item["matched_messages"] = sorted(snippets, key=lambda row: row["score"], reverse=True)[:3]
372
+ matches.append(item)
373
+
374
+ matches.sort(key=lambda row: (row.get("_score", 0), row.get("modified", "")), reverse=True)
375
+ return matches[: max(1, int(limit or 10))]
376
+
377
+
378
+ def load_transcript(session_ref: str = "", transcript_path: str = "", client: str = "") -> dict | None:
379
+ ref = str(session_ref or "").strip()
380
+ path_ref = str(transcript_path or "").strip()
381
+
382
+ transcript_files: list[tuple[str, Path]] = [
383
+ ("claude_code", path) for path in find_claude_session_files()
384
+ ] + [
385
+ ("codex", path) for path in find_codex_session_files()
386
+ ]
387
+ for detected_client, path in transcript_files:
388
+ if client and detected_client != client:
389
+ continue
390
+ if path_ref:
391
+ try:
392
+ if Path(path_ref).expanduser().resolve() != path.resolve():
393
+ continue
394
+ except Exception:
395
+ continue
396
+ session = extract_codex_session(path) if detected_client == "codex" else extract_claude_session(path)
397
+ if not session:
398
+ continue
399
+ if ref:
400
+ if ref not in {
401
+ str(session.get("session_file", "")),
402
+ str(session.get("display_name", "")),
403
+ str(session.get("session_uid", "")),
404
+ str(path),
405
+ }:
406
+ continue
407
+ try:
408
+ session["modified"] = datetime.fromtimestamp(path.stat().st_mtime).isoformat()
409
+ except OSError:
410
+ session["modified"] = ""
411
+ return session
412
+ return None