parallelclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/CHANGELOG.md +204 -0
  2. package/HELP.md +600 -0
  3. package/LICENSE +21 -0
  4. package/MULTI_MACHINE.md +152 -0
  5. package/README.md +417 -0
  6. package/README.ru.md +740 -0
  7. package/SYNC.md +844 -0
  8. package/bot/README.md +173 -0
  9. package/bot/config.js +66 -0
  10. package/bot/inbox.js +153 -0
  11. package/bot/index.js +294 -0
  12. package/bot/nexara.js +61 -0
  13. package/bot/poll.js +304 -0
  14. package/bot/search.js +155 -0
  15. package/bot/telegram.js +96 -0
  16. package/ingest.js +2712 -0
  17. package/lib/cli/index.js +1987 -0
  18. package/lib/config.js +220 -0
  19. package/lib/db-init.js +158 -0
  20. package/lib/hook/install.js +268 -0
  21. package/lib/import-telegram.js +158 -0
  22. package/lib/ingest-file.js +779 -0
  23. package/lib/notify-click-action.js +281 -0
  24. package/lib/openclaw-channel.js +643 -0
  25. package/lib/parse-cursor.js +172 -0
  26. package/lib/parse-obsidian.js +256 -0
  27. package/lib/parse-telegram-html.js +384 -0
  28. package/lib/parse.js +175 -0
  29. package/lib/render-markdown.js +0 -0
  30. package/lib/store-doc/canonicalize.js +116 -0
  31. package/lib/store-doc/detect.js +209 -0
  32. package/lib/store-doc/extract-title.js +162 -0
  33. package/lib/sync/auth.js +80 -0
  34. package/lib/sync/cert.js +144 -0
  35. package/lib/sync/cli.js +906 -0
  36. package/lib/sync/client.js +138 -0
  37. package/lib/sync/config.js +130 -0
  38. package/lib/sync/pair.js +145 -0
  39. package/lib/sync/pull.js +158 -0
  40. package/lib/sync/push.js +305 -0
  41. package/lib/sync/replicate.js +335 -0
  42. package/lib/sync/server.js +224 -0
  43. package/lib/sync/service.js +726 -0
  44. package/lib/tasks.js +215 -0
  45. package/lib/telegram-decisions.js +165 -0
  46. package/lib/telegram-discovery.js +373 -0
  47. package/lib/telegram-notify.js +272 -0
  48. package/lib/telegram-pending.js +200 -0
  49. package/lib/web/index.js +265 -0
  50. package/lib/web/routes/conversation.js +193 -0
  51. package/lib/web/routes/conversations.js +180 -0
  52. package/lib/web/routes/dashboard.js +175 -0
  53. package/lib/web/routes/pending.js +277 -0
  54. package/lib/web/routes/settings.js +226 -0
  55. package/lib/web/static/style.css +393 -0
  56. package/lib/web/templates.js +234 -0
  57. package/package.json +84 -0
  58. package/server.js +3816 -0
  59. package/skills/install-memex/README.md +109 -0
  60. package/skills/install-memex/SKILL.md +342 -0
  61. package/skills/install-memex/examples.md +294 -0
  62. package/skills/install-memex-claw/SKILL.md +423 -0
@@ -0,0 +1,384 @@
1
+ /**
2
+ * Telegram Desktop HTML export → Telegram-JSON-shape converter.
3
+ *
4
+ * Telegram Desktop offers two export formats:
5
+ * - "Machine-readable JSON" — what memex's importTelegram expects
6
+ * - "Human-readable HTML" — what many users pick by default
7
+ *
8
+ * Users frequently export as HTML by accident (often the default in the
9
+ * Telegram UI), then memex's inbox watcher silently ignores the dropped
10
+ * directory. This module makes HTML work: parse → emit the same shape
11
+ * importTelegram already understands.
12
+ *
13
+ * Telegram's HTML export is reasonably stable:
14
+ *
15
+ * ChatExport_<chat-title>_<date>/
16
+ * ├── messages.html (or messages.htm — chunked: messages2, messages3, …)
17
+ * ├── photos/
18
+ * ├── files/
19
+ * ├── stickers/
20
+ * └── voice_messages/
21
+ *
22
+ * Each messages*.html has structure:
23
+ *
24
+ * <div class="message default clearfix" id="message12345">
25
+ * <div class="body">
26
+ * <div class="from_name"> ↳ Sender Name </div> (may be absent on "joined" messages)
27
+ * <div class="text"> message text </div>
28
+ * <div class="pull_right date details" title="2024-01-01 14:23:45 UTC+03:00">14:23</div>
29
+ * </div>
30
+ * </div>
31
+ *
32
+ * Joined message = same sender as previous, has class "joined", no from_name.
33
+ * Service message = class "service" (joined chat, name change, …) — we skip these.
34
+ * Forwarded = "forwarded body" wrapping the message body.
35
+ * Reply = "reply_to details" sibling.
36
+ *
37
+ * We use regex-based parsing (no DOM dependency) because Telegram's class
38
+ * names are stable and we control which fields we care about. If Telegram
39
+ * radically changes the schema, parser breaks loudly (returns 0 messages
40
+ * + clear log) rather than silently corrupting.
41
+ */
42
+
43
+ import { readFileSync, existsSync, readdirSync, statSync } from 'node:fs';
44
+ import { join, basename, dirname } from 'node:path';
45
+
46
+ /**
47
+ * Detect if a given path is a Telegram HTML export.
48
+ * Accepts both a directory (most common — ChatExport_xxx/) and a bare
49
+ * messages.html file (rare — user dropped just the one file).
50
+ *
51
+ * Returns { type: 'dir' | 'file' | null, htmlFiles: string[] }
52
+ * null type means "not a Telegram HTML export"
53
+ */
54
+ export function detectTelegramHtml(path) {
55
+ if (!existsSync(path)) return { type: null, htmlFiles: [] };
56
+ const stats = statSync(path);
57
+
58
+ // Directory case: look for messages*.html inside
59
+ if (stats.isDirectory()) {
60
+ let entries = [];
61
+ try { entries = readdirSync(path); } catch (_) { return { type: null, htmlFiles: [] }; }
62
+ const htmlFiles = entries
63
+ .filter((f) => /^messages\d*\.html?$/i.test(f))
64
+ .map((f) => join(path, f));
65
+ if (htmlFiles.length === 0) return { type: null, htmlFiles: [] };
66
+ // Verify the first one contains Telegram-shaped markers
67
+ const head = safeReadHead(htmlFiles[0]);
68
+ if (!looksLikeTelegram(head)) return { type: null, htmlFiles: [] };
69
+ // Sort chunks: messages.html < messages2.html < messages3.html …
70
+ htmlFiles.sort(numericChunkSort);
71
+ return { type: 'dir', htmlFiles };
72
+ }
73
+
74
+ // Single file case: must be messages*.html
75
+ if (stats.isFile() && /\.html?$/i.test(path) && /messages\d*\.html?$/i.test(basename(path))) {
76
+ const head = safeReadHead(path);
77
+ if (!looksLikeTelegram(head)) return { type: null, htmlFiles: [] };
78
+ return { type: 'file', htmlFiles: [path] };
79
+ }
80
+
81
+ return { type: null, htmlFiles: [] };
82
+ }
83
+
84
+ function safeReadHead(file, bytes = 8192) {
85
+ try {
86
+ return readFileSync(file, 'utf-8').slice(0, bytes);
87
+ } catch (_) {
88
+ return '';
89
+ }
90
+ }
91
+
92
+ function looksLikeTelegram(head) {
93
+ // Reliable markers in Telegram Desktop HTML exports
94
+ return /class="page_wrap"/.test(head) ||
95
+ /class="page_body chat_page"/.test(head) ||
96
+ (/class="from_name"/.test(head) && /class="text"/.test(head));
97
+ }
98
+
99
+ function numericChunkSort(a, b) {
100
+ const numA = parseInt((basename(a).match(/messages(\d*)\.html?/i) || [, '0'])[1] || '0', 10);
101
+ const numB = parseInt((basename(b).match(/messages(\d*)\.html?/i) || [, '0'])[1] || '0', 10);
102
+ return numA - numB;
103
+ }
104
+
105
+ /**
106
+ * Strip HTML tags and decode common entities → plain text.
107
+ * Conservative: preserves newlines from <br>, paragraph breaks from </div>.
108
+ */
109
+ function htmlToText(html) {
110
+ if (!html) return '';
111
+ let out = String(html);
112
+ // Convert breaks to newlines BEFORE stripping tags
113
+ out = out.replace(/<br\s*\/?>/gi, '\n');
114
+ out = out.replace(/<\/p>/gi, '\n\n');
115
+ out = out.replace(/<\/div>/gi, '\n');
116
+ // Drop all remaining tags
117
+ out = out.replace(/<[^>]+>/g, '');
118
+ // Decode common entities
119
+ out = out
120
+ .replace(/&nbsp;/g, ' ')
121
+ .replace(/&amp;/g, '&')
122
+ .replace(/&lt;/g, '<')
123
+ .replace(/&gt;/g, '>')
124
+ .replace(/&quot;/g, '"')
125
+ .replace(/&apos;/g, "'")
126
+ .replace(/&#39;/g, "'")
127
+ .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
128
+ .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)));
129
+ // Collapse 3+ blank lines, trim
130
+ out = out.replace(/\n{3,}/g, '\n\n').trim();
131
+ return out;
132
+ }
133
+
134
+ /**
135
+ * Parse a Telegram date title into Unix timestamp.
136
+ * Telegram emits dates in the user's locale format, e.g.:
137
+ * • "2024-01-01 14:23:45 UTC+03:00" (ISO — English locale)
138
+ * • "01.01.2024 14:23:45 UTC+03:00" (European — Russian / German / etc.)
139
+ * • "01/01/2024 14:23:45 UTC+03:00" (US slash format — less common in exports)
140
+ * Returns { tsUnix, isoString } or null if unparseable.
141
+ */
142
+ function parseTelegramDate(title) {
143
+ if (!title) return null;
144
+ let y, mo, d, h, mi, s, sign, oh, om;
145
+ // ISO: YYYY-MM-DD HH:MM:SS [UTC±HH:MM]
146
+ let m = title.match(/^(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(?:\s+UTC([+-])(\d{2}):(\d{2}))?$/);
147
+ if (m) {
148
+ [, y, mo, d, h, mi, s, sign, oh, om] = m;
149
+ } else {
150
+ // European: DD.MM.YYYY HH:MM:SS [UTC±HH:MM] (also supports "/" or "-" as separator)
151
+ m = title.match(/^(\d{2})[.\/-](\d{2})[.\/-](\d{4})\s+(\d{2}):(\d{2}):(\d{2})(?:\s+UTC([+-])(\d{2}):(\d{2}))?$/);
152
+ if (!m) return null;
153
+ [, d, mo, y, h, mi, s, sign, oh, om] = m;
154
+ }
155
+ // Construct an ISO 8601 string with the explicit offset (or UTC if absent)
156
+ const offset = sign ? `${sign}${oh}:${om}` : 'Z';
157
+ const iso = `${y}-${mo}-${d}T${h}:${mi}:${s}${offset}`;
158
+ const date = new Date(iso);
159
+ if (isNaN(date.getTime())) return null;
160
+ return {
161
+ tsUnix: Math.floor(date.getTime() / 1000),
162
+ isoString: iso.replace(/[+-]\d{2}:\d{2}$/, '').replace('Z', ''),
163
+ };
164
+ }
165
+
166
+ /**
167
+ * Parse a single message div (raw HTML segment).
168
+ * Returns null for service messages (we skip those) or messages with no text.
169
+ */
170
+ function parseMessageDiv(messageHtml, lastSender) {
171
+ // Skip service messages outright
172
+ if (/class="message service\b/.test(messageHtml)) return null;
173
+
174
+ // Extract message id from outer div: id="message12345"
175
+ const idMatch = messageHtml.match(/id="message(\d+)"/);
176
+ const msgId = idMatch ? idMatch[1] : null;
177
+ if (!msgId) return null;
178
+
179
+ const isJoined = /class="message [^"]*joined/.test(messageHtml);
180
+
181
+ // Forwarded marker
182
+ const isForwarded = /class="forwarded body"/.test(messageHtml);
183
+ let forwardedFrom = null;
184
+ if (isForwarded) {
185
+ const fwdM = messageHtml.match(/class="forwarded[^"]*"[\s\S]*?<div class="from_name"[^>]*>\s*([\s\S]*?)\s*<\/div>/);
186
+ if (fwdM) {
187
+ forwardedFrom = htmlToText(fwdM[1]).replace(/^Forwarded from:?\s*/i, '').trim();
188
+ }
189
+ }
190
+
191
+ // Sender (from_name) — absent on joined messages
192
+ let fromName = null;
193
+ const fromM = messageHtml.match(/<div class="from_name"[^>]*>\s*([\s\S]*?)\s*<\/div>/);
194
+ if (fromM && !isForwarded) {
195
+ fromName = htmlToText(fromM[1]).trim();
196
+ }
197
+ // If joined, inherit lastSender; otherwise use parsed or fallback
198
+ if (!fromName && isJoined && lastSender) fromName = lastSender;
199
+ if (!fromName) fromName = 'Unknown';
200
+
201
+ // Date — title attribute on `.date.details`
202
+ let date = null;
203
+ const dateM = messageHtml.match(/class="[^"]*\bdate details[^"]*"\s+title="([^"]+)"/);
204
+ if (dateM) date = parseTelegramDate(dateM[1]);
205
+
206
+ // Main text — last `<div class="text">…</div>` inside body (forwards may have one earlier)
207
+ let text = '';
208
+ const textMatches = [...messageHtml.matchAll(/<div class="text"[^>]*>([\s\S]*?)<\/div>(?=\s*(?:<div class="(?!text)|<\/div>|<a class="|$))/g)];
209
+ if (textMatches.length > 0) {
210
+ // Use last one (the actual message body, after any quoted/forwarded preamble)
211
+ text = htmlToText(textMatches[textMatches.length - 1][1]);
212
+ }
213
+
214
+ // Reply marker — include as prefix so it's searchable but not lost
215
+ const replyM = messageHtml.match(/class="reply_to details"[^>]*>([\s\S]*?)<\/div>/);
216
+ if (replyM) {
217
+ const replyTxt = htmlToText(replyM[1]).replace(/^In reply to\s+/i, '').trim();
218
+ if (replyTxt) text = `↩ Reply: ${replyTxt}\n\n${text}`;
219
+ }
220
+
221
+ // Photo / media — if no text, note the media presence so the row isn't lost.
222
+ // Use word-boundary regexes since class attrs like "photo_wrap clearfix pull_left"
223
+ // wouldn't match a strict `class="photo_wrap"` pattern.
224
+ if (!text) {
225
+ if (/class="[^"]*\bphoto_wrap\b/.test(messageHtml)) text = '[photo]';
226
+ else if (/class="[^"]*\bmedia_voice_message\b/.test(messageHtml)) text = '[voice message]';
227
+ else if (/class="[^"]*\bmedia_video_file\b/.test(messageHtml)) text = '[video]';
228
+ else if (/class="[^"]*\bmedia_audio_file\b/.test(messageHtml)) text = '[audio]';
229
+ else if (/class="[^"]*\bmedia_file\b/.test(messageHtml)) text = '[file]';
230
+ else if (/class="[^"]*\bsticker\b/.test(messageHtml)) text = '[sticker]';
231
+ else return null; // Truly empty — skip
232
+ }
233
+
234
+ // Build the message object in the shape importTelegram expects
235
+ // (date and date_unixtime are required by the importer)
236
+ const isoDate = date ? date.isoString : null;
237
+ const tsUnix = date ? date.tsUnix : 0;
238
+
239
+ return {
240
+ id: parseInt(msgId, 10),
241
+ type: 'message',
242
+ date: isoDate || '1970-01-01T00:00:00',
243
+ date_unixtime: tsUnix > 0 ? String(tsUnix) : '0',
244
+ from: fromName,
245
+ from_id: fromName ? `user_html_${slugify(fromName)}` : 'unknown',
246
+ text: text,
247
+ ...(forwardedFrom ? { forwarded_from: forwardedFrom } : {}),
248
+ };
249
+ }
250
+
251
+ function slugify(s) {
252
+ return String(s).toLowerCase().replace(/[^a-z0-9]+/g, '_').replace(/^_+|_+$/g, '').slice(0, 40) || 'anon';
253
+ }
254
+
255
+ /**
256
+ * Extract chat title from messages.html (or first chunk).
257
+ * Falls back to directory name basename, then "Telegram chat".
258
+ */
259
+ function extractChatTitle(htmlContent, fallbackPath) {
260
+ // PREFER the page_header — it's the actual chat name as shown in Telegram.
261
+ // The <title> tag is often the locale-specific "Exported Data" / "Telegram"
262
+ // boilerplate, which we want to avoid.
263
+ const headerM = htmlContent.match(/<div class="page_header"[\s\S]*?<div class="text bold"[^>]*>\s*([\s\S]*?)\s*<\/div>/);
264
+ if (headerM) {
265
+ const t = htmlToText(headerM[1]).trim();
266
+ if (t) return t;
267
+ }
268
+ // Fallback: <title>...</title> — strip "Chat Export" / "Telegram" / "Exported Data" suffixes
269
+ const titleM = htmlContent.match(/<title>\s*([^<]+?)\s*<\/title>/i);
270
+ if (titleM) {
271
+ let t = titleM[1].trim();
272
+ t = t.replace(/\s*[—-]\s*(Chat Export|Telegram).*$/i, '').trim();
273
+ // Skip locale boilerplate that Telegram itself uses as the page <title>
274
+ if (t && !/^(Telegram|Exported Data|Экспорт(ированные)? данные|Эспортированные данные)$/i.test(t)) {
275
+ return t;
276
+ }
277
+ }
278
+ // Fallback: dirname of the parent ChatExport_xxx folder
279
+ if (fallbackPath) {
280
+ const parent = basename(dirname(fallbackPath));
281
+ if (parent && parent.startsWith('ChatExport')) {
282
+ return parent.replace(/^ChatExport_?/, '').replace(/_/g, ' ').trim() || 'Telegram chat';
283
+ }
284
+ }
285
+ return 'Telegram chat';
286
+ }
287
+
288
+ /**
289
+ * Main entrypoint. Parse a Telegram HTML export path → return an object
290
+ * shaped like a Telegram JSON export, ready for importTelegram().
291
+ *
292
+ * Returns null if path isn't a valid Telegram HTML export.
293
+ *
294
+ * Object shape:
295
+ * {
296
+ * personal_information: { user_id: "" },
297
+ * chats: {
298
+ * list: [{
299
+ * id: <stable hash of chat title>,
300
+ * name: <chat title>,
301
+ * type: "personal_chat",
302
+ * messages: [{ id, type, date, date_unixtime, from, from_id, text, … }, …]
303
+ * }]
304
+ * }
305
+ * }
306
+ */
307
+ export function parseTelegramHtmlExport(path, opts = {}) {
308
+ const detection = detectTelegramHtml(path);
309
+ if (!detection.type) return null;
310
+ if (detection.htmlFiles.length === 0) return null;
311
+
312
+ let allMessages = [];
313
+ let chatTitle = null;
314
+ let lastSender = null;
315
+
316
+ for (const htmlPath of detection.htmlFiles) {
317
+ let content;
318
+ try { content = readFileSync(htmlPath, 'utf-8'); }
319
+ catch (_) { continue; }
320
+
321
+ if (!chatTitle) chatTitle = extractChatTitle(content, htmlPath);
322
+
323
+ // Split into per-message blocks. The reliable boundary is the
324
+ // opening `<div class="message ` of the next message.
325
+ // Use a tolerant regex that handles the message default / joined variants.
326
+ const messageBlocks = [...content.matchAll(/<div class="message [^"]*"[\s\S]*?(?=<div class="message [^"]*"|<div class="page_footer"|<\/body>)/g)];
327
+
328
+ for (const blockMatch of messageBlocks) {
329
+ const msg = parseMessageDiv(blockMatch[0], lastSender);
330
+ if (msg) {
331
+ allMessages.push(msg);
332
+ // Track sender for "joined" continuation messages
333
+ if (msg.from && msg.from !== 'Unknown') lastSender = msg.from;
334
+ }
335
+ }
336
+ }
337
+
338
+ if (allMessages.length === 0) return null;
339
+
340
+ // Stable chat id: hash of title + first message ts (good enough for dedup)
341
+ // We use a simple numeric hash so the synthetic chat_id is stable across re-imports.
342
+ const chatId = stableChatId(chatTitle || 'Telegram chat', allMessages[0]?.date_unixtime || '0');
343
+
344
+ // Detect chat type from sender diversity. A `personal_chat` has at most 2 distinct
345
+ // senders (you + the other person). 3+ distinct senders → group / supergroup.
346
+ // We can't distinguish private_group vs public_supergroup from HTML alone, so we
347
+ // call it `private_group` (matches the JSON export taxonomy).
348
+ const distinctSenders = new Set();
349
+ for (const m of allMessages) {
350
+ if (m.from && m.from !== 'Unknown') distinctSenders.add(m.from);
351
+ if (distinctSenders.size > 2) break;
352
+ }
353
+ const chatType = distinctSenders.size > 2 ? 'private_group' : 'personal_chat';
354
+
355
+ return {
356
+ personal_information: { user_id: '' },
357
+ chats: {
358
+ list: [
359
+ {
360
+ id: chatId,
361
+ name: chatTitle || 'Telegram chat',
362
+ type: chatType,
363
+ messages: allMessages,
364
+ },
365
+ ],
366
+ },
367
+ _source: {
368
+ format: 'telegram-html',
369
+ original_path: path,
370
+ chunks: detection.htmlFiles.length,
371
+ messages_total: allMessages.length,
372
+ },
373
+ };
374
+ }
375
+
376
+ function stableChatId(title, firstTs) {
377
+ let hash = 0;
378
+ const key = title + ':' + firstTs;
379
+ for (let i = 0; i < key.length; i++) {
380
+ hash = ((hash << 5) - hash) + key.charCodeAt(i);
381
+ hash |= 0;
382
+ }
383
+ return Math.abs(hash);
384
+ }
package/lib/parse.js ADDED
@@ -0,0 +1,175 @@
1
+ /**
2
+ * Shared dialogue-only parser for Claude Code / Cowork JSONL.
3
+ *
4
+ * Used by both the MCP server (server.js, importing inbox files) and the
5
+ * ingest daemon (ingest.js, reading deltas from raw source files).
6
+ */
7
+
8
+ /** Skip these top-level event types — they're not dialogue. */
9
+ export const CLAUDE_CODE_SKIP_TYPES = new Set(['queue-operation', 'ai-title', 'summary']);
10
+
11
+ /** Auto-generated user messages produced by /compact, /resume, and
12
+ * continuation flows. They're real messages (we keep them in the
13
+ * index), but they're never useful as conversation titles. */
14
+ export const CONTINUATION_PREFIXES = [
15
+ 'This session is being continued',
16
+ 'Continue from where you left off',
17
+ 'Please continue from where you left off',
18
+ ];
19
+
20
+ export function isContinuationBoilerplate(text) {
21
+ for (const p of CONTINUATION_PREFIXES) if (text.startsWith(p)) return true;
22
+ // XML/tag-wrapped artefacts (uploaded_files, system-reminder, command-name…)
23
+ if (text.startsWith('<')) return true;
24
+ return false;
25
+ }
26
+
27
+ /** Extract a clean dialogue message from a Claude Code JSONL record.
28
+ *
29
+ * Handles both:
30
+ * 1. Legacy flat shape (original spec):
31
+ * {"role":"user","content":"...","timestamp":"..."}
32
+ * 2. Real nested shape (current Claude Code / Cowork on disk):
33
+ * {"type":"user","message":{"role":"user","content":"..."},"timestamp":"..."}
34
+ * {"parentUuid":"...","message":{"role":"assistant","content":[{type:"text",text:"..."},...]}}
35
+ *
36
+ * Filters out everything that isn't human-readable dialogue:
37
+ * - queue-operation / ai-title / summary events
38
+ * - attachment-only records (deferred_tools_delta, skill_listing, plan_mode)
39
+ * - tool_use / tool_result / thinking / redacted_thinking / image content blocks
40
+ * - encrypted thinking signatures (multi-kilobyte base64 blobs)
41
+ *
42
+ * Compaction handling:
43
+ * Records with isCompactSummary:true (synthetic summary fed back into model
44
+ * context by /compact) are returned with role='summary' so the importer
45
+ * can route them away from FTS5 indexing — otherwise the summary would
46
+ * double-count against the original raw discussion it summarises.
47
+ *
48
+ * Returns null when the record should be skipped, otherwise
49
+ * { role, text, id, timestamp, uuid, parentUuid }.
50
+ */
51
+ export function extractMessageFromRecord(obj) {
52
+ if (!obj || typeof obj !== 'object') return null;
53
+
54
+ // Skip non-dialogue top-level event types
55
+ if (CLAUDE_CODE_SKIP_TYPES.has(obj.type)) return null;
56
+
57
+ // Skip attachment-only records (Claude Code harness bookkeeping)
58
+ if (obj.attachment && !obj.message) return null;
59
+
60
+ // Resolve role/content from either nested or flat shape
61
+ const nested = obj.message;
62
+ const fromNested = nested && typeof nested === 'object';
63
+ let role = fromNested ? nested.role : obj.role;
64
+ if (!role || typeof role !== 'string') return null;
65
+
66
+ let rawContent;
67
+ if (fromNested) {
68
+ rawContent = nested.content;
69
+ } else if (obj.content !== undefined) {
70
+ rawContent = obj.content;
71
+ } else {
72
+ rawContent = obj.text;
73
+ }
74
+
75
+ // Normalise content into dialogue-only text
76
+ let text = '';
77
+ if (typeof rawContent === 'string') {
78
+ text = rawContent;
79
+ } else if (Array.isArray(rawContent)) {
80
+ const parts = [];
81
+ for (const block of rawContent) {
82
+ if (typeof block === 'string') {
83
+ parts.push(block);
84
+ continue;
85
+ }
86
+ if (!block || typeof block !== 'object') continue;
87
+ // Only keep text-bearing blocks. Drop tool_use, tool_result, thinking,
88
+ // redacted_thinking, image, and any future unknown block types.
89
+ if (block.type === 'text' && typeof block.text === 'string') {
90
+ parts.push(block.text);
91
+ }
92
+ }
93
+ text = parts.join('\n');
94
+ }
95
+
96
+ if (!text || !text.trim()) return null;
97
+
98
+ // Claude Code marks the synthetic /compact summary message with
99
+ // isCompactSummary:true (and isVisibleInTranscriptOnly:true). Re-tag
100
+ // those as role='summary' so the importer can keep them in the messages
101
+ // table for retrieval but exclude them from FTS5 — otherwise searching
102
+ // for any topic discussed before a compaction would return both the
103
+ // original raw turns AND the compressed summary mention, polluting rank.
104
+ if (
105
+ role === 'user' &&
106
+ (obj.isCompactSummary === true || obj.isVisibleInTranscriptOnly === true)
107
+ ) {
108
+ role = 'summary';
109
+ }
110
+
111
+ const id = (fromNested && nested.id) || obj.id || null;
112
+ const timestamp =
113
+ obj.timestamp || (fromNested && nested.timestamp) || null;
114
+ const uuid = obj.uuid || null;
115
+ const parentUuid = obj.parentUuid || null;
116
+
117
+ return { role, text, id, timestamp, uuid, parentUuid };
118
+ }
119
+
120
+ /** Detect a compact_boundary record.
121
+ *
122
+ * Claude Code writes two record types when /compact (or auto-compact) fires:
123
+ * 1. {type:"system", subtype:"compact_boundary", compactMetadata:{...}, ...}
124
+ * — boundary marker. parentUuid is reset to null. compactMetadata
125
+ * carries {trigger, preTokens, postTokens, durationMs,
126
+ * logicalParentUuid, preCompactDiscoveredTools}.
127
+ * 2. {type:"user", isCompactSummary:true, message:{...}} — the
128
+ * AI-generated summary fed back into model context (handled by
129
+ * extractMessageFromRecord via role='summary').
130
+ *
131
+ * We also recognise the daemon's inbox-emitted shape
132
+ * {type:"compact-boundary", metadata:{...}, ...} so server.js can import
133
+ * either the raw on-disk format or the daemon's snapshot.
134
+ *
135
+ * Returns null when the record isn't a boundary, otherwise
136
+ * { timestamp, uuid, parentUuid, logicalParentUuid, metadata, id }.
137
+ */
138
+ export function extractCompactBoundary(obj) {
139
+ if (!obj || typeof obj !== 'object') return null;
140
+
141
+ let metadata, raw;
142
+ if (obj.type === 'system' && obj.subtype === 'compact_boundary') {
143
+ metadata = obj.compactMetadata || {};
144
+ raw = obj;
145
+ } else if (obj.type === 'compact-boundary') {
146
+ metadata = obj.metadata || {};
147
+ raw = obj;
148
+ } else {
149
+ return null;
150
+ }
151
+
152
+ return {
153
+ timestamp: obj.timestamp || null,
154
+ uuid: obj.uuid || null,
155
+ parentUuid: obj.parentUuid || null,
156
+ logicalParentUuid:
157
+ obj.logicalParentUuid || (metadata && metadata.logicalParentUuid) || null,
158
+ metadata,
159
+ id: obj.id || null,
160
+ raw,
161
+ };
162
+ }
163
+
164
+ /** Pull an ai-title record out of a JSONL line, if present. */
165
+ export function extractAiTitle(obj) {
166
+ if (
167
+ obj &&
168
+ obj.type === 'ai-title' &&
169
+ typeof obj.aiTitle === 'string' &&
170
+ obj.aiTitle.trim()
171
+ ) {
172
+ return obj.aiTitle.trim();
173
+ }
174
+ return null;
175
+ }
Binary file
@@ -0,0 +1,116 @@
1
+ /**
2
+ * URL canonicalization for stable deduplication of stored web documents.
3
+ *
4
+ * Goal: two URLs that point to "the same document" should map to the same
5
+ * canonical form, so memex_store_document gives them the same conversation_id
6
+ * via sha256(canonical).
7
+ *
8
+ * What we normalize:
9
+ * - Lowercase scheme + host
10
+ * - Strip known tracking params (utm_*, fbclid, gclid, ref, mc_*, _ga, …)
11
+ * - Drop the fragment (#anchor) — same document
12
+ * - Normalize trailing slash on pathname
13
+ *
14
+ * What we DON'T normalize:
15
+ * - Path case (some servers are case-sensitive)
16
+ * - Non-tracking query params (?q= search, ?id= permalinks — meaningful)
17
+ * - Port (rare in public URLs)
18
+ *
19
+ * If the input isn't a valid URL, we return the input unchanged. Callers
20
+ * should still hash the result for deduplication.
21
+ */
22
+
23
+ // Well-known tracking-param families. Case-insensitive prefix match.
24
+ const TRACKING_PREFIXES = [
25
+ 'utm_', // Google Analytics
26
+ 'mc_', // Mailchimp
27
+ ];
28
+ const TRACKING_EXACT = new Set([
29
+ 'fbclid', // Facebook
30
+ 'gclid', // Google ads
31
+ 'dclid', // Google DoubleClick
32
+ 'gbraid', // Google
33
+ 'wbraid', // Google
34
+ 'yclid', // Yandex
35
+ 'msclkid', // Microsoft ads
36
+ 'twclid', // Twitter
37
+ 'igshid', // Instagram
38
+ 'ref', // generic referrer
39
+ 'ref_source',
40
+ 'ref_url',
41
+ 'referrer',
42
+ 'source', // common referrer flag (NOT always tracking but very often)
43
+ '_ga', // Google Analytics
44
+ '_gl', // Google Analytics linker
45
+ 'hsCtaTracking',
46
+ 'hsenc',
47
+ 'hsmi',
48
+ 'mkt_tok',
49
+ 'pk_campaign',
50
+ 'pk_source',
51
+ 'pk_medium',
52
+ 'pk_keyword',
53
+ 'pk_content',
54
+ 'vero_id',
55
+ 'vero_conv',
56
+ ]);
57
+
58
+ function isTrackingParam(name) {
59
+ const lower = name.toLowerCase();
60
+ if (TRACKING_EXACT.has(lower)) return true;
61
+ for (const prefix of TRACKING_PREFIXES) {
62
+ if (lower.startsWith(prefix)) return true;
63
+ }
64
+ return false;
65
+ }
66
+
67
+ /**
68
+ * @param {string} rawUrl
69
+ * @returns {string} canonicalized URL (or the input unchanged if unparseable)
70
+ */
71
+ export function canonicalize(rawUrl) {
72
+ if (typeof rawUrl !== 'string' || !rawUrl.trim()) return rawUrl;
73
+
74
+ let u;
75
+ try {
76
+ u = new URL(rawUrl.trim());
77
+ } catch (_) {
78
+ return rawUrl.trim();
79
+ }
80
+
81
+ // Lowercase scheme + host (URL parser already does that, but be explicit)
82
+ u.protocol = u.protocol.toLowerCase();
83
+ u.hostname = u.hostname.toLowerCase();
84
+
85
+ // Drop the fragment
86
+ u.hash = '';
87
+
88
+ // Strip tracking params
89
+ const cleanParams = new URLSearchParams();
90
+ for (const [k, v] of u.searchParams) {
91
+ if (!isTrackingParam(k)) cleanParams.append(k, v);
92
+ }
93
+ u.search = cleanParams.toString();
94
+
95
+ // Normalize trailing slash: drop trailing slash on non-root paths,
96
+ // so /foo and /foo/ are treated as the same document
97
+ if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
98
+ u.pathname = u.pathname.replace(/\/+$/, '');
99
+ }
100
+
101
+ return u.toString();
102
+ }
103
+
104
+ /**
105
+ * Best-effort domain extraction for metadata (e.g. "perplexity.ai").
106
+ * Returns null for unparseable URLs.
107
+ */
108
+ export function extractDomain(rawUrl) {
109
+ if (typeof rawUrl !== 'string') return null;
110
+ try {
111
+ const u = new URL(rawUrl);
112
+ return u.hostname.toLowerCase().replace(/^www\./, '');
113
+ } catch (_) {
114
+ return null;
115
+ }
116
+ }