parallelclaw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +204 -0
- package/HELP.md +600 -0
- package/LICENSE +21 -0
- package/MULTI_MACHINE.md +152 -0
- package/README.md +417 -0
- package/README.ru.md +740 -0
- package/SYNC.md +844 -0
- package/bot/README.md +173 -0
- package/bot/config.js +66 -0
- package/bot/inbox.js +153 -0
- package/bot/index.js +294 -0
- package/bot/nexara.js +61 -0
- package/bot/poll.js +304 -0
- package/bot/search.js +155 -0
- package/bot/telegram.js +96 -0
- package/ingest.js +2712 -0
- package/lib/cli/index.js +1987 -0
- package/lib/config.js +220 -0
- package/lib/db-init.js +158 -0
- package/lib/hook/install.js +268 -0
- package/lib/import-telegram.js +158 -0
- package/lib/ingest-file.js +779 -0
- package/lib/notify-click-action.js +281 -0
- package/lib/openclaw-channel.js +643 -0
- package/lib/parse-cursor.js +172 -0
- package/lib/parse-obsidian.js +256 -0
- package/lib/parse-telegram-html.js +384 -0
- package/lib/parse.js +175 -0
- package/lib/render-markdown.js +0 -0
- package/lib/store-doc/canonicalize.js +116 -0
- package/lib/store-doc/detect.js +209 -0
- package/lib/store-doc/extract-title.js +162 -0
- package/lib/sync/auth.js +80 -0
- package/lib/sync/cert.js +144 -0
- package/lib/sync/cli.js +906 -0
- package/lib/sync/client.js +138 -0
- package/lib/sync/config.js +130 -0
- package/lib/sync/pair.js +145 -0
- package/lib/sync/pull.js +158 -0
- package/lib/sync/push.js +305 -0
- package/lib/sync/replicate.js +335 -0
- package/lib/sync/server.js +224 -0
- package/lib/sync/service.js +726 -0
- package/lib/tasks.js +215 -0
- package/lib/telegram-decisions.js +165 -0
- package/lib/telegram-discovery.js +373 -0
- package/lib/telegram-notify.js +272 -0
- package/lib/telegram-pending.js +200 -0
- package/lib/web/index.js +265 -0
- package/lib/web/routes/conversation.js +193 -0
- package/lib/web/routes/conversations.js +180 -0
- package/lib/web/routes/dashboard.js +175 -0
- package/lib/web/routes/pending.js +277 -0
- package/lib/web/routes/settings.js +226 -0
- package/lib/web/static/style.css +393 -0
- package/lib/web/templates.js +234 -0
- package/package.json +84 -0
- package/server.js +3816 -0
- package/skills/install-memex/README.md +109 -0
- package/skills/install-memex/SKILL.md +342 -0
- package/skills/install-memex/examples.md +294 -0
- package/skills/install-memex-claw/SKILL.md +423 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Telegram Desktop HTML export → Telegram-JSON-shape converter.
|
|
3
|
+
*
|
|
4
|
+
* Telegram Desktop offers two export formats:
|
|
5
|
+
* - "Machine-readable JSON" — what memex's importTelegram expects
|
|
6
|
+
* - "Human-readable HTML" — what many users pick by default
|
|
7
|
+
*
|
|
8
|
+
* Users frequently export as HTML by accident (often the default in the
|
|
9
|
+
* Telegram UI), then memex's inbox watcher silently ignores the dropped
|
|
10
|
+
* directory. This module makes HTML work: parse → emit the same shape
|
|
11
|
+
* importTelegram already understands.
|
|
12
|
+
*
|
|
13
|
+
* Telegram's HTML export is reasonably stable:
|
|
14
|
+
*
|
|
15
|
+
* ChatExport_<chat-title>_<date>/
|
|
16
|
+
* ├── messages.html (or messages.htm — chunked: messages2, messages3, …)
|
|
17
|
+
* ├── photos/
|
|
18
|
+
* ├── files/
|
|
19
|
+
* ├── stickers/
|
|
20
|
+
* └── voice_messages/
|
|
21
|
+
*
|
|
22
|
+
* Each messages*.html has structure:
|
|
23
|
+
*
|
|
24
|
+
* <div class="message default clearfix" id="message12345">
|
|
25
|
+
* <div class="body">
|
|
26
|
+
* <div class="from_name"> ↳ Sender Name </div> (may be absent on "joined" messages)
|
|
27
|
+
* <div class="text"> message text </div>
|
|
28
|
+
* <div class="pull_right date details" title="2024-01-01 14:23:45 UTC+03:00">14:23</div>
|
|
29
|
+
* </div>
|
|
30
|
+
* </div>
|
|
31
|
+
*
|
|
32
|
+
* Joined message = same sender as previous, has class "joined", no from_name.
|
|
33
|
+
* Service message = class "service" (joined chat, name change, …) — we skip these.
|
|
34
|
+
* Forwarded = "forwarded body" wrapping the message body.
|
|
35
|
+
* Reply = "reply_to details" sibling.
|
|
36
|
+
*
|
|
37
|
+
* We use regex-based parsing (no DOM dependency) because Telegram's class
|
|
38
|
+
* names are stable and we control which fields we care about. If Telegram
|
|
39
|
+
* radically changes the schema, parser breaks loudly (returns 0 messages
|
|
40
|
+
* + clear log) rather than silently corrupting.
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
import { readFileSync, existsSync, readdirSync, statSync } from 'node:fs';
|
|
44
|
+
import { join, basename, dirname } from 'node:path';
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Detect if a given path is a Telegram HTML export.
|
|
48
|
+
* Accepts both a directory (most common — ChatExport_xxx/) and a bare
|
|
49
|
+
* messages.html file (rare — user dropped just the one file).
|
|
50
|
+
*
|
|
51
|
+
* Returns { type: 'dir' | 'file' | null, htmlFiles: string[] }
|
|
52
|
+
* null type means "not a Telegram HTML export"
|
|
53
|
+
*/
|
|
54
|
+
export function detectTelegramHtml(path) {
|
|
55
|
+
if (!existsSync(path)) return { type: null, htmlFiles: [] };
|
|
56
|
+
const stats = statSync(path);
|
|
57
|
+
|
|
58
|
+
// Directory case: look for messages*.html inside
|
|
59
|
+
if (stats.isDirectory()) {
|
|
60
|
+
let entries = [];
|
|
61
|
+
try { entries = readdirSync(path); } catch (_) { return { type: null, htmlFiles: [] }; }
|
|
62
|
+
const htmlFiles = entries
|
|
63
|
+
.filter((f) => /^messages\d*\.html?$/i.test(f))
|
|
64
|
+
.map((f) => join(path, f));
|
|
65
|
+
if (htmlFiles.length === 0) return { type: null, htmlFiles: [] };
|
|
66
|
+
// Verify the first one contains Telegram-shaped markers
|
|
67
|
+
const head = safeReadHead(htmlFiles[0]);
|
|
68
|
+
if (!looksLikeTelegram(head)) return { type: null, htmlFiles: [] };
|
|
69
|
+
// Sort chunks: messages.html < messages2.html < messages3.html …
|
|
70
|
+
htmlFiles.sort(numericChunkSort);
|
|
71
|
+
return { type: 'dir', htmlFiles };
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Single file case: must be messages*.html
|
|
75
|
+
if (stats.isFile() && /\.html?$/i.test(path) && /messages\d*\.html?$/i.test(basename(path))) {
|
|
76
|
+
const head = safeReadHead(path);
|
|
77
|
+
if (!looksLikeTelegram(head)) return { type: null, htmlFiles: [] };
|
|
78
|
+
return { type: 'file', htmlFiles: [path] };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return { type: null, htmlFiles: [] };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function safeReadHead(file, bytes = 8192) {
|
|
85
|
+
try {
|
|
86
|
+
return readFileSync(file, 'utf-8').slice(0, bytes);
|
|
87
|
+
} catch (_) {
|
|
88
|
+
return '';
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function looksLikeTelegram(head) {
|
|
93
|
+
// Reliable markers in Telegram Desktop HTML exports
|
|
94
|
+
return /class="page_wrap"/.test(head) ||
|
|
95
|
+
/class="page_body chat_page"/.test(head) ||
|
|
96
|
+
(/class="from_name"/.test(head) && /class="text"/.test(head));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function numericChunkSort(a, b) {
|
|
100
|
+
const numA = parseInt((basename(a).match(/messages(\d*)\.html?/i) || [, '0'])[1] || '0', 10);
|
|
101
|
+
const numB = parseInt((basename(b).match(/messages(\d*)\.html?/i) || [, '0'])[1] || '0', 10);
|
|
102
|
+
return numA - numB;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Strip HTML tags and decode common entities → plain text.
|
|
107
|
+
* Conservative: preserves newlines from <br>, paragraph breaks from </div>.
|
|
108
|
+
*/
|
|
109
|
+
function htmlToText(html) {
|
|
110
|
+
if (!html) return '';
|
|
111
|
+
let out = String(html);
|
|
112
|
+
// Convert breaks to newlines BEFORE stripping tags
|
|
113
|
+
out = out.replace(/<br\s*\/?>/gi, '\n');
|
|
114
|
+
out = out.replace(/<\/p>/gi, '\n\n');
|
|
115
|
+
out = out.replace(/<\/div>/gi, '\n');
|
|
116
|
+
// Drop all remaining tags
|
|
117
|
+
out = out.replace(/<[^>]+>/g, '');
|
|
118
|
+
// Decode common entities
|
|
119
|
+
out = out
|
|
120
|
+
.replace(/ /g, ' ')
|
|
121
|
+
.replace(/&/g, '&')
|
|
122
|
+
.replace(/</g, '<')
|
|
123
|
+
.replace(/>/g, '>')
|
|
124
|
+
.replace(/"/g, '"')
|
|
125
|
+
.replace(/'/g, "'")
|
|
126
|
+
.replace(/'/g, "'")
|
|
127
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
|
128
|
+
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)));
|
|
129
|
+
// Collapse 3+ blank lines, trim
|
|
130
|
+
out = out.replace(/\n{3,}/g, '\n\n').trim();
|
|
131
|
+
return out;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Parse a Telegram date title into Unix timestamp.
|
|
136
|
+
* Telegram emits dates in the user's locale format, e.g.:
|
|
137
|
+
* • "2024-01-01 14:23:45 UTC+03:00" (ISO — English locale)
|
|
138
|
+
* • "01.01.2024 14:23:45 UTC+03:00" (European — Russian / German / etc.)
|
|
139
|
+
* • "01/01/2024 14:23:45 UTC+03:00" (US slash format — less common in exports)
|
|
140
|
+
* Returns { tsUnix, isoString } or null if unparseable.
|
|
141
|
+
*/
|
|
142
|
+
function parseTelegramDate(title) {
|
|
143
|
+
if (!title) return null;
|
|
144
|
+
let y, mo, d, h, mi, s, sign, oh, om;
|
|
145
|
+
// ISO: YYYY-MM-DD HH:MM:SS [UTC±HH:MM]
|
|
146
|
+
let m = title.match(/^(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(?:\s+UTC([+-])(\d{2}):(\d{2}))?$/);
|
|
147
|
+
if (m) {
|
|
148
|
+
[, y, mo, d, h, mi, s, sign, oh, om] = m;
|
|
149
|
+
} else {
|
|
150
|
+
// European: DD.MM.YYYY HH:MM:SS [UTC±HH:MM] (also supports "/" or "-" as separator)
|
|
151
|
+
m = title.match(/^(\d{2})[.\/-](\d{2})[.\/-](\d{4})\s+(\d{2}):(\d{2}):(\d{2})(?:\s+UTC([+-])(\d{2}):(\d{2}))?$/);
|
|
152
|
+
if (!m) return null;
|
|
153
|
+
[, d, mo, y, h, mi, s, sign, oh, om] = m;
|
|
154
|
+
}
|
|
155
|
+
// Construct an ISO 8601 string with the explicit offset (or UTC if absent)
|
|
156
|
+
const offset = sign ? `${sign}${oh}:${om}` : 'Z';
|
|
157
|
+
const iso = `${y}-${mo}-${d}T${h}:${mi}:${s}${offset}`;
|
|
158
|
+
const date = new Date(iso);
|
|
159
|
+
if (isNaN(date.getTime())) return null;
|
|
160
|
+
return {
|
|
161
|
+
tsUnix: Math.floor(date.getTime() / 1000),
|
|
162
|
+
isoString: iso.replace(/[+-]\d{2}:\d{2}$/, '').replace('Z', ''),
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Parse a single message div (raw HTML segment).
|
|
168
|
+
* Returns null for service messages (we skip those) or messages with no text.
|
|
169
|
+
*/
|
|
170
|
+
function parseMessageDiv(messageHtml, lastSender) {
|
|
171
|
+
// Skip service messages outright
|
|
172
|
+
if (/class="message service\b/.test(messageHtml)) return null;
|
|
173
|
+
|
|
174
|
+
// Extract message id from outer div: id="message12345"
|
|
175
|
+
const idMatch = messageHtml.match(/id="message(\d+)"/);
|
|
176
|
+
const msgId = idMatch ? idMatch[1] : null;
|
|
177
|
+
if (!msgId) return null;
|
|
178
|
+
|
|
179
|
+
const isJoined = /class="message [^"]*joined/.test(messageHtml);
|
|
180
|
+
|
|
181
|
+
// Forwarded marker
|
|
182
|
+
const isForwarded = /class="forwarded body"/.test(messageHtml);
|
|
183
|
+
let forwardedFrom = null;
|
|
184
|
+
if (isForwarded) {
|
|
185
|
+
const fwdM = messageHtml.match(/class="forwarded[^"]*"[\s\S]*?<div class="from_name"[^>]*>\s*([\s\S]*?)\s*<\/div>/);
|
|
186
|
+
if (fwdM) {
|
|
187
|
+
forwardedFrom = htmlToText(fwdM[1]).replace(/^Forwarded from:?\s*/i, '').trim();
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Sender (from_name) — absent on joined messages
|
|
192
|
+
let fromName = null;
|
|
193
|
+
const fromM = messageHtml.match(/<div class="from_name"[^>]*>\s*([\s\S]*?)\s*<\/div>/);
|
|
194
|
+
if (fromM && !isForwarded) {
|
|
195
|
+
fromName = htmlToText(fromM[1]).trim();
|
|
196
|
+
}
|
|
197
|
+
// If joined, inherit lastSender; otherwise use parsed or fallback
|
|
198
|
+
if (!fromName && isJoined && lastSender) fromName = lastSender;
|
|
199
|
+
if (!fromName) fromName = 'Unknown';
|
|
200
|
+
|
|
201
|
+
// Date — title attribute on `.date.details`
|
|
202
|
+
let date = null;
|
|
203
|
+
const dateM = messageHtml.match(/class="[^"]*\bdate details[^"]*"\s+title="([^"]+)"/);
|
|
204
|
+
if (dateM) date = parseTelegramDate(dateM[1]);
|
|
205
|
+
|
|
206
|
+
// Main text — last `<div class="text">…</div>` inside body (forwards may have one earlier)
|
|
207
|
+
let text = '';
|
|
208
|
+
const textMatches = [...messageHtml.matchAll(/<div class="text"[^>]*>([\s\S]*?)<\/div>(?=\s*(?:<div class="(?!text)|<\/div>|<a class="|$))/g)];
|
|
209
|
+
if (textMatches.length > 0) {
|
|
210
|
+
// Use last one (the actual message body, after any quoted/forwarded preamble)
|
|
211
|
+
text = htmlToText(textMatches[textMatches.length - 1][1]);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Reply marker — include as prefix so it's searchable but not lost
|
|
215
|
+
const replyM = messageHtml.match(/class="reply_to details"[^>]*>([\s\S]*?)<\/div>/);
|
|
216
|
+
if (replyM) {
|
|
217
|
+
const replyTxt = htmlToText(replyM[1]).replace(/^In reply to\s+/i, '').trim();
|
|
218
|
+
if (replyTxt) text = `↩ Reply: ${replyTxt}\n\n${text}`;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Photo / media — if no text, note the media presence so the row isn't lost.
|
|
222
|
+
// Use word-boundary regexes since class attrs like "photo_wrap clearfix pull_left"
|
|
223
|
+
// wouldn't match a strict `class="photo_wrap"` pattern.
|
|
224
|
+
if (!text) {
|
|
225
|
+
if (/class="[^"]*\bphoto_wrap\b/.test(messageHtml)) text = '[photo]';
|
|
226
|
+
else if (/class="[^"]*\bmedia_voice_message\b/.test(messageHtml)) text = '[voice message]';
|
|
227
|
+
else if (/class="[^"]*\bmedia_video_file\b/.test(messageHtml)) text = '[video]';
|
|
228
|
+
else if (/class="[^"]*\bmedia_audio_file\b/.test(messageHtml)) text = '[audio]';
|
|
229
|
+
else if (/class="[^"]*\bmedia_file\b/.test(messageHtml)) text = '[file]';
|
|
230
|
+
else if (/class="[^"]*\bsticker\b/.test(messageHtml)) text = '[sticker]';
|
|
231
|
+
else return null; // Truly empty — skip
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Build the message object in the shape importTelegram expects
|
|
235
|
+
// (date and date_unixtime are required by the importer)
|
|
236
|
+
const isoDate = date ? date.isoString : null;
|
|
237
|
+
const tsUnix = date ? date.tsUnix : 0;
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
id: parseInt(msgId, 10),
|
|
241
|
+
type: 'message',
|
|
242
|
+
date: isoDate || '1970-01-01T00:00:00',
|
|
243
|
+
date_unixtime: tsUnix > 0 ? String(tsUnix) : '0',
|
|
244
|
+
from: fromName,
|
|
245
|
+
from_id: fromName ? `user_html_${slugify(fromName)}` : 'unknown',
|
|
246
|
+
text: text,
|
|
247
|
+
...(forwardedFrom ? { forwarded_from: forwardedFrom } : {}),
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function slugify(s) {
|
|
252
|
+
return String(s).toLowerCase().replace(/[^a-z0-9]+/g, '_').replace(/^_+|_+$/g, '').slice(0, 40) || 'anon';
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Extract chat title from messages.html (or first chunk).
|
|
257
|
+
* Falls back to directory name basename, then "Telegram chat".
|
|
258
|
+
*/
|
|
259
|
+
function extractChatTitle(htmlContent, fallbackPath) {
|
|
260
|
+
// PREFER the page_header — it's the actual chat name as shown in Telegram.
|
|
261
|
+
// The <title> tag is often the locale-specific "Exported Data" / "Telegram"
|
|
262
|
+
// boilerplate, which we want to avoid.
|
|
263
|
+
const headerM = htmlContent.match(/<div class="page_header"[\s\S]*?<div class="text bold"[^>]*>\s*([\s\S]*?)\s*<\/div>/);
|
|
264
|
+
if (headerM) {
|
|
265
|
+
const t = htmlToText(headerM[1]).trim();
|
|
266
|
+
if (t) return t;
|
|
267
|
+
}
|
|
268
|
+
// Fallback: <title>...</title> — strip "Chat Export" / "Telegram" / "Exported Data" suffixes
|
|
269
|
+
const titleM = htmlContent.match(/<title>\s*([^<]+?)\s*<\/title>/i);
|
|
270
|
+
if (titleM) {
|
|
271
|
+
let t = titleM[1].trim();
|
|
272
|
+
t = t.replace(/\s*[—-]\s*(Chat Export|Telegram).*$/i, '').trim();
|
|
273
|
+
// Skip locale boilerplate that Telegram itself uses as the page <title>
|
|
274
|
+
if (t && !/^(Telegram|Exported Data|Экспорт(ированные)? данные|Эспортированные данные)$/i.test(t)) {
|
|
275
|
+
return t;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
// Fallback: dirname of the parent ChatExport_xxx folder
|
|
279
|
+
if (fallbackPath) {
|
|
280
|
+
const parent = basename(dirname(fallbackPath));
|
|
281
|
+
if (parent && parent.startsWith('ChatExport')) {
|
|
282
|
+
return parent.replace(/^ChatExport_?/, '').replace(/_/g, ' ').trim() || 'Telegram chat';
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
return 'Telegram chat';
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Main entrypoint. Parse a Telegram HTML export path → return an object
|
|
290
|
+
* shaped like a Telegram JSON export, ready for importTelegram().
|
|
291
|
+
*
|
|
292
|
+
* Returns null if path isn't a valid Telegram HTML export.
|
|
293
|
+
*
|
|
294
|
+
* Object shape:
|
|
295
|
+
* {
|
|
296
|
+
* personal_information: { user_id: "" },
|
|
297
|
+
* chats: {
|
|
298
|
+
* list: [{
|
|
299
|
+
* id: <stable hash of chat title>,
|
|
300
|
+
* name: <chat title>,
|
|
301
|
+
* type: "personal_chat",
|
|
302
|
+
* messages: [{ id, type, date, date_unixtime, from, from_id, text, … }, …]
|
|
303
|
+
* }]
|
|
304
|
+
* }
|
|
305
|
+
* }
|
|
306
|
+
*/
|
|
307
|
+
export function parseTelegramHtmlExport(path, opts = {}) {
|
|
308
|
+
const detection = detectTelegramHtml(path);
|
|
309
|
+
if (!detection.type) return null;
|
|
310
|
+
if (detection.htmlFiles.length === 0) return null;
|
|
311
|
+
|
|
312
|
+
let allMessages = [];
|
|
313
|
+
let chatTitle = null;
|
|
314
|
+
let lastSender = null;
|
|
315
|
+
|
|
316
|
+
for (const htmlPath of detection.htmlFiles) {
|
|
317
|
+
let content;
|
|
318
|
+
try { content = readFileSync(htmlPath, 'utf-8'); }
|
|
319
|
+
catch (_) { continue; }
|
|
320
|
+
|
|
321
|
+
if (!chatTitle) chatTitle = extractChatTitle(content, htmlPath);
|
|
322
|
+
|
|
323
|
+
// Split into per-message blocks. The reliable boundary is the
|
|
324
|
+
// opening `<div class="message ` of the next message.
|
|
325
|
+
// Use a tolerant regex that handles the message default / joined variants.
|
|
326
|
+
const messageBlocks = [...content.matchAll(/<div class="message [^"]*"[\s\S]*?(?=<div class="message [^"]*"|<div class="page_footer"|<\/body>)/g)];
|
|
327
|
+
|
|
328
|
+
for (const blockMatch of messageBlocks) {
|
|
329
|
+
const msg = parseMessageDiv(blockMatch[0], lastSender);
|
|
330
|
+
if (msg) {
|
|
331
|
+
allMessages.push(msg);
|
|
332
|
+
// Track sender for "joined" continuation messages
|
|
333
|
+
if (msg.from && msg.from !== 'Unknown') lastSender = msg.from;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if (allMessages.length === 0) return null;
|
|
339
|
+
|
|
340
|
+
// Stable chat id: hash of title + first message ts (good enough for dedup)
|
|
341
|
+
// We use a simple numeric hash so the synthetic chat_id is stable across re-imports.
|
|
342
|
+
const chatId = stableChatId(chatTitle || 'Telegram chat', allMessages[0]?.date_unixtime || '0');
|
|
343
|
+
|
|
344
|
+
// Detect chat type from sender diversity. A `personal_chat` has at most 2 distinct
|
|
345
|
+
// senders (you + the other person). 3+ distinct senders → group / supergroup.
|
|
346
|
+
// We can't distinguish private_group vs public_supergroup from HTML alone, so we
|
|
347
|
+
// call it `private_group` (matches the JSON export taxonomy).
|
|
348
|
+
const distinctSenders = new Set();
|
|
349
|
+
for (const m of allMessages) {
|
|
350
|
+
if (m.from && m.from !== 'Unknown') distinctSenders.add(m.from);
|
|
351
|
+
if (distinctSenders.size > 2) break;
|
|
352
|
+
}
|
|
353
|
+
const chatType = distinctSenders.size > 2 ? 'private_group' : 'personal_chat';
|
|
354
|
+
|
|
355
|
+
return {
|
|
356
|
+
personal_information: { user_id: '' },
|
|
357
|
+
chats: {
|
|
358
|
+
list: [
|
|
359
|
+
{
|
|
360
|
+
id: chatId,
|
|
361
|
+
name: chatTitle || 'Telegram chat',
|
|
362
|
+
type: chatType,
|
|
363
|
+
messages: allMessages,
|
|
364
|
+
},
|
|
365
|
+
],
|
|
366
|
+
},
|
|
367
|
+
_source: {
|
|
368
|
+
format: 'telegram-html',
|
|
369
|
+
original_path: path,
|
|
370
|
+
chunks: detection.htmlFiles.length,
|
|
371
|
+
messages_total: allMessages.length,
|
|
372
|
+
},
|
|
373
|
+
};
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
function stableChatId(title, firstTs) {
|
|
377
|
+
let hash = 0;
|
|
378
|
+
const key = title + ':' + firstTs;
|
|
379
|
+
for (let i = 0; i < key.length; i++) {
|
|
380
|
+
hash = ((hash << 5) - hash) + key.charCodeAt(i);
|
|
381
|
+
hash |= 0;
|
|
382
|
+
}
|
|
383
|
+
return Math.abs(hash);
|
|
384
|
+
}
|
package/lib/parse.js
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared dialogue-only parser for Claude Code / Cowork JSONL.
|
|
3
|
+
*
|
|
4
|
+
* Used by both the MCP server (server.js, importing inbox files) and the
|
|
5
|
+
* ingest daemon (ingest.js, reading deltas from raw source files).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/** Skip these top-level event types — they're not dialogue. */
|
|
9
|
+
export const CLAUDE_CODE_SKIP_TYPES = new Set(['queue-operation', 'ai-title', 'summary']);
|
|
10
|
+
|
|
11
|
+
/** Auto-generated user messages produced by /compact, /resume, and
|
|
12
|
+
* continuation flows. They're real messages (we keep them in the
|
|
13
|
+
* index), but they're never useful as conversation titles. */
|
|
14
|
+
export const CONTINUATION_PREFIXES = [
|
|
15
|
+
'This session is being continued',
|
|
16
|
+
'Continue from where you left off',
|
|
17
|
+
'Please continue from where you left off',
|
|
18
|
+
];
|
|
19
|
+
|
|
20
|
+
export function isContinuationBoilerplate(text) {
|
|
21
|
+
for (const p of CONTINUATION_PREFIXES) if (text.startsWith(p)) return true;
|
|
22
|
+
// XML/tag-wrapped artefacts (uploaded_files, system-reminder, command-name…)
|
|
23
|
+
if (text.startsWith('<')) return true;
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Extract a clean dialogue message from a Claude Code JSONL record.
|
|
28
|
+
*
|
|
29
|
+
* Handles both:
|
|
30
|
+
* 1. Legacy flat shape (original spec):
|
|
31
|
+
* {"role":"user","content":"...","timestamp":"..."}
|
|
32
|
+
* 2. Real nested shape (current Claude Code / Cowork on disk):
|
|
33
|
+
* {"type":"user","message":{"role":"user","content":"..."},"timestamp":"..."}
|
|
34
|
+
* {"parentUuid":"...","message":{"role":"assistant","content":[{type:"text",text:"..."},...]}}
|
|
35
|
+
*
|
|
36
|
+
* Filters out everything that isn't human-readable dialogue:
|
|
37
|
+
* - queue-operation / ai-title / summary events
|
|
38
|
+
* - attachment-only records (deferred_tools_delta, skill_listing, plan_mode)
|
|
39
|
+
* - tool_use / tool_result / thinking / redacted_thinking / image content blocks
|
|
40
|
+
* - encrypted thinking signatures (multi-kilobyte base64 blobs)
|
|
41
|
+
*
|
|
42
|
+
* Compaction handling:
|
|
43
|
+
* Records with isCompactSummary:true (synthetic summary fed back into model
|
|
44
|
+
* context by /compact) are returned with role='summary' so the importer
|
|
45
|
+
* can route them away from FTS5 indexing — otherwise the summary would
|
|
46
|
+
* double-count against the original raw discussion it summarises.
|
|
47
|
+
*
|
|
48
|
+
* Returns null when the record should be skipped, otherwise
|
|
49
|
+
* { role, text, id, timestamp, uuid, parentUuid }.
|
|
50
|
+
*/
|
|
51
|
+
export function extractMessageFromRecord(obj) {
|
|
52
|
+
if (!obj || typeof obj !== 'object') return null;
|
|
53
|
+
|
|
54
|
+
// Skip non-dialogue top-level event types
|
|
55
|
+
if (CLAUDE_CODE_SKIP_TYPES.has(obj.type)) return null;
|
|
56
|
+
|
|
57
|
+
// Skip attachment-only records (Claude Code harness bookkeeping)
|
|
58
|
+
if (obj.attachment && !obj.message) return null;
|
|
59
|
+
|
|
60
|
+
// Resolve role/content from either nested or flat shape
|
|
61
|
+
const nested = obj.message;
|
|
62
|
+
const fromNested = nested && typeof nested === 'object';
|
|
63
|
+
let role = fromNested ? nested.role : obj.role;
|
|
64
|
+
if (!role || typeof role !== 'string') return null;
|
|
65
|
+
|
|
66
|
+
let rawContent;
|
|
67
|
+
if (fromNested) {
|
|
68
|
+
rawContent = nested.content;
|
|
69
|
+
} else if (obj.content !== undefined) {
|
|
70
|
+
rawContent = obj.content;
|
|
71
|
+
} else {
|
|
72
|
+
rawContent = obj.text;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Normalise content into dialogue-only text
|
|
76
|
+
let text = '';
|
|
77
|
+
if (typeof rawContent === 'string') {
|
|
78
|
+
text = rawContent;
|
|
79
|
+
} else if (Array.isArray(rawContent)) {
|
|
80
|
+
const parts = [];
|
|
81
|
+
for (const block of rawContent) {
|
|
82
|
+
if (typeof block === 'string') {
|
|
83
|
+
parts.push(block);
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
if (!block || typeof block !== 'object') continue;
|
|
87
|
+
// Only keep text-bearing blocks. Drop tool_use, tool_result, thinking,
|
|
88
|
+
// redacted_thinking, image, and any future unknown block types.
|
|
89
|
+
if (block.type === 'text' && typeof block.text === 'string') {
|
|
90
|
+
parts.push(block.text);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
text = parts.join('\n');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (!text || !text.trim()) return null;
|
|
97
|
+
|
|
98
|
+
// Claude Code marks the synthetic /compact summary message with
|
|
99
|
+
// isCompactSummary:true (and isVisibleInTranscriptOnly:true). Re-tag
|
|
100
|
+
// those as role='summary' so the importer can keep them in the messages
|
|
101
|
+
// table for retrieval but exclude them from FTS5 — otherwise searching
|
|
102
|
+
// for any topic discussed before a compaction would return both the
|
|
103
|
+
// original raw turns AND the compressed summary mention, polluting rank.
|
|
104
|
+
if (
|
|
105
|
+
role === 'user' &&
|
|
106
|
+
(obj.isCompactSummary === true || obj.isVisibleInTranscriptOnly === true)
|
|
107
|
+
) {
|
|
108
|
+
role = 'summary';
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const id = (fromNested && nested.id) || obj.id || null;
|
|
112
|
+
const timestamp =
|
|
113
|
+
obj.timestamp || (fromNested && nested.timestamp) || null;
|
|
114
|
+
const uuid = obj.uuid || null;
|
|
115
|
+
const parentUuid = obj.parentUuid || null;
|
|
116
|
+
|
|
117
|
+
return { role, text, id, timestamp, uuid, parentUuid };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** Detect a compact_boundary record.
|
|
121
|
+
*
|
|
122
|
+
* Claude Code writes two record types when /compact (or auto-compact) fires:
|
|
123
|
+
* 1. {type:"system", subtype:"compact_boundary", compactMetadata:{...}, ...}
|
|
124
|
+
* — boundary marker. parentUuid is reset to null. compactMetadata
|
|
125
|
+
* carries {trigger, preTokens, postTokens, durationMs,
|
|
126
|
+
* logicalParentUuid, preCompactDiscoveredTools}.
|
|
127
|
+
* 2. {type:"user", isCompactSummary:true, message:{...}} — the
|
|
128
|
+
* AI-generated summary fed back into model context (handled by
|
|
129
|
+
* extractMessageFromRecord via role='summary').
|
|
130
|
+
*
|
|
131
|
+
* We also recognise the daemon's inbox-emitted shape
|
|
132
|
+
* {type:"compact-boundary", metadata:{...}, ...} so server.js can import
|
|
133
|
+
* either the raw on-disk format or the daemon's snapshot.
|
|
134
|
+
*
|
|
135
|
+
* Returns null when the record isn't a boundary, otherwise
|
|
136
|
+
* { timestamp, uuid, parentUuid, logicalParentUuid, metadata, id }.
|
|
137
|
+
*/
|
|
138
|
+
export function extractCompactBoundary(obj) {
|
|
139
|
+
if (!obj || typeof obj !== 'object') return null;
|
|
140
|
+
|
|
141
|
+
let metadata, raw;
|
|
142
|
+
if (obj.type === 'system' && obj.subtype === 'compact_boundary') {
|
|
143
|
+
metadata = obj.compactMetadata || {};
|
|
144
|
+
raw = obj;
|
|
145
|
+
} else if (obj.type === 'compact-boundary') {
|
|
146
|
+
metadata = obj.metadata || {};
|
|
147
|
+
raw = obj;
|
|
148
|
+
} else {
|
|
149
|
+
return null;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
timestamp: obj.timestamp || null,
|
|
154
|
+
uuid: obj.uuid || null,
|
|
155
|
+
parentUuid: obj.parentUuid || null,
|
|
156
|
+
logicalParentUuid:
|
|
157
|
+
obj.logicalParentUuid || (metadata && metadata.logicalParentUuid) || null,
|
|
158
|
+
metadata,
|
|
159
|
+
id: obj.id || null,
|
|
160
|
+
raw,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/** Pull an ai-title record out of a JSONL line, if present. */
|
|
165
|
+
export function extractAiTitle(obj) {
|
|
166
|
+
if (
|
|
167
|
+
obj &&
|
|
168
|
+
obj.type === 'ai-title' &&
|
|
169
|
+
typeof obj.aiTitle === 'string' &&
|
|
170
|
+
obj.aiTitle.trim()
|
|
171
|
+
) {
|
|
172
|
+
return obj.aiTitle.trim();
|
|
173
|
+
}
|
|
174
|
+
return null;
|
|
175
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL canonicalization for stable deduplication of stored web documents.
|
|
3
|
+
*
|
|
4
|
+
* Goal: two URLs that point to "the same document" should map to the same
|
|
5
|
+
* canonical form, so memex_store_document gives them the same conversation_id
|
|
6
|
+
* via sha256(canonical).
|
|
7
|
+
*
|
|
8
|
+
* What we normalize:
|
|
9
|
+
* - Lowercase scheme + host
|
|
10
|
+
* - Strip known tracking params (utm_*, fbclid, gclid, ref, mc_*, _ga, …)
|
|
11
|
+
* - Drop the fragment (#anchor) — same document
|
|
12
|
+
* - Normalize trailing slash on pathname
|
|
13
|
+
*
|
|
14
|
+
* What we DON'T normalize:
|
|
15
|
+
* - Path case (some servers are case-sensitive)
|
|
16
|
+
* - Non-tracking query params (?q= search, ?id= permalinks — meaningful)
|
|
17
|
+
* - Port (rare in public URLs)
|
|
18
|
+
*
|
|
19
|
+
* If the input isn't a valid URL, we return the input unchanged. Callers
|
|
20
|
+
* should still hash the result for deduplication.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
// Well-known tracking-param families. Case-insensitive prefix match.
|
|
24
|
+
const TRACKING_PREFIXES = [
|
|
25
|
+
'utm_', // Google Analytics
|
|
26
|
+
'mc_', // Mailchimp
|
|
27
|
+
];
|
|
28
|
+
const TRACKING_EXACT = new Set([
|
|
29
|
+
'fbclid', // Facebook
|
|
30
|
+
'gclid', // Google ads
|
|
31
|
+
'dclid', // Google DoubleClick
|
|
32
|
+
'gbraid', // Google
|
|
33
|
+
'wbraid', // Google
|
|
34
|
+
'yclid', // Yandex
|
|
35
|
+
'msclkid', // Microsoft ads
|
|
36
|
+
'twclid', // Twitter
|
|
37
|
+
'igshid', // Instagram
|
|
38
|
+
'ref', // generic referrer
|
|
39
|
+
'ref_source',
|
|
40
|
+
'ref_url',
|
|
41
|
+
'referrer',
|
|
42
|
+
'source', // common referrer flag (NOT always tracking but very often)
|
|
43
|
+
'_ga', // Google Analytics
|
|
44
|
+
'_gl', // Google Analytics linker
|
|
45
|
+
'hsCtaTracking',
|
|
46
|
+
'hsenc',
|
|
47
|
+
'hsmi',
|
|
48
|
+
'mkt_tok',
|
|
49
|
+
'pk_campaign',
|
|
50
|
+
'pk_source',
|
|
51
|
+
'pk_medium',
|
|
52
|
+
'pk_keyword',
|
|
53
|
+
'pk_content',
|
|
54
|
+
'vero_id',
|
|
55
|
+
'vero_conv',
|
|
56
|
+
]);
|
|
57
|
+
|
|
58
|
+
function isTrackingParam(name) {
|
|
59
|
+
const lower = name.toLowerCase();
|
|
60
|
+
if (TRACKING_EXACT.has(lower)) return true;
|
|
61
|
+
for (const prefix of TRACKING_PREFIXES) {
|
|
62
|
+
if (lower.startsWith(prefix)) return true;
|
|
63
|
+
}
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* @param {string} rawUrl
|
|
69
|
+
* @returns {string} canonicalized URL (or the input unchanged if unparseable)
|
|
70
|
+
*/
|
|
71
|
+
export function canonicalize(rawUrl) {
|
|
72
|
+
if (typeof rawUrl !== 'string' || !rawUrl.trim()) return rawUrl;
|
|
73
|
+
|
|
74
|
+
let u;
|
|
75
|
+
try {
|
|
76
|
+
u = new URL(rawUrl.trim());
|
|
77
|
+
} catch (_) {
|
|
78
|
+
return rawUrl.trim();
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Lowercase scheme + host (URL parser already does that, but be explicit)
|
|
82
|
+
u.protocol = u.protocol.toLowerCase();
|
|
83
|
+
u.hostname = u.hostname.toLowerCase();
|
|
84
|
+
|
|
85
|
+
// Drop the fragment
|
|
86
|
+
u.hash = '';
|
|
87
|
+
|
|
88
|
+
// Strip tracking params
|
|
89
|
+
const cleanParams = new URLSearchParams();
|
|
90
|
+
for (const [k, v] of u.searchParams) {
|
|
91
|
+
if (!isTrackingParam(k)) cleanParams.append(k, v);
|
|
92
|
+
}
|
|
93
|
+
u.search = cleanParams.toString();
|
|
94
|
+
|
|
95
|
+
// Normalize trailing slash: drop trailing slash on non-root paths,
|
|
96
|
+
// so /foo and /foo/ are treated as the same document
|
|
97
|
+
if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
|
|
98
|
+
u.pathname = u.pathname.replace(/\/+$/, '');
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return u.toString();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Best-effort domain extraction for metadata (e.g. "perplexity.ai").
|
|
106
|
+
* Returns null for unparseable URLs.
|
|
107
|
+
*/
|
|
108
|
+
export function extractDomain(rawUrl) {
|
|
109
|
+
if (typeof rawUrl !== 'string') return null;
|
|
110
|
+
try {
|
|
111
|
+
const u = new URL(rawUrl);
|
|
112
|
+
return u.hostname.toLowerCase().replace(/^www\./, '');
|
|
113
|
+
} catch (_) {
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
}
|