ctb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,585 @@
1
+ /**
2
+ * Document handler for Claude Telegram Bot.
3
+ *
4
+ * Supports PDFs and text files with media group buffering.
5
+ * PDF extraction uses pdftotext CLI (install via: brew install poppler)
6
+ */
7
+
8
+ import { unlinkSync } from "node:fs";
9
+ import type { Context } from "grammy";
10
+ import { ALLOWED_USERS, TEMP_DIR } from "../config";
11
+ import { isAuthorized, rateLimiter } from "../security";
12
+ import { session } from "../session";
13
+ import { auditLog, auditLogRateLimit, startTypingIndicator } from "../utils";
14
+ import { createMediaGroupBuffer, handleProcessingError } from "./media-group";
15
+ import { createStatusCallback, StreamingState } from "./streaming";
16
+
17
+ /**
18
+ * Safely delete a temp file, ignoring errors.
19
+ */
20
+ function cleanupTempFile(filePath: string): void {
21
+ try {
22
+ unlinkSync(filePath);
23
+ } catch {
24
+ // Ignore cleanup errors
25
+ }
26
+ }
27
+
28
+ /**
29
+ * Cleanup multiple temp files.
30
+ */
31
+ function cleanupTempFiles(filePaths: string[]): void {
32
+ for (const path of filePaths) {
33
+ cleanupTempFile(path);
34
+ }
35
+ }
36
+
37
+ // Supported text file extensions
38
+ const TEXT_EXTENSIONS = [
39
+ ".md",
40
+ ".txt",
41
+ ".json",
42
+ ".yaml",
43
+ ".yml",
44
+ ".csv",
45
+ ".xml",
46
+ ".html",
47
+ ".css",
48
+ ".js",
49
+ ".ts",
50
+ ".py",
51
+ ".sh",
52
+ ".env",
53
+ ".log",
54
+ ".cfg",
55
+ ".ini",
56
+ ".toml",
57
+ ];
58
+
59
+ // Supported archive extensions
60
+ const ARCHIVE_EXTENSIONS = [".zip", ".tar", ".tar.gz", ".tgz"];
61
+
62
+ // Max file size (10MB)
63
+ const MAX_FILE_SIZE = 10 * 1024 * 1024;
64
+
65
+ // Max extracted archive size (100MB) - prevents decompression bombs
66
+ const MAX_EXTRACTED_SIZE = 100 * 1024 * 1024;
67
+
68
+ // Max content from archive (50K chars total)
69
+ const MAX_ARCHIVE_CONTENT = 50000;
70
+
71
+ // Create document-specific media group buffer
72
+ const documentBuffer = createMediaGroupBuffer({
73
+ emoji: "📄",
74
+ itemLabel: "document",
75
+ itemLabelPlural: "documents",
76
+ });
77
+
78
+ /**
79
+ * Download a document and return the local path.
80
+ */
81
+ async function downloadDocument(ctx: Context): Promise<string> {
82
+ const doc = ctx.message?.document;
83
+ if (!doc) {
84
+ throw new Error("No document in message");
85
+ }
86
+
87
+ const file = await ctx.getFile();
88
+ const fileName = doc.file_name || `doc_${Date.now()}`;
89
+
90
+ // Sanitize filename
91
+ const safeName = fileName.replace(/[^a-zA-Z0-9._-]/g, "_");
92
+ const docPath = `${TEMP_DIR}/${safeName}`;
93
+
94
+ // Download
95
+ const response = await fetch(
96
+ `https://api.telegram.org/file/bot${ctx.api.token}/${file.file_path}`,
97
+ );
98
+ const buffer = await response.arrayBuffer();
99
+ await Bun.write(docPath, buffer);
100
+
101
+ return docPath;
102
+ }
103
+
104
+ /**
105
+ * Extract text from a document.
106
+ */
107
+ async function extractText(
108
+ filePath: string,
109
+ mimeType?: string,
110
+ ): Promise<string> {
111
+ const fileName = filePath.split("/").pop() || "";
112
+ const extension = `.${(fileName.split(".").pop() || "").toLowerCase()}`;
113
+
114
+ // PDF extraction using pdftotext CLI (install: brew install poppler)
115
+ if (mimeType === "application/pdf" || extension === ".pdf") {
116
+ try {
117
+ const result = await Bun.$`pdftotext -layout ${filePath} -`.quiet();
118
+ return result.text();
119
+ } catch (error) {
120
+ console.error("PDF parsing failed:", error);
121
+ return "[PDF parsing failed - ensure pdftotext is installed: brew install poppler]";
122
+ }
123
+ }
124
+
125
+ // Text files
126
+ if (TEXT_EXTENSIONS.includes(extension) || mimeType?.startsWith("text/")) {
127
+ const text = await Bun.file(filePath).text();
128
+ // Limit to 100K chars
129
+ return text.slice(0, 100000);
130
+ }
131
+
132
+ throw new Error(`Unsupported file type: ${extension || mimeType}`);
133
+ }
134
+
135
+ /**
136
+ * Check if a file extension is an archive.
137
+ */
138
+ function isArchive(fileName: string): boolean {
139
+ const lower = fileName.toLowerCase();
140
+ return ARCHIVE_EXTENSIONS.some((ext) => lower.endsWith(ext));
141
+ }
142
+
143
+ /**
144
+ * Get archive extension from filename.
145
+ */
146
+ function getArchiveExtension(fileName: string): string {
147
+ const lower = fileName.toLowerCase();
148
+ if (lower.endsWith(".tar.gz")) return ".tar.gz";
149
+ if (lower.endsWith(".tgz")) return ".tgz";
150
+ if (lower.endsWith(".tar")) return ".tar";
151
+ if (lower.endsWith(".zip")) return ".zip";
152
+ return "";
153
+ }
154
+
155
+ /**
156
+ * Calculate total size of files in a directory.
157
+ */
158
+ async function getDirectorySize(dir: string): Promise<number> {
159
+ let totalSize = 0;
160
+ const entries = await Array.fromAsync(
161
+ new Bun.Glob("**/*").scan({ cwd: dir, dot: true }),
162
+ );
163
+
164
+ for (const entry of entries) {
165
+ try {
166
+ const file = Bun.file(`${dir}/${entry}`);
167
+ totalSize += file.size;
168
+ // Early exit if already over limit
169
+ if (totalSize > MAX_EXTRACTED_SIZE) {
170
+ return totalSize;
171
+ }
172
+ } catch {
173
+ // Skip files we can't access
174
+ }
175
+ }
176
+
177
+ return totalSize;
178
+ }
179
+
180
+ /**
181
+ * Extract an archive to a temp directory.
182
+ * Validates extracted size to prevent decompression bombs.
183
+ */
184
+ async function extractArchive(
185
+ archivePath: string,
186
+ fileName: string,
187
+ ): Promise<string> {
188
+ const ext = getArchiveExtension(fileName);
189
+ const extractDir = `${TEMP_DIR}/archive_${Date.now()}`;
190
+ await Bun.$`mkdir -p ${extractDir}`;
191
+
192
+ try {
193
+ if (ext === ".zip") {
194
+ await Bun.$`unzip -q -o ${archivePath} -d ${extractDir}`.quiet();
195
+ } else if (ext === ".tar" || ext === ".tar.gz" || ext === ".tgz") {
196
+ await Bun.$`tar -xf ${archivePath} -C ${extractDir}`.quiet();
197
+ } else {
198
+ throw new Error(`Unknown archive type: ${ext}`);
199
+ }
200
+
201
+ // Check extracted size to prevent decompression bombs
202
+ const extractedSize = await getDirectorySize(extractDir);
203
+ if (extractedSize > MAX_EXTRACTED_SIZE) {
204
+ // Clean up and throw
205
+ await Bun.$`rm -rf ${extractDir}`.quiet();
206
+ throw new Error(
207
+ `Archive too large when extracted (${Math.round(extractedSize / 1024 / 1024)}MB > ${Math.round(MAX_EXTRACTED_SIZE / 1024 / 1024)}MB limit)`,
208
+ );
209
+ }
210
+
211
+ return extractDir;
212
+ } catch (error) {
213
+ // Clean up on any error
214
+ try {
215
+ await Bun.$`rm -rf ${extractDir}`.quiet();
216
+ } catch {
217
+ // Ignore cleanup errors
218
+ }
219
+ throw error;
220
+ }
221
+ }
222
+
223
+ /**
224
+ * Build a file tree from a directory.
225
+ */
226
+ async function buildFileTree(dir: string): Promise<string[]> {
227
+ const entries = await Array.fromAsync(
228
+ new Bun.Glob("**/*").scan({ cwd: dir, dot: false }),
229
+ );
230
+ entries.sort();
231
+ return entries.slice(0, 100); // Limit to 100 files
232
+ }
233
+
234
+ /**
235
+ * Extract text content from archive files.
236
+ */
237
+ async function extractArchiveContent(extractDir: string): Promise<{
238
+ tree: string[];
239
+ contents: Array<{ name: string; content: string }>;
240
+ }> {
241
+ const tree = await buildFileTree(extractDir);
242
+ const contents: Array<{ name: string; content: string }> = [];
243
+ let totalSize = 0;
244
+
245
+ for (const relativePath of tree) {
246
+ const fullPath = `${extractDir}/${relativePath}`;
247
+ const stat = await Bun.file(fullPath).exists();
248
+ if (!stat) continue;
249
+
250
+ // Check if it's a directory
251
+ const fileInfo = Bun.file(fullPath);
252
+ const size = fileInfo.size;
253
+ if (size === 0) continue;
254
+
255
+ const ext = `.${(relativePath.split(".").pop() || "").toLowerCase()}`;
256
+ if (!TEXT_EXTENSIONS.includes(ext)) continue;
257
+
258
+ // Skip large files
259
+ if (size > 100000) continue;
260
+
261
+ try {
262
+ const text = await fileInfo.text();
263
+ const truncated = text.slice(0, 10000); // 10K per file max
264
+ if (totalSize + truncated.length > MAX_ARCHIVE_CONTENT) break;
265
+ contents.push({ name: relativePath, content: truncated });
266
+ totalSize += truncated.length;
267
+ } catch {
268
+ // Skip binary or unreadable files
269
+ }
270
+ }
271
+
272
+ return { tree, contents };
273
+ }
274
+
275
+ /**
276
+ * Process an archive file.
277
+ */
278
+ async function processArchive(
279
+ ctx: Context,
280
+ archivePath: string,
281
+ fileName: string,
282
+ caption: string | undefined,
283
+ userId: number,
284
+ username: string,
285
+ chatId: number,
286
+ ): Promise<void> {
287
+ const stopProcessing = session.startProcessing();
288
+ const typing = startTypingIndicator(ctx);
289
+
290
+ // Show extraction progress
291
+ const statusMsg = await ctx.reply(`📦 Extracting <b>${fileName}</b>...`, {
292
+ parse_mode: "HTML",
293
+ });
294
+
295
+ try {
296
+ // Extract archive
297
+ console.log(`Extracting archive: ${fileName}`);
298
+ const extractDir = await extractArchive(archivePath, fileName);
299
+ const { tree, contents } = await extractArchiveContent(extractDir);
300
+ console.log(`Extracted: ${tree.length} files, ${contents.length} readable`);
301
+
302
+ // Update status
303
+ await ctx.api.editMessageText(
304
+ statusMsg.chat.id,
305
+ statusMsg.message_id,
306
+ `📦 Extracted <b>${fileName}</b>: ${tree.length} files, ${contents.length} readable`,
307
+ { parse_mode: "HTML" },
308
+ );
309
+
310
+ // Build prompt
311
+ const treeStr = tree.length > 0 ? tree.join("\n") : "(empty)";
312
+ const contentsStr =
313
+ contents.length > 0
314
+ ? contents.map((c) => `--- ${c.name} ---\n${c.content}`).join("\n\n")
315
+ : "(no readable text files)";
316
+
317
+ const prompt = caption
318
+ ? `Archive: ${fileName}\n\nFile tree (${tree.length} files):\n${treeStr}\n\nExtracted contents:\n${contentsStr}\n\n---\n\n${caption}`
319
+ : `Please analyze this archive (${fileName}):\n\nFile tree (${tree.length} files):\n${treeStr}\n\nExtracted contents:\n${contentsStr}`;
320
+
321
+ // Create streaming state
322
+ const state = new StreamingState();
323
+ const statusCallback = createStatusCallback(ctx, state);
324
+
325
+ const response = await session.sendMessageStreaming(
326
+ prompt,
327
+ username,
328
+ userId,
329
+ statusCallback,
330
+ chatId,
331
+ ctx,
332
+ );
333
+
334
+ await auditLog(
335
+ userId,
336
+ username,
337
+ "ARCHIVE",
338
+ `[${fileName}] ${caption || ""}`,
339
+ response,
340
+ );
341
+
342
+ // Cleanup
343
+ await Bun.$`rm -rf ${extractDir}`.quiet();
344
+
345
+ // Delete status message
346
+ try {
347
+ await ctx.api.deleteMessage(statusMsg.chat.id, statusMsg.message_id);
348
+ } catch {
349
+ // Ignore deletion errors
350
+ }
351
+ } catch (error) {
352
+ console.error("Archive processing error:", error);
353
+ // Delete status message on error
354
+ try {
355
+ await ctx.api.deleteMessage(statusMsg.chat.id, statusMsg.message_id);
356
+ } catch {
357
+ // Ignore
358
+ }
359
+ await ctx.reply(
360
+ `❌ Failed to process archive: ${String(error).slice(0, 100)}`,
361
+ );
362
+ } finally {
363
+ stopProcessing();
364
+ typing.stop();
365
+ // Clean up archive file
366
+ cleanupTempFile(archivePath);
367
+ }
368
+ }
369
+
370
+ /**
371
+ * Process documents with Claude.
372
+ */
373
+ async function processDocuments(
374
+ ctx: Context,
375
+ documents: Array<{ path: string; name: string; content: string }>,
376
+ caption: string | undefined,
377
+ userId: number,
378
+ username: string,
379
+ chatId: number,
380
+ ): Promise<void> {
381
+ // Mark processing started
382
+ const stopProcessing = session.startProcessing();
383
+
384
+ // Build prompt
385
+ let prompt: string;
386
+ if (documents.length === 1 && documents[0]) {
387
+ const doc = documents[0];
388
+ prompt = caption
389
+ ? `Document: ${doc.name}\n\nContent:\n${doc.content}\n\n---\n\n${caption}`
390
+ : `Please analyze this document (${doc.name}):\n\n${doc.content}`;
391
+ } else {
392
+ const docList = documents
393
+ .map((d, i) => `--- Document ${i + 1}: ${d.name} ---\n${d.content}`)
394
+ .join("\n\n");
395
+ prompt = caption
396
+ ? `${documents.length} Documents:\n\n${docList}\n\n---\n\n${caption}`
397
+ : `Please analyze these ${documents.length} documents:\n\n${docList}`;
398
+ }
399
+
400
+ // Start typing
401
+ const typing = startTypingIndicator(ctx);
402
+
403
+ // Create streaming state
404
+ const state = new StreamingState();
405
+ const statusCallback = createStatusCallback(ctx, state);
406
+
407
+ try {
408
+ const response = await session.sendMessageStreaming(
409
+ prompt,
410
+ username,
411
+ userId,
412
+ statusCallback,
413
+ chatId,
414
+ ctx,
415
+ );
416
+
417
+ await auditLog(
418
+ userId,
419
+ username,
420
+ "DOCUMENT",
421
+ `[${documents.length} docs] ${caption || ""}`,
422
+ response,
423
+ );
424
+ } catch (error) {
425
+ await handleProcessingError(ctx, error, state.toolMessages);
426
+ } finally {
427
+ stopProcessing();
428
+ typing.stop();
429
+ // Clean up temp files
430
+ cleanupTempFiles(documents.map((d) => d.path));
431
+ }
432
+ }
433
+
434
+ /**
435
+ * Process document paths by extracting text and calling processDocuments.
436
+ */
437
+ async function processDocumentPaths(
438
+ ctx: Context,
439
+ paths: string[],
440
+ caption: string | undefined,
441
+ userId: number,
442
+ username: string,
443
+ chatId: number,
444
+ ): Promise<void> {
445
+ // Extract text from all documents
446
+ const documents: Array<{ path: string; name: string; content: string }> = [];
447
+
448
+ for (const path of paths) {
449
+ try {
450
+ const name = path.split("/").pop() || "document";
451
+ const content = await extractText(path);
452
+ documents.push({ path, name, content });
453
+ } catch (error) {
454
+ console.error(`Failed to extract ${path}:`, error);
455
+ }
456
+ }
457
+
458
+ if (documents.length === 0) {
459
+ await ctx.reply("❌ Failed to extract any documents.");
460
+ return;
461
+ }
462
+
463
+ await processDocuments(ctx, documents, caption, userId, username, chatId);
464
+ }
465
+
466
+ /**
467
+ * Handle incoming document messages.
468
+ */
469
+ export async function handleDocument(ctx: Context): Promise<void> {
470
+ const userId = ctx.from?.id;
471
+ const username = ctx.from?.username || "unknown";
472
+ const chatId = ctx.chat?.id;
473
+ const doc = ctx.message?.document;
474
+ const mediaGroupId = ctx.message?.media_group_id;
475
+
476
+ if (!userId || !chatId || !doc) {
477
+ return;
478
+ }
479
+
480
+ // 1. Authorization check
481
+ if (!isAuthorized(userId, ALLOWED_USERS)) {
482
+ await ctx.reply("Unauthorized. Contact the bot owner for access.");
483
+ return;
484
+ }
485
+
486
+ // 2. Check file size
487
+ if (doc.file_size && doc.file_size > MAX_FILE_SIZE) {
488
+ await ctx.reply("❌ File too large. Maximum size is 10MB.");
489
+ return;
490
+ }
491
+
492
+ // 3. Check file type
493
+ const fileName = doc.file_name || "";
494
+ const extension = `.${(fileName.split(".").pop() || "").toLowerCase()}`;
495
+ const isPdf = doc.mime_type === "application/pdf" || extension === ".pdf";
496
+ const isText =
497
+ TEXT_EXTENSIONS.includes(extension) || doc.mime_type?.startsWith("text/");
498
+ const isArchiveFile = isArchive(fileName);
499
+
500
+ if (!isPdf && !isText && !isArchiveFile) {
501
+ await ctx.reply(
502
+ `❌ Unsupported file type: ${extension || doc.mime_type}\n\n` +
503
+ `Supported: PDF, archives (${ARCHIVE_EXTENSIONS.join(
504
+ ", ",
505
+ )}), ${TEXT_EXTENSIONS.join(", ")}`,
506
+ );
507
+ return;
508
+ }
509
+
510
+ // 4. Download document
511
+ let docPath: string;
512
+ try {
513
+ docPath = await downloadDocument(ctx);
514
+ } catch (error) {
515
+ console.error("Failed to download document:", error);
516
+ await ctx.reply("❌ Failed to download document.");
517
+ return;
518
+ }
519
+
520
+ // 5. Archive files - process separately (no media group support)
521
+ if (isArchiveFile) {
522
+ console.log(`Received archive: ${fileName} from @${username}`);
523
+ const [allowed, retryAfter] = rateLimiter.check(userId);
524
+ if (!allowed && retryAfter !== undefined) {
525
+ await auditLogRateLimit(userId, username, retryAfter);
526
+ await ctx.reply(
527
+ `⏳ Rate limited. Please wait ${retryAfter.toFixed(1)} seconds.`,
528
+ );
529
+ return;
530
+ }
531
+
532
+ await processArchive(
533
+ ctx,
534
+ docPath,
535
+ fileName,
536
+ ctx.message?.caption,
537
+ userId,
538
+ username,
539
+ chatId,
540
+ );
541
+ return;
542
+ }
543
+
544
+ // 6. Single document - process immediately
545
+ if (!mediaGroupId) {
546
+ console.log(`Received document: ${fileName} from @${username}`);
547
+ // Rate limit
548
+ const [allowed, retryAfter] = rateLimiter.check(userId);
549
+ if (!allowed && retryAfter !== undefined) {
550
+ await auditLogRateLimit(userId, username, retryAfter);
551
+ await ctx.reply(
552
+ `⏳ Rate limited. Please wait ${retryAfter.toFixed(1)} seconds.`,
553
+ );
554
+ return;
555
+ }
556
+
557
+ try {
558
+ const content = await extractText(docPath, doc.mime_type);
559
+ await processDocuments(
560
+ ctx,
561
+ [{ path: docPath, name: fileName, content }],
562
+ ctx.message?.caption,
563
+ userId,
564
+ username,
565
+ chatId,
566
+ );
567
+ } catch (error) {
568
+ console.error("Failed to extract document:", error);
569
+ await ctx.reply(
570
+ `❌ Failed to process document: ${String(error).slice(0, 100)}`,
571
+ );
572
+ }
573
+ return;
574
+ }
575
+
576
+ // 7. Media group - buffer with timeout
577
+ await documentBuffer.addToGroup(
578
+ mediaGroupId,
579
+ docPath,
580
+ ctx,
581
+ userId,
582
+ username,
583
+ processDocumentPaths,
584
+ );
585
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Handler exports for Claude Telegram Bot.
3
+ */
4
+
5
+ export { handleCallback } from "./callback";
6
+ export {
7
+ handleBookmarks,
8
+ handleCd,
9
+ handleNew,
10
+ handleRestart,
11
+ handleResume,
12
+ handleRetry,
13
+ handleStart,
14
+ handleStatus,
15
+ handleStop,
16
+ } from "./commands";
17
+ export { handleDocument } from "./document";
18
+ export { handlePhoto } from "./photo";
19
+ export { createStatusCallback, StreamingState } from "./streaming";
20
+ export { handleText } from "./text";
21
+ export { handleVoice } from "./voice";