multis 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.js ADDED
@@ -0,0 +1,71 @@
1
+ const { loadConfig, ensureMultisDir } = require('./config');
2
+ const { logAudit } = require('./governance/audit');
3
+ const { createMessageRouter } = require('./bot/handlers');
4
+ const { TelegramPlatform } = require('./platforms/telegram');
5
+ const { BeeperPlatform } = require('./platforms/beeper');
6
+
7
+ async function main() {
8
+ ensureMultisDir();
9
+ const config = loadConfig();
10
+
11
+ console.log('multis v0.1.0');
12
+ console.log(`Pairing code: ${config.pairing_code}`);
13
+ console.log(`Paired users: ${config.allowed_users.length}`);
14
+ console.log(`LLM provider: ${config.llm.provider}`);
15
+
16
+ const handler = createMessageRouter(config);
17
+ const platforms = [];
18
+
19
+ // Telegram — enabled by default (backward compat)
20
+ if (config.platforms?.telegram?.enabled !== false) {
21
+ try {
22
+ const telegram = new TelegramPlatform(config);
23
+ telegram.onMessage(handler);
24
+ platforms.push(telegram);
25
+ } catch (err) {
26
+ console.error(`Telegram: ${err.message}`);
27
+ }
28
+ }
29
+
30
+ // Beeper — opt-in
31
+ if (config.platforms?.beeper?.enabled) {
32
+ try {
33
+ const beeper = new BeeperPlatform(config);
34
+ beeper.onMessage(handler);
35
+ platforms.push(beeper);
36
+ } catch (err) {
37
+ console.error(`Beeper: ${err.message}`);
38
+ }
39
+ }
40
+
41
+ if (platforms.length === 0) {
42
+ console.error('No platforms configured. Set up at least one platform.');
43
+ process.exit(1);
44
+ }
45
+
46
+ logAudit({ action: 'bot_start', platforms: platforms.map(p => p.name), paired_users: config.allowed_users.length });
47
+
48
+ for (const p of platforms) {
49
+ await p.start();
50
+ }
51
+
52
+ console.log(`Running on: ${platforms.map(p => p.name).join(', ')}`);
53
+
54
+ // Graceful shutdown
55
+ const shutdown = async (signal) => {
56
+ console.log(`\nShutting down (${signal})...`);
57
+ logAudit({ action: 'bot_stop', reason: signal });
58
+ for (const p of platforms) {
59
+ await p.stop();
60
+ }
61
+ process.exit(0);
62
+ };
63
+
64
+ process.once('SIGINT', () => shutdown('SIGINT'));
65
+ process.once('SIGTERM', () => shutdown('SIGTERM'));
66
+ }
67
+
68
+ main().catch(err => {
69
+ console.error('Fatal:', err.message);
70
+ process.exit(1);
71
+ });
@@ -0,0 +1,68 @@
1
+ const crypto = require('crypto');
2
+
3
+ /**
4
+ * DocChunk - represents a document section/chunk.
5
+ * Ported from aurora_core.chunks.DocChunk (Python dataclass).
6
+ */
7
+ class DocChunk {
8
+ constructor({
9
+ chunkId = null,
10
+ filePath,
11
+ pageStart = 0,
12
+ pageEnd = 0,
13
+ elementType = 'paragraph', // toc_entry, section, paragraph, table
14
+ name = '',
15
+ content = '',
16
+ parentChunkId = null,
17
+ sectionPath = [], // breadcrumb array: ["Chapter 1", "Section 1.2"]
18
+ sectionLevel = 0, // heading depth 1-5, 0 = body
19
+ documentType = 'unknown', // pdf, docx, md, txt
20
+ metadata = {},
21
+ createdAt = null,
22
+ updatedAt = null
23
+ }) {
24
+ this.chunkId = chunkId || DocChunk.generateId(filePath, name, content);
25
+ this.filePath = filePath;
26
+ this.pageStart = pageStart;
27
+ this.pageEnd = pageEnd;
28
+ this.elementType = elementType;
29
+ this.name = name;
30
+ this.content = content;
31
+ this.parentChunkId = parentChunkId;
32
+ this.sectionPath = sectionPath;
33
+ this.sectionLevel = sectionLevel;
34
+ this.documentType = documentType;
35
+ this.metadata = metadata;
36
+ this.createdAt = createdAt || new Date().toISOString();
37
+ this.updatedAt = updatedAt || new Date().toISOString();
38
+ }
39
+
40
+ static generateId(filePath, name, content) {
41
+ const hash = crypto.createHash('sha256')
42
+ .update(`${filePath}:${name}:${content.slice(0, 200)}`)
43
+ .digest('hex')
44
+ .slice(0, 16);
45
+ return `doc:${hash}`;
46
+ }
47
+
48
+ toJSON() {
49
+ return {
50
+ chunk_id: this.chunkId,
51
+ file_path: this.filePath,
52
+ page_start: this.pageStart,
53
+ page_end: this.pageEnd,
54
+ element_type: this.elementType,
55
+ name: this.name,
56
+ content: this.content,
57
+ parent_chunk_id: this.parentChunkId,
58
+ section_path: this.sectionPath,
59
+ section_level: this.sectionLevel,
60
+ document_type: this.documentType,
61
+ metadata: this.metadata,
62
+ created_at: this.createdAt,
63
+ updated_at: this.updatedAt
64
+ };
65
+ }
66
+ }
67
+
68
+ module.exports = { DocChunk };
@@ -0,0 +1,87 @@
1
+ const { DocChunk } = require('./chunk');
2
+
3
+ /**
4
+ * DocumentChunker - section-aware chunk splitting with overlap.
5
+ * Ported from aurora_context_doc.chunker.DocumentChunker (Python).
6
+ */
7
+ class DocumentChunker {
8
+ constructor({ maxChunkSize = 2000, overlap = 200 } = {}) {
9
+ this.maxChunkSize = maxChunkSize;
10
+ this.overlap = overlap;
11
+ }
12
+
13
+ /**
14
+ * Split a large chunk into smaller overlapping pieces at sentence boundaries.
15
+ * @param {DocChunk} chunk
16
+ * @returns {DocChunk[]}
17
+ */
18
+ splitLarge(chunk) {
19
+ if (chunk.content.length <= this.maxChunkSize) {
20
+ return [chunk];
21
+ }
22
+
23
+ const chunks = [];
24
+ const content = chunk.content;
25
+ let start = 0;
26
+ let partNum = 0;
27
+
28
+ while (start < content.length) {
29
+ let end = Math.min(start + this.maxChunkSize, content.length);
30
+
31
+ // Try to break at sentence boundary
32
+ if (end < content.length) {
33
+ for (const marker of ['. ', '! ', '? ', '\n\n', '\n']) {
34
+ const lastBreak = content.lastIndexOf(marker, end);
35
+ if (lastBreak > start) {
36
+ end = lastBreak + marker.length;
37
+ break;
38
+ }
39
+ }
40
+ }
41
+
42
+ const sliceContent = content.slice(start, end).trim();
43
+ if (sliceContent) {
44
+ chunks.push(new DocChunk({
45
+ chunkId: `${chunk.chunkId}-p${partNum}`,
46
+ filePath: chunk.filePath,
47
+ pageStart: chunk.pageStart,
48
+ pageEnd: chunk.pageEnd,
49
+ elementType: chunk.elementType,
50
+ name: `${chunk.name} (part ${partNum + 1})`,
51
+ content: sliceContent,
52
+ parentChunkId: chunk.parentChunkId,
53
+ sectionPath: chunk.sectionPath,
54
+ sectionLevel: chunk.sectionLevel,
55
+ documentType: chunk.documentType,
56
+ metadata: chunk.metadata,
57
+ createdAt: chunk.createdAt,
58
+ updatedAt: chunk.updatedAt
59
+ }));
60
+ }
61
+
62
+ if (end >= content.length) break;
63
+
64
+ // Move forward with overlap, ensure progress
65
+ const nextStart = end - this.overlap;
66
+ start = nextStart <= start ? start + 1 : nextStart;
67
+ partNum++;
68
+ }
69
+
70
+ return chunks;
71
+ }
72
+
73
+ /**
74
+ * Process an array of chunks: split large ones.
75
+ * @param {DocChunk[]} chunks
76
+ * @returns {DocChunk[]}
77
+ */
78
+ process(chunks) {
79
+ const result = [];
80
+ for (const chunk of chunks) {
81
+ result.push(...this.splitLarge(chunk));
82
+ }
83
+ return result;
84
+ }
85
+ }
86
+
87
+ module.exports = { DocumentChunker };
@@ -0,0 +1,150 @@
1
+ const fs = require('fs');
2
+ const path = require('path');
3
+ const { getParser } = require('./parsers');
4
+ const { DocumentChunker } = require('./chunker');
5
+ const { DocumentStore } = require('./store');
6
+ const { logAudit } = require('../governance/audit');
7
+
8
+ /**
9
+ * DocumentIndexer - orchestrates parsing, chunking, and storage.
10
+ * Ported from aurora_context_doc.indexer.DocumentIndexer (Python).
11
+ */
12
+ class DocumentIndexer {
13
+ constructor(store = null) {
14
+ this.store = store || new DocumentStore();
15
+ this.chunker = new DocumentChunker();
16
+ }
17
+
18
+ /**
19
+ * Index a single file: parse → chunk → store
20
+ * @param {string} filePath - Path to document
21
+ * @returns {Promise<number>} - Number of chunks created
22
+ */
23
+ async indexFile(filePath) {
24
+ const resolved = path.resolve(filePath);
25
+
26
+ if (!fs.existsSync(resolved)) {
27
+ throw new Error(`File not found: ${filePath}`);
28
+ }
29
+
30
+ const parser = getParser(resolved);
31
+ if (!parser) {
32
+ const ext = path.extname(resolved);
33
+ throw new Error(`Unsupported file type: ${ext}. Supported: .pdf, .docx, .md, .txt`);
34
+ }
35
+
36
+ // Delete existing chunks for this file (re-index)
37
+ this.store.deleteByFile(resolved);
38
+
39
+ // Parse
40
+ const rawChunks = await parser(resolved);
41
+ if (!rawChunks || rawChunks.length === 0) {
42
+ return 0;
43
+ }
44
+
45
+ // Chunk (split large sections)
46
+ const processed = this.chunker.process(rawChunks);
47
+
48
+ // Store
49
+ this.store.saveChunks(processed);
50
+
51
+ logAudit({
52
+ action: 'index_file',
53
+ file: resolved,
54
+ raw_chunks: rawChunks.length,
55
+ stored_chunks: processed.length
56
+ });
57
+
58
+ return processed.length;
59
+ }
60
+
61
+ /**
62
+ * Index a buffer (e.g. from Telegram file upload)
63
+ * @param {Buffer} buffer - File contents
64
+ * @param {string} filename - Original filename
65
+ * @returns {Promise<number>} - Number of chunks created
66
+ */
67
+ async indexBuffer(buffer, filename) {
68
+ // Write to temp file, index it, then clean up
69
+ const tmpDir = path.join(require('../config').MULTIS_DIR, 'tmp');
70
+ if (!fs.existsSync(tmpDir)) {
71
+ fs.mkdirSync(tmpDir, { recursive: true });
72
+ }
73
+
74
+ const tmpPath = path.join(tmpDir, filename);
75
+ fs.writeFileSync(tmpPath, buffer);
76
+
77
+ try {
78
+ const count = await this.indexFile(tmpPath);
79
+ return count;
80
+ } finally {
81
+ // Clean up temp file
82
+ if (fs.existsSync(tmpPath)) {
83
+ fs.unlinkSync(tmpPath);
84
+ }
85
+ }
86
+ }
87
+
88
+ /**
89
+ * Index all supported files in a directory
90
+ * @param {string} dirPath - Directory path
91
+ * @param {boolean} recursive - Recurse into subdirectories
92
+ * @returns {Promise<{files: number, chunks: number}>}
93
+ */
94
+ async indexDirectory(dirPath, recursive = true) {
95
+ const resolved = path.resolve(dirPath);
96
+
97
+ if (!fs.existsSync(resolved)) {
98
+ throw new Error(`Directory not found: ${dirPath}`);
99
+ }
100
+
101
+ const supportedExts = ['.pdf', '.docx', '.md', '.txt'];
102
+ let totalFiles = 0;
103
+ let totalChunks = 0;
104
+
105
+ const entries = fs.readdirSync(resolved, { withFileTypes: true });
106
+
107
+ for (const entry of entries) {
108
+ const fullPath = path.join(resolved, entry.name);
109
+
110
+ if (entry.isDirectory() && recursive) {
111
+ const sub = await this.indexDirectory(fullPath, true);
112
+ totalFiles += sub.files;
113
+ totalChunks += sub.chunks;
114
+ } else if (entry.isFile() && supportedExts.includes(path.extname(entry.name).toLowerCase())) {
115
+ try {
116
+ const count = await this.indexFile(fullPath);
117
+ totalFiles++;
118
+ totalChunks += count;
119
+ } catch (err) {
120
+ logAudit({ action: 'index_error', file: fullPath, error: err.message });
121
+ }
122
+ }
123
+ }
124
+
125
+ return { files: totalFiles, chunks: totalChunks };
126
+ }
127
+
128
+ /**
129
+ * Search indexed documents
130
+ * @param {string} query - Search query
131
+ * @param {number} limit - Max results
132
+ * @returns {Array} - Matching chunks
133
+ */
134
+ search(query, limit = 5) {
135
+ return this.store.search(query, limit);
136
+ }
137
+
138
+ /**
139
+ * Get indexing stats
140
+ */
141
+ getStats() {
142
+ return this.store.getStats();
143
+ }
144
+
145
+ close() {
146
+ this.store.close();
147
+ }
148
+ }
149
+
150
+ module.exports = { DocumentIndexer };
@@ -0,0 +1,299 @@
1
+ const fs = require('fs');
2
+ const path = require('path');
3
+ const { DocChunk } = require('./chunk');
4
+
5
+ /**
6
+ * PDF Parser - uses pdf-parse to extract text, creates page-level chunks.
7
+ * Ported from aurora_context_doc.parser.pdf (PyMuPDF).
8
+ * pdf-parse is simpler than PyMuPDF — no TOC extraction, no font detection.
9
+ * We get page-level text and chunk from there.
10
+ */
11
+ async function parsePDF(filePath) {
12
+ const pdfParse = require('pdf-parse');
13
+ const buffer = fs.readFileSync(filePath);
14
+
15
+ const data = await pdfParse(buffer, {
16
+ // Custom page renderer to get per-page text
17
+ pagerender: function (pageData) {
18
+ return pageData.getTextContent().then(function (textContent) {
19
+ return textContent.items.map(item => item.str).join(' ');
20
+ });
21
+ }
22
+ });
23
+
24
+ // pdf-parse doesn't give per-page text easily with custom renderer,
25
+ // but data.text has full text. Split by form feeds if available.
26
+ // Fallback: use numpages and split evenly, or treat as single doc.
27
+ const chunks = [];
28
+ const absPath = path.resolve(filePath);
29
+
30
+ // Try to split by page using the raw text
31
+ // pdf-parse joins pages - we re-parse with page tracking
32
+ const pdf = await pdfParse(buffer);
33
+ const fullText = pdf.text;
34
+ const numPages = pdf.numpages;
35
+
36
+ if (numPages <= 1 || fullText.length < 500) {
37
+ // Single chunk for small docs
38
+ if (fullText.trim()) {
39
+ chunks.push(new DocChunk({
40
+ filePath: absPath,
41
+ pageStart: 1,
42
+ pageEnd: numPages,
43
+ elementType: 'section',
44
+ name: path.basename(filePath),
45
+ content: fullText.trim(),
46
+ sectionPath: [path.basename(filePath)],
47
+ sectionLevel: 1,
48
+ documentType: 'pdf'
49
+ }));
50
+ }
51
+ } else {
52
+ // Split full text roughly by page count
53
+ // This is approximate — pdf-parse doesn't give clean page breaks
54
+ const avgChars = Math.ceil(fullText.length / numPages);
55
+ for (let i = 0; i < numPages; i++) {
56
+ const start = i * avgChars;
57
+ const end = Math.min((i + 1) * avgChars, fullText.length);
58
+ const pageText = fullText.slice(start, end).trim();
59
+
60
+ if (!pageText) continue;
61
+
62
+ chunks.push(new DocChunk({
63
+ filePath: absPath,
64
+ pageStart: i + 1,
65
+ pageEnd: i + 1,
66
+ elementType: 'paragraph',
67
+ name: `Page ${i + 1}`,
68
+ content: pageText,
69
+ sectionPath: [path.basename(filePath), `Page ${i + 1}`],
70
+ sectionLevel: 1,
71
+ documentType: 'pdf'
72
+ }));
73
+ }
74
+ }
75
+
76
+ return chunks;
77
+ }
78
+
79
+ /**
80
+ * DOCX Parser - uses mammoth to extract HTML, then parses headings for hierarchy.
81
+ * Ported from aurora_context_doc.parser.docx (python-docx).
82
+ * mammoth converts to HTML — we parse headings from that.
83
+ */
84
+ async function parseDOCX(filePath) {
85
+ const mammoth = require('mammoth');
86
+ const absPath = path.resolve(filePath);
87
+
88
+ const result = await mammoth.convertToHtml({ path: filePath });
89
+ const html = result.value;
90
+
91
+ // Parse HTML for headings and content blocks
92
+ const chunks = [];
93
+ const sectionStack = []; // track current heading hierarchy
94
+
95
+ // Split HTML by heading tags
96
+ const parts = html.split(/(<h[1-6][^>]*>.*?<\/h[1-6]>)/gi);
97
+
98
+ let currentContent = '';
99
+ let currentName = path.basename(filePath);
100
+ let currentLevel = 0;
101
+
102
+ for (const part of parts) {
103
+ const headingMatch = part.match(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/i);
104
+
105
+ if (headingMatch) {
106
+ // Save previous section if it has content
107
+ if (currentContent.trim()) {
108
+ const cleanContent = stripHtml(currentContent);
109
+ if (cleanContent) {
110
+ chunks.push(new DocChunk({
111
+ filePath: absPath,
112
+ elementType: currentLevel > 0 ? 'section' : 'paragraph',
113
+ name: currentName,
114
+ content: cleanContent,
115
+ sectionPath: sectionStack.map(s => s.name).concat(currentLevel > 0 ? [] : [currentName]),
116
+ sectionLevel: currentLevel,
117
+ documentType: 'docx'
118
+ }));
119
+ }
120
+ }
121
+
122
+ // Start new section
123
+ const level = parseInt(headingMatch[1]);
124
+ const title = stripHtml(headingMatch[2]);
125
+
126
+ // Update section stack
127
+ while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1].level >= level) {
128
+ sectionStack.pop();
129
+ }
130
+ sectionStack.push({ level, name: title });
131
+
132
+ currentName = title;
133
+ currentLevel = level;
134
+ currentContent = '';
135
+ } else {
136
+ currentContent += part;
137
+ }
138
+ }
139
+
140
+ // Save last section
141
+ if (currentContent.trim()) {
142
+ const cleanContent = stripHtml(currentContent);
143
+ if (cleanContent) {
144
+ chunks.push(new DocChunk({
145
+ filePath: absPath,
146
+ elementType: currentLevel > 0 ? 'section' : 'paragraph',
147
+ name: currentName,
148
+ content: cleanContent,
149
+ sectionPath: sectionStack.map(s => s.name),
150
+ sectionLevel: currentLevel,
151
+ documentType: 'docx'
152
+ }));
153
+ }
154
+ }
155
+
156
+ // If no headings found, treat as single chunk
157
+ if (chunks.length === 0 && html.trim()) {
158
+ const cleanContent = stripHtml(html);
159
+ if (cleanContent) {
160
+ chunks.push(new DocChunk({
161
+ filePath: absPath,
162
+ elementType: 'paragraph',
163
+ name: path.basename(filePath),
164
+ content: cleanContent,
165
+ sectionPath: [path.basename(filePath)],
166
+ sectionLevel: 0,
167
+ documentType: 'docx'
168
+ }));
169
+ }
170
+ }
171
+
172
+ return chunks;
173
+ }
174
+
175
+ /**
176
+ * Markdown Parser - native, splits by headings.
177
+ * No external dependency needed.
178
+ */
179
+ function parseMD(filePath) {
180
+ const absPath = path.resolve(filePath);
181
+ const content = fs.readFileSync(filePath, 'utf8');
182
+ const lines = content.split('\n');
183
+ const chunks = [];
184
+ const sectionStack = [];
185
+
186
+ let currentContent = '';
187
+ let currentName = path.basename(filePath);
188
+ let currentLevel = 0;
189
+
190
+ for (const line of lines) {
191
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)/);
192
+
193
+ if (headingMatch) {
194
+ // Save previous section
195
+ if (currentContent.trim()) {
196
+ chunks.push(new DocChunk({
197
+ filePath: absPath,
198
+ elementType: currentLevel > 0 ? 'section' : 'paragraph',
199
+ name: currentName,
200
+ content: currentContent.trim(),
201
+ sectionPath: sectionStack.map(s => s.name),
202
+ sectionLevel: currentLevel,
203
+ documentType: 'md'
204
+ }));
205
+ }
206
+
207
+ const level = headingMatch[1].length;
208
+ const title = headingMatch[2].trim();
209
+
210
+ while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1].level >= level) {
211
+ sectionStack.pop();
212
+ }
213
+ sectionStack.push({ level, name: title });
214
+
215
+ currentName = title;
216
+ currentLevel = level;
217
+ currentContent = '';
218
+ } else {
219
+ currentContent += line + '\n';
220
+ }
221
+ }
222
+
223
+ // Save last section
224
+ if (currentContent.trim()) {
225
+ chunks.push(new DocChunk({
226
+ filePath: absPath,
227
+ elementType: currentLevel > 0 ? 'section' : 'paragraph',
228
+ name: currentName,
229
+ content: currentContent.trim(),
230
+ sectionPath: sectionStack.map(s => s.name),
231
+ sectionLevel: currentLevel,
232
+ documentType: 'md'
233
+ }));
234
+ }
235
+
236
+ if (chunks.length === 0 && content.trim()) {
237
+ chunks.push(new DocChunk({
238
+ filePath: absPath,
239
+ elementType: 'paragraph',
240
+ name: path.basename(filePath),
241
+ content: content.trim(),
242
+ sectionPath: [path.basename(filePath)],
243
+ sectionLevel: 0,
244
+ documentType: 'md'
245
+ }));
246
+ }
247
+
248
+ return chunks;
249
+ }
250
+
251
+ /**
252
+ * Plain text parser - single chunk per file
253
+ */
254
+ function parseTXT(filePath) {
255
+ const absPath = path.resolve(filePath);
256
+ const content = fs.readFileSync(filePath, 'utf8');
257
+
258
+ if (!content.trim()) return [];
259
+
260
+ return [new DocChunk({
261
+ filePath: absPath,
262
+ elementType: 'paragraph',
263
+ name: path.basename(filePath),
264
+ content: content.trim(),
265
+ sectionPath: [path.basename(filePath)],
266
+ sectionLevel: 0,
267
+ documentType: 'txt'
268
+ })];
269
+ }
270
+
271
+ /**
272
+ * Get the appropriate parser for a file extension
273
+ */
274
+ function getParser(filePath) {
275
+ const ext = path.extname(filePath).toLowerCase().slice(1);
276
+ switch (ext) {
277
+ case 'pdf': return parsePDF;
278
+ case 'docx': return parseDOCX;
279
+ case 'md': return parseMD;
280
+ case 'txt': return parseTXT;
281
+ default: return null;
282
+ }
283
+ }
284
+
285
+ /** Strip HTML tags and decode entities */
286
+ function stripHtml(html) {
287
+ return html
288
+ .replace(/<[^>]+>/g, '')
289
+ .replace(/&amp;/g, '&')
290
+ .replace(/&lt;/g, '<')
291
+ .replace(/&gt;/g, '>')
292
+ .replace(/&quot;/g, '"')
293
+ .replace(/&#39;/g, "'")
294
+ .replace(/&nbsp;/g, ' ')
295
+ .replace(/\s+/g, ' ')
296
+ .trim();
297
+ }
298
+
299
+ module.exports = { parsePDF, parseDOCX, parseMD, parseTXT, getParser };