multis 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +19 -0
- package/CLAUDE.md +66 -0
- package/README.md +98 -0
- package/package.json +32 -0
- package/skills/capture.md +60 -0
- package/skills/files.md +38 -0
- package/skills/shell.md +53 -0
- package/skills/weather.md +32 -0
- package/src/bot/handlers.js +712 -0
- package/src/bot/telegram.js +51 -0
- package/src/cli/setup-beeper.js +239 -0
- package/src/config.js +157 -0
- package/src/governance/audit.js +95 -0
- package/src/governance/validate.js +99 -0
- package/src/index.js +71 -0
- package/src/indexer/chunk.js +68 -0
- package/src/indexer/chunker.js +87 -0
- package/src/indexer/index.js +150 -0
- package/src/indexer/parsers.js +299 -0
- package/src/indexer/store.js +256 -0
- package/src/llm/anthropic.js +106 -0
- package/src/llm/base.js +38 -0
- package/src/llm/client.js +34 -0
- package/src/llm/ollama.js +148 -0
- package/src/llm/openai.js +107 -0
- package/src/llm/prompts.js +71 -0
- package/src/memory/capture.js +85 -0
- package/src/memory/manager.js +123 -0
- package/src/platforms/base.js +38 -0
- package/src/platforms/beeper.js +238 -0
- package/src/platforms/message.js +61 -0
- package/src/platforms/telegram.js +95 -0
- package/src/skills/executor.js +125 -0
package/src/index.js
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
const { loadConfig, ensureMultisDir } = require('./config');
|
|
2
|
+
const { logAudit } = require('./governance/audit');
|
|
3
|
+
const { createMessageRouter } = require('./bot/handlers');
|
|
4
|
+
const { TelegramPlatform } = require('./platforms/telegram');
|
|
5
|
+
const { BeeperPlatform } = require('./platforms/beeper');
|
|
6
|
+
|
|
7
|
+
async function main() {
|
|
8
|
+
ensureMultisDir();
|
|
9
|
+
const config = loadConfig();
|
|
10
|
+
|
|
11
|
+
console.log('multis v0.1.0');
|
|
12
|
+
console.log(`Pairing code: ${config.pairing_code}`);
|
|
13
|
+
console.log(`Paired users: ${config.allowed_users.length}`);
|
|
14
|
+
console.log(`LLM provider: ${config.llm.provider}`);
|
|
15
|
+
|
|
16
|
+
const handler = createMessageRouter(config);
|
|
17
|
+
const platforms = [];
|
|
18
|
+
|
|
19
|
+
// Telegram — enabled by default (backward compat)
|
|
20
|
+
if (config.platforms?.telegram?.enabled !== false) {
|
|
21
|
+
try {
|
|
22
|
+
const telegram = new TelegramPlatform(config);
|
|
23
|
+
telegram.onMessage(handler);
|
|
24
|
+
platforms.push(telegram);
|
|
25
|
+
} catch (err) {
|
|
26
|
+
console.error(`Telegram: ${err.message}`);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Beeper — opt-in
|
|
31
|
+
if (config.platforms?.beeper?.enabled) {
|
|
32
|
+
try {
|
|
33
|
+
const beeper = new BeeperPlatform(config);
|
|
34
|
+
beeper.onMessage(handler);
|
|
35
|
+
platforms.push(beeper);
|
|
36
|
+
} catch (err) {
|
|
37
|
+
console.error(`Beeper: ${err.message}`);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (platforms.length === 0) {
|
|
42
|
+
console.error('No platforms configured. Set up at least one platform.');
|
|
43
|
+
process.exit(1);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
logAudit({ action: 'bot_start', platforms: platforms.map(p => p.name), paired_users: config.allowed_users.length });
|
|
47
|
+
|
|
48
|
+
for (const p of platforms) {
|
|
49
|
+
await p.start();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
console.log(`Running on: ${platforms.map(p => p.name).join(', ')}`);
|
|
53
|
+
|
|
54
|
+
// Graceful shutdown
|
|
55
|
+
const shutdown = async (signal) => {
|
|
56
|
+
console.log(`\nShutting down (${signal})...`);
|
|
57
|
+
logAudit({ action: 'bot_stop', reason: signal });
|
|
58
|
+
for (const p of platforms) {
|
|
59
|
+
await p.stop();
|
|
60
|
+
}
|
|
61
|
+
process.exit(0);
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
process.once('SIGINT', () => shutdown('SIGINT'));
|
|
65
|
+
process.once('SIGTERM', () => shutdown('SIGTERM'));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
main().catch(err => {
|
|
69
|
+
console.error('Fatal:', err.message);
|
|
70
|
+
process.exit(1);
|
|
71
|
+
});
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
const crypto = require('crypto');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* DocChunk - represents a document section/chunk.
|
|
5
|
+
* Ported from aurora_core.chunks.DocChunk (Python dataclass).
|
|
6
|
+
*/
|
|
7
|
+
class DocChunk {
|
|
8
|
+
constructor({
|
|
9
|
+
chunkId = null,
|
|
10
|
+
filePath,
|
|
11
|
+
pageStart = 0,
|
|
12
|
+
pageEnd = 0,
|
|
13
|
+
elementType = 'paragraph', // toc_entry, section, paragraph, table
|
|
14
|
+
name = '',
|
|
15
|
+
content = '',
|
|
16
|
+
parentChunkId = null,
|
|
17
|
+
sectionPath = [], // breadcrumb array: ["Chapter 1", "Section 1.2"]
|
|
18
|
+
sectionLevel = 0, // heading depth 1-5, 0 = body
|
|
19
|
+
documentType = 'unknown', // pdf, docx, md, txt
|
|
20
|
+
metadata = {},
|
|
21
|
+
createdAt = null,
|
|
22
|
+
updatedAt = null
|
|
23
|
+
}) {
|
|
24
|
+
this.chunkId = chunkId || DocChunk.generateId(filePath, name, content);
|
|
25
|
+
this.filePath = filePath;
|
|
26
|
+
this.pageStart = pageStart;
|
|
27
|
+
this.pageEnd = pageEnd;
|
|
28
|
+
this.elementType = elementType;
|
|
29
|
+
this.name = name;
|
|
30
|
+
this.content = content;
|
|
31
|
+
this.parentChunkId = parentChunkId;
|
|
32
|
+
this.sectionPath = sectionPath;
|
|
33
|
+
this.sectionLevel = sectionLevel;
|
|
34
|
+
this.documentType = documentType;
|
|
35
|
+
this.metadata = metadata;
|
|
36
|
+
this.createdAt = createdAt || new Date().toISOString();
|
|
37
|
+
this.updatedAt = updatedAt || new Date().toISOString();
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
static generateId(filePath, name, content) {
|
|
41
|
+
const hash = crypto.createHash('sha256')
|
|
42
|
+
.update(`${filePath}:${name}:${content.slice(0, 200)}`)
|
|
43
|
+
.digest('hex')
|
|
44
|
+
.slice(0, 16);
|
|
45
|
+
return `doc:${hash}`;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
toJSON() {
|
|
49
|
+
return {
|
|
50
|
+
chunk_id: this.chunkId,
|
|
51
|
+
file_path: this.filePath,
|
|
52
|
+
page_start: this.pageStart,
|
|
53
|
+
page_end: this.pageEnd,
|
|
54
|
+
element_type: this.elementType,
|
|
55
|
+
name: this.name,
|
|
56
|
+
content: this.content,
|
|
57
|
+
parent_chunk_id: this.parentChunkId,
|
|
58
|
+
section_path: this.sectionPath,
|
|
59
|
+
section_level: this.sectionLevel,
|
|
60
|
+
document_type: this.documentType,
|
|
61
|
+
metadata: this.metadata,
|
|
62
|
+
created_at: this.createdAt,
|
|
63
|
+
updated_at: this.updatedAt
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
module.exports = { DocChunk };
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
const { DocChunk } = require('./chunk');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* DocumentChunker - section-aware chunk splitting with overlap.
|
|
5
|
+
* Ported from aurora_context_doc.chunker.DocumentChunker (Python).
|
|
6
|
+
*/
|
|
7
|
+
class DocumentChunker {
|
|
8
|
+
constructor({ maxChunkSize = 2000, overlap = 200 } = {}) {
|
|
9
|
+
this.maxChunkSize = maxChunkSize;
|
|
10
|
+
this.overlap = overlap;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Split a large chunk into smaller overlapping pieces at sentence boundaries.
|
|
15
|
+
* @param {DocChunk} chunk
|
|
16
|
+
* @returns {DocChunk[]}
|
|
17
|
+
*/
|
|
18
|
+
splitLarge(chunk) {
|
|
19
|
+
if (chunk.content.length <= this.maxChunkSize) {
|
|
20
|
+
return [chunk];
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const chunks = [];
|
|
24
|
+
const content = chunk.content;
|
|
25
|
+
let start = 0;
|
|
26
|
+
let partNum = 0;
|
|
27
|
+
|
|
28
|
+
while (start < content.length) {
|
|
29
|
+
let end = Math.min(start + this.maxChunkSize, content.length);
|
|
30
|
+
|
|
31
|
+
// Try to break at sentence boundary
|
|
32
|
+
if (end < content.length) {
|
|
33
|
+
for (const marker of ['. ', '! ', '? ', '\n\n', '\n']) {
|
|
34
|
+
const lastBreak = content.lastIndexOf(marker, end);
|
|
35
|
+
if (lastBreak > start) {
|
|
36
|
+
end = lastBreak + marker.length;
|
|
37
|
+
break;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const sliceContent = content.slice(start, end).trim();
|
|
43
|
+
if (sliceContent) {
|
|
44
|
+
chunks.push(new DocChunk({
|
|
45
|
+
chunkId: `${chunk.chunkId}-p${partNum}`,
|
|
46
|
+
filePath: chunk.filePath,
|
|
47
|
+
pageStart: chunk.pageStart,
|
|
48
|
+
pageEnd: chunk.pageEnd,
|
|
49
|
+
elementType: chunk.elementType,
|
|
50
|
+
name: `${chunk.name} (part ${partNum + 1})`,
|
|
51
|
+
content: sliceContent,
|
|
52
|
+
parentChunkId: chunk.parentChunkId,
|
|
53
|
+
sectionPath: chunk.sectionPath,
|
|
54
|
+
sectionLevel: chunk.sectionLevel,
|
|
55
|
+
documentType: chunk.documentType,
|
|
56
|
+
metadata: chunk.metadata,
|
|
57
|
+
createdAt: chunk.createdAt,
|
|
58
|
+
updatedAt: chunk.updatedAt
|
|
59
|
+
}));
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (end >= content.length) break;
|
|
63
|
+
|
|
64
|
+
// Move forward with overlap, ensure progress
|
|
65
|
+
const nextStart = end - this.overlap;
|
|
66
|
+
start = nextStart <= start ? start + 1 : nextStart;
|
|
67
|
+
partNum++;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return chunks;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Process an array of chunks: split large ones.
|
|
75
|
+
* @param {DocChunk[]} chunks
|
|
76
|
+
* @returns {DocChunk[]}
|
|
77
|
+
*/
|
|
78
|
+
process(chunks) {
|
|
79
|
+
const result = [];
|
|
80
|
+
for (const chunk of chunks) {
|
|
81
|
+
result.push(...this.splitLarge(chunk));
|
|
82
|
+
}
|
|
83
|
+
return result;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
module.exports = { DocumentChunker };
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const { getParser } = require('./parsers');
|
|
4
|
+
const { DocumentChunker } = require('./chunker');
|
|
5
|
+
const { DocumentStore } = require('./store');
|
|
6
|
+
const { logAudit } = require('../governance/audit');
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* DocumentIndexer - orchestrates parsing, chunking, and storage.
|
|
10
|
+
* Ported from aurora_context_doc.indexer.DocumentIndexer (Python).
|
|
11
|
+
*/
|
|
12
|
+
class DocumentIndexer {
|
|
13
|
+
constructor(store = null) {
|
|
14
|
+
this.store = store || new DocumentStore();
|
|
15
|
+
this.chunker = new DocumentChunker();
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Index a single file: parse → chunk → store
|
|
20
|
+
* @param {string} filePath - Path to document
|
|
21
|
+
* @returns {Promise<number>} - Number of chunks created
|
|
22
|
+
*/
|
|
23
|
+
async indexFile(filePath) {
|
|
24
|
+
const resolved = path.resolve(filePath);
|
|
25
|
+
|
|
26
|
+
if (!fs.existsSync(resolved)) {
|
|
27
|
+
throw new Error(`File not found: ${filePath}`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const parser = getParser(resolved);
|
|
31
|
+
if (!parser) {
|
|
32
|
+
const ext = path.extname(resolved);
|
|
33
|
+
throw new Error(`Unsupported file type: ${ext}. Supported: .pdf, .docx, .md, .txt`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Delete existing chunks for this file (re-index)
|
|
37
|
+
this.store.deleteByFile(resolved);
|
|
38
|
+
|
|
39
|
+
// Parse
|
|
40
|
+
const rawChunks = await parser(resolved);
|
|
41
|
+
if (!rawChunks || rawChunks.length === 0) {
|
|
42
|
+
return 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Chunk (split large sections)
|
|
46
|
+
const processed = this.chunker.process(rawChunks);
|
|
47
|
+
|
|
48
|
+
// Store
|
|
49
|
+
this.store.saveChunks(processed);
|
|
50
|
+
|
|
51
|
+
logAudit({
|
|
52
|
+
action: 'index_file',
|
|
53
|
+
file: resolved,
|
|
54
|
+
raw_chunks: rawChunks.length,
|
|
55
|
+
stored_chunks: processed.length
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
return processed.length;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Index a buffer (e.g. from Telegram file upload)
|
|
63
|
+
* @param {Buffer} buffer - File contents
|
|
64
|
+
* @param {string} filename - Original filename
|
|
65
|
+
* @returns {Promise<number>} - Number of chunks created
|
|
66
|
+
*/
|
|
67
|
+
async indexBuffer(buffer, filename) {
|
|
68
|
+
// Write to temp file, index it, then clean up
|
|
69
|
+
const tmpDir = path.join(require('../config').MULTIS_DIR, 'tmp');
|
|
70
|
+
if (!fs.existsSync(tmpDir)) {
|
|
71
|
+
fs.mkdirSync(tmpDir, { recursive: true });
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const tmpPath = path.join(tmpDir, filename);
|
|
75
|
+
fs.writeFileSync(tmpPath, buffer);
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
const count = await this.indexFile(tmpPath);
|
|
79
|
+
return count;
|
|
80
|
+
} finally {
|
|
81
|
+
// Clean up temp file
|
|
82
|
+
if (fs.existsSync(tmpPath)) {
|
|
83
|
+
fs.unlinkSync(tmpPath);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Index all supported files in a directory
|
|
90
|
+
* @param {string} dirPath - Directory path
|
|
91
|
+
* @param {boolean} recursive - Recurse into subdirectories
|
|
92
|
+
* @returns {Promise<{files: number, chunks: number}>}
|
|
93
|
+
*/
|
|
94
|
+
async indexDirectory(dirPath, recursive = true) {
|
|
95
|
+
const resolved = path.resolve(dirPath);
|
|
96
|
+
|
|
97
|
+
if (!fs.existsSync(resolved)) {
|
|
98
|
+
throw new Error(`Directory not found: ${dirPath}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const supportedExts = ['.pdf', '.docx', '.md', '.txt'];
|
|
102
|
+
let totalFiles = 0;
|
|
103
|
+
let totalChunks = 0;
|
|
104
|
+
|
|
105
|
+
const entries = fs.readdirSync(resolved, { withFileTypes: true });
|
|
106
|
+
|
|
107
|
+
for (const entry of entries) {
|
|
108
|
+
const fullPath = path.join(resolved, entry.name);
|
|
109
|
+
|
|
110
|
+
if (entry.isDirectory() && recursive) {
|
|
111
|
+
const sub = await this.indexDirectory(fullPath, true);
|
|
112
|
+
totalFiles += sub.files;
|
|
113
|
+
totalChunks += sub.chunks;
|
|
114
|
+
} else if (entry.isFile() && supportedExts.includes(path.extname(entry.name).toLowerCase())) {
|
|
115
|
+
try {
|
|
116
|
+
const count = await this.indexFile(fullPath);
|
|
117
|
+
totalFiles++;
|
|
118
|
+
totalChunks += count;
|
|
119
|
+
} catch (err) {
|
|
120
|
+
logAudit({ action: 'index_error', file: fullPath, error: err.message });
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return { files: totalFiles, chunks: totalChunks };
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Search indexed documents
|
|
130
|
+
* @param {string} query - Search query
|
|
131
|
+
* @param {number} limit - Max results
|
|
132
|
+
* @returns {Array} - Matching chunks
|
|
133
|
+
*/
|
|
134
|
+
search(query, limit = 5) {
|
|
135
|
+
return this.store.search(query, limit);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Get indexing stats
|
|
140
|
+
*/
|
|
141
|
+
getStats() {
|
|
142
|
+
return this.store.getStats();
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
close() {
|
|
146
|
+
this.store.close();
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
module.exports = { DocumentIndexer };
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const { DocChunk } = require('./chunk');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* PDF Parser - uses pdf-parse to extract text, creates page-level chunks.
|
|
7
|
+
* Ported from aurora_context_doc.parser.pdf (PyMuPDF).
|
|
8
|
+
* pdf-parse is simpler than PyMuPDF — no TOC extraction, no font detection.
|
|
9
|
+
* We get page-level text and chunk from there.
|
|
10
|
+
*/
|
|
11
|
+
async function parsePDF(filePath) {
|
|
12
|
+
const pdfParse = require('pdf-parse');
|
|
13
|
+
const buffer = fs.readFileSync(filePath);
|
|
14
|
+
|
|
15
|
+
const data = await pdfParse(buffer, {
|
|
16
|
+
// Custom page renderer to get per-page text
|
|
17
|
+
pagerender: function (pageData) {
|
|
18
|
+
return pageData.getTextContent().then(function (textContent) {
|
|
19
|
+
return textContent.items.map(item => item.str).join(' ');
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
// pdf-parse doesn't give per-page text easily with custom renderer,
|
|
25
|
+
// but data.text has full text. Split by form feeds if available.
|
|
26
|
+
// Fallback: use numpages and split evenly, or treat as single doc.
|
|
27
|
+
const chunks = [];
|
|
28
|
+
const absPath = path.resolve(filePath);
|
|
29
|
+
|
|
30
|
+
// Try to split by page using the raw text
|
|
31
|
+
// pdf-parse joins pages - we re-parse with page tracking
|
|
32
|
+
const pdf = await pdfParse(buffer);
|
|
33
|
+
const fullText = pdf.text;
|
|
34
|
+
const numPages = pdf.numpages;
|
|
35
|
+
|
|
36
|
+
if (numPages <= 1 || fullText.length < 500) {
|
|
37
|
+
// Single chunk for small docs
|
|
38
|
+
if (fullText.trim()) {
|
|
39
|
+
chunks.push(new DocChunk({
|
|
40
|
+
filePath: absPath,
|
|
41
|
+
pageStart: 1,
|
|
42
|
+
pageEnd: numPages,
|
|
43
|
+
elementType: 'section',
|
|
44
|
+
name: path.basename(filePath),
|
|
45
|
+
content: fullText.trim(),
|
|
46
|
+
sectionPath: [path.basename(filePath)],
|
|
47
|
+
sectionLevel: 1,
|
|
48
|
+
documentType: 'pdf'
|
|
49
|
+
}));
|
|
50
|
+
}
|
|
51
|
+
} else {
|
|
52
|
+
// Split full text roughly by page count
|
|
53
|
+
// This is approximate — pdf-parse doesn't give clean page breaks
|
|
54
|
+
const avgChars = Math.ceil(fullText.length / numPages);
|
|
55
|
+
for (let i = 0; i < numPages; i++) {
|
|
56
|
+
const start = i * avgChars;
|
|
57
|
+
const end = Math.min((i + 1) * avgChars, fullText.length);
|
|
58
|
+
const pageText = fullText.slice(start, end).trim();
|
|
59
|
+
|
|
60
|
+
if (!pageText) continue;
|
|
61
|
+
|
|
62
|
+
chunks.push(new DocChunk({
|
|
63
|
+
filePath: absPath,
|
|
64
|
+
pageStart: i + 1,
|
|
65
|
+
pageEnd: i + 1,
|
|
66
|
+
elementType: 'paragraph',
|
|
67
|
+
name: `Page ${i + 1}`,
|
|
68
|
+
content: pageText,
|
|
69
|
+
sectionPath: [path.basename(filePath), `Page ${i + 1}`],
|
|
70
|
+
sectionLevel: 1,
|
|
71
|
+
documentType: 'pdf'
|
|
72
|
+
}));
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return chunks;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* DOCX Parser - uses mammoth to extract HTML, then parses headings for hierarchy.
|
|
81
|
+
* Ported from aurora_context_doc.parser.docx (python-docx).
|
|
82
|
+
* mammoth converts to HTML — we parse headings from that.
|
|
83
|
+
*/
|
|
84
|
+
async function parseDOCX(filePath) {
|
|
85
|
+
const mammoth = require('mammoth');
|
|
86
|
+
const absPath = path.resolve(filePath);
|
|
87
|
+
|
|
88
|
+
const result = await mammoth.convertToHtml({ path: filePath });
|
|
89
|
+
const html = result.value;
|
|
90
|
+
|
|
91
|
+
// Parse HTML for headings and content blocks
|
|
92
|
+
const chunks = [];
|
|
93
|
+
const sectionStack = []; // track current heading hierarchy
|
|
94
|
+
|
|
95
|
+
// Split HTML by heading tags
|
|
96
|
+
const parts = html.split(/(<h[1-6][^>]*>.*?<\/h[1-6]>)/gi);
|
|
97
|
+
|
|
98
|
+
let currentContent = '';
|
|
99
|
+
let currentName = path.basename(filePath);
|
|
100
|
+
let currentLevel = 0;
|
|
101
|
+
|
|
102
|
+
for (const part of parts) {
|
|
103
|
+
const headingMatch = part.match(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/i);
|
|
104
|
+
|
|
105
|
+
if (headingMatch) {
|
|
106
|
+
// Save previous section if it has content
|
|
107
|
+
if (currentContent.trim()) {
|
|
108
|
+
const cleanContent = stripHtml(currentContent);
|
|
109
|
+
if (cleanContent) {
|
|
110
|
+
chunks.push(new DocChunk({
|
|
111
|
+
filePath: absPath,
|
|
112
|
+
elementType: currentLevel > 0 ? 'section' : 'paragraph',
|
|
113
|
+
name: currentName,
|
|
114
|
+
content: cleanContent,
|
|
115
|
+
sectionPath: sectionStack.map(s => s.name).concat(currentLevel > 0 ? [] : [currentName]),
|
|
116
|
+
sectionLevel: currentLevel,
|
|
117
|
+
documentType: 'docx'
|
|
118
|
+
}));
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Start new section
|
|
123
|
+
const level = parseInt(headingMatch[1]);
|
|
124
|
+
const title = stripHtml(headingMatch[2]);
|
|
125
|
+
|
|
126
|
+
// Update section stack
|
|
127
|
+
while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1].level >= level) {
|
|
128
|
+
sectionStack.pop();
|
|
129
|
+
}
|
|
130
|
+
sectionStack.push({ level, name: title });
|
|
131
|
+
|
|
132
|
+
currentName = title;
|
|
133
|
+
currentLevel = level;
|
|
134
|
+
currentContent = '';
|
|
135
|
+
} else {
|
|
136
|
+
currentContent += part;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Save last section
|
|
141
|
+
if (currentContent.trim()) {
|
|
142
|
+
const cleanContent = stripHtml(currentContent);
|
|
143
|
+
if (cleanContent) {
|
|
144
|
+
chunks.push(new DocChunk({
|
|
145
|
+
filePath: absPath,
|
|
146
|
+
elementType: currentLevel > 0 ? 'section' : 'paragraph',
|
|
147
|
+
name: currentName,
|
|
148
|
+
content: cleanContent,
|
|
149
|
+
sectionPath: sectionStack.map(s => s.name),
|
|
150
|
+
sectionLevel: currentLevel,
|
|
151
|
+
documentType: 'docx'
|
|
152
|
+
}));
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// If no headings found, treat as single chunk
|
|
157
|
+
if (chunks.length === 0 && html.trim()) {
|
|
158
|
+
const cleanContent = stripHtml(html);
|
|
159
|
+
if (cleanContent) {
|
|
160
|
+
chunks.push(new DocChunk({
|
|
161
|
+
filePath: absPath,
|
|
162
|
+
elementType: 'paragraph',
|
|
163
|
+
name: path.basename(filePath),
|
|
164
|
+
content: cleanContent,
|
|
165
|
+
sectionPath: [path.basename(filePath)],
|
|
166
|
+
sectionLevel: 0,
|
|
167
|
+
documentType: 'docx'
|
|
168
|
+
}));
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return chunks;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Markdown Parser - native, splits by headings.
|
|
177
|
+
* No external dependency needed.
|
|
178
|
+
*/
|
|
179
|
+
function parseMD(filePath) {
|
|
180
|
+
const absPath = path.resolve(filePath);
|
|
181
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
182
|
+
const lines = content.split('\n');
|
|
183
|
+
const chunks = [];
|
|
184
|
+
const sectionStack = [];
|
|
185
|
+
|
|
186
|
+
let currentContent = '';
|
|
187
|
+
let currentName = path.basename(filePath);
|
|
188
|
+
let currentLevel = 0;
|
|
189
|
+
|
|
190
|
+
for (const line of lines) {
|
|
191
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)/);
|
|
192
|
+
|
|
193
|
+
if (headingMatch) {
|
|
194
|
+
// Save previous section
|
|
195
|
+
if (currentContent.trim()) {
|
|
196
|
+
chunks.push(new DocChunk({
|
|
197
|
+
filePath: absPath,
|
|
198
|
+
elementType: currentLevel > 0 ? 'section' : 'paragraph',
|
|
199
|
+
name: currentName,
|
|
200
|
+
content: currentContent.trim(),
|
|
201
|
+
sectionPath: sectionStack.map(s => s.name),
|
|
202
|
+
sectionLevel: currentLevel,
|
|
203
|
+
documentType: 'md'
|
|
204
|
+
}));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const level = headingMatch[1].length;
|
|
208
|
+
const title = headingMatch[2].trim();
|
|
209
|
+
|
|
210
|
+
while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1].level >= level) {
|
|
211
|
+
sectionStack.pop();
|
|
212
|
+
}
|
|
213
|
+
sectionStack.push({ level, name: title });
|
|
214
|
+
|
|
215
|
+
currentName = title;
|
|
216
|
+
currentLevel = level;
|
|
217
|
+
currentContent = '';
|
|
218
|
+
} else {
|
|
219
|
+
currentContent += line + '\n';
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Save last section
|
|
224
|
+
if (currentContent.trim()) {
|
|
225
|
+
chunks.push(new DocChunk({
|
|
226
|
+
filePath: absPath,
|
|
227
|
+
elementType: currentLevel > 0 ? 'section' : 'paragraph',
|
|
228
|
+
name: currentName,
|
|
229
|
+
content: currentContent.trim(),
|
|
230
|
+
sectionPath: sectionStack.map(s => s.name),
|
|
231
|
+
sectionLevel: currentLevel,
|
|
232
|
+
documentType: 'md'
|
|
233
|
+
}));
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (chunks.length === 0 && content.trim()) {
|
|
237
|
+
chunks.push(new DocChunk({
|
|
238
|
+
filePath: absPath,
|
|
239
|
+
elementType: 'paragraph',
|
|
240
|
+
name: path.basename(filePath),
|
|
241
|
+
content: content.trim(),
|
|
242
|
+
sectionPath: [path.basename(filePath)],
|
|
243
|
+
sectionLevel: 0,
|
|
244
|
+
documentType: 'md'
|
|
245
|
+
}));
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return chunks;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Plain text parser - single chunk per file
|
|
253
|
+
*/
|
|
254
|
+
function parseTXT(filePath) {
|
|
255
|
+
const absPath = path.resolve(filePath);
|
|
256
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
257
|
+
|
|
258
|
+
if (!content.trim()) return [];
|
|
259
|
+
|
|
260
|
+
return [new DocChunk({
|
|
261
|
+
filePath: absPath,
|
|
262
|
+
elementType: 'paragraph',
|
|
263
|
+
name: path.basename(filePath),
|
|
264
|
+
content: content.trim(),
|
|
265
|
+
sectionPath: [path.basename(filePath)],
|
|
266
|
+
sectionLevel: 0,
|
|
267
|
+
documentType: 'txt'
|
|
268
|
+
})];
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Get the appropriate parser for a file extension
|
|
273
|
+
*/
|
|
274
|
+
function getParser(filePath) {
|
|
275
|
+
const ext = path.extname(filePath).toLowerCase().slice(1);
|
|
276
|
+
switch (ext) {
|
|
277
|
+
case 'pdf': return parsePDF;
|
|
278
|
+
case 'docx': return parseDOCX;
|
|
279
|
+
case 'md': return parseMD;
|
|
280
|
+
case 'txt': return parseTXT;
|
|
281
|
+
default: return null;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/** Strip HTML tags and decode entities */
|
|
286
|
+
function stripHtml(html) {
|
|
287
|
+
return html
|
|
288
|
+
.replace(/<[^>]+>/g, '')
|
|
289
|
+
.replace(/&/g, '&')
|
|
290
|
+
.replace(/</g, '<')
|
|
291
|
+
.replace(/>/g, '>')
|
|
292
|
+
.replace(/"/g, '"')
|
|
293
|
+
.replace(/'/g, "'")
|
|
294
|
+
.replace(/ /g, ' ')
|
|
295
|
+
.replace(/\s+/g, ' ')
|
|
296
|
+
.trim();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
module.exports = { parsePDF, parseDOCX, parseMD, parseTXT, getParser };
|