opc-agent 4.1.0 → 4.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/ISSUE_TEMPLATE/bug_report.md +20 -20
- package/.github/ISSUE_TEMPLATE/feature_request.md +14 -14
- package/.github/PULL_REQUEST_TEMPLATE.md +13 -13
- package/CHANGELOG.md +48 -48
- package/CONTRIBUTING.md +36 -36
- package/README.zh-CN.md +497 -497
- package/USABILITY-ISSUES.md +73 -0
- package/dist/channels/web.js +8 -2
- package/dist/channels/wechat.js +6 -6
- package/dist/cli.js +200 -85
- package/dist/core/runtime.js +37 -15
- package/dist/deploy/index.js +56 -56
- package/dist/doctor.d.ts +1 -0
- package/dist/doctor.js +105 -10
- package/dist/memory/deepbrain.d.ts +1 -1
- package/dist/memory/deepbrain.js +95 -4
- package/dist/scheduler/cron-engine.js +3 -36
- package/dist/studio/server.js +30 -1
- package/dist/studio-ui/index.html +230 -10
- package/dist/ui/components.js +105 -105
- package/examples/README.md +22 -22
- package/examples/basic-agent.ts +90 -90
- package/examples/brain-integration.ts +71 -71
- package/examples/multi-channel.ts +74 -74
- package/fix-sidebar.mjs +188 -188
- package/install.ps1 +154 -154
- package/install.sh +164 -164
- package/package.json +1 -1
- package/scripts/install.ps1 +31 -31
- package/scripts/install.sh +40 -40
- package/serve-studio.js +13 -13
- package/serve-test.js +25 -25
- package/src/channels/dingtalk.ts +46 -46
- package/src/channels/email.ts +351 -351
- package/src/channels/feishu.ts +349 -349
- package/src/channels/googlechat.ts +42 -42
- package/src/channels/imessage.ts +31 -31
- package/src/channels/irc.ts +82 -82
- package/src/channels/line.ts +32 -32
- package/src/channels/matrix.ts +33 -33
- package/src/channels/mattermost.ts +57 -57
- package/src/channels/msteams.ts +32 -32
- package/src/channels/nostr.ts +32 -32
- package/src/channels/qq.ts +33 -33
- package/src/channels/signal.ts +32 -32
- package/src/channels/sms.ts +33 -33
- package/src/channels/telegram.ts +616 -616
- package/src/channels/twitch.ts +65 -65
- package/src/channels/voice-call.ts +100 -100
- package/src/channels/web.ts +8 -2
- package/src/channels/websocket.ts +399 -399
- package/src/channels/wechat.ts +329 -329
- package/src/channels/whatsapp.ts +32 -32
- package/src/cli/chat.ts +99 -99
- package/src/cli/setup.ts +314 -314
- package/src/cli.ts +195 -92
- package/src/core/agent.ts +476 -476
- package/src/core/api-server.ts +277 -277
- package/src/core/audio.ts +98 -98
- package/src/core/collaboration.ts +275 -275
- package/src/core/context-discovery.ts +85 -85
- package/src/core/context-refs.ts +140 -140
- package/src/core/gateway.ts +106 -106
- package/src/core/heartbeat.ts +51 -51
- package/src/core/hooks.ts +105 -105
- package/src/core/ide-bridge.ts +133 -133
- package/src/core/node-network.ts +86 -86
- package/src/core/profiles.ts +122 -122
- package/src/core/runtime.ts +25 -0
- package/src/core/scheduler.ts +187 -187
- package/src/core/session-manager.ts +137 -137
- package/src/core/subagent.ts +98 -98
- package/src/core/vision.ts +180 -180
- package/src/core/workflow-graph.ts +365 -365
- package/src/daemon.ts +96 -96
- package/src/deploy/index.ts +255 -255
- package/src/doctor.ts +98 -11
- package/src/eval/index.ts +211 -211
- package/src/eval/suites/basic.json +16 -16
- package/src/eval/suites/memory.json +12 -12
- package/src/eval/suites/safety.json +14 -14
- package/src/hub/brain-seed.ts +54 -54
- package/src/hub/client.ts +60 -60
- package/src/mcp/servers/calculator-mcp.ts +65 -65
- package/src/mcp/servers/crypto-mcp.ts +73 -73
- package/src/mcp/servers/database-mcp.ts +72 -72
- package/src/mcp/servers/datetime-mcp.ts +69 -69
- package/src/mcp/servers/filesystem.ts +66 -66
- package/src/mcp/servers/github-mcp.ts +58 -58
- package/src/mcp/servers/index.ts +63 -63
- package/src/mcp/servers/json-mcp.ts +102 -102
- package/src/mcp/servers/memory-mcp.ts +56 -56
- package/src/mcp/servers/regex-mcp.ts +53 -53
- package/src/mcp/servers/web-mcp.ts +49 -49
- package/src/memory/context-compressor.ts +189 -189
- package/src/memory/deepbrain.ts +99 -5
- package/src/memory/seed-loader.ts +212 -212
- package/src/memory/user-profiler.ts +215 -215
- package/src/plugins/content-filter.ts +23 -23
- package/src/plugins/logger.ts +18 -18
- package/src/plugins/rate-limiter.ts +38 -38
- package/src/protocols/a2a/client.ts +132 -132
- package/src/protocols/a2a/index.ts +8 -8
- package/src/protocols/a2a/server.ts +333 -333
- package/src/protocols/a2a/types.ts +88 -88
- package/src/protocols/a2a/utils.ts +50 -50
- package/src/protocols/agui/client.ts +83 -83
- package/src/protocols/agui/index.ts +4 -4
- package/src/protocols/agui/server.ts +218 -218
- package/src/protocols/agui/types.ts +153 -153
- package/src/protocols/index.ts +2 -2
- package/src/protocols/mcp/agent-tools.ts +134 -134
- package/src/protocols/mcp/index.ts +8 -8
- package/src/protocols/mcp/server.ts +262 -262
- package/src/protocols/mcp/types.ts +69 -69
- package/src/providers/index.ts +632 -632
- package/src/publish/index.ts +376 -376
- package/src/scheduler/cron-engine.ts +191 -191
- package/src/scheduler/index.ts +2 -2
- package/src/schema/oad.ts +217 -217
- package/src/security/approval.ts +131 -131
- package/src/security/approvals.ts +143 -143
- package/src/security/elevated.ts +105 -105
- package/src/security/guardrails.ts +248 -248
- package/src/security/index.ts +9 -9
- package/src/security/keys.ts +87 -87
- package/src/security/secrets.ts +129 -129
- package/src/skills/builtin/index.ts +408 -408
- package/src/skills/marketplace.ts +113 -113
- package/src/skills/types.ts +42 -42
- package/src/studio/server.ts +31 -1
- package/src/studio/templates-data.ts +178 -178
- package/src/studio-ui/index.html +230 -10
- package/src/telemetry/index.ts +324 -324
- package/src/tools/builtin/browser.ts +299 -299
- package/src/tools/builtin/datetime.ts +41 -41
- package/src/tools/builtin/file.ts +107 -107
- package/src/tools/builtin/home-assistant.ts +116 -116
- package/src/tools/builtin/rl-tools.ts +243 -243
- package/src/tools/builtin/shell.ts +43 -43
- package/src/tools/builtin/vision.ts +64 -64
- package/src/tools/builtin/web-search.ts +126 -126
- package/src/tools/builtin/web.ts +35 -35
- package/src/tools/document-processor.ts +213 -213
- package/src/tools/image-generator.ts +150 -150
- package/src/tools/integrations/calendar.ts +73 -73
- package/src/tools/integrations/code-exec.ts +39 -39
- package/src/tools/integrations/csv-analyzer.ts +92 -92
- package/src/tools/integrations/database.ts +44 -44
- package/src/tools/integrations/email-send.ts +76 -76
- package/src/tools/integrations/git-tool.ts +42 -42
- package/src/tools/integrations/github-tool.ts +76 -76
- package/src/tools/integrations/image-gen.ts +56 -56
- package/src/tools/integrations/index.ts +92 -92
- package/src/tools/integrations/jira.ts +83 -83
- package/src/tools/integrations/notion.ts +71 -71
- package/src/tools/integrations/npm-tool.ts +48 -48
- package/src/tools/integrations/pdf-reader.ts +58 -58
- package/src/tools/integrations/slack.ts +65 -65
- package/src/tools/integrations/summarizer.ts +49 -49
- package/src/tools/integrations/translator.ts +48 -48
- package/src/tools/integrations/trello.ts +60 -60
- package/src/tools/integrations/vector-search.ts +42 -42
- package/src/tools/integrations/web-scraper.ts +47 -47
- package/src/tools/integrations/web-search.ts +58 -58
- package/src/tools/integrations/webhook.ts +38 -38
- package/src/tools/mcp-client.ts +131 -131
- package/src/tools/web-scraper.ts +179 -179
- package/src/tools/web-search.ts +180 -180
- package/src/ui/components.ts +127 -127
- package/srv-out.txt +1 -1
- package/templates/ecommerce-assistant/README.md +45 -45
- package/templates/ecommerce-assistant/oad.yaml +47 -47
- package/templates/tech-support/README.md +43 -43
- package/templates/tech-support/oad.yaml +45 -45
- package/test-agent/Dockerfile +9 -9
- package/test-agent/README.md +50 -50
- package/test-agent/agent.yaml +23 -23
- package/test-agent/docker-compose.yml +11 -11
- package/test-agent/oad.yaml +31 -31
- package/test-agent/package-lock.json +1492 -1492
- package/test-agent/package.json +17 -17
- package/test-agent/src/index.ts +24 -24
- package/test-agent/src/skills/echo.ts +15 -15
- package/test-agent/tsconfig.json +24 -24
- package/test-full.js +43 -43
- package/test-sidebar.js +22 -22
- package/test-studio3.js +75 -75
- package/test-studio4.js +41 -41
- package/tests/a2a-protocol.test.ts +285 -285
- package/tests/agui-protocol.test.ts +246 -246
- package/tests/api-server.test.ts +148 -148
- package/tests/approvals.test.ts +89 -89
- package/tests/audio.test.ts +40 -40
- package/tests/brain-seed-extended.test.ts +490 -490
- package/tests/brain-seed.test.ts +239 -239
- package/tests/browser.test.ts +179 -179
- package/tests/channels/discord.test.ts +79 -79
- package/tests/channels/email.test.ts +148 -148
- package/tests/channels/feishu.test.ts +123 -123
- package/tests/channels/telegram.test.ts +129 -129
- package/tests/channels/websocket.test.ts +53 -53
- package/tests/channels/wechat.test.ts +170 -170
- package/tests/channels-extra.test.ts +45 -45
- package/tests/chat-cli.test.ts +160 -160
- package/tests/cli.test.ts +46 -46
- package/tests/context-compressor.test.ts +172 -172
- package/tests/context-refs.test.ts +121 -121
- package/tests/cron-engine.test.ts +101 -101
- package/tests/daemon.test.ts +135 -135
- package/tests/deepbrain-wire.test.ts +234 -234
- package/tests/deploy-and-dag.test.ts +196 -196
- package/tests/doctor.test.ts +38 -38
- package/tests/document-processor.test.ts +69 -69
- package/tests/e2e-nocode.test.ts +442 -442
- package/tests/elevated.test.ts +69 -69
- package/tests/eval.test.ts +173 -173
- package/tests/gateway.test.ts +63 -63
- package/tests/guardrails.test.ts +177 -177
- package/tests/home-assistant.test.ts +40 -40
- package/tests/hooks.test.ts +79 -79
- package/tests/ide-bridge.test.ts +38 -38
- package/tests/image-generator.test.ts +84 -84
- package/tests/init-role.test.ts +124 -124
- package/tests/integrations.test.ts +249 -249
- package/tests/mcp-client.test.ts +92 -92
- package/tests/mcp-server.test.ts +178 -178
- package/tests/mcp-servers.test.ts +260 -260
- package/tests/node-network.test.ts +74 -74
- package/tests/plugin-a2a-enhanced.test.ts +230 -230
- package/tests/profiles.test.ts +61 -61
- package/tests/publish.test.ts +231 -231
- package/tests/rl-tools.test.ts +93 -93
- package/tests/sandbox-manager.test.ts +46 -46
- package/tests/scheduler.test.ts +200 -200
- package/tests/secrets.test.ts +107 -107
- package/tests/security-enhanced.test.ts +233 -233
- package/tests/settings-api.test.ts +148 -148
- package/tests/setup.test.ts +73 -73
- package/tests/subagent.test.ts +193 -193
- package/tests/telegram-discord.test.ts +60 -60
- package/tests/telemetry.test.ts +186 -186
- package/tests/user-profiler.test.ts +169 -169
- package/tests/v090-features.test.ts +254 -254
- package/tests/vision.test.ts +61 -61
- package/tests/voice-call.test.ts +47 -47
- package/tests/voice-enhanced.test.ts +169 -169
- package/tests/voice-interaction.test.ts +38 -38
- package/tests/web-search.test.ts +155 -155
- package/tests/workflow-graph.test.ts +279 -279
- package/tutorial/customer-service-agent/README.md +612 -612
- package/tutorial/customer-service-agent/SOUL.md +26 -26
- package/tutorial/customer-service-agent/agent.yaml +63 -63
- package/tutorial/customer-service-agent/package.json +19 -19
- package/tutorial/customer-service-agent/src/index.ts +69 -69
- package/tutorial/customer-service-agent/src/skills/faq.ts +27 -27
- package/tutorial/customer-service-agent/src/skills/ticket.ts +22 -22
- package/tutorial/customer-service-agent/tsconfig.json +14 -14
|
@@ -1,213 +1,213 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Document Processor - Parse and chunk documents for knowledge learning
|
|
3
|
-
* Supports: PDF, TXT, MD, DOCX, CSV, JSON
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
export interface DocumentChunk {
|
|
7
|
-
title: string;
|
|
8
|
-
content: string;
|
|
9
|
-
metadata: {
|
|
10
|
-
source: string;
|
|
11
|
-
format: string;
|
|
12
|
-
chunkIndex: number;
|
|
13
|
-
totalChunks?: number;
|
|
14
|
-
page?: number;
|
|
15
|
-
};
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
export interface ProcessedDocument {
|
|
19
|
-
id: string;
|
|
20
|
-
filename: string;
|
|
21
|
-
format: string;
|
|
22
|
-
size: number;
|
|
23
|
-
chunks: DocumentChunk[];
|
|
24
|
-
processedAt: string;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
|
28
|
-
const CHUNK_TARGET_CHARS = 2000; // ~500 tokens
|
|
29
|
-
const CHUNK_MAX_CHARS = 4000; // ~1000 tokens
|
|
30
|
-
|
|
31
|
-
export class DocumentProcessor {
|
|
32
|
-
/**
|
|
33
|
-
* Process a file buffer into chunks
|
|
34
|
-
*/
|
|
35
|
-
async process(buffer: Buffer, filename: string): Promise<ProcessedDocument> {
|
|
36
|
-
if (buffer.length > MAX_FILE_SIZE) {
|
|
37
|
-
throw new Error(`File too large: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (max 50MB)`);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const ext = filename.split('.').pop()?.toLowerCase() || '';
|
|
41
|
-
let rawText: string;
|
|
42
|
-
|
|
43
|
-
switch (ext) {
|
|
44
|
-
case 'pdf':
|
|
45
|
-
rawText = await this.parsePDF(buffer);
|
|
46
|
-
break;
|
|
47
|
-
case 'docx':
|
|
48
|
-
rawText = await this.parseDOCX(buffer);
|
|
49
|
-
break;
|
|
50
|
-
case 'csv':
|
|
51
|
-
rawText = this.parseCSV(buffer.toString('utf-8'));
|
|
52
|
-
break;
|
|
53
|
-
case 'json':
|
|
54
|
-
rawText = this.parseJSON(buffer.toString('utf-8'));
|
|
55
|
-
break;
|
|
56
|
-
case 'txt':
|
|
57
|
-
case 'md':
|
|
58
|
-
case 'markdown':
|
|
59
|
-
rawText = buffer.toString('utf-8');
|
|
60
|
-
break;
|
|
61
|
-
default:
|
|
62
|
-
// Try as plain text
|
|
63
|
-
rawText = buffer.toString('utf-8');
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
const chunks = this.chunkText(rawText, filename, ext);
|
|
67
|
-
|
|
68
|
-
return {
|
|
69
|
-
id: `doc-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
70
|
-
filename,
|
|
71
|
-
format: ext,
|
|
72
|
-
size: buffer.length,
|
|
73
|
-
chunks,
|
|
74
|
-
processedAt: new Date().toISOString(),
|
|
75
|
-
};
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
private async parsePDF(buffer: Buffer): Promise<string> {
|
|
79
|
-
try {
|
|
80
|
-
const pdfParse = require('pdf-parse');
|
|
81
|
-
const data = await pdfParse(buffer);
|
|
82
|
-
return data.text || '';
|
|
83
|
-
} catch (e: any) {
|
|
84
|
-
throw new Error(`PDF parse failed: ${e.message}`);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
private async parseDOCX(buffer: Buffer): Promise<string> {
|
|
89
|
-
try {
|
|
90
|
-
const mammoth = require('mammoth');
|
|
91
|
-
const result = await mammoth.extractRawText({ buffer });
|
|
92
|
-
return result.value || '';
|
|
93
|
-
} catch (e: any) {
|
|
94
|
-
throw new Error(`DOCX parse failed: ${e.message}`);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
private parseCSV(text: string): string {
|
|
99
|
-
const lines = text.split('\n').filter(l => l.trim());
|
|
100
|
-
if (lines.length === 0) return '';
|
|
101
|
-
|
|
102
|
-
const headers = lines[0].split(',').map(h => h.trim().replace(/^"|"$/g, ''));
|
|
103
|
-
const rows = lines.slice(1);
|
|
104
|
-
|
|
105
|
-
// Convert CSV to readable text
|
|
106
|
-
return rows.map((row, i) => {
|
|
107
|
-
const values = this.parseCSVLine(row);
|
|
108
|
-
const pairs = headers.map((h, j) => `${h}: ${values[j] || ''}`);
|
|
109
|
-
return `Record ${i + 1}:\n${pairs.join('\n')}`;
|
|
110
|
-
}).join('\n\n');
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
private parseCSVLine(line: string): string[] {
|
|
114
|
-
const result: string[] = [];
|
|
115
|
-
let current = '';
|
|
116
|
-
let inQuotes = false;
|
|
117
|
-
for (const ch of line) {
|
|
118
|
-
if (ch === '"') { inQuotes = !inQuotes; }
|
|
119
|
-
else if (ch === ',' && !inQuotes) { result.push(current.trim()); current = ''; }
|
|
120
|
-
else { current += ch; }
|
|
121
|
-
}
|
|
122
|
-
result.push(current.trim());
|
|
123
|
-
return result;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
private parseJSON(text: string): string {
|
|
127
|
-
try {
|
|
128
|
-
const data = JSON.parse(text);
|
|
129
|
-
if (Array.isArray(data)) {
|
|
130
|
-
return data.map((item, i) => `Item ${i + 1}:\n${JSON.stringify(item, null, 2)}`).join('\n\n');
|
|
131
|
-
}
|
|
132
|
-
return JSON.stringify(data, null, 2);
|
|
133
|
-
} catch {
|
|
134
|
-
return text;
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
/**
|
|
139
|
-
* Smart chunking: split by headings/paragraphs, respecting size limits
|
|
140
|
-
*/
|
|
141
|
-
private chunkText(text: string, filename: string, format: string): DocumentChunk[] {
|
|
142
|
-
if (!text.trim()) return [];
|
|
143
|
-
|
|
144
|
-
// Split by markdown headings or double newlines
|
|
145
|
-
const sections = text.split(/\n(?=#{1,3}\s)|(?:\n\s*\n)/).filter(s => s.trim());
|
|
146
|
-
const chunks: DocumentChunk[] = [];
|
|
147
|
-
let currentChunk = '';
|
|
148
|
-
let currentTitle = filename;
|
|
149
|
-
|
|
150
|
-
for (const section of sections) {
|
|
151
|
-
const headingMatch = section.match(/^(#{1,3})\s+(.+)/);
|
|
152
|
-
if (headingMatch) {
|
|
153
|
-
currentTitle = headingMatch[2].trim();
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
if (currentChunk.length + section.length > CHUNK_MAX_CHARS && currentChunk.length > 0) {
|
|
157
|
-
chunks.push({
|
|
158
|
-
title: currentTitle,
|
|
159
|
-
content: currentChunk.trim(),
|
|
160
|
-
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
161
|
-
});
|
|
162
|
-
currentChunk = '';
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
currentChunk += section + '\n\n';
|
|
166
|
-
|
|
167
|
-
if (currentChunk.length >= CHUNK_TARGET_CHARS) {
|
|
168
|
-
chunks.push({
|
|
169
|
-
title: currentTitle,
|
|
170
|
-
content: currentChunk.trim(),
|
|
171
|
-
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
172
|
-
});
|
|
173
|
-
currentChunk = '';
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
if (currentChunk.trim()) {
|
|
178
|
-
chunks.push({
|
|
179
|
-
title: currentTitle,
|
|
180
|
-
content: currentChunk.trim(),
|
|
181
|
-
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
182
|
-
});
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
// If we got no chunks from section splitting (e.g. dense text), force-split
|
|
186
|
-
if (chunks.length === 0 && text.trim()) {
|
|
187
|
-
const words = text.split(/\s+/);
|
|
188
|
-
let buf = '';
|
|
189
|
-
for (const w of words) {
|
|
190
|
-
if (buf.length + w.length + 1 > CHUNK_MAX_CHARS && buf) {
|
|
191
|
-
chunks.push({
|
|
192
|
-
title: filename,
|
|
193
|
-
content: buf.trim(),
|
|
194
|
-
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
195
|
-
});
|
|
196
|
-
buf = '';
|
|
197
|
-
}
|
|
198
|
-
buf += w + ' ';
|
|
199
|
-
}
|
|
200
|
-
if (buf.trim()) {
|
|
201
|
-
chunks.push({
|
|
202
|
-
title: filename,
|
|
203
|
-
content: buf.trim(),
|
|
204
|
-
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
205
|
-
});
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
// Set totalChunks
|
|
210
|
-
for (const c of chunks) c.metadata.totalChunks = chunks.length;
|
|
211
|
-
return chunks;
|
|
212
|
-
}
|
|
213
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* Document Processor - Parse and chunk documents for knowledge learning
|
|
3
|
+
* Supports: PDF, TXT, MD, DOCX, CSV, JSON
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export interface DocumentChunk {
|
|
7
|
+
title: string;
|
|
8
|
+
content: string;
|
|
9
|
+
metadata: {
|
|
10
|
+
source: string;
|
|
11
|
+
format: string;
|
|
12
|
+
chunkIndex: number;
|
|
13
|
+
totalChunks?: number;
|
|
14
|
+
page?: number;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface ProcessedDocument {
|
|
19
|
+
id: string;
|
|
20
|
+
filename: string;
|
|
21
|
+
format: string;
|
|
22
|
+
size: number;
|
|
23
|
+
chunks: DocumentChunk[];
|
|
24
|
+
processedAt: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
|
28
|
+
const CHUNK_TARGET_CHARS = 2000; // ~500 tokens
|
|
29
|
+
const CHUNK_MAX_CHARS = 4000; // ~1000 tokens
|
|
30
|
+
|
|
31
|
+
export class DocumentProcessor {
|
|
32
|
+
/**
|
|
33
|
+
* Process a file buffer into chunks
|
|
34
|
+
*/
|
|
35
|
+
async process(buffer: Buffer, filename: string): Promise<ProcessedDocument> {
|
|
36
|
+
if (buffer.length > MAX_FILE_SIZE) {
|
|
37
|
+
throw new Error(`File too large: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (max 50MB)`);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const ext = filename.split('.').pop()?.toLowerCase() || '';
|
|
41
|
+
let rawText: string;
|
|
42
|
+
|
|
43
|
+
switch (ext) {
|
|
44
|
+
case 'pdf':
|
|
45
|
+
rawText = await this.parsePDF(buffer);
|
|
46
|
+
break;
|
|
47
|
+
case 'docx':
|
|
48
|
+
rawText = await this.parseDOCX(buffer);
|
|
49
|
+
break;
|
|
50
|
+
case 'csv':
|
|
51
|
+
rawText = this.parseCSV(buffer.toString('utf-8'));
|
|
52
|
+
break;
|
|
53
|
+
case 'json':
|
|
54
|
+
rawText = this.parseJSON(buffer.toString('utf-8'));
|
|
55
|
+
break;
|
|
56
|
+
case 'txt':
|
|
57
|
+
case 'md':
|
|
58
|
+
case 'markdown':
|
|
59
|
+
rawText = buffer.toString('utf-8');
|
|
60
|
+
break;
|
|
61
|
+
default:
|
|
62
|
+
// Try as plain text
|
|
63
|
+
rawText = buffer.toString('utf-8');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const chunks = this.chunkText(rawText, filename, ext);
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
id: `doc-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
70
|
+
filename,
|
|
71
|
+
format: ext,
|
|
72
|
+
size: buffer.length,
|
|
73
|
+
chunks,
|
|
74
|
+
processedAt: new Date().toISOString(),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
private async parsePDF(buffer: Buffer): Promise<string> {
|
|
79
|
+
try {
|
|
80
|
+
const pdfParse = require('pdf-parse');
|
|
81
|
+
const data = await pdfParse(buffer);
|
|
82
|
+
return data.text || '';
|
|
83
|
+
} catch (e: any) {
|
|
84
|
+
throw new Error(`PDF parse failed: ${e.message}`);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
private async parseDOCX(buffer: Buffer): Promise<string> {
|
|
89
|
+
try {
|
|
90
|
+
const mammoth = require('mammoth');
|
|
91
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
92
|
+
return result.value || '';
|
|
93
|
+
} catch (e: any) {
|
|
94
|
+
throw new Error(`DOCX parse failed: ${e.message}`);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
private parseCSV(text: string): string {
|
|
99
|
+
const lines = text.split('\n').filter(l => l.trim());
|
|
100
|
+
if (lines.length === 0) return '';
|
|
101
|
+
|
|
102
|
+
const headers = lines[0].split(',').map(h => h.trim().replace(/^"|"$/g, ''));
|
|
103
|
+
const rows = lines.slice(1);
|
|
104
|
+
|
|
105
|
+
// Convert CSV to readable text
|
|
106
|
+
return rows.map((row, i) => {
|
|
107
|
+
const values = this.parseCSVLine(row);
|
|
108
|
+
const pairs = headers.map((h, j) => `${h}: ${values[j] || ''}`);
|
|
109
|
+
return `Record ${i + 1}:\n${pairs.join('\n')}`;
|
|
110
|
+
}).join('\n\n');
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
private parseCSVLine(line: string): string[] {
|
|
114
|
+
const result: string[] = [];
|
|
115
|
+
let current = '';
|
|
116
|
+
let inQuotes = false;
|
|
117
|
+
for (const ch of line) {
|
|
118
|
+
if (ch === '"') { inQuotes = !inQuotes; }
|
|
119
|
+
else if (ch === ',' && !inQuotes) { result.push(current.trim()); current = ''; }
|
|
120
|
+
else { current += ch; }
|
|
121
|
+
}
|
|
122
|
+
result.push(current.trim());
|
|
123
|
+
return result;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
private parseJSON(text: string): string {
|
|
127
|
+
try {
|
|
128
|
+
const data = JSON.parse(text);
|
|
129
|
+
if (Array.isArray(data)) {
|
|
130
|
+
return data.map((item, i) => `Item ${i + 1}:\n${JSON.stringify(item, null, 2)}`).join('\n\n');
|
|
131
|
+
}
|
|
132
|
+
return JSON.stringify(data, null, 2);
|
|
133
|
+
} catch {
|
|
134
|
+
return text;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Smart chunking: split by headings/paragraphs, respecting size limits
|
|
140
|
+
*/
|
|
141
|
+
private chunkText(text: string, filename: string, format: string): DocumentChunk[] {
|
|
142
|
+
if (!text.trim()) return [];
|
|
143
|
+
|
|
144
|
+
// Split by markdown headings or double newlines
|
|
145
|
+
const sections = text.split(/\n(?=#{1,3}\s)|(?:\n\s*\n)/).filter(s => s.trim());
|
|
146
|
+
const chunks: DocumentChunk[] = [];
|
|
147
|
+
let currentChunk = '';
|
|
148
|
+
let currentTitle = filename;
|
|
149
|
+
|
|
150
|
+
for (const section of sections) {
|
|
151
|
+
const headingMatch = section.match(/^(#{1,3})\s+(.+)/);
|
|
152
|
+
if (headingMatch) {
|
|
153
|
+
currentTitle = headingMatch[2].trim();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (currentChunk.length + section.length > CHUNK_MAX_CHARS && currentChunk.length > 0) {
|
|
157
|
+
chunks.push({
|
|
158
|
+
title: currentTitle,
|
|
159
|
+
content: currentChunk.trim(),
|
|
160
|
+
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
161
|
+
});
|
|
162
|
+
currentChunk = '';
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
currentChunk += section + '\n\n';
|
|
166
|
+
|
|
167
|
+
if (currentChunk.length >= CHUNK_TARGET_CHARS) {
|
|
168
|
+
chunks.push({
|
|
169
|
+
title: currentTitle,
|
|
170
|
+
content: currentChunk.trim(),
|
|
171
|
+
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
172
|
+
});
|
|
173
|
+
currentChunk = '';
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (currentChunk.trim()) {
|
|
178
|
+
chunks.push({
|
|
179
|
+
title: currentTitle,
|
|
180
|
+
content: currentChunk.trim(),
|
|
181
|
+
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// If we got no chunks from section splitting (e.g. dense text), force-split
|
|
186
|
+
if (chunks.length === 0 && text.trim()) {
|
|
187
|
+
const words = text.split(/\s+/);
|
|
188
|
+
let buf = '';
|
|
189
|
+
for (const w of words) {
|
|
190
|
+
if (buf.length + w.length + 1 > CHUNK_MAX_CHARS && buf) {
|
|
191
|
+
chunks.push({
|
|
192
|
+
title: filename,
|
|
193
|
+
content: buf.trim(),
|
|
194
|
+
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
195
|
+
});
|
|
196
|
+
buf = '';
|
|
197
|
+
}
|
|
198
|
+
buf += w + ' ';
|
|
199
|
+
}
|
|
200
|
+
if (buf.trim()) {
|
|
201
|
+
chunks.push({
|
|
202
|
+
title: filename,
|
|
203
|
+
content: buf.trim(),
|
|
204
|
+
metadata: { source: filename, format, chunkIndex: chunks.length },
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Set totalChunks
|
|
210
|
+
for (const c of chunks) c.metadata.totalChunks = chunks.length;
|
|
211
|
+
return chunks;
|
|
212
|
+
}
|
|
213
|
+
}
|