morpheus-cli 0.9.5 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -43
- package/dist/channels/discord.js +3 -6
- package/dist/channels/telegram.js +3 -6
- package/dist/cli/commands/restart.js +15 -0
- package/dist/cli/commands/start.js +16 -0
- package/dist/config/manager.js +61 -0
- package/dist/config/paths.js +1 -0
- package/dist/config/schemas.js +11 -3
- package/dist/http/api.js +3 -0
- package/dist/http/routers/link.js +239 -0
- package/dist/http/routers/skills.js +1 -8
- package/dist/runtime/apoc.js +1 -1
- package/dist/runtime/audit/repository.js +1 -1
- package/dist/runtime/link-chunker.js +214 -0
- package/dist/runtime/link-repository.js +301 -0
- package/dist/runtime/link-search.js +298 -0
- package/dist/runtime/link-worker.js +284 -0
- package/dist/runtime/link.js +295 -0
- package/dist/runtime/memory/sati/service.js +1 -1
- package/dist/runtime/neo.js +1 -1
- package/dist/runtime/oracle.js +81 -44
- package/dist/runtime/scaffold.js +4 -17
- package/dist/runtime/skills/__tests__/loader.test.js +7 -10
- package/dist/runtime/skills/__tests__/registry.test.js +2 -18
- package/dist/runtime/skills/__tests__/tool.test.js +55 -224
- package/dist/runtime/skills/index.js +1 -2
- package/dist/runtime/skills/loader.js +0 -2
- package/dist/runtime/skills/registry.js +8 -20
- package/dist/runtime/skills/schema.js +0 -4
- package/dist/runtime/skills/tool.js +42 -209
- package/dist/runtime/smiths/delegator.js +1 -1
- package/dist/runtime/smiths/registry.js +1 -1
- package/dist/runtime/tasks/worker.js +12 -44
- package/dist/runtime/trinity.js +1 -1
- package/dist/types/config.js +14 -0
- package/dist/ui/assets/AuditDashboard-93LCGHG1.js +1 -0
- package/dist/ui/assets/{Chat-BNtutgja.js → Chat-CK5sNcQ1.js} +8 -8
- package/dist/ui/assets/{Chronos-3C8RPZcl.js → Chronos-m2h--GEe.js} +1 -1
- package/dist/ui/assets/{ConfirmationModal-ZQPBeJ2Z.js → ConfirmationModal-Dd5pUJme.js} +1 -1
- package/dist/ui/assets/{Dashboard-CqkHzr2F.js → Dashboard-ODwl7d-a.js} +1 -1
- package/dist/ui/assets/{DeleteConfirmationModal-CioxFWn_.js → DeleteConfirmationModal-CCcojDmr.js} +1 -1
- package/dist/ui/assets/Documents-dWnSoxFO.js +7 -0
- package/dist/ui/assets/{Logs-DBVanS0O.js → Logs-Dc9Z2LBj.js} +1 -1
- package/dist/ui/assets/{MCPManager-vXfL3P2U.js → MCPManager-CMkb8vMn.js} +1 -1
- package/dist/ui/assets/{ModelPricing-DyfdunLT.js → ModelPricing-DtHPPbEQ.js} +1 -1
- package/dist/ui/assets/{Notifications-VL-vep6d.js → Notifications-BPvo-DWP.js} +1 -1
- package/dist/ui/assets/{Pagination-oTGieBLM.js → Pagination-BHZKk42X.js} +1 -1
- package/dist/ui/assets/{SatiMemories-jaadkW0U.js → SatiMemories-BUPu1Lxr.js} +1 -1
- package/dist/ui/assets/SessionAudit-CFKF4DA8.js +9 -0
- package/dist/ui/assets/Settings-C4JrXfsR.js +47 -0
- package/dist/ui/assets/{Skills-DE3zziXL.js → Skills-BUlvJgJ4.js} +1 -1
- package/dist/ui/assets/{Smiths-pmogN1mU.js → Smiths-CDtJdY0I.js} +1 -1
- package/dist/ui/assets/{Tasks-Bs8s34Jc.js → Tasks-DK_cOsNK.js} +1 -1
- package/dist/ui/assets/{TrinityDatabases-D7uihcdp.js → TrinityDatabases-X07by-19.js} +1 -1
- package/dist/ui/assets/{UsageStats-B9gePLZ0.js → UsageStats-dYcgckLq.js} +1 -1
- package/dist/ui/assets/{WebhookManager-B2L3rCLM.js → WebhookManager-DDw5eX2R.js} +1 -1
- package/dist/ui/assets/{audit-Cggeu9mM.js → audit-DZ5WLUEm.js} +1 -1
- package/dist/ui/assets/{chronos-D3-sWhfU.js → chronos-B_HI4mlq.js} +1 -1
- package/dist/ui/assets/{config-CBqRUPgn.js → config-B-YxlVrc.js} +1 -1
- package/dist/ui/assets/index-DVjwJ8jT.css +1 -0
- package/dist/ui/assets/{index-zKplfrXZ.js → index-DfJwcKqG.js} +5 -5
- package/dist/ui/assets/{mcp-uL1R9hyA.js → mcp-k-_pwbqA.js} +1 -1
- package/dist/ui/assets/{skills-jmw8yTJs.js → skills-xMXangks.js} +1 -1
- package/dist/ui/assets/{stats-HOms6GnM.js → stats-C4QZIv5O.js} +1 -1
- package/dist/ui/assets/{vendor-icons-DMd9RGvJ.js → vendor-icons-NHF9HNeN.js} +1 -1
- package/dist/ui/index.html +3 -3
- package/dist/ui/sw.js +1 -1
- package/package.json +3 -1
- package/dist/runtime/__tests__/keymaker.test.js +0 -148
- package/dist/runtime/keymaker.js +0 -157
- package/dist/ui/assets/AuditDashboard-DliJ1CX0.js +0 -1
- package/dist/ui/assets/SessionAudit-BsXrWlwz.js +0 -9
- package/dist/ui/assets/Settings-B4eezRcg.js +0 -47
- package/dist/ui/assets/index-D4fzIKy1.css +0 -1
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import { Router } from 'express';
|
|
2
|
+
import multer from 'multer';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import fs from 'fs-extra';
|
|
5
|
+
import { homedir } from 'os';
|
|
6
|
+
import { LinkRepository } from '../../runtime/link-repository.js';
|
|
7
|
+
import { LinkWorker } from '../../runtime/link-worker.js';
|
|
8
|
+
import { ConfigManager } from '../../config/manager.js';
|
|
9
|
+
const DOCS_PATH = path.join(homedir(), '.morpheus', 'docs');
|
|
10
|
+
// Configure multer for file uploads
|
|
11
|
+
const storage = multer.diskStorage({
|
|
12
|
+
destination: async (req, file, cb) => {
|
|
13
|
+
await fs.ensureDir(DOCS_PATH);
|
|
14
|
+
cb(null, DOCS_PATH);
|
|
15
|
+
},
|
|
16
|
+
filename: (req, file, cb) => {
|
|
17
|
+
// Multer decodes originalname as Latin1 per HTTP spec.
|
|
18
|
+
// Re-encode to get the raw bytes and decode as UTF-8.
|
|
19
|
+
const fixedName = Buffer.from(file.originalname, 'latin1').toString('utf-8');
|
|
20
|
+
cb(null, fixedName);
|
|
21
|
+
},
|
|
22
|
+
});
|
|
23
|
+
const upload = multer({
|
|
24
|
+
storage,
|
|
25
|
+
limits: {
|
|
26
|
+
fileSize: 50 * 1024 * 1024, // 50MB default, will check config
|
|
27
|
+
},
|
|
28
|
+
fileFilter: (req, file, cb) => {
|
|
29
|
+
const name = Buffer.from(file.originalname, 'latin1').toString('utf-8');
|
|
30
|
+
const ext = path.extname(name).toLowerCase();
|
|
31
|
+
const allowed = ['.pdf', '.txt', '.md', '.docx'];
|
|
32
|
+
if (allowed.includes(ext)) {
|
|
33
|
+
cb(null, true);
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
cb(new Error(`Unsupported file type: ${ext}. Allowed: ${allowed.join(', ')}`));
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
});
|
|
40
|
+
/**
|
|
41
|
+
* Create the Link router for document management.
|
|
42
|
+
*/
|
|
43
|
+
export function createLinkRouter() {
|
|
44
|
+
const router = Router();
|
|
45
|
+
const repository = LinkRepository.getInstance();
|
|
46
|
+
const worker = LinkWorker.getInstance();
|
|
47
|
+
// GET /api/link/documents - List all documents
|
|
48
|
+
router.get('/documents', (req, res) => {
|
|
49
|
+
try {
|
|
50
|
+
const status = req.query.status;
|
|
51
|
+
const documents = repository.listDocuments(status);
|
|
52
|
+
const stats = repository.getStats();
|
|
53
|
+
res.json({
|
|
54
|
+
documents,
|
|
55
|
+
stats,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
catch (err) {
|
|
59
|
+
res.status(500).json({ error: err.message });
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
// GET /api/link/documents/:id - Get single document
|
|
63
|
+
router.get('/documents/:id', (req, res) => {
|
|
64
|
+
try {
|
|
65
|
+
const document = repository.getDocument(req.params.id);
|
|
66
|
+
if (!document) {
|
|
67
|
+
return res.status(404).json({ error: 'Document not found' });
|
|
68
|
+
}
|
|
69
|
+
// Also fetch chunks
|
|
70
|
+
const chunks = repository.getChunksByDocument(req.params.id);
|
|
71
|
+
res.json({ document, chunks });
|
|
72
|
+
}
|
|
73
|
+
catch (err) {
|
|
74
|
+
res.status(500).json({ error: err.message });
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
// POST /api/link/documents/upload - Upload a new document
|
|
78
|
+
router.post('/documents/upload', async (req, res) => {
|
|
79
|
+
try {
|
|
80
|
+
const config = ConfigManager.getInstance().getLinkConfig();
|
|
81
|
+
const maxSizeMB = config.max_file_size_mb;
|
|
82
|
+
// Configure multer with config max size
|
|
83
|
+
const uploadWithConfig = multer({
|
|
84
|
+
storage,
|
|
85
|
+
limits: { fileSize: maxSizeMB * 1024 * 1024 },
|
|
86
|
+
fileFilter: (req, file, cb) => {
|
|
87
|
+
const name = Buffer.from(file.originalname, 'latin1').toString('utf-8');
|
|
88
|
+
const ext = path.extname(name).toLowerCase();
|
|
89
|
+
const allowed = ['.pdf', '.txt', '.md', '.docx'];
|
|
90
|
+
if (allowed.includes(ext)) {
|
|
91
|
+
cb(null, true);
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
cb(new Error(`Unsupported file type: ${ext}`));
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
// Handle upload
|
|
99
|
+
await new Promise((resolve, reject) => {
|
|
100
|
+
uploadWithConfig.single('file')(req, res, (err) => {
|
|
101
|
+
if (err)
|
|
102
|
+
reject(err);
|
|
103
|
+
else
|
|
104
|
+
resolve();
|
|
105
|
+
});
|
|
106
|
+
});
|
|
107
|
+
if (!req.file) {
|
|
108
|
+
return res.status(400).json({ error: 'No file uploaded' });
|
|
109
|
+
}
|
|
110
|
+
// Trigger immediate scan
|
|
111
|
+
const result = await worker.tick();
|
|
112
|
+
res.json({
|
|
113
|
+
message: 'File uploaded successfully',
|
|
114
|
+
filename: Buffer.from(req.file.originalname, 'latin1').toString('utf-8'),
|
|
115
|
+
path: req.file.path,
|
|
116
|
+
indexed: result.indexed,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
catch (err) {
|
|
120
|
+
res.status(500).json({ error: err.message });
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
// DELETE /api/link/documents/:id - Delete a document
|
|
124
|
+
router.delete('/documents/:id', async (req, res) => {
|
|
125
|
+
try {
|
|
126
|
+
const document = repository.getDocument(req.params.id);
|
|
127
|
+
if (!document) {
|
|
128
|
+
return res.status(404).json({ error: 'Document not found' });
|
|
129
|
+
}
|
|
130
|
+
// Delete from repository (CASCADE removes chunks and embeddings)
|
|
131
|
+
const deleted = repository.deleteDocument(req.params.id);
|
|
132
|
+
// Also delete file from disk
|
|
133
|
+
try {
|
|
134
|
+
await fs.unlink(document.file_path);
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
// File may not exist, ignore
|
|
138
|
+
}
|
|
139
|
+
res.json({ message: 'Document deleted', deleted });
|
|
140
|
+
}
|
|
141
|
+
catch (err) {
|
|
142
|
+
res.status(500).json({ error: err.message });
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
// POST /api/link/documents/:id/reindex - Force reindex a document
|
|
146
|
+
router.post('/documents/:id/reindex', async (req, res) => {
|
|
147
|
+
try {
|
|
148
|
+
const document = repository.getDocument(req.params.id);
|
|
149
|
+
if (!document) {
|
|
150
|
+
return res.status(404).json({ error: 'Document not found' });
|
|
151
|
+
}
|
|
152
|
+
// Check if file still exists
|
|
153
|
+
const exists = await fs.pathExists(document.file_path);
|
|
154
|
+
if (!exists) {
|
|
155
|
+
return res.status(400).json({ error: 'Document file no longer exists' });
|
|
156
|
+
}
|
|
157
|
+
// Reset status to pending and trigger processing
|
|
158
|
+
repository.updateDocumentStatus(req.params.id, 'pending');
|
|
159
|
+
// Process the document
|
|
160
|
+
const result = await worker.processDocument(document.file_path);
|
|
161
|
+
res.json({
|
|
162
|
+
message: 'Document reindexed',
|
|
163
|
+
result,
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
catch (err) {
|
|
167
|
+
res.status(500).json({ error: err.message });
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
// GET /api/link/config - Get Link configuration
|
|
171
|
+
router.get('/config', (req, res) => {
|
|
172
|
+
try {
|
|
173
|
+
const config = ConfigManager.getInstance().getLinkConfig();
|
|
174
|
+
res.json(config);
|
|
175
|
+
}
|
|
176
|
+
catch (err) {
|
|
177
|
+
res.status(500).json({ error: err.message });
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
// POST /api/link/config - Update Link configuration (partial update)
|
|
181
|
+
router.post('/config', async (req, res) => {
|
|
182
|
+
try {
|
|
183
|
+
const configManager = ConfigManager.getInstance();
|
|
184
|
+
const currentConfig = configManager.get();
|
|
185
|
+
const currentLinkConfig = configManager.getLinkConfig();
|
|
186
|
+
const updates = req.body;
|
|
187
|
+
// Merge updates with current config (ensuring all required fields are present)
|
|
188
|
+
const newLinkConfig = {
|
|
189
|
+
...currentLinkConfig,
|
|
190
|
+
...updates,
|
|
191
|
+
};
|
|
192
|
+
// Save to zaion.yaml
|
|
193
|
+
await configManager.save({
|
|
194
|
+
...currentConfig,
|
|
195
|
+
link: newLinkConfig,
|
|
196
|
+
});
|
|
197
|
+
// Update worker interval if changed
|
|
198
|
+
if (updates.scan_interval_ms) {
|
|
199
|
+
worker.updateInterval(updates.scan_interval_ms);
|
|
200
|
+
}
|
|
201
|
+
res.json({
|
|
202
|
+
message: 'Configuration updated',
|
|
203
|
+
config: configManager.getLinkConfig(),
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
catch (err) {
|
|
207
|
+
res.status(500).json({ error: err.message });
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
// POST /api/link/worker/scan - Trigger manual scan
|
|
211
|
+
router.post('/worker/scan', async (req, res) => {
|
|
212
|
+
try {
|
|
213
|
+
const result = await worker.tick();
|
|
214
|
+
res.json({
|
|
215
|
+
message: 'Scan completed',
|
|
216
|
+
...result,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
catch (err) {
|
|
220
|
+
res.status(500).json({ error: err.message });
|
|
221
|
+
}
|
|
222
|
+
});
|
|
223
|
+
// GET /api/link/worker/status - Get worker status
|
|
224
|
+
router.get('/worker/status', (req, res) => {
|
|
225
|
+
try {
|
|
226
|
+
const config = ConfigManager.getInstance().getLinkConfig();
|
|
227
|
+
const stats = repository.getStats();
|
|
228
|
+
res.json({
|
|
229
|
+
running: true, // Worker is always running when daemon is up
|
|
230
|
+
scan_interval_ms: config.scan_interval_ms,
|
|
231
|
+
...stats,
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
catch (err) {
|
|
235
|
+
res.status(500).json({ error: err.message });
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
return router;
|
|
239
|
+
}
|
|
@@ -4,7 +4,7 @@ import extract from 'extract-zip';
|
|
|
4
4
|
import fs from 'fs-extra';
|
|
5
5
|
import path from 'path';
|
|
6
6
|
import os from 'os';
|
|
7
|
-
import { SkillRegistry
|
|
7
|
+
import { SkillRegistry } from '../../runtime/skills/index.js';
|
|
8
8
|
import { DisplayManager } from '../../runtime/display.js';
|
|
9
9
|
import { PATHS } from '../../config/paths.js';
|
|
10
10
|
import { SkillMetadataSchema } from '../../runtime/skills/schema.js';
|
|
@@ -119,8 +119,6 @@ export function createSkillsRouter() {
|
|
|
119
119
|
try {
|
|
120
120
|
const registry = SkillRegistry.getInstance();
|
|
121
121
|
const result = await registry.reload();
|
|
122
|
-
// Update skill_delegate tool description with new skills
|
|
123
|
-
updateSkillDelegateDescription();
|
|
124
122
|
display.log(`Skills reloaded: ${result.skills.length} loaded, ${result.errors.length} errors`, {
|
|
125
123
|
source: 'SkillsAPI',
|
|
126
124
|
});
|
|
@@ -207,7 +205,6 @@ export function createSkillsRouter() {
|
|
|
207
205
|
// Reload skills
|
|
208
206
|
const registry = SkillRegistry.getInstance();
|
|
209
207
|
await registry.reload();
|
|
210
|
-
updateSkillDelegateDescription();
|
|
211
208
|
display.log(`Skill "${metadata.name}" uploaded successfully`, { source: 'SkillsAPI' });
|
|
212
209
|
res.json({
|
|
213
210
|
success: true,
|
|
@@ -258,8 +255,6 @@ export function createSkillsRouter() {
|
|
|
258
255
|
if (!success) {
|
|
259
256
|
return res.status(404).json({ error: `Skill "${name}" not found` });
|
|
260
257
|
}
|
|
261
|
-
// Update skill_delegate tool description
|
|
262
|
-
updateSkillDelegateDescription();
|
|
263
258
|
display.log(`Skill "${name}" enabled`, { source: 'SkillsAPI' });
|
|
264
259
|
res.json({ success: true, name, enabled: true });
|
|
265
260
|
}
|
|
@@ -277,8 +272,6 @@ export function createSkillsRouter() {
|
|
|
277
272
|
if (!success) {
|
|
278
273
|
return res.status(404).json({ error: `Skill "${name}" not found` });
|
|
279
274
|
}
|
|
280
|
-
// Update skill_delegate tool description
|
|
281
|
-
updateSkillDelegateDescription();
|
|
282
275
|
display.log(`Skill "${name}" disabled`, { source: 'SkillsAPI' });
|
|
283
276
|
res.json({ success: true, name, enabled: false });
|
|
284
277
|
}
|
package/dist/runtime/apoc.js
CHANGED
|
@@ -259,7 +259,7 @@ ${context ? `CONTEXT FROM ORACLE:\n${context}` : ""}
|
|
|
259
259
|
try {
|
|
260
260
|
const inputCount = messages.length;
|
|
261
261
|
const startMs = Date.now();
|
|
262
|
-
const response = await this.agent.invoke({ messages }, { recursionLimit:
|
|
262
|
+
const response = await this.agent.invoke({ messages }, { recursionLimit: 10 });
|
|
263
263
|
const durationMs = Date.now() - startMs;
|
|
264
264
|
const apocConfig = this.config.apoc || this.config.llm;
|
|
265
265
|
const lastMessage = response.messages[response.messages.length - 1];
|
|
@@ -188,7 +188,7 @@ export class AuditRepository {
|
|
|
188
188
|
SUM(CASE WHEN ae.event_type = 'llm_call' THEN 1 ELSE 0 END) as llmCallCount,
|
|
189
189
|
SUM(CASE WHEN ae.event_type = 'tool_call' THEN 1 ELSE 0 END) as toolCallCount,
|
|
190
190
|
SUM(CASE WHEN ae.event_type = 'mcp_tool' THEN 1 ELSE 0 END) as mcpToolCount,
|
|
191
|
-
SUM(CASE WHEN ae.event_type = '
|
|
191
|
+
SUM(CASE WHEN ae.event_type = 'skill_loaded' THEN 1 ELSE 0 END) as skillCount,
|
|
192
192
|
SUM(CASE WHEN ae.event_type = 'memory_recovery' THEN 1 ELSE 0 END) as memoryRecoveryCount,
|
|
193
193
|
SUM(CASE WHEN ae.event_type = 'memory_persist' THEN 1 ELSE 0 END) as memoryPersistCount,
|
|
194
194
|
SUM(CASE WHEN ae.event_type = 'chronos_job' THEN 1 ELSE 0 END) as chronosJobCount,
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import { createHash } from 'crypto';
|
|
2
|
+
import fs from 'fs-extra';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import { PDFParse } from 'pdf-parse';
|
|
5
|
+
import mammoth from 'mammoth';
|
|
6
|
+
// ─── Hashing ─────────────────────────────────────────────────────────────────
|
|
7
|
+
/**
|
|
8
|
+
* Calculate SHA-256 hash of file content.
|
|
9
|
+
*/
|
|
10
|
+
export function hashDocument(content) {
|
|
11
|
+
return createHash('sha256').update(content).digest('hex');
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Calculate SHA-256 hash of a file by path.
|
|
15
|
+
*/
|
|
16
|
+
export async function hashFile(filePath) {
|
|
17
|
+
const content = await fs.readFile(filePath);
|
|
18
|
+
return hashDocument(content);
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Split text into chunks respecting sentence boundaries.
|
|
22
|
+
* @param text - The text to chunk
|
|
23
|
+
* @param chunkSize - Target size in characters (default: 500)
|
|
24
|
+
* @param minChunkSize - Minimum chunk size to avoid tiny chunks (default: 100)
|
|
25
|
+
*/
|
|
26
|
+
export function chunkText(text, chunkSize = 500, minChunkSize = 100) {
|
|
27
|
+
const chunks = [];
|
|
28
|
+
let position = 0;
|
|
29
|
+
let charPos = 0;
|
|
30
|
+
// Split by paragraphs first
|
|
31
|
+
const paragraphs = text.split(/\n\n+/);
|
|
32
|
+
let currentChunk = '';
|
|
33
|
+
let chunkStart = 0;
|
|
34
|
+
for (const paragraph of paragraphs) {
|
|
35
|
+
// If adding this paragraph exceeds chunk size
|
|
36
|
+
if (currentChunk.length + paragraph.length + 2 > chunkSize && currentChunk.length >= minChunkSize) {
|
|
37
|
+
// Save current chunk
|
|
38
|
+
chunks.push({
|
|
39
|
+
content: currentChunk.trim(),
|
|
40
|
+
position: position++,
|
|
41
|
+
char_start: chunkStart,
|
|
42
|
+
char_end: chunkStart + currentChunk.length,
|
|
43
|
+
});
|
|
44
|
+
currentChunk = paragraph;
|
|
45
|
+
chunkStart = charPos;
|
|
46
|
+
}
|
|
47
|
+
else if (paragraph.length > chunkSize) {
|
|
48
|
+
// Paragraph is too long, split by sentences
|
|
49
|
+
if (currentChunk.length > 0) {
|
|
50
|
+
// Save current chunk first
|
|
51
|
+
chunks.push({
|
|
52
|
+
content: currentChunk.trim(),
|
|
53
|
+
position: position++,
|
|
54
|
+
char_start: chunkStart,
|
|
55
|
+
char_end: chunkStart + currentChunk.length,
|
|
56
|
+
});
|
|
57
|
+
currentChunk = '';
|
|
58
|
+
}
|
|
59
|
+
const sentences = splitBySentences(paragraph);
|
|
60
|
+
let sentenceChunk = '';
|
|
61
|
+
let sentenceStart = charPos;
|
|
62
|
+
for (const sentence of sentences) {
|
|
63
|
+
if (sentenceChunk.length + sentence.length + 1 > chunkSize && sentenceChunk.length >= minChunkSize) {
|
|
64
|
+
chunks.push({
|
|
65
|
+
content: sentenceChunk.trim(),
|
|
66
|
+
position: position++,
|
|
67
|
+
char_start: sentenceStart,
|
|
68
|
+
char_end: sentenceStart + sentenceChunk.length,
|
|
69
|
+
});
|
|
70
|
+
sentenceChunk = sentence;
|
|
71
|
+
sentenceStart = charPos + (paragraph.indexOf(sentence) > 0 ? paragraph.indexOf(sentence) : 0);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
sentenceChunk += (sentenceChunk ? ' ' : '') + sentence;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
if (sentenceChunk.trim()) {
|
|
78
|
+
currentChunk = sentenceChunk;
|
|
79
|
+
chunkStart = sentenceStart;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
// Add paragraph to current chunk
|
|
84
|
+
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
|
|
85
|
+
if (!currentChunk) {
|
|
86
|
+
chunkStart = charPos;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
charPos += paragraph.length + 2; // +2 for paragraph separator
|
|
90
|
+
}
|
|
91
|
+
// Don't forget the last chunk
|
|
92
|
+
if (currentChunk.trim()) {
|
|
93
|
+
chunks.push({
|
|
94
|
+
content: currentChunk.trim(),
|
|
95
|
+
position,
|
|
96
|
+
char_start: chunkStart,
|
|
97
|
+
char_end: chunkStart + currentChunk.length,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
return chunks;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Split text by sentences using common sentence delimiters.
|
|
104
|
+
*/
|
|
105
|
+
function splitBySentences(text) {
|
|
106
|
+
// Match sentences ending with . ! ? followed by space or end of string
|
|
107
|
+
const sentences = text.match(/[^.!?]*[.!?]+(?:\s+|$)/g) || [text];
|
|
108
|
+
return sentences.map(s => s.trim()).filter(Boolean);
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Parse PDF file and extract text.
|
|
112
|
+
*/
|
|
113
|
+
export async function parsePDF(filePath) {
|
|
114
|
+
const dataBuffer = await fs.readFile(filePath);
|
|
115
|
+
const parser = new PDFParse({ data: dataBuffer });
|
|
116
|
+
const textResult = await parser.getText();
|
|
117
|
+
const text = textResult.text || '';
|
|
118
|
+
const infoResult = await parser.getInfo();
|
|
119
|
+
return {
|
|
120
|
+
text,
|
|
121
|
+
metadata: {
|
|
122
|
+
pageCount: infoResult.total,
|
|
123
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Parse DOCX file and extract text.
|
|
129
|
+
*/
|
|
130
|
+
export async function parseDOCX(filePath) {
|
|
131
|
+
const result = await mammoth.extractRawText({ path: filePath });
|
|
132
|
+
const text = result.value;
|
|
133
|
+
return {
|
|
134
|
+
text,
|
|
135
|
+
metadata: {
|
|
136
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
137
|
+
},
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Parse plain text file.
|
|
142
|
+
*/
|
|
143
|
+
export async function parseTXT(filePath) {
|
|
144
|
+
const text = await fs.readFile(filePath, 'utf-8');
|
|
145
|
+
return {
|
|
146
|
+
text,
|
|
147
|
+
metadata: {
|
|
148
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
149
|
+
},
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Parse Markdown file (treated as plain text for chunking).
|
|
154
|
+
*/
|
|
155
|
+
export async function parseMD(filePath) {
|
|
156
|
+
return parseTXT(filePath);
|
|
157
|
+
}
|
|
158
|
+
// ─── Supported Formats ───────────────────────────────────────────────────────
|
|
159
|
+
const SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.docx'];
|
|
160
|
+
/**
|
|
161
|
+
* Check if a file extension is supported.
|
|
162
|
+
*/
|
|
163
|
+
export function isSupportedFormat(filePath) {
|
|
164
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
165
|
+
return SUPPORTED_EXTENSIONS.includes(ext);
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Get the content type based on file extension.
|
|
169
|
+
*/
|
|
170
|
+
export function getContentType(filePath) {
|
|
171
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
172
|
+
const contentTypes = {
|
|
173
|
+
'.pdf': 'application/pdf',
|
|
174
|
+
'.txt': 'text/plain',
|
|
175
|
+
'.md': 'text/markdown',
|
|
176
|
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
177
|
+
};
|
|
178
|
+
return contentTypes[ext] || 'application/octet-stream';
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Parse a document based on its file extension.
|
|
182
|
+
*/
|
|
183
|
+
export async function parseDocument(filePath) {
|
|
184
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
185
|
+
switch (ext) {
|
|
186
|
+
case '.pdf':
|
|
187
|
+
return parsePDF(filePath);
|
|
188
|
+
case '.docx':
|
|
189
|
+
return parseDOCX(filePath);
|
|
190
|
+
case '.txt':
|
|
191
|
+
return parseTXT(filePath);
|
|
192
|
+
case '.md':
|
|
193
|
+
return parseMD(filePath);
|
|
194
|
+
default:
|
|
195
|
+
throw new Error(`Unsupported file format: ${ext}`);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Process a document: parse, chunk, and return chunks with metadata.
|
|
200
|
+
*/
|
|
201
|
+
export async function processDocument(filePath, chunkSize = 500) {
|
|
202
|
+
// Parse document
|
|
203
|
+
const parsed = await parseDocument(filePath);
|
|
204
|
+
// Calculate hash from raw file bytes (must match hashFile used by the caller)
|
|
205
|
+
const hash = await hashFile(filePath);
|
|
206
|
+
// Chunk text
|
|
207
|
+
const chunks = chunkText(parsed.text, chunkSize);
|
|
208
|
+
return {
|
|
209
|
+
text: parsed.text,
|
|
210
|
+
chunks,
|
|
211
|
+
hash,
|
|
212
|
+
metadata: parsed.metadata,
|
|
213
|
+
};
|
|
214
|
+
}
|