morpheus-cli 0.9.4 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +63 -43
  2. package/dist/channels/discord.js +3 -6
  3. package/dist/channels/telegram.js +3 -6
  4. package/dist/cli/commands/restart.js +15 -0
  5. package/dist/cli/commands/start.js +16 -0
  6. package/dist/config/manager.js +61 -0
  7. package/dist/config/paths.js +1 -0
  8. package/dist/config/schemas.js +11 -3
  9. package/dist/http/api.js +3 -0
  10. package/dist/http/routers/link.js +239 -0
  11. package/dist/http/routers/skills.js +1 -8
  12. package/dist/http/routers/smiths.js +14 -4
  13. package/dist/runtime/apoc.js +1 -1
  14. package/dist/runtime/audit/repository.js +1 -1
  15. package/dist/runtime/link-chunker.js +214 -0
  16. package/dist/runtime/link-repository.js +301 -0
  17. package/dist/runtime/link-search.js +298 -0
  18. package/dist/runtime/link-worker.js +284 -0
  19. package/dist/runtime/link.js +295 -0
  20. package/dist/runtime/memory/sati/service.js +1 -1
  21. package/dist/runtime/neo.js +1 -1
  22. package/dist/runtime/oracle.js +81 -44
  23. package/dist/runtime/scaffold.js +4 -17
  24. package/dist/runtime/skills/__tests__/loader.test.js +7 -10
  25. package/dist/runtime/skills/__tests__/registry.test.js +2 -18
  26. package/dist/runtime/skills/__tests__/tool.test.js +55 -224
  27. package/dist/runtime/skills/index.js +1 -2
  28. package/dist/runtime/skills/loader.js +0 -2
  29. package/dist/runtime/skills/registry.js +8 -20
  30. package/dist/runtime/skills/schema.js +0 -4
  31. package/dist/runtime/skills/tool.js +42 -209
  32. package/dist/runtime/smiths/delegator.js +1 -1
  33. package/dist/runtime/smiths/registry.js +1 -1
  34. package/dist/runtime/tasks/worker.js +12 -44
  35. package/dist/runtime/trinity.js +1 -1
  36. package/dist/types/config.js +14 -0
  37. package/dist/ui/assets/AuditDashboard-93LCGHG1.js +1 -0
  38. package/dist/ui/assets/{Chat-5AeRYuRj.js → Chat-CK5sNcQ1.js} +8 -8
  39. package/dist/ui/assets/{Chronos-BrKldYVw.js → Chronos-m2h--GEe.js} +1 -1
  40. package/dist/ui/assets/{ConfirmationModal-DsbS3XkJ.js → ConfirmationModal-Dd5pUJme.js} +1 -1
  41. package/dist/ui/assets/{Dashboard-DvrTXLdo.js → Dashboard-ODwl7d-a.js} +1 -1
  42. package/dist/ui/assets/{DeleteConfirmationModal-BfSjv04R.js → DeleteConfirmationModal-CCcojDmr.js} +1 -1
  43. package/dist/ui/assets/Documents-dWnSoxFO.js +7 -0
  44. package/dist/ui/assets/{Logs-B0ZYWs5x.js → Logs-Dc9Z2LBj.js} +1 -1
  45. package/dist/ui/assets/{MCPManager-BwHGTeNs.js → MCPManager-CMkb8vMn.js} +1 -1
  46. package/dist/ui/assets/{ModelPricing-CYhGRQr8.js → ModelPricing-DtHPPbEQ.js} +1 -1
  47. package/dist/ui/assets/{Notifications-BYMAtVMq.js → Notifications-BPvo-DWP.js} +1 -1
  48. package/dist/ui/assets/{Pagination-oTGieBLM.js → Pagination-BHZKk42X.js} +1 -1
  49. package/dist/ui/assets/{SatiMemories-I1vsYtP2.js → SatiMemories-BUPu1Lxr.js} +1 -1
  50. package/dist/ui/assets/SessionAudit-CFKF4DA8.js +9 -0
  51. package/dist/ui/assets/Settings-C4JrXfsR.js +47 -0
  52. package/dist/ui/assets/{Skills-lGU3I5DO.js → Skills-BUlvJgJ4.js} +1 -1
  53. package/dist/ui/assets/Smiths-CDtJdY0I.js +1 -0
  54. package/dist/ui/assets/{Tasks-Bz92GPWK.js → Tasks-DK_cOsNK.js} +1 -1
  55. package/dist/ui/assets/{TrinityDatabases-BUY-3j7Q.js → TrinityDatabases-X07by-19.js} +1 -1
  56. package/dist/ui/assets/{UsageStats-Dr5eSgJc.js → UsageStats-dYcgckLq.js} +1 -1
  57. package/dist/ui/assets/{WebhookManager-DIASAC-1.js → WebhookManager-DDw5eX2R.js} +1 -1
  58. package/dist/ui/assets/{audit-CcAEDbZh.js → audit-DZ5WLUEm.js} +1 -1
  59. package/dist/ui/assets/{chronos-2Z9E96_1.js → chronos-B_HI4mlq.js} +1 -1
  60. package/dist/ui/assets/{config-DdfK4DX6.js → config-B-YxlVrc.js} +1 -1
  61. package/dist/ui/assets/index-DVjwJ8jT.css +1 -0
  62. package/dist/ui/assets/{index-Dpd1Mkgp.js → index-DfJwcKqG.js} +5 -5
  63. package/dist/ui/assets/{mcp-BWMt8aY7.js → mcp-k-_pwbqA.js} +1 -1
  64. package/dist/ui/assets/{skills-D7JjK7JH.js → skills-xMXangks.js} +1 -1
  65. package/dist/ui/assets/{stats-DoIhtLot.js → stats-C4QZIv5O.js} +1 -1
  66. package/dist/ui/assets/{vendor-icons-DMd9RGvJ.js → vendor-icons-NHF9HNeN.js} +1 -1
  67. package/dist/ui/index.html +3 -3
  68. package/dist/ui/sw.js +1 -1
  69. package/package.json +3 -1
  70. package/dist/runtime/__tests__/keymaker.test.js +0 -148
  71. package/dist/runtime/keymaker.js +0 -157
  72. package/dist/ui/assets/AuditDashboard-C1f6Hbdw.js +0 -1
  73. package/dist/ui/assets/SessionAudit-BCecQWde.js +0 -9
  74. package/dist/ui/assets/Settings-Cu4D-7tb.js +0 -47
  75. package/dist/ui/assets/Smiths-DnEH3nID.js +0 -1
  76. package/dist/ui/assets/index-D4fzIKy1.css +0 -1
@@ -0,0 +1,239 @@
1
+ import { Router } from 'express';
2
+ import multer from 'multer';
3
+ import path from 'path';
4
+ import fs from 'fs-extra';
5
+ import { homedir } from 'os';
6
+ import { LinkRepository } from '../../runtime/link-repository.js';
7
+ import { LinkWorker } from '../../runtime/link-worker.js';
8
+ import { ConfigManager } from '../../config/manager.js';
9
+ const DOCS_PATH = path.join(homedir(), '.morpheus', 'docs');
10
+ // Configure multer for file uploads
11
+ const storage = multer.diskStorage({
12
+ destination: async (req, file, cb) => {
13
+ await fs.ensureDir(DOCS_PATH);
14
+ cb(null, DOCS_PATH);
15
+ },
16
+ filename: (req, file, cb) => {
17
+ // Multer decodes originalname as Latin1 per HTTP spec.
18
+ // Re-encode to get the raw bytes and decode as UTF-8.
19
+ const fixedName = Buffer.from(file.originalname, 'latin1').toString('utf-8');
20
+ cb(null, fixedName);
21
+ },
22
+ });
23
+ const upload = multer({
24
+ storage,
25
+ limits: {
26
+ fileSize: 50 * 1024 * 1024, // 50MB default, will check config
27
+ },
28
+ fileFilter: (req, file, cb) => {
29
+ const name = Buffer.from(file.originalname, 'latin1').toString('utf-8');
30
+ const ext = path.extname(name).toLowerCase();
31
+ const allowed = ['.pdf', '.txt', '.md', '.docx'];
32
+ if (allowed.includes(ext)) {
33
+ cb(null, true);
34
+ }
35
+ else {
36
+ cb(new Error(`Unsupported file type: ${ext}. Allowed: ${allowed.join(', ')}`));
37
+ }
38
+ },
39
+ });
40
+ /**
41
+ * Create the Link router for document management.
42
+ */
43
+ export function createLinkRouter() {
44
+ const router = Router();
45
+ const repository = LinkRepository.getInstance();
46
+ const worker = LinkWorker.getInstance();
47
+ // GET /api/link/documents - List all documents
48
+ router.get('/documents', (req, res) => {
49
+ try {
50
+ const status = req.query.status;
51
+ const documents = repository.listDocuments(status);
52
+ const stats = repository.getStats();
53
+ res.json({
54
+ documents,
55
+ stats,
56
+ });
57
+ }
58
+ catch (err) {
59
+ res.status(500).json({ error: err.message });
60
+ }
61
+ });
62
+ // GET /api/link/documents/:id - Get single document
63
+ router.get('/documents/:id', (req, res) => {
64
+ try {
65
+ const document = repository.getDocument(req.params.id);
66
+ if (!document) {
67
+ return res.status(404).json({ error: 'Document not found' });
68
+ }
69
+ // Also fetch chunks
70
+ const chunks = repository.getChunksByDocument(req.params.id);
71
+ res.json({ document, chunks });
72
+ }
73
+ catch (err) {
74
+ res.status(500).json({ error: err.message });
75
+ }
76
+ });
77
+ // POST /api/link/documents/upload - Upload a new document
78
+ router.post('/documents/upload', async (req, res) => {
79
+ try {
80
+ const config = ConfigManager.getInstance().getLinkConfig();
81
+ const maxSizeMB = config.max_file_size_mb;
82
+ // Configure multer with config max size
83
+ const uploadWithConfig = multer({
84
+ storage,
85
+ limits: { fileSize: maxSizeMB * 1024 * 1024 },
86
+ fileFilter: (req, file, cb) => {
87
+ const name = Buffer.from(file.originalname, 'latin1').toString('utf-8');
88
+ const ext = path.extname(name).toLowerCase();
89
+ const allowed = ['.pdf', '.txt', '.md', '.docx'];
90
+ if (allowed.includes(ext)) {
91
+ cb(null, true);
92
+ }
93
+ else {
94
+ cb(new Error(`Unsupported file type: ${ext}`));
95
+ }
96
+ },
97
+ });
98
+ // Handle upload
99
+ await new Promise((resolve, reject) => {
100
+ uploadWithConfig.single('file')(req, res, (err) => {
101
+ if (err)
102
+ reject(err);
103
+ else
104
+ resolve();
105
+ });
106
+ });
107
+ if (!req.file) {
108
+ return res.status(400).json({ error: 'No file uploaded' });
109
+ }
110
+ // Trigger immediate scan
111
+ const result = await worker.tick();
112
+ res.json({
113
+ message: 'File uploaded successfully',
114
+ filename: Buffer.from(req.file.originalname, 'latin1').toString('utf-8'),
115
+ path: req.file.path,
116
+ indexed: result.indexed,
117
+ });
118
+ }
119
+ catch (err) {
120
+ res.status(500).json({ error: err.message });
121
+ }
122
+ });
123
+ // DELETE /api/link/documents/:id - Delete a document
124
+ router.delete('/documents/:id', async (req, res) => {
125
+ try {
126
+ const document = repository.getDocument(req.params.id);
127
+ if (!document) {
128
+ return res.status(404).json({ error: 'Document not found' });
129
+ }
130
+ // Delete from repository (CASCADE removes chunks and embeddings)
131
+ const deleted = repository.deleteDocument(req.params.id);
132
+ // Also delete file from disk
133
+ try {
134
+ await fs.unlink(document.file_path);
135
+ }
136
+ catch {
137
+ // File may not exist, ignore
138
+ }
139
+ res.json({ message: 'Document deleted', deleted });
140
+ }
141
+ catch (err) {
142
+ res.status(500).json({ error: err.message });
143
+ }
144
+ });
145
+ // POST /api/link/documents/:id/reindex - Force reindex a document
146
+ router.post('/documents/:id/reindex', async (req, res) => {
147
+ try {
148
+ const document = repository.getDocument(req.params.id);
149
+ if (!document) {
150
+ return res.status(404).json({ error: 'Document not found' });
151
+ }
152
+ // Check if file still exists
153
+ const exists = await fs.pathExists(document.file_path);
154
+ if (!exists) {
155
+ return res.status(400).json({ error: 'Document file no longer exists' });
156
+ }
157
+ // Reset status to pending and trigger processing
158
+ repository.updateDocumentStatus(req.params.id, 'pending');
159
+ // Process the document
160
+ const result = await worker.processDocument(document.file_path);
161
+ res.json({
162
+ message: 'Document reindexed',
163
+ result,
164
+ });
165
+ }
166
+ catch (err) {
167
+ res.status(500).json({ error: err.message });
168
+ }
169
+ });
170
+ // GET /api/link/config - Get Link configuration
171
+ router.get('/config', (req, res) => {
172
+ try {
173
+ const config = ConfigManager.getInstance().getLinkConfig();
174
+ res.json(config);
175
+ }
176
+ catch (err) {
177
+ res.status(500).json({ error: err.message });
178
+ }
179
+ });
180
+ // POST /api/link/config - Update Link configuration (partial update)
181
+ router.post('/config', async (req, res) => {
182
+ try {
183
+ const configManager = ConfigManager.getInstance();
184
+ const currentConfig = configManager.get();
185
+ const currentLinkConfig = configManager.getLinkConfig();
186
+ const updates = req.body;
187
+ // Merge updates with current config (ensuring all required fields are present)
188
+ const newLinkConfig = {
189
+ ...currentLinkConfig,
190
+ ...updates,
191
+ };
192
+ // Save to zaion.yaml
193
+ await configManager.save({
194
+ ...currentConfig,
195
+ link: newLinkConfig,
196
+ });
197
+ // Update worker interval if changed
198
+ if (updates.scan_interval_ms) {
199
+ worker.updateInterval(updates.scan_interval_ms);
200
+ }
201
+ res.json({
202
+ message: 'Configuration updated',
203
+ config: configManager.getLinkConfig(),
204
+ });
205
+ }
206
+ catch (err) {
207
+ res.status(500).json({ error: err.message });
208
+ }
209
+ });
210
+ // POST /api/link/worker/scan - Trigger manual scan
211
+ router.post('/worker/scan', async (req, res) => {
212
+ try {
213
+ const result = await worker.tick();
214
+ res.json({
215
+ message: 'Scan completed',
216
+ ...result,
217
+ });
218
+ }
219
+ catch (err) {
220
+ res.status(500).json({ error: err.message });
221
+ }
222
+ });
223
+ // GET /api/link/worker/status - Get worker status
224
+ router.get('/worker/status', (req, res) => {
225
+ try {
226
+ const config = ConfigManager.getInstance().getLinkConfig();
227
+ const stats = repository.getStats();
228
+ res.json({
229
+ running: true, // Worker is always running when daemon is up
230
+ scan_interval_ms: config.scan_interval_ms,
231
+ ...stats,
232
+ });
233
+ }
234
+ catch (err) {
235
+ res.status(500).json({ error: err.message });
236
+ }
237
+ });
238
+ return router;
239
+ }
@@ -4,7 +4,7 @@ import extract from 'extract-zip';
4
4
  import fs from 'fs-extra';
5
5
  import path from 'path';
6
6
  import os from 'os';
7
- import { SkillRegistry, updateSkillDelegateDescription } from '../../runtime/skills/index.js';
7
+ import { SkillRegistry } from '../../runtime/skills/index.js';
8
8
  import { DisplayManager } from '../../runtime/display.js';
9
9
  import { PATHS } from '../../config/paths.js';
10
10
  import { SkillMetadataSchema } from '../../runtime/skills/schema.js';
@@ -119,8 +119,6 @@ export function createSkillsRouter() {
119
119
  try {
120
120
  const registry = SkillRegistry.getInstance();
121
121
  const result = await registry.reload();
122
- // Update skill_delegate tool description with new skills
123
- updateSkillDelegateDescription();
124
122
  display.log(`Skills reloaded: ${result.skills.length} loaded, ${result.errors.length} errors`, {
125
123
  source: 'SkillsAPI',
126
124
  });
@@ -207,7 +205,6 @@ export function createSkillsRouter() {
207
205
  // Reload skills
208
206
  const registry = SkillRegistry.getInstance();
209
207
  await registry.reload();
210
- updateSkillDelegateDescription();
211
208
  display.log(`Skill "${metadata.name}" uploaded successfully`, { source: 'SkillsAPI' });
212
209
  res.json({
213
210
  success: true,
@@ -258,8 +255,6 @@ export function createSkillsRouter() {
258
255
  if (!success) {
259
256
  return res.status(404).json({ error: `Skill "${name}" not found` });
260
257
  }
261
- // Update skill_delegate tool description
262
- updateSkillDelegateDescription();
263
258
  display.log(`Skill "${name}" enabled`, { source: 'SkillsAPI' });
264
259
  res.json({ success: true, name, enabled: true });
265
260
  }
@@ -277,8 +272,6 @@ export function createSkillsRouter() {
277
272
  if (!success) {
278
273
  return res.status(404).json({ error: `Skill "${name}" not found` });
279
274
  }
280
- // Update skill_delegate tool description
281
- updateSkillDelegateDescription();
282
275
  display.log(`Skill "${name}" disabled`, { source: 'SkillsAPI' });
283
276
  res.json({ success: true, name, enabled: false });
284
277
  }
@@ -172,13 +172,23 @@ export function createSmithsRouter() {
172
172
  /**
173
173
  * DELETE /api/smiths/:name — Remove a Smith
174
174
  */
175
- router.delete('/:name', (req, res) => {
175
+ router.delete('/:name', async (req, res) => {
176
176
  try {
177
- const removed = registry.unregister(req.params.name);
177
+ const smithName = req.params.name;
178
+ const removed = registry.unregister(smithName);
178
179
  if (!removed) {
179
- return res.status(404).json({ error: `Smith '${req.params.name}' not found` });
180
+ return res.status(404).json({ error: `Smith '${smithName}' not found` });
180
181
  }
181
- res.json({ status: 'removed', name: req.params.name });
182
+ // Persist removal to zaion.yaml
183
+ const configManager = ConfigManager.getInstance();
184
+ const currentConfig = configManager.get();
185
+ const smithsConfig = configManager.getSmithsConfig();
186
+ const updatedEntries = smithsConfig.entries.filter(e => e.name !== smithName);
187
+ await configManager.save({
188
+ ...currentConfig,
189
+ smiths: { ...smithsConfig, entries: updatedEntries },
190
+ });
191
+ res.json({ status: 'removed', name: smithName });
182
192
  }
183
193
  catch (err) {
184
194
  res.status(500).json({ error: err.message });
@@ -259,7 +259,7 @@ ${context ? `CONTEXT FROM ORACLE:\n${context}` : ""}
259
259
  try {
260
260
  const inputCount = messages.length;
261
261
  const startMs = Date.now();
262
- const response = await this.agent.invoke({ messages }, { recursionLimit: 50 });
262
+ const response = await this.agent.invoke({ messages }, { recursionLimit: 10 });
263
263
  const durationMs = Date.now() - startMs;
264
264
  const apocConfig = this.config.apoc || this.config.llm;
265
265
  const lastMessage = response.messages[response.messages.length - 1];
@@ -188,7 +188,7 @@ export class AuditRepository {
188
188
  SUM(CASE WHEN ae.event_type = 'llm_call' THEN 1 ELSE 0 END) as llmCallCount,
189
189
  SUM(CASE WHEN ae.event_type = 'tool_call' THEN 1 ELSE 0 END) as toolCallCount,
190
190
  SUM(CASE WHEN ae.event_type = 'mcp_tool' THEN 1 ELSE 0 END) as mcpToolCount,
191
- SUM(CASE WHEN ae.event_type = 'skill_executed' THEN 1 ELSE 0 END) as skillCount,
191
+ SUM(CASE WHEN ae.event_type = 'skill_loaded' THEN 1 ELSE 0 END) as skillCount,
192
192
  SUM(CASE WHEN ae.event_type = 'memory_recovery' THEN 1 ELSE 0 END) as memoryRecoveryCount,
193
193
  SUM(CASE WHEN ae.event_type = 'memory_persist' THEN 1 ELSE 0 END) as memoryPersistCount,
194
194
  SUM(CASE WHEN ae.event_type = 'chronos_job' THEN 1 ELSE 0 END) as chronosJobCount,
@@ -0,0 +1,214 @@
1
+ import { createHash } from 'crypto';
2
+ import fs from 'fs-extra';
3
+ import path from 'path';
4
+ import { PDFParse } from 'pdf-parse';
5
+ import mammoth from 'mammoth';
6
+ // ─── Hashing ─────────────────────────────────────────────────────────────────
7
+ /**
8
+ * Calculate SHA-256 hash of file content.
9
+ */
10
+ export function hashDocument(content) {
11
+ return createHash('sha256').update(content).digest('hex');
12
+ }
13
+ /**
14
+ * Calculate SHA-256 hash of a file by path.
15
+ */
16
+ export async function hashFile(filePath) {
17
+ const content = await fs.readFile(filePath);
18
+ return hashDocument(content);
19
+ }
20
+ /**
21
+ * Split text into chunks respecting sentence boundaries.
22
+ * @param text - The text to chunk
23
+ * @param chunkSize - Target size in characters (default: 500)
24
+ * @param minChunkSize - Minimum chunk size to avoid tiny chunks (default: 100)
25
+ */
26
+ export function chunkText(text, chunkSize = 500, minChunkSize = 100) {
27
+ const chunks = [];
28
+ let position = 0;
29
+ let charPos = 0;
30
+ // Split by paragraphs first
31
+ const paragraphs = text.split(/\n\n+/);
32
+ let currentChunk = '';
33
+ let chunkStart = 0;
34
+ for (const paragraph of paragraphs) {
35
+ // If adding this paragraph exceeds chunk size
36
+ if (currentChunk.length + paragraph.length + 2 > chunkSize && currentChunk.length >= minChunkSize) {
37
+ // Save current chunk
38
+ chunks.push({
39
+ content: currentChunk.trim(),
40
+ position: position++,
41
+ char_start: chunkStart,
42
+ char_end: chunkStart + currentChunk.length,
43
+ });
44
+ currentChunk = paragraph;
45
+ chunkStart = charPos;
46
+ }
47
+ else if (paragraph.length > chunkSize) {
48
+ // Paragraph is too long, split by sentences
49
+ if (currentChunk.length > 0) {
50
+ // Save current chunk first
51
+ chunks.push({
52
+ content: currentChunk.trim(),
53
+ position: position++,
54
+ char_start: chunkStart,
55
+ char_end: chunkStart + currentChunk.length,
56
+ });
57
+ currentChunk = '';
58
+ }
59
+ const sentences = splitBySentences(paragraph);
60
+ let sentenceChunk = '';
61
+ let sentenceStart = charPos;
62
+ for (const sentence of sentences) {
63
+ if (sentenceChunk.length + sentence.length + 1 > chunkSize && sentenceChunk.length >= minChunkSize) {
64
+ chunks.push({
65
+ content: sentenceChunk.trim(),
66
+ position: position++,
67
+ char_start: sentenceStart,
68
+ char_end: sentenceStart + sentenceChunk.length,
69
+ });
70
+ sentenceChunk = sentence;
71
+ sentenceStart = charPos + (paragraph.indexOf(sentence) > 0 ? paragraph.indexOf(sentence) : 0);
72
+ }
73
+ else {
74
+ sentenceChunk += (sentenceChunk ? ' ' : '') + sentence;
75
+ }
76
+ }
77
+ if (sentenceChunk.trim()) {
78
+ currentChunk = sentenceChunk;
79
+ chunkStart = sentenceStart;
80
+ }
81
+ }
82
+ else {
83
+ // Add paragraph to current chunk
84
+ currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
85
+ if (!currentChunk) {
86
+ chunkStart = charPos;
87
+ }
88
+ }
89
+ charPos += paragraph.length + 2; // +2 for paragraph separator
90
+ }
91
+ // Don't forget the last chunk
92
+ if (currentChunk.trim()) {
93
+ chunks.push({
94
+ content: currentChunk.trim(),
95
+ position,
96
+ char_start: chunkStart,
97
+ char_end: chunkStart + currentChunk.length,
98
+ });
99
+ }
100
+ return chunks;
101
+ }
102
+ /**
103
+ * Split text by sentences using common sentence delimiters.
104
+ */
105
+ function splitBySentences(text) {
106
+ // Match sentences ending with . ! ? followed by space or end of string
107
+ const sentences = text.match(/[^.!?]*[.!?]+(?:\s+|$)/g) || [text];
108
+ return sentences.map(s => s.trim()).filter(Boolean);
109
+ }
110
+ /**
111
+ * Parse PDF file and extract text.
112
+ */
113
+ export async function parsePDF(filePath) {
114
+ const dataBuffer = await fs.readFile(filePath);
115
+ const parser = new PDFParse({ data: dataBuffer });
116
+ const textResult = await parser.getText();
117
+ const text = textResult.text || '';
118
+ const infoResult = await parser.getInfo();
119
+ return {
120
+ text,
121
+ metadata: {
122
+ pageCount: infoResult.total,
123
+ wordCount: text.split(/\s+/).filter(Boolean).length,
124
+ },
125
+ };
126
+ }
127
+ /**
128
+ * Parse DOCX file and extract text.
129
+ */
130
+ export async function parseDOCX(filePath) {
131
+ const result = await mammoth.extractRawText({ path: filePath });
132
+ const text = result.value;
133
+ return {
134
+ text,
135
+ metadata: {
136
+ wordCount: text.split(/\s+/).filter(Boolean).length,
137
+ },
138
+ };
139
+ }
140
+ /**
141
+ * Parse plain text file.
142
+ */
143
+ export async function parseTXT(filePath) {
144
+ const text = await fs.readFile(filePath, 'utf-8');
145
+ return {
146
+ text,
147
+ metadata: {
148
+ wordCount: text.split(/\s+/).filter(Boolean).length,
149
+ },
150
+ };
151
+ }
152
+ /**
153
+ * Parse Markdown file (treated as plain text for chunking).
154
+ */
155
+ export async function parseMD(filePath) {
156
+ return parseTXT(filePath);
157
+ }
158
+ // ─── Supported Formats ───────────────────────────────────────────────────────
159
+ const SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.docx'];
160
+ /**
161
+ * Check if a file extension is supported.
162
+ */
163
+ export function isSupportedFormat(filePath) {
164
+ const ext = path.extname(filePath).toLowerCase();
165
+ return SUPPORTED_EXTENSIONS.includes(ext);
166
+ }
167
+ /**
168
+ * Get the content type based on file extension.
169
+ */
170
+ export function getContentType(filePath) {
171
+ const ext = path.extname(filePath).toLowerCase();
172
+ const contentTypes = {
173
+ '.pdf': 'application/pdf',
174
+ '.txt': 'text/plain',
175
+ '.md': 'text/markdown',
176
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
177
+ };
178
+ return contentTypes[ext] || 'application/octet-stream';
179
+ }
180
+ /**
181
+ * Parse a document based on its file extension.
182
+ */
183
+ export async function parseDocument(filePath) {
184
+ const ext = path.extname(filePath).toLowerCase();
185
+ switch (ext) {
186
+ case '.pdf':
187
+ return parsePDF(filePath);
188
+ case '.docx':
189
+ return parseDOCX(filePath);
190
+ case '.txt':
191
+ return parseTXT(filePath);
192
+ case '.md':
193
+ return parseMD(filePath);
194
+ default:
195
+ throw new Error(`Unsupported file format: ${ext}`);
196
+ }
197
+ }
198
+ /**
199
+ * Process a document: parse, chunk, and return chunks with metadata.
200
+ */
201
+ export async function processDocument(filePath, chunkSize = 500) {
202
+ // Parse document
203
+ const parsed = await parseDocument(filePath);
204
+ // Calculate hash from raw file bytes (must match hashFile used by the caller)
205
+ const hash = await hashFile(filePath);
206
+ // Chunk text
207
+ const chunks = chunkText(parsed.text, chunkSize);
208
+ return {
209
+ text: parsed.text,
210
+ chunks,
211
+ hash,
212
+ metadata: parsed.metadata,
213
+ };
214
+ }