wikimem 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +37 -0
- package/LICENSE +21 -0
- package/README.md +398 -0
- package/dist/cli/commands/duplicates.d.ts +3 -0
- package/dist/cli/commands/duplicates.d.ts.map +1 -0
- package/dist/cli/commands/duplicates.js +38 -0
- package/dist/cli/commands/duplicates.js.map +1 -0
- package/dist/cli/commands/improve.d.ts +3 -0
- package/dist/cli/commands/improve.d.ts.map +1 -0
- package/dist/cli/commands/improve.js +69 -0
- package/dist/cli/commands/improve.js.map +1 -0
- package/dist/cli/commands/ingest.d.ts +3 -0
- package/dist/cli/commands/ingest.d.ts.map +1 -0
- package/dist/cli/commands/ingest.js +181 -0
- package/dist/cli/commands/ingest.js.map +1 -0
- package/dist/cli/commands/init.d.ts +3 -0
- package/dist/cli/commands/init.d.ts.map +1 -0
- package/dist/cli/commands/init.js +91 -0
- package/dist/cli/commands/init.js.map +1 -0
- package/dist/cli/commands/lint.d.ts +3 -0
- package/dist/cli/commands/lint.d.ts.map +1 -0
- package/dist/cli/commands/lint.js +49 -0
- package/dist/cli/commands/lint.js.map +1 -0
- package/dist/cli/commands/query.d.ts +3 -0
- package/dist/cli/commands/query.d.ts.map +1 -0
- package/dist/cli/commands/query.js +51 -0
- package/dist/cli/commands/query.js.map +1 -0
- package/dist/cli/commands/scrape.d.ts +3 -0
- package/dist/cli/commands/scrape.d.ts.map +1 -0
- package/dist/cli/commands/scrape.js +47 -0
- package/dist/cli/commands/scrape.js.map +1 -0
- package/dist/cli/commands/serve.d.ts +3 -0
- package/dist/cli/commands/serve.d.ts.map +1 -0
- package/dist/cli/commands/serve.js +24 -0
- package/dist/cli/commands/serve.js.map +1 -0
- package/dist/cli/commands/status.d.ts +3 -0
- package/dist/cli/commands/status.d.ts.map +1 -0
- package/dist/cli/commands/status.js +30 -0
- package/dist/cli/commands/status.js.map +1 -0
- package/dist/cli/commands/watch.d.ts +3 -0
- package/dist/cli/commands/watch.d.ts.map +1 -0
- package/dist/cli/commands/watch.js +29 -0
- package/dist/cli/commands/watch.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +30 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/config.d.ts +47 -0
- package/dist/core/config.d.ts.map +1 -0
- package/dist/core/config.js +11 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/improve.d.ts +19 -0
- package/dist/core/improve.d.ts.map +1 -0
- package/dist/core/improve.js +175 -0
- package/dist/core/improve.js.map +1 -0
- package/dist/core/index-manager.d.ts +9 -0
- package/dist/core/index-manager.d.ts.map +1 -0
- package/dist/core/index-manager.js +30 -0
- package/dist/core/index-manager.js.map +1 -0
- package/dist/core/ingest.d.ts +46 -0
- package/dist/core/ingest.d.ts.map +1 -0
- package/dist/core/ingest.js +366 -0
- package/dist/core/ingest.js.map +1 -0
- package/dist/core/lint.d.ts +19 -0
- package/dist/core/lint.d.ts.map +1 -0
- package/dist/core/lint.js +90 -0
- package/dist/core/lint.js.map +1 -0
- package/dist/core/log-manager.d.ts +2 -0
- package/dist/core/log-manager.d.ts.map +1 -0
- package/dist/core/log-manager.js +14 -0
- package/dist/core/log-manager.js.map +1 -0
- package/dist/core/obsidian.d.ts +89 -0
- package/dist/core/obsidian.d.ts.map +1 -0
- package/dist/core/obsidian.js +123 -0
- package/dist/core/obsidian.js.map +1 -0
- package/dist/core/query.d.ts +16 -0
- package/dist/core/query.d.ts.map +1 -0
- package/dist/core/query.js +77 -0
- package/dist/core/query.js.map +1 -0
- package/dist/core/scrape.d.ts +13 -0
- package/dist/core/scrape.d.ts.map +1 -0
- package/dist/core/scrape.js +103 -0
- package/dist/core/scrape.js.map +1 -0
- package/dist/core/vault.d.ts +35 -0
- package/dist/core/vault.d.ts.map +1 -0
- package/dist/core/vault.js +119 -0
- package/dist/core/vault.js.map +1 -0
- package/dist/core/watcher.d.ts +4 -0
- package/dist/core/watcher.d.ts.map +1 -0
- package/dist/core/watcher.js +34 -0
- package/dist/core/watcher.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/processors/audio.d.ts +10 -0
- package/dist/processors/audio.d.ts.map +1 -0
- package/dist/processors/audio.js +139 -0
- package/dist/processors/audio.js.map +1 -0
- package/dist/processors/docx.d.ts +12 -0
- package/dist/processors/docx.d.ts.map +1 -0
- package/dist/processors/docx.js +98 -0
- package/dist/processors/docx.js.map +1 -0
- package/dist/processors/image.d.ts +9 -0
- package/dist/processors/image.d.ts.map +1 -0
- package/dist/processors/image.js +94 -0
- package/dist/processors/image.js.map +1 -0
- package/dist/processors/pdf.d.ts +10 -0
- package/dist/processors/pdf.d.ts.map +1 -0
- package/dist/processors/pdf.js +92 -0
- package/dist/processors/pdf.js.map +1 -0
- package/dist/processors/pptx.d.ts +13 -0
- package/dist/processors/pptx.d.ts.map +1 -0
- package/dist/processors/pptx.js +165 -0
- package/dist/processors/pptx.js.map +1 -0
- package/dist/processors/text.d.ts +7 -0
- package/dist/processors/text.d.ts.map +1 -0
- package/dist/processors/text.js +9 -0
- package/dist/processors/text.js.map +1 -0
- package/dist/processors/url.d.ts +7 -0
- package/dist/processors/url.d.ts.map +1 -0
- package/dist/processors/url.js +61 -0
- package/dist/processors/url.js.map +1 -0
- package/dist/processors/video.d.ts +10 -0
- package/dist/processors/video.d.ts.map +1 -0
- package/dist/processors/video.js +115 -0
- package/dist/processors/video.js.map +1 -0
- package/dist/processors/xlsx.d.ts +13 -0
- package/dist/processors/xlsx.d.ts.map +1 -0
- package/dist/processors/xlsx.js +138 -0
- package/dist/processors/xlsx.js.map +1 -0
- package/dist/providers/claude.d.ts +10 -0
- package/dist/providers/claude.d.ts.map +1 -0
- package/dist/providers/claude.js +44 -0
- package/dist/providers/claude.js.map +1 -0
- package/dist/providers/embeddings.d.ts +62 -0
- package/dist/providers/embeddings.d.ts.map +1 -0
- package/dist/providers/embeddings.js +206 -0
- package/dist/providers/embeddings.js.map +1 -0
- package/dist/providers/index.d.ts +7 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +19 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/ollama.d.ts +10 -0
- package/dist/providers/ollama.d.ts.map +1 -0
- package/dist/providers/ollama.js +48 -0
- package/dist/providers/ollama.js.map +1 -0
- package/dist/providers/openai.d.ts +10 -0
- package/dist/providers/openai.d.ts.map +1 -0
- package/dist/providers/openai.js +38 -0
- package/dist/providers/openai.js.map +1 -0
- package/dist/providers/types.d.ts +33 -0
- package/dist/providers/types.d.ts.map +1 -0
- package/dist/providers/types.js +2 -0
- package/dist/providers/types.js.map +1 -0
- package/dist/search/bm25.d.ts +18 -0
- package/dist/search/bm25.d.ts.map +1 -0
- package/dist/search/bm25.js +52 -0
- package/dist/search/bm25.js.map +1 -0
- package/dist/search/index.d.ts +12 -0
- package/dist/search/index.d.ts.map +1 -0
- package/dist/search/index.js +64 -0
- package/dist/search/index.js.map +1 -0
- package/dist/search/semantic.d.ts +30 -0
- package/dist/search/semantic.d.ts.map +1 -0
- package/dist/search/semantic.js +162 -0
- package/dist/search/semantic.js.map +1 -0
- package/dist/templates/agents-md.d.ts +2 -0
- package/dist/templates/agents-md.d.ts.map +1 -0
- package/dist/templates/agents-md.js +85 -0
- package/dist/templates/agents-md.js.map +1 -0
- package/dist/templates/config-yaml.d.ts +2 -0
- package/dist/templates/config-yaml.d.ts.map +1 -0
- package/dist/templates/config-yaml.js +81 -0
- package/dist/templates/config-yaml.js.map +1 -0
- package/dist/web/server.d.ts +2 -0
- package/dist/web/server.d.ts.map +1 -0
- package/dist/web/server.js +170 -0
- package/dist/web/server.js.map +1 -0
- package/package.json +68 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import { spawnSync } from 'node:child_process';
|
|
2
|
+
import { basename, extname, join } from 'node:path';
|
|
3
|
+
import { existsSync, readFileSync, unlinkSync } from 'node:fs';
|
|
4
|
+
import { tmpdir } from 'node:os';
|
|
5
|
+
const SUPPORTED_EXTENSIONS = new Set(['.mp3', '.wav', '.m4a', '.ogg', '.flac', '.aac', '.wma']);
|
|
6
|
+
export function isAudioFile(filePath) {
|
|
7
|
+
return SUPPORTED_EXTENSIONS.has(extname(filePath).toLowerCase());
|
|
8
|
+
}
|
|
9
|
+
export async function processAudio(filePath) {
|
|
10
|
+
const ext = extname(filePath).toLowerCase();
|
|
11
|
+
const title = basename(filePath, ext);
|
|
12
|
+
if (!SUPPORTED_EXTENSIONS.has(ext)) {
|
|
13
|
+
throw new Error(`Unsupported audio format: ${ext}. Supported: ${[...SUPPORTED_EXTENSIONS].join(', ')}`);
|
|
14
|
+
}
|
|
15
|
+
// Try Deepgram API first
|
|
16
|
+
const deepgramKey = process.env['DEEPGRAM_API_KEY'];
|
|
17
|
+
if (deepgramKey) {
|
|
18
|
+
return await transcribeWithDeepgram(filePath, title, deepgramKey);
|
|
19
|
+
}
|
|
20
|
+
// Try local Whisper CLI
|
|
21
|
+
if (isWhisperAvailable()) {
|
|
22
|
+
return await transcribeWithWhisper(filePath, title);
|
|
23
|
+
}
|
|
24
|
+
// Fallback: reference without transcription
|
|
25
|
+
return {
|
|
26
|
+
title,
|
|
27
|
+
transcript: '',
|
|
28
|
+
markdown: buildMarkdown(title, filePath, '[Audio file — install Whisper or set DEEPGRAM_API_KEY for transcription]'),
|
|
29
|
+
sourcePath: filePath,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
async function transcribeWithWhisper(filePath, title) {
|
|
33
|
+
const tmpOutput = join(tmpdir(), `llmwiki-whisper-${Date.now()}`);
|
|
34
|
+
const result = spawnSync('whisper', [
|
|
35
|
+
filePath,
|
|
36
|
+
'--model', 'base',
|
|
37
|
+
'--output_format', 'txt',
|
|
38
|
+
'--output_dir', tmpdir(),
|
|
39
|
+
'--fp16', 'False',
|
|
40
|
+
], { encoding: 'utf-8', timeout: 300000 });
|
|
41
|
+
if (result.status !== 0) {
|
|
42
|
+
throw new Error(`Whisper failed: ${result.stderr}`);
|
|
43
|
+
}
|
|
44
|
+
// Find the output .txt file
|
|
45
|
+
const txtPath = join(tmpdir(), `${basename(filePath, extname(filePath))}.txt`);
|
|
46
|
+
const transcript = existsSync(txtPath) ? readFileSync(txtPath, 'utf-8').trim() : '';
|
|
47
|
+
// Clean up
|
|
48
|
+
if (existsSync(txtPath))
|
|
49
|
+
unlinkSync(txtPath);
|
|
50
|
+
// Get duration via ffprobe if available
|
|
51
|
+
const duration = getDuration(filePath);
|
|
52
|
+
return {
|
|
53
|
+
title,
|
|
54
|
+
transcript,
|
|
55
|
+
markdown: buildMarkdown(title, filePath, transcript, duration),
|
|
56
|
+
duration,
|
|
57
|
+
sourcePath: filePath,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
async function transcribeWithDeepgram(filePath, title, apiKey) {
|
|
61
|
+
const audioData = readFileSync(filePath);
|
|
62
|
+
const response = await fetch('https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true¶graphs=true', {
|
|
63
|
+
method: 'POST',
|
|
64
|
+
headers: {
|
|
65
|
+
'Authorization': `Token ${apiKey}`,
|
|
66
|
+
'Content-Type': getContentType(extname(filePath)),
|
|
67
|
+
},
|
|
68
|
+
body: audioData,
|
|
69
|
+
});
|
|
70
|
+
if (!response.ok) {
|
|
71
|
+
throw new Error(`Deepgram API error: ${response.status} ${response.statusText}`);
|
|
72
|
+
}
|
|
73
|
+
const data = (await response.json());
|
|
74
|
+
const transcript = data.results.channels[0]?.alternatives[0]?.transcript ?? '';
|
|
75
|
+
const durationSec = data.metadata.duration;
|
|
76
|
+
const duration = durationSec ? formatDuration(durationSec) : undefined;
|
|
77
|
+
return {
|
|
78
|
+
title,
|
|
79
|
+
transcript,
|
|
80
|
+
markdown: buildMarkdown(title, filePath, transcript, duration),
|
|
81
|
+
duration,
|
|
82
|
+
sourcePath: filePath,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
function buildMarkdown(title, filePath, transcript, duration) {
|
|
86
|
+
return `# ${title}
|
|
87
|
+
|
|
88
|
+
> **Source:** [${basename(filePath)}](${filePath})
|
|
89
|
+
> **Type:** Audio${duration ? `\n> **Duration:** ${duration}` : ''}
|
|
90
|
+
> **Processed:** ${new Date().toISOString().split('T')[0]}
|
|
91
|
+
|
|
92
|
+
## Transcript
|
|
93
|
+
|
|
94
|
+
${transcript || '_No transcript available._'}
|
|
95
|
+
`;
|
|
96
|
+
}
|
|
97
|
+
function isWhisperAvailable() {
|
|
98
|
+
try {
|
|
99
|
+
const result = spawnSync('whisper', ['--help'], { encoding: 'utf-8', timeout: 5000 });
|
|
100
|
+
return result.status === 0;
|
|
101
|
+
}
|
|
102
|
+
catch {
|
|
103
|
+
return false;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
function getDuration(filePath) {
|
|
107
|
+
try {
|
|
108
|
+
const result = spawnSync('ffprobe', [
|
|
109
|
+
'-v', 'error', '-show_entries', 'format=duration',
|
|
110
|
+
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
|
|
111
|
+
], { encoding: 'utf-8', timeout: 10000 });
|
|
112
|
+
const seconds = parseFloat(result.stdout.trim());
|
|
113
|
+
return isNaN(seconds) ? undefined : formatDuration(seconds);
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return undefined;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
function formatDuration(seconds) {
|
|
120
|
+
const h = Math.floor(seconds / 3600);
|
|
121
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
122
|
+
const s = Math.floor(seconds % 60);
|
|
123
|
+
if (h > 0)
|
|
124
|
+
return `${h}h ${m}m ${s}s`;
|
|
125
|
+
if (m > 0)
|
|
126
|
+
return `${m}m ${s}s`;
|
|
127
|
+
return `${s}s`;
|
|
128
|
+
}
|
|
129
|
+
function getContentType(ext) {
|
|
130
|
+
switch (ext.toLowerCase()) {
|
|
131
|
+
case '.mp3': return 'audio/mpeg';
|
|
132
|
+
case '.wav': return 'audio/wav';
|
|
133
|
+
case '.m4a': return 'audio/mp4';
|
|
134
|
+
case '.ogg': return 'audio/ogg';
|
|
135
|
+
case '.flac': return 'audio/flac';
|
|
136
|
+
default: return 'audio/mpeg';
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
//# sourceMappingURL=audio.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audio.js","sourceRoot":"","sources":["../../src/processors/audio.ts"],"names":[],"mappings":"AAAA,OAAO,EAAY,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACpD,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAC/D,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAUjC,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;AAEhG,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,OAAO,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;AACnE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IAC5C,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IAEtC,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,gBAAgB,CAAC,GAAG,oBAAoB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC1G,CAAC;IAED,yBAAyB;IACzB,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,CAAC;IACpD,IAAI,WAAW,EAAE,CAAC;QAChB,OAAO,MAAM,sBAAsB,CAAC,QAAQ,EAAE,KAAK,EAAE,WAAW,CAAC,CAAC;IACpE,CAAC;IAED,wBAAwB;IACxB,IAAI,kBAAkB,EAAE,EAAE,CAAC;QACzB,OAAO,MAAM,qBAAqB,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;IACtD,CAAC;IAED,4CAA4C;IAC5C,OAAO;QACL,KAAK;QACL,UAAU,EAAE,EAAE;QACd,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,0EAA0E,CAAC;QACpH,UAAU,EAAE,QAAQ;KACrB,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,qBAAqB,CAAC,QAAgB,EAAE,KAAa;IAClE,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,mBAAmB,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;IAElE,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,EAAE;QAClC,QAAQ;QACR,SAAS,EAAE,MAAM;QACjB,iBAAiB,EAAE,KAAK;QACxB,cAAc,EAAE,MAAM,EAAE;QACxB,QAAQ,EAAE,OAAO;KAClB,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;IAE3C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,MAAM,IAAI,KAAK,CAAC,mBAAmB,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IACtD,CAAC;IAED,4BAA4B;IAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,GAAG,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC;IAC/E,MAAM,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAEpF,WAAW;IACX,IAAI,UAAU,CAAC,OAAO,CAAC;QAAE,UAAU,CAAC,OAAO,CAAC,CAAC;IAE7C,wCAAwC;IACxC,MAAM,QAAQ,GAAG,WAAW,CAAC,QAAQ,CAAC,CAAC;IAEvC,OAAO;QACL,KAAK;QACL,UAAU;QACV,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,CAAC;QAC9D,QAAQ;QACR,UAAU,EAAE,QAAQ;KACrB,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,sBAAsB,CAAC,QAAgB,EAAE,KAAa,EAAE,MAAc;IACnF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IAEzC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,mFAAmF,EAAE;QAChH,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,eAAe,EAAE,SAAS,MAAM,EAAE;YAClC,cAAc,EAAE,cAAc,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;SAClD;QACD,IAAI,EAAE,SAAS;KAChB,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IACnF,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAOlC,CAAC;IAEF,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,YAAY,CAAC,CAAC,CAAC,EAAE,UAAU,IAAI,EAAE,CAAC;IAC/E,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;IAC3C,MAAM,QAAQ,GAAG,WAAW,CAAC,CAAC,CAAC,cAAc,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAEvE,OAAO;QACL,KAAK;QACL,UAAU;QACV,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,CAAC;QAC9D,QAAQ;QACR,UAAU,EAAE,QAAQ;KACrB,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,KAAa,EAAE,QAAgB,EAAE,UAAkB,EAAE,QAAiB;IAC3F,OAAO,KAAK,KAAK;;iBAEF,QAAQ,CAAC,QAAQ,CAAC,KAAK,QAAQ;mBAC7B,QAAQ,CAAC,CAAC,CAAC,qBAAqB,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE;mBAC/C,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;;;;EAIvD,UAAU,IAAI,4BAA4B;CAC3C,CAAC;AACF,CAAC;AAED,SAAS,kBAAkB;IACzB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,EAAE,CAAC,QAAQ,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACtF,OAAO,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,QAAgB;IACnC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,EAAE;YAClC,IAAI,EAAE,OAAO,EAAE,eAAe,EAAE,iBAAiB;YACjD,KAAK,EAAE,oCAAoC,EAAE,QAAQ;SACtD,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;QAC1C,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;QACjD,OAAO,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED,SAAS,cAAc,CAAC,OAAe;IACrC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,IAAI,CAAC,CAAC;IACrC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;IAC5C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,EAAE,CAAC,CAAC;IACnC,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC;IACtC,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC;IAChC,OAAO,GAAG,CAAC,GAAG,CAAC;AACjB,CAAC;AAED,SAAS,cAAc,CAAC,GAAW;IACjC,QAAQ,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC;QAC1B,KAAK,MAAM,CAAC,CAAC,OAAO,YAAY,CAAC;QACjC,KAAK,MAAM,CAAC,CAAC,OAAO,WAAW,CAAC;QAChC,KAAK,MAAM,CAAC,CAAC,OAAO,WAAW,CAAC;QAChC,KAAK,MAAM,CAAC,CAAC,OAAO,WAAW,CAAC;QAChC,KAAK,OAAO,CAAC,CAAC,OAAO,YAAY,CAAC;QAClC,OAAO,CAAC,CAAC,OAAO,YAAY,CAAC;IAC/B,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Word document (.docx) processor.
|
|
3
|
+
* Uses mammoth for HTML extraction, with a built-in XML fallback.
|
|
4
|
+
*/
|
|
5
|
+
export interface DocxResult {
|
|
6
|
+
title: string;
|
|
7
|
+
content: string;
|
|
8
|
+
markdown: string;
|
|
9
|
+
sourcePath: string;
|
|
10
|
+
}
|
|
11
|
+
export declare function processDocx(filePath: string): Promise<DocxResult>;
|
|
12
|
+
//# sourceMappingURL=docx.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx.d.ts","sourceRoot":"","sources":["../../src/processors/docx.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAKH,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAqBvE"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Word document (.docx) processor.
|
|
3
|
+
* Uses mammoth for HTML extraction, with a built-in XML fallback.
|
|
4
|
+
*/
|
|
5
|
+
import { readFileSync } from 'node:fs';
|
|
6
|
+
import { basename } from 'node:path';
|
|
7
|
+
export async function processDocx(filePath) {
|
|
8
|
+
const title = basename(filePath, '.docx');
|
|
9
|
+
// Try mammoth first (best quality), fall back to raw XML extraction
|
|
10
|
+
let content;
|
|
11
|
+
try {
|
|
12
|
+
content = await extractWithMammoth(filePath);
|
|
13
|
+
}
|
|
14
|
+
catch {
|
|
15
|
+
content = extractFromRawXml(filePath);
|
|
16
|
+
}
|
|
17
|
+
if (!content.trim()) {
|
|
18
|
+
content = `[Word document — no text content extracted from ${basename(filePath)}]`;
|
|
19
|
+
}
|
|
20
|
+
return {
|
|
21
|
+
title,
|
|
22
|
+
content,
|
|
23
|
+
markdown: buildMarkdown(title, filePath, content),
|
|
24
|
+
sourcePath: filePath,
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
async function extractWithMammoth(filePath) {
|
|
28
|
+
// Dynamic import — mammoth is an optional dependency
|
|
29
|
+
const mammoth = await import('mammoth');
|
|
30
|
+
const buffer = readFileSync(filePath);
|
|
31
|
+
const result = await mammoth.convertToHtml({ buffer });
|
|
32
|
+
// Convert HTML to simplified markdown (strip tags, keep structure)
|
|
33
|
+
let md = result.value
|
|
34
|
+
.replace(/<h1>(.*?)<\/h1>/gi, '# $1\n')
|
|
35
|
+
.replace(/<h2>(.*?)<\/h2>/gi, '## $1\n')
|
|
36
|
+
.replace(/<h3>(.*?)<\/h3>/gi, '### $1\n')
|
|
37
|
+
.replace(/<p>(.*?)<\/p>/gi, '$1\n\n')
|
|
38
|
+
.replace(/<strong>(.*?)<\/strong>/gi, '**$1**')
|
|
39
|
+
.replace(/<em>(.*?)<\/em>/gi, '*$1*')
|
|
40
|
+
.replace(/<li>(.*?)<\/li>/gi, '- $1\n')
|
|
41
|
+
.replace(/<[^>]+>/g, '')
|
|
42
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
43
|
+
.trim();
|
|
44
|
+
// Add any warnings as comments
|
|
45
|
+
if (result.messages.length > 0) {
|
|
46
|
+
const warnings = result.messages
|
|
47
|
+
.filter((m) => m.type === 'warning')
|
|
48
|
+
.map((m) => m.message)
|
|
49
|
+
.join(', ');
|
|
50
|
+
if (warnings) {
|
|
51
|
+
md += `\n\n> **Conversion notes:** ${warnings}`;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return md;
|
|
55
|
+
}
|
|
56
|
+
function extractFromRawXml(filePath) {
|
|
57
|
+
// .docx is a zip file — read raw bytes and extract text from document.xml
|
|
58
|
+
// This is a lightweight fallback when mammoth is not installed
|
|
59
|
+
const buffer = readFileSync(filePath);
|
|
60
|
+
const content = buffer.toString('latin1');
|
|
61
|
+
const textParts = [];
|
|
62
|
+
// Look for w:t elements in the XML (Word text runs)
|
|
63
|
+
const textRegex = /<w:t[^>]*>([\s\S]*?)<\/w:t>/g;
|
|
64
|
+
let match;
|
|
65
|
+
while ((match = textRegex.exec(content)) !== null) {
|
|
66
|
+
if (match[1]) {
|
|
67
|
+
textParts.push(match[1]);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (textParts.length > 0) {
|
|
71
|
+
return textParts.join(' ').replace(/\s+/g, ' ').trim();
|
|
72
|
+
}
|
|
73
|
+
// Broader fallback: extract any readable text between XML tags
|
|
74
|
+
const anyText = content
|
|
75
|
+
.replace(/<[^>]+>/g, ' ')
|
|
76
|
+
.replace(/[^\x20-\x7E\r\n]/g, '')
|
|
77
|
+
.replace(/\s+/g, ' ')
|
|
78
|
+
.trim();
|
|
79
|
+
// Only return if it looks like actual text (>30% alphabetic)
|
|
80
|
+
const alphaCount = (anyText.match(/[a-zA-Z]/g) ?? []).length;
|
|
81
|
+
if (anyText.length > 0 && alphaCount / anyText.length > 0.3) {
|
|
82
|
+
return anyText.substring(0, 20000);
|
|
83
|
+
}
|
|
84
|
+
return '';
|
|
85
|
+
}
|
|
86
|
+
function buildMarkdown(title, filePath, content) {
|
|
87
|
+
return `# ${title}
|
|
88
|
+
|
|
89
|
+
> **Source:** [${basename(filePath)}](${filePath})
|
|
90
|
+
> **Type:** Word Document (.docx)
|
|
91
|
+
> **Processed:** ${new Date().toISOString().split('T')[0]}
|
|
92
|
+
|
|
93
|
+
## Content
|
|
94
|
+
|
|
95
|
+
${content}
|
|
96
|
+
`;
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=docx.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx.js","sourceRoot":"","sources":["../../src/processors/docx.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AASrC,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,QAAgB;IAChD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAE1C,oEAAoE;IACpE,IAAI,OAAe,CAAC;IACpB,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,kBAAkB,CAAC,QAAQ,CAAC,CAAC;IAC/C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IACxC,CAAC;IAED,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QACpB,OAAO,GAAG,mDAAmD,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;IACrF,CAAC;IAED,OAAO;QACL,KAAK;QACL,OAAO;QACP,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,OAAO,CAAC;QACjD,UAAU,EAAE,QAAQ;KACrB,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IAChD,qDAAqD;IACrD,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;IACxC,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IAEvD,mEAAmE;IACnE,IAAI,EAAE,GAAG,MAAM,CAAC,KAAK;SAClB,OAAO,CAAC,mBAAmB,EAAE,QAAQ,CAAC;SACtC,OAAO,CAAC,mBAAmB,EAAE,SAAS,CAAC;SACvC,OAAO,CAAC,mBAAmB,EAAE,UAAU,CAAC;SACxC,OAAO,CAAC,iBAAiB,EAAE,QAAQ,CAAC;SACpC,OAAO,CAAC,2BAA2B,EAAE,QAAQ,CAAC;SAC9C,OAAO,CAAC,mBAAmB,EAAE,MAAM,CAAC;SACpC,OAAO,CAAC,mBAAmB,EAAE,QAAQ,CAAC;SACtC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;IAEV,+BAA+B;IAC/B,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ;aAC7B,MAAM,CAAC,CAAC,CAAmB,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,CAAC;aACrD,GAAG,CAAC,CAAC,CAAsB,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;aAC1C,IAAI,CAAC,IAAI,CAAC,CAAC;QACd,IAAI,QAAQ,EAAE,CAAC;YACb,EAAE,IAAI,+BAA+B,QAAQ,EAAE,CAAC;QAClD,CAAC;IACH,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAgB;IACzC,0EAA0E;IAC1E,+DAA+D;IAC/D,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAE1C,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,oDAAoD;IACpD,MAAM,SAAS,GAAG,8BAA8B,CAAC;IACjD,IAAI,KAA6B,CAAC;IAElC,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;YACb,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IACzD,CAAC;IAED,+DAA+D;IAC/D,MAAM,OAAO,GAAG,OAAO;SACpB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC;SAChC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAC;IAEV,6DAA6D;IAC7D,MAAM,UAAU,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAC7D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,OAAO,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QAC5D,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;IACrC,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,aAAa,CAAC,KAAa,EAAE,QAAgB,EAAE,OAAe;IACrE,OAAO,KAAK,KAAK;;iBAEF,QAAQ,CAAC,QAAQ,CAAC,KAAK,QAAQ;;mBAE7B,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;;;;EAIvD,OAAO;CACR,CAAC;AACF,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export interface ImageResult {
|
|
2
|
+
title: string;
|
|
3
|
+
description: string;
|
|
4
|
+
markdown: string;
|
|
5
|
+
sourcePath: string;
|
|
6
|
+
}
|
|
7
|
+
export declare function isImageFile(filePath: string): boolean;
|
|
8
|
+
export declare function processImage(filePath: string): Promise<ImageResult>;
|
|
9
|
+
//# sourceMappingURL=image.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image.d.ts","sourceRoot":"","sources":["../../src/processors/image.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;CACpB;AAID,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAErD;AAED,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CA+DzE"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { basename, extname } from 'node:path';
|
|
3
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
4
|
+
const SUPPORTED_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']);
|
|
5
|
+
export function isImageFile(filePath) {
|
|
6
|
+
return SUPPORTED_EXTENSIONS.has(extname(filePath).toLowerCase());
|
|
7
|
+
}
|
|
8
|
+
export async function processImage(filePath) {
|
|
9
|
+
const ext = extname(filePath).toLowerCase();
|
|
10
|
+
const title = basename(filePath, ext);
|
|
11
|
+
if (!SUPPORTED_EXTENSIONS.has(ext)) {
|
|
12
|
+
throw new Error(`Unsupported image format: ${ext}. Supported: ${[...SUPPORTED_EXTENSIONS].join(', ')}`);
|
|
13
|
+
}
|
|
14
|
+
const imageData = readFileSync(filePath);
|
|
15
|
+
const base64 = imageData.toString('base64');
|
|
16
|
+
const mediaType = getMediaType(ext);
|
|
17
|
+
// Use Claude vision to describe the image
|
|
18
|
+
const apiKey = process.env['ANTHROPIC_API_KEY'];
|
|
19
|
+
if (!apiKey) {
|
|
20
|
+
// Fallback: create a basic markdown reference without AI description
|
|
21
|
+
return {
|
|
22
|
+
title,
|
|
23
|
+
description: `Image file: ${basename(filePath)}`,
|
|
24
|
+
markdown: buildMarkdown(title, filePath, `[Image file — set ANTHROPIC_API_KEY for AI description]`),
|
|
25
|
+
sourcePath: filePath,
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
const client = new Anthropic({ apiKey });
|
|
29
|
+
const response = await client.messages.create({
|
|
30
|
+
model: 'claude-sonnet-4-20250514',
|
|
31
|
+
max_tokens: 2048,
|
|
32
|
+
messages: [
|
|
33
|
+
{
|
|
34
|
+
role: 'user',
|
|
35
|
+
content: [
|
|
36
|
+
{
|
|
37
|
+
type: 'image',
|
|
38
|
+
source: { type: 'base64', media_type: mediaType, data: base64 },
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
type: 'text',
|
|
42
|
+
text: `Describe this image in detail for a knowledge base. Include:
|
|
43
|
+
1. What the image shows (objects, people, text, diagrams, charts)
|
|
44
|
+
2. Key information or data visible
|
|
45
|
+
3. Any text content (OCR)
|
|
46
|
+
4. Context and significance
|
|
47
|
+
|
|
48
|
+
Be thorough but concise. This description will represent the image in a markdown wiki where agents need to understand its content without seeing it directly.`,
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
});
|
|
54
|
+
const description = response.content
|
|
55
|
+
.filter((block) => block.type === 'text')
|
|
56
|
+
.map((block) => block.text)
|
|
57
|
+
.join('');
|
|
58
|
+
return {
|
|
59
|
+
title,
|
|
60
|
+
description,
|
|
61
|
+
markdown: buildMarkdown(title, filePath, description),
|
|
62
|
+
sourcePath: filePath,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
function buildMarkdown(title, filePath, description) {
|
|
66
|
+
return `# ${title}
|
|
67
|
+
|
|
68
|
+
> **Source:** [${basename(filePath)}](${filePath})
|
|
69
|
+
> **Type:** Image
|
|
70
|
+
> **Processed:** ${new Date().toISOString().split('T')[0]}
|
|
71
|
+
|
|
72
|
+

|
|
73
|
+
|
|
74
|
+
## Description
|
|
75
|
+
|
|
76
|
+
${description}
|
|
77
|
+
`;
|
|
78
|
+
}
|
|
79
|
+
function getMediaType(ext) {
|
|
80
|
+
switch (ext) {
|
|
81
|
+
case '.jpg':
|
|
82
|
+
case '.jpeg':
|
|
83
|
+
return 'image/jpeg';
|
|
84
|
+
case '.png':
|
|
85
|
+
return 'image/png';
|
|
86
|
+
case '.gif':
|
|
87
|
+
return 'image/gif';
|
|
88
|
+
case '.webp':
|
|
89
|
+
return 'image/webp';
|
|
90
|
+
default:
|
|
91
|
+
return 'image/jpeg';
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=image.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image.js","sourceRoot":"","sources":["../../src/processors/image.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC9C,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAS1C,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC;AAEzF,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,OAAO,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;AACnE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IAC5C,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IAEtC,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,gBAAgB,CAAC,GAAG,oBAAoB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC1G,CAAC;IAED,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACzC,MAAM,MAAM,GAAG,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC5C,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;IAEpC,0CAA0C;IAC1C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;IAChD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,qEAAqE;QACrE,OAAO;YACL,KAAK;YACL,WAAW,EAAE,eAAe,QAAQ,CAAC,QAAQ,CAAC,EAAE;YAChD,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,yDAAyD,CAAC;YACnG,UAAU,EAAE,QAAQ;SACrB,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IAEzC,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5C,KAAK,EAAE,0BAA0B;QACjC,UAAU,EAAE,IAAI;QAChB,QAAQ,EAAE;YACR;gBACE,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE;oBACP;wBACE,IAAI,EAAE,OAAO;wBACb,MAAM,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE;qBAChE;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,IAAI,EAAE;;;;;;8JAM4I;qBACnJ;iBACF;aACF;SACF;KACF,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO;SACjC,MAAM,CAAC,CAAC,KAAK,EAAgC,EAAE,CAAC,KAAK,CAAC,IAAI,KAAK,MAAM,CAAC;SACtE,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;SAC1B,IAAI,CAAC,EAAE,CAAC,CAAC;IAEZ,OAAO;QACL,KAAK;QACL,WAAW;QACX,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,WAAW,CAAC;QACrD,UAAU,EAAE,QAAQ;KACrB,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,KAAa,EAAE,QAAgB,EAAE,WAAmB;IACzE,OAAO,KAAK,KAAK;;iBAEF,QAAQ,CAAC,QAAQ,CAAC,KAAK,QAAQ;;mBAE7B,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;;IAErD,KAAK,KAAK,QAAQ;;;;EAIpB,WAAW;CACZ,CAAC;AACF,CAAC;AAED,SAAS,YAAY,CAAC,GAAW;IAC/B,QAAQ,GAAG,EAAE,CAAC;QACZ,KAAK,MAAM,CAAC;QACZ,KAAK,OAAO;YACV,OAAO,YAAY,CAAC;QACtB,KAAK,MAAM;YACT,OAAO,WAAW,CAAC;QACrB,KAAK,MAAM;YACT,OAAO,WAAW,CAAC;QACrB,KAAK,OAAO;YACV,OAAO,YAAY,CAAC;QACtB;YACE,OAAO,YAAY,CAAC;IACxB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export interface PdfResult {
|
|
2
|
+
title: string;
|
|
3
|
+
content: string;
|
|
4
|
+
markdown: string;
|
|
5
|
+
pageCount?: number;
|
|
6
|
+
sourcePath: string;
|
|
7
|
+
}
|
|
8
|
+
export declare function isPdfFile(filePath: string): boolean;
|
|
9
|
+
export declare function processPdf(filePath: string): Promise<PdfResult>;
|
|
10
|
+
//# sourceMappingURL=pdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/processors/pdf.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,SAAS;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,wBAAgB,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAEnD;AAED,wBAAsB,UAAU,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAiBrE"}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { basename, extname } from 'node:path';
|
|
3
|
+
export function isPdfFile(filePath) {
|
|
4
|
+
return extname(filePath).toLowerCase() === '.pdf';
|
|
5
|
+
}
|
|
6
|
+
export async function processPdf(filePath) {
|
|
7
|
+
const title = basename(filePath, '.pdf');
|
|
8
|
+
const buffer = readFileSync(filePath);
|
|
9
|
+
// Extract text from PDF
|
|
10
|
+
const text = extractTextFromPdf(buffer);
|
|
11
|
+
const pageCount = countPages(buffer);
|
|
12
|
+
const content = text.trim() || `[PDF content from ${title} — text extraction yielded no results]`;
|
|
13
|
+
return {
|
|
14
|
+
title,
|
|
15
|
+
content,
|
|
16
|
+
markdown: buildMarkdown(title, filePath, content, pageCount),
|
|
17
|
+
pageCount,
|
|
18
|
+
sourcePath: filePath,
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
function buildMarkdown(title, filePath, content, pageCount) {
|
|
22
|
+
return `# ${title}
|
|
23
|
+
|
|
24
|
+
> **Source:** [${basename(filePath)}](${filePath})
|
|
25
|
+
> **Type:** PDF${pageCount ? `\n> **Pages:** ${pageCount}` : ''}
|
|
26
|
+
> **Processed:** ${new Date().toISOString().split('T')[0]}
|
|
27
|
+
|
|
28
|
+
## Content
|
|
29
|
+
|
|
30
|
+
${content}
|
|
31
|
+
`;
|
|
32
|
+
}
|
|
33
|
+
function extractTextFromPdf(buffer) {
|
|
34
|
+
const content = buffer.toString('latin1');
|
|
35
|
+
const textParts = [];
|
|
36
|
+
// Method 1: Extract from BT...ET text objects (basic PDF text extraction)
|
|
37
|
+
const btEtRegex = /BT\s([\s\S]*?)\sET/g;
|
|
38
|
+
let match;
|
|
39
|
+
while ((match = btEtRegex.exec(content)) !== null) {
|
|
40
|
+
const block = match[1] ?? '';
|
|
41
|
+
// Extract from Tj operator (single string)
|
|
42
|
+
const tjRegex = /\(([^)]*)\)\s*Tj/g;
|
|
43
|
+
let tjMatch;
|
|
44
|
+
while ((tjMatch = tjRegex.exec(block)) !== null) {
|
|
45
|
+
if (tjMatch[1])
|
|
46
|
+
textParts.push(decodePdfString(tjMatch[1]));
|
|
47
|
+
}
|
|
48
|
+
// Extract from TJ operator (array of strings)
|
|
49
|
+
const tjArrayRegex = /\[([^\]]*)\]\s*TJ/g;
|
|
50
|
+
let tjArrMatch;
|
|
51
|
+
while ((tjArrMatch = tjArrayRegex.exec(block)) !== null) {
|
|
52
|
+
const arr = tjArrMatch[1] ?? '';
|
|
53
|
+
const strRegex = /\(([^)]*)\)/g;
|
|
54
|
+
let strMatch;
|
|
55
|
+
while ((strMatch = strRegex.exec(arr)) !== null) {
|
|
56
|
+
if (strMatch[1])
|
|
57
|
+
textParts.push(decodePdfString(strMatch[1]));
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// Method 2: Extract from stream objects containing plain text
|
|
62
|
+
if (textParts.length === 0) {
|
|
63
|
+
const streamRegex = /stream\r?\n([\s\S]*?)\r?\nendstream/g;
|
|
64
|
+
let streamMatch;
|
|
65
|
+
while ((streamMatch = streamRegex.exec(content)) !== null) {
|
|
66
|
+
const streamContent = streamMatch[1] ?? '';
|
|
67
|
+
// Only include streams that look like text (have readable ASCII)
|
|
68
|
+
const readableChars = streamContent.replace(/[^\x20-\x7E\r\n]/g, '');
|
|
69
|
+
if (readableChars.length > streamContent.length * 0.5 && readableChars.length > 50) {
|
|
70
|
+
textParts.push(readableChars.trim());
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return textParts.join(' ').replace(/\s+/g, ' ').trim();
|
|
75
|
+
}
|
|
76
|
+
function decodePdfString(s) {
|
|
77
|
+
// Handle basic PDF escape sequences
|
|
78
|
+
return s
|
|
79
|
+
.replace(/\\n/g, '\n')
|
|
80
|
+
.replace(/\\r/g, '\r')
|
|
81
|
+
.replace(/\\t/g, '\t')
|
|
82
|
+
.replace(/\\\(/g, '(')
|
|
83
|
+
.replace(/\\\)/g, ')')
|
|
84
|
+
.replace(/\\\\/g, '\\');
|
|
85
|
+
}
|
|
86
|
+
function countPages(buffer) {
|
|
87
|
+
const content = buffer.toString('latin1');
|
|
88
|
+
// Count /Type /Page occurrences (excluding /Pages)
|
|
89
|
+
const pageMatches = content.match(/\/Type\s*\/Page(?!\s*s)/g);
|
|
90
|
+
return pageMatches ? pageMatches.length : undefined;
|
|
91
|
+
}
|
|
92
|
+
//# sourceMappingURL=pdf.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../src/processors/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAU9C,MAAM,UAAU,SAAS,CAAC,QAAgB;IACxC,OAAO,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,KAAK,MAAM,CAAC;AACpD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,QAAgB;IAC/C,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IACzC,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IAEtC,wBAAwB;IACxB,MAAM,IAAI,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAC;IACxC,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC;IAErC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,IAAI,qBAAqB,KAAK,wCAAwC,CAAC;IAElG,OAAO;QACL,KAAK;QACL,OAAO;QACP,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC;QAC5D,SAAS;QACT,UAAU,EAAE,QAAQ;KACrB,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,KAAa,EAAE,QAAgB,EAAE,OAAe,EAAE,SAAkB;IACzF,OAAO,KAAK,KAAK;;iBAEF,QAAQ,CAAC,QAAQ,CAAC,KAAK,QAAQ;iBAC/B,SAAS,CAAC,CAAC,CAAC,kBAAkB,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE;mBAC5C,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;;;;EAIvD,OAAO;CACR,CAAC;AACF,CAAC;AAED,SAAS,kBAAkB,CAAC,MAAc;IACxC,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC1C,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,0EAA0E;IAC1E,MAAM,SAAS,GAAG,qBAAqB,CAAC;IACxC,IAAI,KAA6B,CAAC;IAElC,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAE7B,2CAA2C;QAC3C,MAAM,OAAO,GAAG,mBAAmB,CAAC;QACpC,IAAI,OAA+B,CAAC;QACpC,OAAO,CAAC,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAChD,IAAI,OAAO,CAAC,CAAC,CAAC;gBAAE,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9D,CAAC;QAED,8CAA8C;QAC9C,MAAM,YAAY,GAAG,oBAAoB,CAAC;QAC1C,IAAI,UAAkC,CAAC;QACvC,OAAO,CAAC,UAAU,GAAG,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACxD,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAChC,MAAM,QAAQ,GAAG,cAAc,CAAC;YAChC,IAAI,QAAgC,CAAC;YACrC,OAAO,CAAC,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;gBAChD,IAAI,QAAQ,CAAC,CAAC,CAAC;oBAAE,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;IACH,CAAC;IAED,8DAA8D;IAC9D,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,sCAAsC,CAAC;QAC3D,IAAI,WAAmC,CAAC;QACxC,OAAO,CAAC,WAAW,GAAG,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC1D,MAAM,aAAa,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC3C,iEAAiE;YACjE,MAAM,aAAa,GAAG,aAAa,CAAC,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC,CAAC;YACrE,IAAI,aAAa,CAAC,MAAM,GAAG,aAAa,CAAC,MAAM,GAAG,GAAG,IAAI,aAAa,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;gBACnF,SAAS,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC;YACvC,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACzD,CAAC;AAED,SAAS,eAAe,CAAC,CAAS;IAChC,oCAAoC;IACpC,OAAO,CAAC;SACL,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC;SACrB,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC;SACrB,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,UAAU,CAAC,MAAc;IAChC,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;IAC9D,OAAO,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC;AACtD,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PowerPoint (.pptx) processor.
|
|
3
|
+
* Extracts slide text and speaker notes from raw XML (no external deps).
|
|
4
|
+
*/
|
|
5
|
+
export interface PptxResult {
|
|
6
|
+
title: string;
|
|
7
|
+
content: string;
|
|
8
|
+
markdown: string;
|
|
9
|
+
slideCount: number;
|
|
10
|
+
sourcePath: string;
|
|
11
|
+
}
|
|
12
|
+
export declare function processPptx(filePath: string): Promise<PptxResult>;
|
|
13
|
+
//# sourceMappingURL=pptx.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pptx.d.ts","sourceRoot":"","sources":["../../src/processors/pptx.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAKH,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB;AAQD,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAsBvE"}
|