gnosys 4.4.6 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -7
- package/dist/cli.js +140 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.js +65 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/attachments.d.ts +43 -0
- package/dist/lib/attachments.d.ts.map +1 -0
- package/dist/lib/attachments.js +154 -0
- package/dist/lib/attachments.js.map +1 -0
- package/dist/lib/audioExtract.d.ts +39 -0
- package/dist/lib/audioExtract.d.ts.map +1 -0
- package/dist/lib/audioExtract.js +220 -0
- package/dist/lib/audioExtract.js.map +1 -0
- package/dist/lib/chunkSplitter.d.ts +46 -0
- package/dist/lib/chunkSplitter.d.ts.map +1 -0
- package/dist/lib/chunkSplitter.js +233 -0
- package/dist/lib/chunkSplitter.js.map +1 -0
- package/dist/lib/config.d.ts +70 -1
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +62 -6
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/db.d.ts +7 -1
- package/dist/lib/db.d.ts.map +1 -1
- package/dist/lib/db.js +31 -4
- package/dist/lib/db.js.map +1 -1
- package/dist/lib/dbWrite.d.ts.map +1 -1
- package/dist/lib/dbWrite.js +11 -0
- package/dist/lib/dbWrite.js.map +1 -1
- package/dist/lib/docxExtract.d.ts +27 -0
- package/dist/lib/docxExtract.d.ts.map +1 -0
- package/dist/lib/docxExtract.js +80 -0
- package/dist/lib/docxExtract.js.map +1 -0
- package/dist/lib/fileDetect.d.ts +20 -0
- package/dist/lib/fileDetect.d.ts.map +1 -0
- package/dist/lib/fileDetect.js +124 -0
- package/dist/lib/fileDetect.js.map +1 -0
- package/dist/lib/imageExtract.d.ts +26 -0
- package/dist/lib/imageExtract.d.ts.map +1 -0
- package/dist/lib/imageExtract.js +113 -0
- package/dist/lib/imageExtract.js.map +1 -0
- package/dist/lib/llm.d.ts +9 -0
- package/dist/lib/llm.d.ts.map +1 -1
- package/dist/lib/llm.js +102 -0
- package/dist/lib/llm.js.map +1 -1
- package/dist/lib/multimodalIngest.d.ts +68 -0
- package/dist/lib/multimodalIngest.d.ts.map +1 -0
- package/dist/lib/multimodalIngest.js +463 -0
- package/dist/lib/multimodalIngest.js.map +1 -0
- package/dist/lib/pdfExtract.d.ts +29 -0
- package/dist/lib/pdfExtract.d.ts.map +1 -0
- package/dist/lib/pdfExtract.js +163 -0
- package/dist/lib/pdfExtract.js.map +1 -0
- package/dist/lib/setup.d.ts +9 -0
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +241 -16
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/store.d.ts +3 -0
- package/dist/lib/store.d.ts.map +1 -1
- package/dist/lib/store.js.map +1 -1
- package/dist/lib/videoExtract.d.ts +30 -0
- package/dist/lib/videoExtract.d.ts.map +1 -0
- package/dist/lib/videoExtract.js +92 -0
- package/dist/lib/videoExtract.js.map +1 -0
- package/package.json +3 -1
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gnosys Attachments — File attachment management for multimodal ingestion.
|
|
3
|
+
*
|
|
4
|
+
* Stores binary files in .gnosys/attachments/<uuid>.<ext> with a JSON manifest
|
|
5
|
+
* at .gnosys/attachments/attachments.json for tracking metadata and memory links.
|
|
6
|
+
*/
|
|
7
|
+
import * as fs from "fs/promises";
|
|
8
|
+
import * as path from "path";
|
|
9
|
+
import * as crypto from "crypto";
|
|
10
|
+
// ─── MIME type lookup from extension ────────────────────────────────────
|
|
11
|
+
const MIME_MAP = {
|
|
12
|
+
// Documents
|
|
13
|
+
pdf: "application/pdf",
|
|
14
|
+
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
15
|
+
txt: "text/plain",
|
|
16
|
+
md: "text/markdown",
|
|
17
|
+
// Images
|
|
18
|
+
png: "image/png",
|
|
19
|
+
jpg: "image/jpeg",
|
|
20
|
+
jpeg: "image/jpeg",
|
|
21
|
+
gif: "image/gif",
|
|
22
|
+
webp: "image/webp",
|
|
23
|
+
svg: "image/svg+xml",
|
|
24
|
+
// Audio
|
|
25
|
+
mp3: "audio/mpeg",
|
|
26
|
+
wav: "audio/wav",
|
|
27
|
+
m4a: "audio/mp4",
|
|
28
|
+
ogg: "audio/ogg",
|
|
29
|
+
flac: "audio/flac",
|
|
30
|
+
// Video
|
|
31
|
+
mp4: "video/mp4",
|
|
32
|
+
mkv: "video/x-matroska",
|
|
33
|
+
mov: "video/quicktime",
|
|
34
|
+
avi: "video/x-msvideo",
|
|
35
|
+
webm: "video/webm",
|
|
36
|
+
};
|
|
37
|
+
function mimeFromExtension(ext) {
|
|
38
|
+
return MIME_MAP[ext.toLowerCase()] || "application/octet-stream";
|
|
39
|
+
}
|
|
40
|
+
// ─── Helpers ────────────────────────────────────────────────────────────
|
|
41
|
+
function getAttachmentsDir(storePath) {
|
|
42
|
+
return path.join(storePath, "attachments");
|
|
43
|
+
}
|
|
44
|
+
function getManifestPath(storePath) {
|
|
45
|
+
return path.join(getAttachmentsDir(storePath), "attachments.json");
|
|
46
|
+
}
|
|
47
|
+
async function readManifest(storePath) {
|
|
48
|
+
const manifestPath = getManifestPath(storePath);
|
|
49
|
+
try {
|
|
50
|
+
const raw = await fs.readFile(manifestPath, "utf-8");
|
|
51
|
+
return JSON.parse(raw);
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
return { attachments: [] };
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
async function writeManifest(storePath, manifest) {
|
|
58
|
+
const manifestPath = getManifestPath(storePath);
|
|
59
|
+
await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2) + "\n", "utf-8");
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Compute SHA-256 hash of a file's contents.
|
|
63
|
+
*/
|
|
64
|
+
async function hashFile(filePath) {
|
|
65
|
+
const content = await fs.readFile(filePath);
|
|
66
|
+
return crypto.createHash("sha256").update(content).digest("hex");
|
|
67
|
+
}
|
|
68
|
+
// ─── Public API ─────────────────────────────────────────────────────────
|
|
69
|
+
/**
|
|
70
|
+
* Initialize the attachments directory and manifest in a store.
|
|
71
|
+
* Safe to call multiple times — creates only if missing.
|
|
72
|
+
*/
|
|
73
|
+
export async function initAttachments(storePath) {
|
|
74
|
+
const dir = getAttachmentsDir(storePath);
|
|
75
|
+
await fs.mkdir(dir, { recursive: true });
|
|
76
|
+
const manifestPath = getManifestPath(storePath);
|
|
77
|
+
try {
|
|
78
|
+
await fs.access(manifestPath);
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
// Manifest doesn't exist — create empty one
|
|
82
|
+
await writeManifest(storePath, { attachments: [] });
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Copy a file into .gnosys/attachments/<uuid>.<ext> and register it in the manifest.
|
|
87
|
+
* Returns the attachment record with metadata.
|
|
88
|
+
*
|
|
89
|
+
* If a file with the same content hash already exists, returns the existing record
|
|
90
|
+
* instead of creating a duplicate.
|
|
91
|
+
*/
|
|
92
|
+
export async function storeAttachment(storePath, filePath) {
|
|
93
|
+
// Make sure attachments dir exists
|
|
94
|
+
await initAttachments(storePath);
|
|
95
|
+
// Get file info
|
|
96
|
+
const stat = await fs.stat(filePath);
|
|
97
|
+
const originalName = path.basename(filePath);
|
|
98
|
+
const ext = path.extname(filePath).slice(1).toLowerCase() || "bin";
|
|
99
|
+
const contentHash = await hashFile(filePath);
|
|
100
|
+
// Check for duplicate by content hash
|
|
101
|
+
const manifest = await readManifest(storePath);
|
|
102
|
+
const existing = manifest.attachments.find((a) => a.contentHash === contentHash);
|
|
103
|
+
if (existing) {
|
|
104
|
+
return existing;
|
|
105
|
+
}
|
|
106
|
+
// Generate UUID and copy file
|
|
107
|
+
const uuid = crypto.randomUUID();
|
|
108
|
+
const destPath = path.join(getAttachmentsDir(storePath), `${uuid}.${ext}`);
|
|
109
|
+
await fs.copyFile(filePath, destPath);
|
|
110
|
+
// Create record
|
|
111
|
+
const record = {
|
|
112
|
+
uuid,
|
|
113
|
+
originalName,
|
|
114
|
+
extension: ext,
|
|
115
|
+
mimeType: mimeFromExtension(ext),
|
|
116
|
+
sizeBytes: stat.size,
|
|
117
|
+
contentHash,
|
|
118
|
+
createdAt: new Date().toISOString(),
|
|
119
|
+
memoryIds: [],
|
|
120
|
+
};
|
|
121
|
+
// Update manifest
|
|
122
|
+
manifest.attachments.push(record);
|
|
123
|
+
await writeManifest(storePath, manifest);
|
|
124
|
+
return record;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Read and return all attachment records from the manifest.
|
|
128
|
+
*/
|
|
129
|
+
export async function listAttachments(storePath) {
|
|
130
|
+
const manifest = await readManifest(storePath);
|
|
131
|
+
return manifest.attachments;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Get the full filesystem path for an attachment.
|
|
135
|
+
*/
|
|
136
|
+
export function getAttachmentPath(storePath, uuid, ext) {
|
|
137
|
+
return path.join(getAttachmentsDir(storePath), `${uuid}.${ext}`);
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Link a memory ID to an attachment. Updates the manifest so the attachment
|
|
141
|
+
* tracks which memories reference it.
|
|
142
|
+
*/
|
|
143
|
+
export async function linkMemoryToAttachment(storePath, uuid, memoryId) {
|
|
144
|
+
const manifest = await readManifest(storePath);
|
|
145
|
+
const record = manifest.attachments.find((a) => a.uuid === uuid);
|
|
146
|
+
if (!record) {
|
|
147
|
+
throw new Error(`Attachment not found: ${uuid}`);
|
|
148
|
+
}
|
|
149
|
+
if (!record.memoryIds.includes(memoryId)) {
|
|
150
|
+
record.memoryIds.push(memoryId);
|
|
151
|
+
await writeManifest(storePath, manifest);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
//# sourceMappingURL=attachments.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"attachments.js","sourceRoot":"","sources":["../../src/lib/attachments.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,MAAM,MAAM,QAAQ,CAAC;AAmBjC,2EAA2E;AAE3E,MAAM,QAAQ,GAA2B;IACvC,YAAY;IACZ,GAAG,EAAE,iBAAiB;IACtB,IAAI,EAAE,yEAAyE;IAC/E,GAAG,EAAE,YAAY;IACjB,EAAE,EAAE,eAAe;IACnB,SAAS;IACT,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;IAChB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,eAAe;IACpB,QAAQ;IACR,GAAG,EAAE,YAAY;IACjB,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,WAAW;IAChB,IAAI,EAAE,YAAY;IAClB,QAAQ;IACR,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,kBAAkB;IACvB,GAAG,EAAE,iBAAiB;IACtB,GAAG,EAAE,iBAAiB;IACtB,IAAI,EAAE,YAAY;CACnB,CAAC;AAEF,SAAS,iBAAiB,CAAC,GAAW;IACpC,OAAO,QAAQ,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,IAAI,0BAA0B,CAAC;AACnE,CAAC;AAED,2EAA2E;AAE3E,SAAS,iBAAiB,CAAC,SAAiB;IAC1C,OAAO,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;AAC7C,CAAC;AAED,SAAS,eAAe,CAAC,SAAiB;IACxC,OAAO,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,SAAS,CAAC,EAAE,kBAAkB,CAAC,CAAC;AACrE,CAAC;AAED,KAAK,UAAU,YAAY,CAAC,SAAiB;IAC3C,MAAM,YAAY,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAChD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAuB,CAAC;IAC/C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC;IAC7B,CAAC;AACH,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,SAAiB,EAAE,QAA4B;IAC1E,MAAM,YAAY,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,EAAE,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAC;AACtF,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,QAAQ,CAAC,QAAgB;IACtC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC5C,OAAO,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACnE,CAAC;AAED,2EAA2E;AAE3E;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,SAAiB;IACrD,MAAM,GAAG,GAAG,iBAAiB,CAAC,SAAS,CAAC,CAAC;IACzC,MAAM,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAEzC,MAAM,YAAY,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAChD,IAAI,CAAC;QACH,MAAM,EAAE,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC;IAChC,CAAC;IAAC,MAAM,CAAC;QACP,4CAA4C;QAC5C,MAAM,aAAa,CAAC,SAAS,EAAE,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC,CAAC;IACtD,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,SAAiB,EACjB,QAAgB;IAEhB,mCAAmC;IACnC,MAAM,eAAe,CAAC,SAAS,CAAC,CAAC;IAEjC,gBAAgB;IAChB,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,YAAY,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC7C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,IAAI,KAAK,CAAC;IACnE,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAE7C,sCAAsC;IACtC,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,SAAS,CAAC,CAAC;IAC/C,MAAM,QAAQ,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,KAAK,WAAW,CAAC,CAAC;IACjF,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,8BAA8B;IAC9B,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACjC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,SAAS,CAAC,EAAE,GAAG,IAAI,IAAI,GAAG,EAAE,CAAC,CAAC;IAC3E,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAEtC,gBAAgB;IAChB,MAAM,MAAM,GAAqB;QAC/B,IAAI;QACJ,YAAY;QACZ,SAAS,EAAE,GAAG;QACd,QAAQ,EAAE,iBAAiB,CAAC,GAAG,CAAC;QAChC,SAAS,EAAE,IAAI,CAAC,IAAI;QACpB,WAAW;QACX,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,SAAS,EAAE,EAAE;KACd,CAAC;IAEF,kBAAkB;IAClB,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAClC,MAAM,aAAa,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IAEzC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,SAAiB;IACrD,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,SAAS,CAAC,CAAC;IAC/C,OAAO,QAAQ,CAAC,WAAW,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAAC,SAAiB,EAAE,IAAY,EAAE,GAAW;IAC5E,OAAO,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,SAAS,CAAC,EAAE,GAAG,IAAI,IAAI,GAAG,EAAE,CAAC,CAAC;AACnE,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAC1C,SAAiB,EACjB,IAAY,EACZ,QAAgB;IAEhB,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,SAAS,CAAC,CAAC;IAC/C,MAAM,MAAM,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;IACjE,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,yBAAyB,IAAI,EAAE,CAAC,CAAC;IACnD,CAAC;IAED,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACzC,MAAM,CAAC,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAChC,MAAM,aAAa,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IAC3C,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gnosys Audio Extraction — Transcribe audio files with timestamps.
|
|
3
|
+
*
|
|
4
|
+
* API-first approach: Groq Whisper API ($0.02/hr) -> OpenAI Whisper API -> local Whisper (opt-in).
|
|
5
|
+
*
|
|
6
|
+
* Part of v5.0 Phase 4: Multimodal Ingestion — Audio support.
|
|
7
|
+
*/
|
|
8
|
+
export interface TranscriptSegment {
|
|
9
|
+
text: string;
|
|
10
|
+
/** Start time in seconds */
|
|
11
|
+
startTime: number;
|
|
12
|
+
/** End time in seconds */
|
|
13
|
+
endTime: number;
|
|
14
|
+
}
|
|
15
|
+
export interface TranscriptResult {
|
|
16
|
+
segments: TranscriptSegment[];
|
|
17
|
+
fullText: string;
|
|
18
|
+
/** Total audio duration in seconds */
|
|
19
|
+
duration: number;
|
|
20
|
+
language?: string;
|
|
21
|
+
}
|
|
22
|
+
export interface TranscriptionOptions {
|
|
23
|
+
provider?: "groq" | "openai" | "local";
|
|
24
|
+
apiKey?: string;
|
|
25
|
+
model?: string;
|
|
26
|
+
language?: string;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Transcribe an audio file with timestamps.
|
|
30
|
+
*
|
|
31
|
+
* Provider resolution order:
|
|
32
|
+
* 1. If `options.provider` is specified, use that provider
|
|
33
|
+
* 2. Try Groq (if GNOSYS_GROQ_KEY or GROQ_API_KEY is set)
|
|
34
|
+
* 3. Try OpenAI (if GNOSYS_OPENAI_KEY or OPENAI_API_KEY is set)
|
|
35
|
+
* 4. Try local Whisper (if @xenova/transformers is installed)
|
|
36
|
+
* 5. Throw an error with setup instructions
|
|
37
|
+
*/
|
|
38
|
+
export declare function transcribeAudio(filePath: string, options?: TranscriptionOptions): Promise<TranscriptResult>;
|
|
39
|
+
//# sourceMappingURL=audioExtract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audioExtract.d.ts","sourceRoot":"","sources":["../../src/lib/audioExtract.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAOH,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,SAAS,EAAE,MAAM,CAAC;IAClB,0BAA0B;IAC1B,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,sCAAsC;IACtC,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;IACvC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AA8ND;;;;;;;;;GASG;AACH,wBAAsB,eAAe,CACnC,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,OAAO,CAAC,gBAAgB,CAAC,CA+D3B"}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gnosys Audio Extraction — Transcribe audio files with timestamps.
|
|
3
|
+
*
|
|
4
|
+
* API-first approach: Groq Whisper API ($0.02/hr) -> OpenAI Whisper API -> local Whisper (opt-in).
|
|
5
|
+
*
|
|
6
|
+
* Part of v5.0 Phase 4: Multimodal Ingestion — Audio support.
|
|
7
|
+
*/
|
|
8
|
+
import * as fs from "fs/promises";
|
|
9
|
+
import * as path from "path";
|
|
10
|
+
// ─── MIME type detection for audio files ─────────────────────────────────
|
|
11
|
+
const AUDIO_MIME = {
|
|
12
|
+
".mp3": "audio/mpeg",
|
|
13
|
+
".wav": "audio/wav",
|
|
14
|
+
".m4a": "audio/mp4",
|
|
15
|
+
".ogg": "audio/ogg",
|
|
16
|
+
".flac": "audio/flac",
|
|
17
|
+
".webm": "audio/webm",
|
|
18
|
+
".aac": "audio/aac",
|
|
19
|
+
};
|
|
20
|
+
function detectAudioMime(filePath) {
|
|
21
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
22
|
+
return AUDIO_MIME[ext] || "audio/wav";
|
|
23
|
+
}
|
|
24
|
+
async function transcribeWithGroq(audioBuffer, fileName, mimeType, apiKey, options) {
|
|
25
|
+
const formData = new FormData();
|
|
26
|
+
formData.append("file", new Blob([new Uint8Array(audioBuffer)], { type: mimeType }), fileName);
|
|
27
|
+
formData.append("model", options?.model || "whisper-large-v3-turbo");
|
|
28
|
+
formData.append("response_format", "verbose_json");
|
|
29
|
+
formData.append("timestamp_granularities[]", "segment");
|
|
30
|
+
if (options?.language) {
|
|
31
|
+
formData.append("language", options.language);
|
|
32
|
+
}
|
|
33
|
+
const response = await fetch("https://api.groq.com/openai/v1/audio/transcriptions", {
|
|
34
|
+
method: "POST",
|
|
35
|
+
headers: {
|
|
36
|
+
Authorization: `Bearer ${apiKey}`,
|
|
37
|
+
},
|
|
38
|
+
body: formData,
|
|
39
|
+
});
|
|
40
|
+
if (!response.ok) {
|
|
41
|
+
const errorText = await response.text();
|
|
42
|
+
throw new Error(`Groq Whisper API error (${response.status}): ${errorText}`);
|
|
43
|
+
}
|
|
44
|
+
const data = (await response.json());
|
|
45
|
+
return parseWhisperResponse(data);
|
|
46
|
+
}
|
|
47
|
+
// ─── OpenAI Whisper API ─────────────────────────────────────────────────
|
|
48
|
+
async function transcribeWithOpenAI(audioBuffer, fileName, mimeType, apiKey, options) {
|
|
49
|
+
const formData = new FormData();
|
|
50
|
+
formData.append("file", new Blob([new Uint8Array(audioBuffer)], { type: mimeType }), fileName);
|
|
51
|
+
formData.append("model", options?.model || "whisper-1");
|
|
52
|
+
formData.append("response_format", "verbose_json");
|
|
53
|
+
formData.append("timestamp_granularities[]", "segment");
|
|
54
|
+
if (options?.language) {
|
|
55
|
+
formData.append("language", options.language);
|
|
56
|
+
}
|
|
57
|
+
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
58
|
+
method: "POST",
|
|
59
|
+
headers: {
|
|
60
|
+
Authorization: `Bearer ${apiKey}`,
|
|
61
|
+
},
|
|
62
|
+
body: formData,
|
|
63
|
+
});
|
|
64
|
+
if (!response.ok) {
|
|
65
|
+
const errorText = await response.text();
|
|
66
|
+
throw new Error(`OpenAI Whisper API error (${response.status}): ${errorText}`);
|
|
67
|
+
}
|
|
68
|
+
const data = (await response.json());
|
|
69
|
+
return parseWhisperResponse(data);
|
|
70
|
+
}
|
|
71
|
+
async function transcribeWithLocal(filePath, options) {
|
|
72
|
+
let pipeline;
|
|
73
|
+
try {
|
|
74
|
+
// Dynamic import — @xenova/transformers is an optional dependency
|
|
75
|
+
const transformers = await import("@xenova/transformers");
|
|
76
|
+
pipeline = transformers.pipeline;
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
throw new Error("Local Whisper transcription requires @xenova/transformers. " +
|
|
80
|
+
'Install it with: npm install @xenova/transformers\n' +
|
|
81
|
+
"Or set a Groq/OpenAI API key for cloud transcription.");
|
|
82
|
+
}
|
|
83
|
+
const modelName = options?.model || "Xenova/whisper-small";
|
|
84
|
+
const transcriber = await pipeline("automatic-speech-recognition", modelName);
|
|
85
|
+
const result = await transcriber(filePath, {
|
|
86
|
+
return_timestamps: true,
|
|
87
|
+
...(options?.language ? { language: options.language } : {}),
|
|
88
|
+
});
|
|
89
|
+
const segments = [];
|
|
90
|
+
let duration = 0;
|
|
91
|
+
if (result.chunks && Array.isArray(result.chunks)) {
|
|
92
|
+
for (const chunk of result.chunks) {
|
|
93
|
+
const startTime = chunk.timestamp[0];
|
|
94
|
+
const endTime = chunk.timestamp[1] ?? startTime;
|
|
95
|
+
segments.push({
|
|
96
|
+
text: chunk.text.trim(),
|
|
97
|
+
startTime,
|
|
98
|
+
endTime,
|
|
99
|
+
});
|
|
100
|
+
if (endTime > duration) {
|
|
101
|
+
duration = endTime;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// If no segments were returned, create a single segment from the full text
|
|
106
|
+
if (segments.length === 0 && result.text) {
|
|
107
|
+
segments.push({
|
|
108
|
+
text: result.text.trim(),
|
|
109
|
+
startTime: 0,
|
|
110
|
+
endTime: 0,
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
segments,
|
|
115
|
+
fullText: result.text.trim(),
|
|
116
|
+
duration,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
// ─── Response parsing ───────────────────────────────────────────────────
|
|
120
|
+
function parseWhisperResponse(data) {
|
|
121
|
+
const segments = [];
|
|
122
|
+
let duration = data.duration ?? 0;
|
|
123
|
+
if (data.segments && Array.isArray(data.segments)) {
|
|
124
|
+
for (const seg of data.segments) {
|
|
125
|
+
segments.push({
|
|
126
|
+
text: seg.text.trim(),
|
|
127
|
+
startTime: seg.start,
|
|
128
|
+
endTime: seg.end,
|
|
129
|
+
});
|
|
130
|
+
if (seg.end > duration) {
|
|
131
|
+
duration = seg.end;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
// If the API returned no segments, create one from the full text
|
|
136
|
+
if (segments.length === 0 && data.text) {
|
|
137
|
+
segments.push({
|
|
138
|
+
text: data.text.trim(),
|
|
139
|
+
startTime: 0,
|
|
140
|
+
endTime: duration,
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
segments,
|
|
145
|
+
fullText: data.text?.trim() || segments.map((s) => s.text).join(" "),
|
|
146
|
+
duration,
|
|
147
|
+
language: data.language,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
// ─── API key resolution ─────────────────────────────────────────────────
|
|
151
|
+
function resolveGroqKey(options) {
|
|
152
|
+
return options?.apiKey || process.env.GNOSYS_GROQ_KEY || process.env.GROQ_API_KEY;
|
|
153
|
+
}
|
|
154
|
+
function resolveOpenAIKey(options) {
|
|
155
|
+
return options?.apiKey || process.env.GNOSYS_OPENAI_KEY || process.env.OPENAI_API_KEY;
|
|
156
|
+
}
|
|
157
|
+
// ─── Public API ─────────────────────────────────────────────────────────
|
|
158
|
+
/**
|
|
159
|
+
* Transcribe an audio file with timestamps.
|
|
160
|
+
*
|
|
161
|
+
* Provider resolution order:
|
|
162
|
+
* 1. If `options.provider` is specified, use that provider
|
|
163
|
+
* 2. Try Groq (if GNOSYS_GROQ_KEY or GROQ_API_KEY is set)
|
|
164
|
+
* 3. Try OpenAI (if GNOSYS_OPENAI_KEY or OPENAI_API_KEY is set)
|
|
165
|
+
* 4. Try local Whisper (if @xenova/transformers is installed)
|
|
166
|
+
* 5. Throw an error with setup instructions
|
|
167
|
+
*/
|
|
168
|
+
export async function transcribeAudio(filePath, options) {
|
|
169
|
+
// Read the audio file
|
|
170
|
+
const audioBuffer = await fs.readFile(filePath);
|
|
171
|
+
const fileName = path.basename(filePath);
|
|
172
|
+
const mimeType = detectAudioMime(filePath);
|
|
173
|
+
// 1. Explicit provider requested
|
|
174
|
+
if (options?.provider) {
|
|
175
|
+
switch (options.provider) {
|
|
176
|
+
case "groq": {
|
|
177
|
+
const key = resolveGroqKey(options);
|
|
178
|
+
if (!key) {
|
|
179
|
+
throw new Error("Groq transcription requires an API key. " +
|
|
180
|
+
"Set GNOSYS_GROQ_KEY or GROQ_API_KEY in your environment.");
|
|
181
|
+
}
|
|
182
|
+
return transcribeWithGroq(audioBuffer, fileName, mimeType, key, options);
|
|
183
|
+
}
|
|
184
|
+
case "openai": {
|
|
185
|
+
const key = resolveOpenAIKey(options);
|
|
186
|
+
if (!key) {
|
|
187
|
+
throw new Error("OpenAI transcription requires an API key. " +
|
|
188
|
+
"Set GNOSYS_OPENAI_KEY or OPENAI_API_KEY in your environment.");
|
|
189
|
+
}
|
|
190
|
+
return transcribeWithOpenAI(audioBuffer, fileName, mimeType, key, options);
|
|
191
|
+
}
|
|
192
|
+
case "local":
|
|
193
|
+
return transcribeWithLocal(filePath, options);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
// 2. Try Groq (cheapest API option)
|
|
197
|
+
const groqKey = resolveGroqKey(options);
|
|
198
|
+
if (groqKey) {
|
|
199
|
+
return transcribeWithGroq(audioBuffer, fileName, mimeType, groqKey, options);
|
|
200
|
+
}
|
|
201
|
+
// 3. Try OpenAI
|
|
202
|
+
const openaiKey = resolveOpenAIKey(options);
|
|
203
|
+
if (openaiKey) {
|
|
204
|
+
return transcribeWithOpenAI(audioBuffer, fileName, mimeType, openaiKey, options);
|
|
205
|
+
}
|
|
206
|
+
// 4. Try local Whisper as a fallback
|
|
207
|
+
try {
|
|
208
|
+
return await transcribeWithLocal(filePath, options);
|
|
209
|
+
}
|
|
210
|
+
catch {
|
|
211
|
+
// Local Whisper not available — fall through to the error below
|
|
212
|
+
}
|
|
213
|
+
// 5. No provider available
|
|
214
|
+
throw new Error("No transcription provider available. Set up one of:\n" +
|
|
215
|
+
" 1. Groq API key: export GROQ_API_KEY=your-key (cheapest, $0.02/hr)\n" +
|
|
216
|
+
" 2. OpenAI API key: export OPENAI_API_KEY=your-key\n" +
|
|
217
|
+
" 3. Local Whisper: npm install @xenova/transformers\n" +
|
|
218
|
+
"Or set multimodal.transcriptionProvider in gnosys.json.");
|
|
219
|
+
}
|
|
220
|
+
//# sourceMappingURL=audioExtract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audioExtract.js","sourceRoot":"","sources":["../../src/lib/audioExtract.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AA2B7B,4EAA4E;AAE5E,MAAM,UAAU,GAA2B;IACzC,MAAM,EAAE,YAAY;IACpB,MAAM,EAAE,WAAW;IACnB,MAAM,EAAE,WAAW;IACnB,MAAM,EAAE,WAAW;IACnB,OAAO,EAAE,YAAY;IACrB,OAAO,EAAE,YAAY;IACrB,MAAM,EAAE,WAAW;CACpB,CAAC;AAEF,SAAS,eAAe,CAAC,QAAgB;IACvC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACjD,OAAO,UAAU,CAAC,GAAG,CAAC,IAAI,WAAW,CAAC;AACxC,CAAC;AAiBD,KAAK,UAAU,kBAAkB,CAC/B,WAAmB,EACnB,QAAgB,EAChB,QAAgB,EAChB,MAAc,EACd,OAA8B;IAE9B,MAAM,QAAQ,GAAG,IAAI,QAAQ,EAAE,CAAC;IAChC,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,IAAI,IAAI,CAAC,CAAC,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,QAAQ,CAAC,CAAC;IAC/F,QAAQ,CAAC,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,IAAI,wBAAwB,CAAC,CAAC;IACrE,QAAQ,CAAC,MAAM,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;IACnD,QAAQ,CAAC,MAAM,CAAC,2BAA2B,EAAE,SAAS,CAAC,CAAC;IAExD,IAAI,OAAO,EAAE,QAAQ,EAAE,CAAC;QACtB,QAAQ,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IAChD,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,qDAAqD,EAAE;QAClF,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,aAAa,EAAE,UAAU,MAAM,EAAE;SAClC;QACD,IAAI,EAAE,QAAQ;KACf,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CAAC,2BAA2B,QAAQ,CAAC,MAAM,MAAM,SAAS,EAAE,CAAC,CAAC;IAC/E,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAuB,CAAC;IAC3D,OAAO,oBAAoB,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC;AAED,2EAA2E;AAE3E,KAAK,UAAU,oBAAoB,CACjC,WAAmB,EACnB,QAAgB,EAChB,QAAgB,EAChB,MAAc,EACd,OAA8B;IAE9B,MAAM,QAAQ,GAAG,IAAI,QAAQ,EAAE,CAAC;IAChC,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,IAAI,IAAI,CAAC,CAAC,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,QAAQ,CAAC,CAAC;IAC/F,QAAQ,CAAC,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,IAAI,WAAW,CAAC,CAAC;IACxD,QAAQ,CAAC,MAAM,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;IACnD,QAAQ,CAAC,MAAM,CAAC,2BAA2B,EAAE,SAAS,CAAC,CAAC;IAExD,IAAI,OAAO,EAAE,QAAQ,EAAE,CAAC;QACtB,QAAQ,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IAChD,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,gDAAgD,EAAE;QAC7E,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,aAAa,EAAE,UAAU,MAAM,EAAE;SAClC;QACD,IAAI,EAAE,QAAQ;KACf,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CAAC,6BAA6B,QAAQ,CAAC,MAAM,MAAM,SAAS,EAAE,CAAC,CAAC;IACjF,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAuB,CAAC;IAC3D,OAAO,oBAAoB,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC;AASD,KAAK,UAAU,mBAAmB,CAChC,QAAgB,EAChB,OAA8B;IAE9B,IAAI,QAA2D,CAAC;IAEhE,IAAI,CAAC;QACH,kEAAkE;QAClE,MAAM,YAAY,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;QAC1D,QAAQ,GAAG,YAAY,CAAC,QAA2B,CAAC;IACtD,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,6DAA6D;YAC3D,qDAAqD;YACrD,uDAAuD,CAC1D,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,OAAO,EAAE,KAAK,IAAI,sBAAsB,CAAC;IAC3D,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,8BAA8B,EAAE,SAAS,CAGlB,CAAC;IAE3D,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,QAAQ,EAAE;QACzC,iBAAiB,EAAE,IAAI;QACvB,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAC7D,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAwB,EAAE,CAAC;IACzC,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,IAAI,MAAM,CAAC,MAAM,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;QAClD,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClC,MAAM,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;YACrC,MAAM,OAAO,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,SAAS,CAAC;YAChD,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE;gBACvB,SAAS;gBACT,OAAO;aACR,CAAC,CAAC;YACH,IAAI,OAAO,GAAG,QAAQ,EAAE,CAAC;gBACvB,QAAQ,GAAG,OAAO,CAAC;YACrB,CAAC;QACH,CAAC;IACH,CAAC;IAED,2EAA2E;IAC3E,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;QACzC,QAAQ,CAAC,IAAI,CAAC;YACZ,IAAI,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE;YACxB,SAAS,EAAE,CAAC;YACZ,OAAO,EAAE,CAAC;SACX,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,QAAQ;QACR,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE;QAC5B,QAAQ;KACT,CAAC;AACJ,CAAC;AAED,2EAA2E;AAE3E,SAAS,oBAAoB,CAAC,IAAwB;IACpD,MAAM,QAAQ,GAAwB,EAAE,CAAC;IACzC,IAAI,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;IAElC,IAAI,IAAI,CAAC,QAAQ,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;QAClD,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAChC,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE;gBACrB,SAAS,EAAE,GAAG,CAAC,KAAK;gBACpB,OAAO,EAAE,GAAG,CAAC,GAAG;aACjB,CAAC,CAAC;YACH,IAAI,GAAG,CAAC,GAAG,GAAG,QAAQ,EAAE,CAAC;gBACvB,QAAQ,GAAG,GAAG,CAAC,GAAG,CAAC;YACrB,CAAC;QACH,CAAC;IACH,CAAC;IAED,iEAAiE;IACjE,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QACvC,QAAQ,CAAC,IAAI,CAAC;YACZ,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YACtB,SAAS,EAAE,CAAC;YACZ,OAAO,EAAE,QAAQ;SAClB,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,QAAQ;QACR,QAAQ,EAAE,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;QACpE,QAAQ;QACR,QAAQ,EAAE,IAAI,CAAC,QAAQ;KACxB,CAAC;AACJ,CAAC;AAED,2EAA2E;AAE3E,SAAS,cAAc,CAAC,OAA8B;IACpD,OAAO,OAAO,EAAE,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;AACpF,CAAC;AAED,SAAS,gBAAgB,CAAC,OAA8B;IACtD,OAAO,OAAO,EAAE,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;AACxF,CAAC;AAED,2EAA2E;AAE3E;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,QAAgB,EAChB,OAA8B;IAE9B,sBAAsB;IACtB,MAAM,WAAW,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IACzC,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAE3C,iCAAiC;IACjC,IAAI,OAAO,EAAE,QAAQ,EAAE,CAAC;QACtB,QAAQ,OAAO,CAAC,QAAQ,EAAE,CAAC;YACzB,KAAK,MAAM,CAAC,CAAC,CAAC;gBACZ,MAAM,GAAG,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;gBACpC,IAAI,CAAC,GAAG,EAAE,CAAC;oBACT,MAAM,IAAI,KAAK,CACb,0CAA0C;wBACxC,0DAA0D,CAC7D,CAAC;gBACJ,CAAC;gBACD,OAAO,kBAAkB,CAAC,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;YAC3E,CAAC;YAED,KAAK,QAAQ,CAAC,CAAC,CAAC;gBACd,MAAM,GAAG,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;gBACtC,IAAI,CAAC,GAAG,EAAE,CAAC;oBACT,MAAM,IAAI,KAAK,CACb,4CAA4C;wBAC1C,8DAA8D,CACjE,CAAC;gBACJ,CAAC;gBACD,OAAO,oBAAoB,CAAC,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;YAC7E,CAAC;YAED,KAAK,OAAO;gBACV,OAAO,mBAAmB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,oCAAoC;IACpC,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IACxC,IAAI,OAAO,EAAE,CAAC;QACZ,OAAO,kBAAkB,CAAC,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IAC/E,CAAC;IAED,gBAAgB;IAChB,MAAM,SAAS,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAC5C,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,oBAAoB,CAAC,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IACnF,CAAC;IAED,qCAAqC;IACrC,IAAI,CAAC;QACH,OAAO,MAAM,mBAAmB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACtD,CAAC;IAAC,MAAM,CAAC;QACP,gEAAgE;IAClE,CAAC;IAED,2BAA2B;IAC3B,MAAM,IAAI,KAAK,CACb,uDAAuD;QACrD,yEAAyE;QACzE,uDAAuD;QACvD,wDAAwD;QACxD,yDAAyD,CAC5D,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gnosys Chunk Splitter — Text chunking for multi-page content.
|
|
3
|
+
*
|
|
4
|
+
* Splits long text into memory-sized chunks at natural boundaries
|
|
5
|
+
* (paragraphs first, then sentences). Used by multimodal ingestion
|
|
6
|
+
* to break PDFs, transcripts, and documents into atomic memories.
|
|
7
|
+
*/
|
|
8
|
+
export interface TextChunk {
|
|
9
|
+
text: string;
|
|
10
|
+
index: number;
|
|
11
|
+
sourcePage?: string;
|
|
12
|
+
sourceTimerange?: string;
|
|
13
|
+
metadata?: Record<string, unknown>;
|
|
14
|
+
}
|
|
15
|
+
export interface ChunkOptions {
|
|
16
|
+
/** Target chunk size in characters (default: 1500) */
|
|
17
|
+
targetSize?: number;
|
|
18
|
+
/** Minimum chunk size — chunks below this merge with the next (default: 200) */
|
|
19
|
+
minSize?: number;
|
|
20
|
+
/** Maximum chunk size — paragraphs exceeding this get split at sentences (default: 4000) */
|
|
21
|
+
maxSize?: number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Split a block of text into chunks at paragraph boundaries (double newlines).
|
|
25
|
+
*
|
|
26
|
+
* Algorithm:
|
|
27
|
+
* 1. Split at double-newline (paragraph) boundaries
|
|
28
|
+
* 2. Accumulate paragraphs until reaching targetSize
|
|
29
|
+
* 3. If a single paragraph exceeds maxSize, split it at sentence boundaries
|
|
30
|
+
* 4. Merge chunks that are under minSize with the next chunk
|
|
31
|
+
*/
|
|
32
|
+
export declare function splitIntoChunks(text: string, options?: ChunkOptions): TextChunk[];
|
|
33
|
+
/**
|
|
34
|
+
* Split pre-segmented content (pages, transcript segments) into chunks.
|
|
35
|
+
*
|
|
36
|
+
* Each segment has its own page number or timerange metadata.
|
|
37
|
+
* Segments are chunked individually, preserving their source metadata.
|
|
38
|
+
* Short segments that are under minSize get merged with the next segment
|
|
39
|
+
* (only if they share the same page).
|
|
40
|
+
*/
|
|
41
|
+
export declare function splitSegments(segments: Array<{
|
|
42
|
+
text: string;
|
|
43
|
+
page?: string;
|
|
44
|
+
timerange?: string;
|
|
45
|
+
}>, options?: ChunkOptions): TextChunk[];
|
|
46
|
+
//# sourceMappingURL=chunkSplitter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunkSplitter.d.ts","sourceRoot":"","sources":["../../src/lib/chunkSplitter.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED,MAAM,WAAW,YAAY;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gFAAgF;IAChF,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,4FAA4F;IAC5F,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAyFD;;;;;;;;GAQG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,SAAS,EAAE,CAkEjF;AAED;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IAAC,SAAS,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,EACpE,OAAO,CAAC,EAAE,YAAY,GACrB,SAAS,EAAE,CAkFb"}
|