@datasynx/agentic-crm 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/README.md +8 -1
  2. package/dist/{ask-D8iYqDAr.js → ask-CDysGnRg.js} +2 -2
  3. package/dist/{ask-D8iYqDAr.js.map → ask-CDysGnRg.js.map} +1 -1
  4. package/dist/attachments-CX2GAtsw.cjs +517 -0
  5. package/dist/attachments-CX2GAtsw.cjs.map +1 -0
  6. package/dist/attachments-D207gXfN.js +514 -0
  7. package/dist/attachments-D207gXfN.js.map +1 -0
  8. package/dist/attachments-rLa96rOK.js +514 -0
  9. package/dist/attachments-rLa96rOK.js.map +1 -0
  10. package/dist/chunk-BfDYWZQ8.cjs +32 -0
  11. package/dist/chunk-BfDYWZQ8.cjs.map +1 -0
  12. package/dist/chunk-BhUZmQg5.js +32 -0
  13. package/dist/chunk-BhUZmQg5.js.map +1 -0
  14. package/dist/chunk-ChC83jai.js +2 -0
  15. package/dist/chunk-e_w8qqtP.js +32 -0
  16. package/dist/chunk-e_w8qqtP.js.map +1 -0
  17. package/dist/cli.js +16 -15
  18. package/dist/cli.js.map +1 -1
  19. package/dist/daemon/worker.js +3 -3
  20. package/dist/email-body-BFSRa0AW.cjs +42 -0
  21. package/dist/email-body-BFSRa0AW.cjs.map +1 -0
  22. package/dist/email-body-BOd7U-D2.js +42 -0
  23. package/dist/email-body-BOd7U-D2.js.map +1 -0
  24. package/dist/{gmail-sync-DueE6tl5.js → gmail-sync-B4Iu3AQb.js} +45 -15
  25. package/dist/gmail-sync-B4Iu3AQb.js.map +1 -0
  26. package/dist/{gmail-sync-GEy3oVvw.cjs → gmail-sync-BpSVESSe.cjs} +45 -15
  27. package/dist/gmail-sync-BpSVESSe.cjs.map +1 -0
  28. package/dist/{gmail-sync-C-NmibzS.js → gmail-sync-DIbrPnTK.js} +45 -15
  29. package/dist/gmail-sync-DIbrPnTK.js.map +1 -0
  30. package/dist/{gmail-webhook-handler-kGKpbY9h.js → gmail-webhook-handler-BzOFbvgh.js} +2 -2
  31. package/dist/{gmail-webhook-handler-kGKpbY9h.js.map → gmail-webhook-handler-BzOFbvgh.js.map} +1 -1
  32. package/dist/{gmail-webhook-handler-B26COilD.js → gmail-webhook-handler-CvSDW_Js.js} +1 -1
  33. package/dist/{google-drive-sync-D1n7WKZn.js → google-drive-sync-B_I1d54Y.js} +2 -2
  34. package/dist/{google-drive-sync-D1n7WKZn.js.map → google-drive-sync-B_I1d54Y.js.map} +1 -1
  35. package/dist/html-BaeOCZKE.js +36 -0
  36. package/dist/html-BaeOCZKE.js.map +1 -0
  37. package/dist/html-CmOku6jS.cjs +47 -0
  38. package/dist/html-CmOku6jS.cjs.map +1 -0
  39. package/dist/{import-hubspot-DB4n89jy.js → import-hubspot-CTId9IGV.js} +2 -2
  40. package/dist/{import-hubspot-DB4n89jy.js.map → import-hubspot-CTId9IGV.js.map} +1 -1
  41. package/dist/{index-pY7tYXwH.d.cts → index-BAutNcAT.d.cts} +13 -9
  42. package/dist/index-BAutNcAT.d.cts.map +1 -0
  43. package/dist/{index-B0IMMrp_.d.ts → index-FzDsNSSb.d.ts} +5 -1
  44. package/dist/index-FzDsNSSb.d.ts.map +1 -0
  45. package/dist/index.d.cts +13 -9
  46. package/dist/index.d.cts.map +1 -1
  47. package/dist/index.d.ts +5 -1
  48. package/dist/index.d.ts.map +1 -1
  49. package/dist/{interactions-writer-RJB8SWf2.js → interactions-writer-B2y-73lh.js} +1 -1
  50. package/dist/{interactions-writer-DbSyI2rt.js → interactions-writer-B8XAzdqR.js} +3 -2
  51. package/dist/interactions-writer-B8XAzdqR.js.map +1 -0
  52. package/dist/{interactions-writer-a2yzBd7T.cjs → interactions-writer-BRJNrefF.cjs} +3 -2
  53. package/dist/interactions-writer-BRJNrefF.cjs.map +1 -0
  54. package/dist/{interactions-writer-BZzUIgJd.js → interactions-writer-ZQcpFOh9.js} +3 -2
  55. package/dist/interactions-writer-ZQcpFOh9.js.map +1 -0
  56. package/dist/{knowledge-base-DHNc4hVj.js → knowledge-base--063Kpa3.js} +9 -7
  57. package/dist/{knowledge-base-DHNc4hVj.js.map → knowledge-base--063Kpa3.js.map} +1 -1
  58. package/dist/mcp.cjs +44 -22
  59. package/dist/mcp.cjs.map +1 -1
  60. package/dist/mcp.js +44 -22
  61. package/dist/mcp.js.map +1 -1
  62. package/dist/{microsoft-calendar-jIu9K5zX.js → microsoft-calendar-BgVR8GDv.js} +3 -3
  63. package/dist/{microsoft-calendar-jIu9K5zX.js.map → microsoft-calendar-BgVR8GDv.js.map} +1 -1
  64. package/dist/{microsoft-sync-R_r8HL-B.js → microsoft-sync-D30_XksI.js} +3 -3
  65. package/dist/{microsoft-sync-R_r8HL-B.js.map → microsoft-sync-D30_XksI.js.map} +1 -1
  66. package/dist/{nba-mTJ4yEqD.js → nba-DwdfM93s.js} +2 -2
  67. package/dist/{nba-mTJ4yEqD.js.map → nba-DwdfM93s.js.map} +1 -1
  68. package/dist/{server-DqSMYhSA.js → server-DoRPPOeR.js} +39 -19
  69. package/dist/server-DoRPPOeR.js.map +1 -0
  70. package/dist/{transcript-watcher-0mh2ZhmH.js → transcript-watcher-BoClrJAz.js} +2 -2
  71. package/dist/{transcript-watcher-0mh2ZhmH.js.map → transcript-watcher-BoClrJAz.js.map} +1 -1
  72. package/package.json +12 -1
  73. package/dist/gmail-sync-C-NmibzS.js.map +0 -1
  74. package/dist/gmail-sync-DueE6tl5.js.map +0 -1
  75. package/dist/gmail-sync-GEy3oVvw.cjs.map +0 -1
  76. package/dist/index-B0IMMrp_.d.ts.map +0 -1
  77. package/dist/index-pY7tYXwH.d.cts.map +0 -1
  78. package/dist/interactions-writer-BZzUIgJd.js.map +0 -1
  79. package/dist/interactions-writer-DbSyI2rt.js.map +0 -1
  80. package/dist/interactions-writer-a2yzBd7T.cjs.map +0 -1
  81. package/dist/server-DqSMYhSA.js.map +0 -1
@@ -0,0 +1,514 @@
1
+ import { t as assertSafeSlug } from "./customer-dir-CkMMXhb0.js";
2
+ import { n as logger } from "./logger-Dyl4VcLO.js";
3
+ import { t as chunkText } from "./chunk-BhUZmQg5.js";
4
+ import { n as htmlToMarkdown, t as htmlConverter } from "./html-BaeOCZKE.js";
5
+ import path from "path";
6
+ import fs from "fs";
7
+ //#region src/sync/converters/text.ts
8
+ /** Escape a CSV cell for safe inclusion in a Markdown table cell. */
9
+ function mdCell(value) {
10
+ return value.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ").trim();
11
+ }
12
+ /**
13
+ * Minimal RFC-4180-ish CSV line splitter: handles quoted fields containing
14
+ * commas and escaped double quotes. Good enough for rendering CSV attachments
15
+ * as readable Markdown tables (we are not round-tripping data).
16
+ */
17
+ function parseCsvLine(line) {
18
+ const out = [];
19
+ let cur = "";
20
+ let inQuotes = false;
21
+ for (let i = 0; i < line.length; i++) {
22
+ const ch = line[i];
23
+ if (inQuotes) if (ch === "\"") if (line[i + 1] === "\"") {
24
+ cur += "\"";
25
+ i++;
26
+ } else inQuotes = false;
27
+ else cur += ch;
28
+ else if (ch === "\"") inQuotes = true;
29
+ else if (ch === ",") {
30
+ out.push(cur);
31
+ cur = "";
32
+ } else cur += ch;
33
+ }
34
+ out.push(cur);
35
+ return out;
36
+ }
37
+ /** Render a matrix of cells as a GitHub-flavored Markdown pipe table. */
38
+ function rowsToMarkdown(rows) {
39
+ if (rows.length === 0) return "";
40
+ const width = Math.max(...rows.map((r) => r.length));
41
+ const pad = (r) => Array.from({ length: width }, (_, i) => mdCell(r[i] ?? ""));
42
+ const header = pad(rows[0] ?? []);
43
+ return [
44
+ `| ${header.join(" | ")} |`,
45
+ `| ${header.map(() => "---").join(" | ")} |`,
46
+ ...rows.slice(1).map((r) => `| ${pad(r).join(" | ")} |`)
47
+ ].join("\n");
48
+ }
49
+ /** Render CSV text as a GitHub-flavored Markdown pipe table. */
50
+ function csvToMarkdown(csv) {
51
+ return rowsToMarkdown(csv.split(/\r?\n/).filter((l) => l.trim().length > 0).map(parseCsvLine));
52
+ }
53
+ const TEXT_EXTENSIONS = [
54
+ "txt",
55
+ "text",
56
+ "log",
57
+ "md",
58
+ "markdown"
59
+ ];
60
+ const CODE_FENCE_EXTENSIONS = {
61
+ json: "json",
62
+ xml: "xml",
63
+ yaml: "yaml",
64
+ yml: "yaml"
65
+ };
66
+ /**
67
+ * Converter for plain-text-ish attachments: Markdown/text passthrough, CSV/TSV
68
+ * to Markdown tables, and structured text (JSON/XML/YAML) into fenced code
69
+ * blocks so they stay readable and searchable without a heavy parser.
70
+ */
71
+ const textConverter = {
72
+ name: "text",
73
+ extensions: [
74
+ ...TEXT_EXTENSIONS,
75
+ "csv",
76
+ "tsv",
77
+ ...Object.keys(CODE_FENCE_EXTENSIONS)
78
+ ],
79
+ mimeTypes: [
80
+ "text/plain",
81
+ "text/csv",
82
+ "text/markdown",
83
+ "application/json",
84
+ "text/*"
85
+ ],
86
+ convert(buffer, filename) {
87
+ const ext = filename.split(".").pop()?.toLowerCase() ?? "";
88
+ const content = buffer.toString("utf-8");
89
+ if (ext === "csv") return Promise.resolve({
90
+ markdown: csvToMarkdown(content),
91
+ meta: { format: "csv" }
92
+ });
93
+ if (ext === "tsv") {
94
+ const asCsv = content.replace(/\t/g, ",");
95
+ return Promise.resolve({
96
+ markdown: csvToMarkdown(asCsv),
97
+ meta: { format: "tsv" }
98
+ });
99
+ }
100
+ const fence = CODE_FENCE_EXTENSIONS[ext];
101
+ if (fence) return Promise.resolve({
102
+ markdown: `\`\`\`${fence}\n${content.trim()}\n\`\`\``,
103
+ meta: { format: fence }
104
+ });
105
+ return Promise.resolve({
106
+ markdown: content.trim(),
107
+ meta: { format: "text" }
108
+ });
109
+ }
110
+ };
111
+ //#endregion
112
+ //#region src/sync/converters/docx.ts
113
+ /**
114
+ * DOCX → Markdown via mammoth (DOCX → semantic HTML) then Turndown (HTML →
115
+ * Markdown). Mammoth's own Markdown output is deprecated; the HTML route keeps
116
+ * tables, lists and headings intact. mammoth is loaded lazily.
117
+ */
118
+ const docxConverter = {
119
+ name: "docx",
120
+ extensions: ["docx"],
121
+ mimeTypes: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
122
+ async convert(buffer) {
123
+ const { value: html, messages } = await (await import("mammoth")).default.convertToHtml({ buffer });
124
+ return {
125
+ markdown: await htmlToMarkdown(html),
126
+ meta: {
127
+ format: "docx",
128
+ warnings: messages.filter((m) => m.type === "warning").length
129
+ }
130
+ };
131
+ }
132
+ };
133
+ //#endregion
134
+ //#region src/sync/converters/xlsx.ts
135
+ /** Render a single ExcelJS cell value as plain text. */
136
+ function cellText(value) {
137
+ if (value === null || value === void 0) return "";
138
+ if (value instanceof Date) return value.toISOString().slice(0, 10);
139
+ if (typeof value === "object") {
140
+ const v = value;
141
+ if (typeof v["text"] === "string") return v["text"];
142
+ if ("result" in v) return String(v["result"] ?? "");
143
+ if (Array.isArray(v["richText"])) return v["richText"].map((r) => r.text ?? "").join("");
144
+ if ("hyperlink" in v) return String(v["text"] ?? v["hyperlink"] ?? "");
145
+ }
146
+ return String(value);
147
+ }
148
+ /**
149
+ * Spreadsheet (XLSX) → Markdown via ExcelJS. Each worksheet becomes a
150
+ * `## <sheet name>` section followed by a GitHub-flavored Markdown table.
151
+ * ExcelJS is loaded lazily.
152
+ */
153
+ const xlsxConverter = {
154
+ name: "xlsx",
155
+ extensions: ["xlsx", "xlsm"],
156
+ mimeTypes: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"],
157
+ async convert(buffer) {
158
+ const wb = new (await (import("exceljs"))).default.Workbook();
159
+ await wb.xlsx.load(buffer);
160
+ const sections = [];
161
+ const sheetNames = [];
162
+ wb.eachSheet((sheet) => {
163
+ sheetNames.push(sheet.name);
164
+ const rows = [];
165
+ sheet.eachRow({ includeEmpty: false }, (row) => {
166
+ const cells = [];
167
+ row.eachCell({ includeEmpty: true }, (cell) => {
168
+ cells.push(cellText(cell.value));
169
+ });
170
+ rows.push(cells);
171
+ });
172
+ const table = rowsToMarkdown(rows);
173
+ if (table) sections.push(`## ${sheet.name}\n\n${table}`);
174
+ });
175
+ return {
176
+ markdown: sections.join("\n\n"),
177
+ meta: {
178
+ format: "xlsx",
179
+ sheets: sheetNames
180
+ }
181
+ };
182
+ }
183
+ };
184
+ //#endregion
185
+ //#region src/sync/converters/pptx.ts
186
+ /** Extract the visible text runs (`<a:t>…</a:t>`) from one slide's XML. */
187
+ function extractSlideText(xml) {
188
+ return [...xml.matchAll(/<a:t>([\s\S]*?)<\/a:t>/g)].map((m) => (m[1] ?? "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&apos;/g, "'")).join(" ").replace(/\s+/g, " ").trim();
189
+ }
190
+ function slideNumber(entryName) {
191
+ const m = entryName.match(/slide(\d+)\.xml$/);
192
+ return m ? parseInt(m[1] ?? "0", 10) : 0;
193
+ }
194
+ /**
195
+ * PPTX → Markdown. A .pptx is a zip; slide text lives in `ppt/slides/slideN.xml`
196
+ * as `<a:t>` runs. We unzip with adm-zip (already a dependency) and emit one
197
+ * `## Slide N` section per slide — no extra native parser needed.
198
+ */
199
+ const pptxConverter = {
200
+ name: "pptx",
201
+ extensions: ["pptx"],
202
+ mimeTypes: ["application/vnd.openxmlformats-officedocument.presentationml.presentation"],
203
+ async convert(buffer) {
204
+ const AdmZip = (await import("adm-zip")).default;
205
+ const slides = new AdmZip(buffer).getEntries().filter((e) => /^ppt\/slides\/slide\d+\.xml$/.test(e.entryName)).sort((a, b) => slideNumber(a.entryName) - slideNumber(b.entryName));
206
+ const sections = [];
207
+ for (const entry of slides) {
208
+ const text = extractSlideText(entry.getData().toString("utf-8"));
209
+ if (text) sections.push(`## Slide ${slideNumber(entry.entryName)}\n\n${text}`);
210
+ }
211
+ return {
212
+ markdown: sections.join("\n\n"),
213
+ meta: {
214
+ format: "pptx",
215
+ slides: slides.length
216
+ }
217
+ };
218
+ }
219
+ };
220
+ //#endregion
221
+ //#region src/sync/converters/pdf.ts
222
+ /** Whether scanned-PDF OCR is enabled (opt-in, requires @napi-rs/canvas). */
223
+ function isPdfOcrEnabled() {
224
+ const v = (process.env["DXCRM_PDF_OCR"] ?? "").toLowerCase();
225
+ return v === "1" || v === "true" || v === "yes" || v === "on";
226
+ }
227
+ /**
228
+ * OCR a scanned PDF by rendering each page to a PNG (unpdf + @napi-rs/canvas)
229
+ * and running tesseract.js over it. The canvas backend is an optional peer
230
+ * dependency; if it isn't installed we throw a clear, catchable error so the
231
+ * caller can fall back to the OCR-candidate stub. Page count is capped via
232
+ * DXCRM_PDF_OCR_MAX_PAGES (default 20) to keep this slow path bounded.
233
+ */
234
+ async function ocrPdf(pdf, totalPages) {
235
+ try {
236
+ await import("@napi-rs/canvas");
237
+ } catch {
238
+ throw new Error("PDF OCR requires the optional '@napi-rs/canvas' package — run `npm install @napi-rs/canvas`");
239
+ }
240
+ const { renderPageAsImage } = await import("unpdf");
241
+ const { recognize } = await import("tesseract.js");
242
+ const lang = process.env["DXCRM_OCR_LANG"] ?? "eng";
243
+ const canvasImport = () => import("@napi-rs/canvas");
244
+ const maxPages = Number(process.env["DXCRM_PDF_OCR_MAX_PAGES"] ?? 20) || 20;
245
+ const pageCount = Math.min(totalPages, maxPages);
246
+ const pages = [];
247
+ for (let p = 1; p <= pageCount; p++) {
248
+ const png = await renderPageAsImage(pdf, p, {
249
+ canvasImport,
250
+ scale: 2
251
+ });
252
+ const { data: { text } } = await recognize(Buffer.from(png), lang);
253
+ if (text.trim()) pages.push(`## Page ${p}\n\n${text.trim()}`);
254
+ }
255
+ return pages.join("\n\n");
256
+ }
257
+ /**
258
+ * PDF → Markdown. Extracts the digital text layer with unpdf (a serverless
259
+ * pdf.js build). Scanned PDFs have no text layer: when DXCRM_PDF_OCR is enabled
260
+ * they are rendered and OCR'd page-by-page, otherwise they're flagged as OCR
261
+ * candidates rather than emitting garbage. unpdf is loaded lazily.
262
+ */
263
+ const pdfConverter = {
264
+ name: "pdf",
265
+ extensions: ["pdf"],
266
+ mimeTypes: ["application/pdf"],
267
+ async convert(buffer, filename) {
268
+ const { extractText, getDocumentProxy } = await import("unpdf");
269
+ const pdf = await getDocumentProxy(new Uint8Array(buffer));
270
+ const { totalPages, text } = await extractText(pdf, { mergePages: true });
271
+ const merged = (Array.isArray(text) ? text.join("\n\n") : text).trim();
272
+ if (merged) return {
273
+ markdown: merged,
274
+ meta: {
275
+ format: "pdf",
276
+ pages: totalPages
277
+ }
278
+ };
279
+ if (isPdfOcrEnabled()) try {
280
+ const ocr = await ocrPdf(pdf, totalPages);
281
+ if (ocr.trim()) return {
282
+ markdown: `> _OCR of \`${filename}\` (${totalPages} pages):_\n\n${ocr.trim()}`,
283
+ meta: {
284
+ format: "pdf",
285
+ pages: totalPages,
286
+ ocr: true
287
+ }
288
+ };
289
+ } catch (err) {
290
+ process.stderr.write(`[converters] pdf OCR failed for ${filename}: ${err.message}\n`);
291
+ }
292
+ return {
293
+ markdown: "",
294
+ meta: {
295
+ format: "pdf",
296
+ pages: totalPages,
297
+ ocrCandidate: true
298
+ }
299
+ };
300
+ }
301
+ };
302
+ //#endregion
303
+ //#region src/sync/converters/image.ts
304
+ /**
305
+ * Image → Markdown via Tesseract.js OCR (pure-JS, 100+ languages, fully local).
306
+ * This is the heaviest converter: tesseract.js downloads a WASM core and
307
+ * language data on first use, so it is loaded lazily and only invoked for image
308
+ * attachments. Language defaults to English, override with DXCRM_OCR_LANG.
309
+ */
310
+ const imageConverter = {
311
+ name: "image",
312
+ extensions: [
313
+ "png",
314
+ "jpg",
315
+ "jpeg",
316
+ "tif",
317
+ "tiff",
318
+ "bmp",
319
+ "webp",
320
+ "gif",
321
+ "pbm"
322
+ ],
323
+ mimeTypes: ["image/*"],
324
+ async convert(buffer, filename) {
325
+ const lang = process.env["DXCRM_OCR_LANG"] ?? "eng";
326
+ const { recognize } = await import("tesseract.js");
327
+ const { data: { text } } = await recognize(buffer, lang);
328
+ const ocr = text.trim();
329
+ return {
330
+ markdown: ocr ? `> _OCR of \`${filename}\`:_\n\n${ocr}` : "",
331
+ meta: {
332
+ format: "image",
333
+ ocr: true,
334
+ lang
335
+ }
336
+ };
337
+ }
338
+ };
339
+ //#endregion
340
+ //#region src/sync/converters/registry.ts
341
+ /**
342
+ * Fallback converter for unknown/binary attachments: emit a small metadata stub
343
+ * instead of garbage bytes, so the attachment is still recorded and linkable.
344
+ */
345
+ const fallbackConverter = {
346
+ name: "binary",
347
+ extensions: [],
348
+ convert(buffer, filename) {
349
+ const kb = Math.max(1, Math.round(buffer.length / 1024));
350
+ return Promise.resolve({
351
+ markdown: `> _Binary attachment \`${filename}\` (${kb} KB) — no text representation available._`,
352
+ meta: {
353
+ format: "binary",
354
+ bytes: buffer.length
355
+ }
356
+ });
357
+ }
358
+ };
359
+ /**
360
+ * Ordered converter registry. Earlier entries win on extension conflicts. The
361
+ * text converter is intentionally last among the "real" converters so that more
362
+ * specific formats (html, etc.) take precedence over generic text matching.
363
+ */
364
+ const CONVERTERS = [
365
+ docxConverter,
366
+ xlsxConverter,
367
+ pptxConverter,
368
+ pdfConverter,
369
+ imageConverter,
370
+ htmlConverter,
371
+ textConverter
372
+ ];
373
+ function extensionOf(filename) {
374
+ return filename.split(".").pop()?.toLowerCase() ?? "";
375
+ }
376
+ function mimeMatches(converter, mime) {
377
+ if (!converter.mimeTypes) return false;
378
+ const lower = mime.toLowerCase();
379
+ return converter.mimeTypes.some((m) => {
380
+ const ml = m.toLowerCase();
381
+ if (ml.endsWith("/*")) return lower.startsWith(ml.slice(0, -1));
382
+ return ml === lower;
383
+ });
384
+ }
385
+ /**
386
+ * Pick the converter for an attachment by file extension first (most reliable
387
+ * for Gmail attachments, which always carry a filename), then by MIME type.
388
+ * Returns `undefined` when nothing matches.
389
+ */
390
+ function matchConverter(filename, mime) {
391
+ const ext = extensionOf(filename);
392
+ if (ext) {
393
+ const byExt = CONVERTERS.find((c) => c.extensions.includes(ext));
394
+ if (byExt) return byExt;
395
+ }
396
+ if (mime) {
397
+ const byMime = CONVERTERS.find((c) => mimeMatches(c, mime));
398
+ if (byMime) return byMime;
399
+ }
400
+ }
401
+ /**
402
+ * Convert an attachment to Markdown, dispatching to the best converter and
403
+ * falling back to a metadata stub. Converter errors never throw: they are
404
+ * swallowed into the fallback so a single bad attachment can't break a sync.
405
+ */
406
+ async function convertAttachment(buffer, filename, mime) {
407
+ const converter = matchConverter(filename, mime) ?? fallbackConverter;
408
+ try {
409
+ const result = await converter.convert(buffer, filename);
410
+ if (!result.markdown.trim()) return {
411
+ markdown: `> _Attachment \`${filename}\` contained no extractable text._`,
412
+ meta: {
413
+ ...result.meta,
414
+ empty: true
415
+ }
416
+ };
417
+ return result;
418
+ } catch (err) {
419
+ process.stderr.write(`[converters] ${converter.name} failed for ${filename}: ${err.message}\n`);
420
+ return fallbackConverter.convert(buffer, filename);
421
+ }
422
+ }
423
+ /**
424
+ * Recursively collect downloadable attachment parts from a Gmail message
425
+ * payload — any MIME part that carries both a filename and a body.attachmentId.
426
+ * Inline parts without a filename (e.g. signature logos) are ignored.
427
+ */
428
+ function collectAttachmentParts(payload) {
429
+ const out = [];
430
+ const walk = (part) => {
431
+ if (!part) return;
432
+ const filename = part.filename ?? "";
433
+ const attachmentId = part.body?.attachmentId ?? "";
434
+ if (filename && attachmentId) out.push({
435
+ filename,
436
+ mimeType: part.mimeType ?? "application/octet-stream",
437
+ attachmentId,
438
+ size: part.body?.size ?? 0
439
+ });
440
+ for (const child of part.parts ?? []) walk(child);
441
+ };
442
+ walk(payload);
443
+ return out;
444
+ }
445
+ /** Make a filename safe for use as a single path segment. */
446
+ function sanitizeFilename(name) {
447
+ return (name.split(/[\\/]/).pop() ?? name).replace(/[^a-zA-Z0-9._-]+/g, "_").replace(/^_+|_+$/g, "").slice(0, 120) || "attachment";
448
+ }
449
+ /**
450
+ * Download, convert and index every attachment of a single Gmail message.
451
+ *
452
+ * For each attachment: the raw bytes are saved under
453
+ * `customers/<slug>/attachments/<messageId>__<name>`, converted to a sibling
454
+ * `.md` file, and the Markdown is chunked and indexed into LanceDB so the
455
+ * attachment's content is semantically searchable. Failures on a single
456
+ * attachment are logged and skipped — they never abort the message sync.
457
+ */
458
+ async function processMessageAttachments(opts) {
459
+ const parts = collectAttachmentParts(opts.payload);
460
+ if (parts.length === 0) return [];
461
+ assertSafeSlug(opts.slug);
462
+ const maxBytes = opts.maxBytes ?? 26214400;
463
+ const attachmentsDir = path.join(opts.dataDir, "customers", opts.slug, "attachments");
464
+ fs.mkdirSync(attachmentsDir, { recursive: true });
465
+ const { indexInLanceDB } = await import("./lancedb-CswQEE5K.js");
466
+ const saved = [];
467
+ for (const part of parts) try {
468
+ if (part.size > maxBytes) {
469
+ logger.warn("gmail-sync", "skipping oversized attachment", {
470
+ filename: part.filename,
471
+ bytes: part.size
472
+ });
473
+ continue;
474
+ }
475
+ const data = (await opts.gmail.users.messages.attachments.get({
476
+ userId: "me",
477
+ messageId: opts.messageId,
478
+ id: part.attachmentId
479
+ })).data.data;
480
+ if (!data) continue;
481
+ const buffer = Buffer.from(data, "base64url");
482
+ const storedName = `${opts.messageId}__${sanitizeFilename(part.filename)}`;
483
+ const markdownName = `${storedName}.md`;
484
+ fs.writeFileSync(path.join(attachmentsDir, storedName), buffer);
485
+ const { markdown } = await convertAttachment(buffer, part.filename, part.mimeType);
486
+ const mdBody = `# ${part.filename}\n\n_Source: ${opts.source} · ${opts.date}_\n\n${markdown}\n`;
487
+ fs.writeFileSync(path.join(attachmentsDir, markdownName), mdBody);
488
+ const ref = `${opts.source}#att:${part.filename}`;
489
+ const chunks = chunkText(markdown);
490
+ for (let i = 0; i < chunks.length; i++) await indexInLanceDB(opts.dataDir, opts.slug, chunks[i], `${ref}#${i}`, {
491
+ date: opts.date,
492
+ type: "attachment"
493
+ }).catch((err) => {
494
+ logger.error("gmail-sync", "attachment index failed", { error: err.message });
495
+ });
496
+ saved.push({
497
+ originalName: part.filename,
498
+ storedName,
499
+ markdownName,
500
+ ref,
501
+ chunks: chunks.length
502
+ });
503
+ } catch (err) {
504
+ logger.warn("gmail-sync", "attachment failed", {
505
+ filename: part.filename,
506
+ error: err.message
507
+ });
508
+ }
509
+ return saved;
510
+ }
511
+ //#endregion
512
+ export { processMessageAttachments };
513
+
514
+ //# sourceMappingURL=attachments-rLa96rOK.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"attachments-rLa96rOK.js","names":[],"sources":["../src/sync/converters/text.ts","../src/sync/converters/docx.ts","../src/sync/converters/xlsx.ts","../src/sync/converters/pptx.ts","../src/sync/converters/pdf.ts","../src/sync/converters/image.ts","../src/sync/converters/registry.ts","../src/sync/attachments.ts"],"sourcesContent":["// src/sync/converters/text.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\n\n/** Escape a CSV cell for safe inclusion in a Markdown table cell. */\nfunction mdCell(value: string): string {\n return value.replace(/\\\\/g, \"\\\\\\\\\").replace(/\\|/g, \"\\\\|\").replace(/\\r?\\n/g, \" \").trim();\n}\n\n/**\n * Minimal RFC-4180-ish CSV line splitter: handles quoted fields containing\n * commas and escaped double quotes. Good enough for rendering CSV attachments\n * as readable Markdown tables (we are not round-tripping data).\n */\nexport function parseCsvLine(line: string): string[] {\n const out: string[] = [];\n let cur = \"\";\n let inQuotes = false;\n for (let i = 0; i < line.length; i++) {\n const ch = line[i];\n if (inQuotes) {\n if (ch === '\"') {\n if (line[i + 1] === '\"') {\n cur += '\"';\n i++;\n } else {\n inQuotes = false;\n }\n } else {\n cur += ch;\n }\n } else if (ch === '\"') {\n inQuotes = true;\n } else if (ch === \",\") {\n out.push(cur);\n cur = \"\";\n } else {\n cur += ch;\n }\n }\n out.push(cur);\n return out;\n}\n\n/** Render a matrix of cells as a GitHub-flavored Markdown pipe table. */\nexport function rowsToMarkdown(rows: string[][]): string {\n if (rows.length === 0) return \"\";\n const width = Math.max(...rows.map((r) => r.length));\n const pad = (r: string[]): string[] =>\n Array.from({ length: width }, (_, i) => mdCell(r[i] ?? \"\"));\n\n const header = pad(rows[0] ?? []);\n const lines = [\n `| ${header.join(\" | \")} |`,\n `| ${header.map(() => \"---\").join(\" | \")} |`,\n ...rows.slice(1).map((r) => `| ${pad(r).join(\" | \")} |`),\n ];\n return lines.join(\"\\n\");\n}\n\n/** Render CSV text as a GitHub-flavored Markdown pipe table. */\nexport function csvToMarkdown(csv: string): string {\n const rows = csv\n .split(/\\r?\\n/)\n .filter((l) => l.trim().length > 0)\n .map(parseCsvLine);\n return rowsToMarkdown(rows);\n}\n\nconst TEXT_EXTENSIONS = [\"txt\", \"text\", \"log\", \"md\", \"markdown\"];\nconst CODE_FENCE_EXTENSIONS: Record<string, string> = {\n json: \"json\",\n xml: \"xml\",\n yaml: \"yaml\",\n yml: \"yaml\",\n};\n\n/**\n * Converter for plain-text-ish attachments: Markdown/text passthrough, CSV/TSV\n * to Markdown tables, and structured text (JSON/XML/YAML) into fenced code\n * blocks so they stay readable and searchable without a heavy parser.\n */\nexport const textConverter: Converter = {\n name: \"text\",\n extensions: [...TEXT_EXTENSIONS, \"csv\", \"tsv\", ...Object.keys(CODE_FENCE_EXTENSIONS)],\n mimeTypes: [\"text/plain\", \"text/csv\", \"text/markdown\", \"application/json\", \"text/*\"],\n convert(buffer: Buffer, filename: string): Promise<ConversionResult> {\n const ext = filename.split(\".\").pop()?.toLowerCase() ?? \"\";\n const content = buffer.toString(\"utf-8\");\n\n if (ext === \"csv\") {\n return Promise.resolve({ markdown: csvToMarkdown(content), meta: { format: \"csv\" } });\n }\n if (ext === \"tsv\") {\n const asCsv = content.replace(/\\t/g, \",\");\n return Promise.resolve({ markdown: csvToMarkdown(asCsv), meta: { format: \"tsv\" } });\n }\n const fence = CODE_FENCE_EXTENSIONS[ext];\n if (fence) {\n return Promise.resolve({\n markdown: `\\`\\`\\`${fence}\\n${content.trim()}\\n\\`\\`\\``,\n meta: { format: fence },\n });\n }\n // Markdown / plain text: pass through verbatim.\n return Promise.resolve({ markdown: content.trim(), meta: { format: \"text\" } });\n },\n};\n","// src/sync/converters/docx.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\nimport { htmlToMarkdown } from \"./html.js\";\n\n/**\n * DOCX → Markdown via mammoth (DOCX → semantic HTML) then Turndown (HTML →\n * Markdown). Mammoth's own Markdown output is deprecated; the HTML route keeps\n * tables, lists and headings intact. mammoth is loaded lazily.\n */\nexport const docxConverter: Converter = {\n name: \"docx\",\n extensions: [\"docx\"],\n mimeTypes: [\"application/vnd.openxmlformats-officedocument.wordprocessingml.document\"],\n async convert(buffer: Buffer): Promise<ConversionResult> {\n const mammoth = (await import(\"mammoth\")).default;\n const { value: html, messages } = await mammoth.convertToHtml({ buffer });\n const markdown = await htmlToMarkdown(html);\n return {\n markdown,\n meta: { format: \"docx\", warnings: messages.filter((m) => m.type === \"warning\").length },\n };\n },\n};\n","// src/sync/converters/xlsx.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\nimport { rowsToMarkdown } from \"./text.js\";\n\n/** Render a single ExcelJS cell value as plain text. */\nfunction cellText(value: unknown): string {\n if (value === null || value === undefined) return \"\";\n if (value instanceof Date) return value.toISOString().slice(0, 10);\n if (typeof value === \"object\") {\n const v = value as Record<string, unknown>;\n if (typeof v[\"text\"] === \"string\") return v[\"text\"];\n if (\"result\" in v) return String(v[\"result\"] ?? \"\");\n if (Array.isArray(v[\"richText\"])) {\n return (v[\"richText\"] as Array<{ text?: string }>).map((r) => r.text ?? \"\").join(\"\");\n }\n if (\"hyperlink\" in v) return String(v[\"text\"] ?? v[\"hyperlink\"] ?? \"\");\n }\n return String(value);\n}\n\n/**\n * Spreadsheet (XLSX) → Markdown via ExcelJS. Each worksheet becomes a\n * `## <sheet name>` section followed by a GitHub-flavored Markdown table.\n * ExcelJS is loaded lazily.\n */\nexport const xlsxConverter: Converter = {\n name: \"xlsx\",\n extensions: [\"xlsx\", \"xlsm\"],\n mimeTypes: [\n \"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\",\n \"application/vnd.ms-excel\",\n ],\n async convert(buffer: Buffer): Promise<ConversionResult> {\n const ExcelJS = (await import(\"exceljs\")).default;\n const wb = new ExcelJS.Workbook();\n // ExcelJS's typings predate the @types/node generic Buffer; widen via ArrayBuffer.\n await wb.xlsx.load(buffer as unknown as ArrayBuffer);\n\n const sections: string[] = [];\n const sheetNames: string[] = [];\n wb.eachSheet((sheet) => {\n sheetNames.push(sheet.name);\n const rows: string[][] = [];\n sheet.eachRow({ includeEmpty: false }, (row) => {\n const cells: string[] = [];\n row.eachCell({ includeEmpty: true }, (cell) => {\n cells.push(cellText(cell.value));\n });\n rows.push(cells);\n });\n const table = rowsToMarkdown(rows);\n if (table) sections.push(`## ${sheet.name}\\n\\n${table}`);\n });\n\n return { markdown: sections.join(\"\\n\\n\"), meta: { format: \"xlsx\", sheets: sheetNames } };\n },\n};\n","// src/sync/converters/pptx.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\n\n/** Extract the visible text runs (`<a:t>…</a:t>`) from one slide's XML. */\nexport function extractSlideText(xml: string): string {\n const runs = [...xml.matchAll(/<a:t>([\\s\\S]*?)<\\/a:t>/g)].map((m) =>\n (m[1] ?? \"\")\n .replace(/&amp;/g, \"&\")\n .replace(/&lt;/g, \"<\")\n .replace(/&gt;/g, \">\")\n .replace(/&quot;/g, '\"')\n .replace(/&apos;/g, \"'\")\n );\n return runs.join(\" \").replace(/\\s+/g, \" \").trim();\n}\n\nfunction slideNumber(entryName: string): number {\n const m = entryName.match(/slide(\\d+)\\.xml$/);\n return m ? parseInt(m[1] ?? \"0\", 10) : 0;\n}\n\n/**\n * PPTX → Markdown. A .pptx is a zip; slide text lives in `ppt/slides/slideN.xml`\n * as `<a:t>` runs. We unzip with adm-zip (already a dependency) and emit one\n * `## Slide N` section per slide — no extra native parser needed.\n */\nexport const pptxConverter: Converter = {\n name: \"pptx\",\n extensions: [\"pptx\"],\n mimeTypes: [\"application/vnd.openxmlformats-officedocument.presentationml.presentation\"],\n async convert(buffer: Buffer): Promise<ConversionResult> {\n const AdmZip = (await import(\"adm-zip\")).default;\n const zip = new AdmZip(buffer);\n const slides = zip\n .getEntries()\n .filter((e) => /^ppt\\/slides\\/slide\\d+\\.xml$/.test(e.entryName))\n .sort((a, b) => slideNumber(a.entryName) - slideNumber(b.entryName));\n\n const sections: string[] = [];\n for (const entry of slides) {\n const text = extractSlideText(entry.getData().toString(\"utf-8\"));\n if (text) sections.push(`## Slide ${slideNumber(entry.entryName)}\\n\\n${text}`);\n }\n return { markdown: sections.join(\"\\n\\n\"), meta: { format: \"pptx\", slides: slides.length } };\n },\n};\n","// src/sync/converters/pdf.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\n\n/** Whether scanned-PDF OCR is enabled (opt-in, requires @napi-rs/canvas). */\nexport function isPdfOcrEnabled(): boolean {\n const v = (process.env[\"DXCRM_PDF_OCR\"] ?? \"\").toLowerCase();\n return v === \"1\" || v === \"true\" || v === \"yes\" || v === \"on\";\n}\n\n/**\n * OCR a scanned PDF by rendering each page to a PNG (unpdf + @napi-rs/canvas)\n * and running tesseract.js over it. The canvas backend is an optional peer\n * dependency; if it isn't installed we throw a clear, catchable error so the\n * caller can fall back to the OCR-candidate stub. Page count is capped via\n * DXCRM_PDF_OCR_MAX_PAGES (default 20) to keep this slow path bounded.\n */\nasync function ocrPdf(pdf: unknown, totalPages: number): Promise<string> {\n // Fail fast with a helpful message when the optional canvas backend is absent.\n try {\n await import(\"@napi-rs/canvas\");\n } catch {\n throw new Error(\n \"PDF OCR requires the optional '@napi-rs/canvas' package — run `npm install @napi-rs/canvas`\"\n );\n }\n\n const { renderPageAsImage } = await import(\"unpdf\");\n const { recognize } = await import(\"tesseract.js\");\n const lang = process.env[\"DXCRM_OCR_LANG\"] ?? \"eng\";\n const canvasImport = (): Promise<unknown> => import(\"@napi-rs/canvas\");\n\n const maxPages = Number(process.env[\"DXCRM_PDF_OCR_MAX_PAGES\"] ?? 20) || 20;\n const pageCount = Math.min(totalPages, maxPages);\n\n const pages: string[] = [];\n for (let p = 1; p <= pageCount; p++) {\n const png = await renderPageAsImage(pdf as never, p, { canvasImport, scale: 2 } as never);\n const {\n data: { text },\n } = await recognize(Buffer.from(png), lang);\n if (text.trim()) pages.push(`## Page ${p}\\n\\n${text.trim()}`);\n }\n return pages.join(\"\\n\\n\");\n}\n\n/**\n * PDF → Markdown. Extracts the digital text layer with unpdf (a serverless\n * pdf.js build). Scanned PDFs have no text layer: when DXCRM_PDF_OCR is enabled\n * they are rendered and OCR'd page-by-page, otherwise they're flagged as OCR\n * candidates rather than emitting garbage. unpdf is loaded lazily.\n */\nexport const pdfConverter: Converter = {\n name: \"pdf\",\n extensions: [\"pdf\"],\n mimeTypes: [\"application/pdf\"],\n async convert(buffer: Buffer, filename: string): Promise<ConversionResult> {\n const { extractText, getDocumentProxy } = await import(\"unpdf\");\n const pdf = await getDocumentProxy(new Uint8Array(buffer));\n const { totalPages, text } = await extractText(pdf, { mergePages: true });\n const merged = (Array.isArray(text) ? text.join(\"\\n\\n\") : text).trim();\n\n if (merged) return { markdown: merged, meta: { format: \"pdf\", pages: totalPages } };\n\n // No text layer — scanned PDF. OCR if opted in, else flag as a candidate.\n if (isPdfOcrEnabled()) {\n try {\n const ocr = await ocrPdf(pdf, totalPages);\n if (ocr.trim()) {\n return {\n markdown: `> _OCR of \\`${filename}\\` (${totalPages} pages):_\\n\\n${ocr.trim()}`,\n meta: { format: \"pdf\", pages: totalPages, ocr: true },\n };\n }\n } catch (err) {\n process.stderr.write(\n `[converters] pdf OCR failed for ${filename}: ${(err as Error).message}\\n`\n );\n }\n }\n\n return { markdown: \"\", meta: { format: \"pdf\", pages: totalPages, ocrCandidate: true } };\n },\n};\n","// src/sync/converters/image.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\n\n/**\n * Image → Markdown via Tesseract.js OCR (pure-JS, 100+ languages, fully local).\n * This is the heaviest converter: tesseract.js downloads a WASM core and\n * language data on first use, so it is loaded lazily and only invoked for image\n * attachments. Language defaults to English, override with DXCRM_OCR_LANG.\n */\nexport const imageConverter: Converter = {\n name: \"image\",\n extensions: [\"png\", \"jpg\", \"jpeg\", \"tif\", \"tiff\", \"bmp\", \"webp\", \"gif\", \"pbm\"],\n mimeTypes: [\"image/*\"],\n async convert(buffer: Buffer, filename: string): Promise<ConversionResult> {\n const lang = process.env[\"DXCRM_OCR_LANG\"] ?? \"eng\";\n const { recognize } = await import(\"tesseract.js\");\n const {\n data: { text },\n } = await recognize(buffer, lang);\n const ocr = text.trim();\n return {\n markdown: ocr ? `> _OCR of \\`${filename}\\`:_\\n\\n${ocr}` : \"\",\n meta: { format: \"image\", ocr: true, lang },\n };\n },\n};\n","// src/sync/converters/registry.ts\nimport type { Converter, ConversionResult } from \"./types.js\";\nimport { textConverter } from \"./text.js\";\nimport { docxConverter } from \"./docx.js\";\nimport { xlsxConverter } from \"./xlsx.js\";\nimport { pptxConverter } from \"./pptx.js\";\nimport { pdfConverter } from \"./pdf.js\";\nimport { htmlConverter } from \"./html.js\";\nimport { imageConverter } from \"./image.js\";\n\n/**\n * Fallback converter for unknown/binary attachments: emit a small metadata stub\n * instead of garbage bytes, so the attachment is still recorded and linkable.\n */\nexport const fallbackConverter: Converter = {\n name: \"binary\",\n extensions: [],\n convert(buffer: Buffer, filename: string): Promise<ConversionResult> {\n const kb = Math.max(1, Math.round(buffer.length / 1024));\n return Promise.resolve({\n markdown: `> _Binary attachment \\`${filename}\\` (${kb} KB) — no text representation available._`,\n meta: { format: \"binary\", bytes: buffer.length },\n });\n },\n};\n\n/**\n * Ordered converter registry. Earlier entries win on extension conflicts. The\n * text converter is intentionally last among the \"real\" converters so that more\n * specific formats (html, etc.) take precedence over generic text matching.\n */\nexport const CONVERTERS: Converter[] = [\n docxConverter,\n xlsxConverter,\n pptxConverter,\n pdfConverter,\n imageConverter,\n htmlConverter,\n textConverter,\n];\n\nfunction extensionOf(filename: string): string {\n return filename.split(\".\").pop()?.toLowerCase() ?? \"\";\n}\n\nfunction mimeMatches(converter: Converter, mime: string): boolean {\n if (!converter.mimeTypes) return false;\n const lower = mime.toLowerCase();\n return converter.mimeTypes.some((m) => {\n const ml = m.toLowerCase();\n if (ml.endsWith(\"/*\")) return lower.startsWith(ml.slice(0, -1));\n return ml === lower;\n });\n}\n\n/**\n * Pick the converter for an attachment by file extension first (most reliable\n * for Gmail attachments, which always carry a filename), then by MIME type.\n * Returns `undefined` when nothing matches.\n */\nexport function matchConverter(filename: string, mime?: string): Converter | undefined {\n const ext = extensionOf(filename);\n if (ext) {\n const byExt = CONVERTERS.find((c) => c.extensions.includes(ext));\n if (byExt) return byExt;\n }\n if (mime) {\n const byMime = CONVERTERS.find((c) => mimeMatches(c, mime));\n if (byMime) return byMime;\n }\n return undefined;\n}\n\n/**\n * Convert an attachment to Markdown, dispatching to the best converter and\n * falling back to a metadata stub. Converter errors never throw: they are\n * swallowed into the fallback so a single bad attachment can't break a sync.\n */\nexport async function convertAttachment(\n buffer: Buffer,\n filename: string,\n mime?: string\n): Promise<ConversionResult> {\n const converter = matchConverter(filename, mime) ?? fallbackConverter;\n try {\n const result = await converter.convert(buffer, filename);\n if (!result.markdown.trim()) {\n return {\n markdown: `> _Attachment \\`${filename}\\` contained no extractable text._`,\n meta: { ...result.meta, empty: true },\n };\n }\n return result;\n } catch (err) {\n process.stderr.write(\n `[converters] ${converter.name} failed for ${filename}: ${(err as Error).message}\\n`\n );\n return fallbackConverter.convert(buffer, filename);\n }\n}\n","// src/sync/attachments.ts\nimport fs from \"fs\";\nimport path from \"path\";\nimport type { gmail_v1 } from \"@googleapis/gmail\";\nimport { convertAttachment } from \"./converters/registry.js\";\nimport { chunkText } from \"../core/chunk.js\";\nimport { assertSafeSlug } from \"../fs/customer-dir.js\";\nimport { logger } from \"../core/logger.js\";\n\n/** Default per-attachment size cap (skip larger blobs to keep syncs bounded). */\nexport const DEFAULT_MAX_ATTACHMENT_BYTES = 25 * 1024 * 1024;\n\nexport interface AttachmentPart {\n filename: string;\n mimeType: string;\n attachmentId: string;\n size: number;\n}\n\nexport interface SavedAttachment {\n /** Original filename as sent. */\n originalName: string;\n /** Stored raw filename (sanitized, message-prefixed) under attachments/. */\n storedName: string;\n /** Markdown filename under attachments/. */\n markdownName: string;\n /** Source ref used for LanceDB indexing. */\n ref: string;\n /** Number of indexed chunks produced from the Markdown. */\n chunks: number;\n}\n\n/**\n * Recursively collect downloadable attachment parts from a Gmail message\n * payload — any MIME part that carries both a filename and a body.attachmentId.\n * Inline parts without a filename (e.g. signature logos) are ignored.\n */\nexport function collectAttachmentParts(\n payload: gmail_v1.Schema$MessagePart | undefined\n): AttachmentPart[] {\n const out: AttachmentPart[] = [];\n const walk = (part?: gmail_v1.Schema$MessagePart): void => {\n if (!part) return;\n const filename = part.filename ?? \"\";\n const attachmentId = part.body?.attachmentId ?? \"\";\n if (filename && attachmentId) {\n out.push({\n filename,\n mimeType: part.mimeType ?? \"application/octet-stream\",\n attachmentId,\n size: part.body?.size ?? 0,\n });\n }\n for (const child of part.parts ?? []) walk(child);\n };\n walk(payload);\n return out;\n}\n\n/** Make a filename safe for use as a single path segment. */\nexport function sanitizeFilename(name: string): string {\n const base = name.split(/[\\\\/]/).pop() ?? name;\n return (\n base\n .replace(/[^a-zA-Z0-9._-]+/g, \"_\")\n .replace(/^_+|_+$/g, \"\")\n .slice(0, 120) || \"attachment\"\n );\n}\n\n/**\n * Download, convert and index every attachment of a single Gmail message.\n *\n * For each attachment: the raw bytes are saved under\n * `customers/<slug>/attachments/<messageId>__<name>`, converted to a sibling\n * `.md` file, and the Markdown is chunked and indexed into LanceDB so the\n * attachment's content is semantically searchable. Failures on a single\n * attachment are logged and skipped — they never abort the message sync.\n */\nexport async function processMessageAttachments(opts: {\n gmail: gmail_v1.Gmail;\n dataDir: string;\n slug: string;\n messageId: string;\n source: string;\n payload: gmail_v1.Schema$MessagePart | undefined;\n date: string;\n maxBytes?: number;\n}): Promise<SavedAttachment[]> {\n const parts = collectAttachmentParts(opts.payload);\n if (parts.length === 0) return [];\n\n assertSafeSlug(opts.slug);\n const maxBytes = opts.maxBytes ?? DEFAULT_MAX_ATTACHMENT_BYTES;\n const attachmentsDir = path.join(opts.dataDir, \"customers\", opts.slug, \"attachments\");\n fs.mkdirSync(attachmentsDir, { recursive: true });\n\n const { indexInLanceDB } = await import(\"../core/lancedb.js\");\n const saved: SavedAttachment[] = [];\n\n for (const part of parts) {\n try {\n if (part.size > maxBytes) {\n logger.warn(\"gmail-sync\", \"skipping oversized attachment\", {\n filename: part.filename,\n bytes: part.size,\n });\n continue;\n }\n\n const resp = await opts.gmail.users.messages.attachments.get({\n userId: \"me\",\n messageId: opts.messageId,\n id: part.attachmentId,\n });\n const data = resp.data.data;\n if (!data) continue;\n const buffer = Buffer.from(data, \"base64url\");\n\n const storedName = `${opts.messageId}__${sanitizeFilename(part.filename)}`;\n const markdownName = `${storedName}.md`;\n fs.writeFileSync(path.join(attachmentsDir, storedName), buffer);\n\n const { markdown } = await convertAttachment(buffer, part.filename, part.mimeType);\n const mdBody = `# ${part.filename}\\n\\n_Source: ${opts.source} · ${opts.date}_\\n\\n${markdown}\\n`;\n fs.writeFileSync(path.join(attachmentsDir, markdownName), mdBody);\n\n const ref = `${opts.source}#att:${part.filename}`;\n const chunks = chunkText(markdown);\n for (let i = 0; i < chunks.length; i++) {\n await indexInLanceDB(opts.dataDir, opts.slug, chunks[i]!, `${ref}#${i}`, {\n date: opts.date,\n type: \"attachment\",\n }).catch((err: unknown) => {\n logger.error(\"gmail-sync\", \"attachment index failed\", {\n error: (err as Error).message,\n });\n });\n }\n\n saved.push({\n originalName: part.filename,\n storedName,\n markdownName,\n ref,\n chunks: chunks.length,\n });\n } catch (err) {\n logger.warn(\"gmail-sync\", \"attachment failed\", {\n filename: part.filename,\n error: (err as Error).message,\n });\n }\n }\n\n return saved;\n}\n"],"mappings":";;;;;;;;AAIA,SAAS,OAAO,OAAuB;CACrC,OAAO,MAAM,QAAQ,OAAO,MAAM,EAAE,QAAQ,OAAO,KAAK,EAAE,QAAQ,UAAU,GAAG,EAAE,KAAK;AACxF;;;;;;AAOA,SAAgB,aAAa,MAAwB;CACnD,MAAM,MAAgB,CAAC;CACvB,IAAI,MAAM;CACV,IAAI,WAAW;CACf,KAAK,IAAI,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;EACpC,MAAM,KAAK,KAAK;EAChB,IAAI,UACF,IAAI,OAAO,MACT,IAAI,KAAK,IAAI,OAAO,MAAK;GACvB,OAAO;GACP;EACF,OACE,WAAW;OAGb,OAAO;OAEJ,IAAI,OAAO,MAChB,WAAW;OACN,IAAI,OAAO,KAAK;GACrB,IAAI,KAAK,GAAG;GACZ,MAAM;EACR,OACE,OAAO;CAEX;CACA,IAAI,KAAK,GAAG;CACZ,OAAO;AACT;;AAGA,SAAgB,eAAe,MAA0B;CACvD,IAAI,KAAK,WAAW,GAAG,OAAO;CAC9B,MAAM,QAAQ,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,EAAE,MAAM,CAAC;CACnD,MAAM,OAAO,MACX,MAAM,KAAK,EAAE,QAAQ,MAAM,IAAI,GAAG,MAAM,OAAO,EAAE,MAAM,EAAE,CAAC;CAE5D,MAAM,SAAS,IAAI,KAAK,MAAM,CAAC,CAAC;CAMhC,OAAO;EAJL,KAAK,OAAO,KAAK,KAAK,EAAE;EACxB,KAAK,OAAO,UAAU,KAAK,EAAE,KAAK,KAAK,EAAE;EACzC,GAAG,KAAK,MAAM,CAAC,EAAE,KAAK,MAAM,KAAK,IAAI,CAAC,EAAE,KAAK,KAAK,EAAE,GAAG;CAE9C,EAAE,KAAK,IAAI;AACxB;;AAGA,SAAgB,cAAc,KAAqB;CAKjD,OAAO,eAJM,IACV,MAAM,OAAO,EACb,QAAQ,MAAM,EAAE,KAAK,EAAE,SAAS,CAAC,EACjC,IAAI,YACkB,CAAC;AAC5B;AAEA,MAAM,kBAAkB;CAAC;CAAO;CAAQ;CAAO;CAAM;AAAU;AAC/D,MAAM,wBAAgD;CACpD,MAAM;CACN,KAAK;CACL,MAAM;CACN,KAAK;AACP;;;;;;AAOA,MAAa,gBAA2B;CACtC,MAAM;CACN,YAAY;EAAC,GAAG;EAAiB;EAAO;EAAO,GAAG,OAAO,KAAK,qBAAqB;CAAC;CACpF,WAAW;EAAC;EAAc;EAAY;EAAiB;EAAoB;CAAQ;CACnF,QAAQ,QAAgB,UAA6C;EACnE,MAAM,MAAM,SAAS,MAAM,GAAG,EAAE,IAAI,GAAG,YAAY,KAAK;EACxD,MAAM,UAAU,OAAO,SAAS,OAAO;EAEvC,IAAI,QAAQ,OACV,OAAO,QAAQ,QAAQ;GAAE,UAAU,cAAc,OAAO;GAAG,MAAM,EAAE,QAAQ,MAAM;EAAE,CAAC;EAEtF,IAAI,QAAQ,OAAO;GACjB,MAAM,QAAQ,QAAQ,QAAQ,OAAO,GAAG;GACxC,OAAO,QAAQ,QAAQ;IAAE,UAAU,cAAc,KAAK;IAAG,MAAM,EAAE,QAAQ,MAAM;GAAE,CAAC;EACpF;EACA,MAAM,QAAQ,sBAAsB;EACpC,IAAI,OACF,OAAO,QAAQ,QAAQ;GACrB,UAAU,SAAS,MAAM,IAAI,QAAQ,KAAK,EAAE;GAC5C,MAAM,EAAE,QAAQ,MAAM;EACxB,CAAC;EAGH,OAAO,QAAQ,QAAQ;GAAE,UAAU,QAAQ,KAAK;GAAG,MAAM,EAAE,QAAQ,OAAO;EAAE,CAAC;CAC/E;AACF;;;;;;;;ACjGA,MAAa,gBAA2B;CACtC,MAAM;CACN,YAAY,CAAC,MAAM;CACnB,WAAW,CAAC,yEAAyE;CACrF,MAAM,QAAQ,QAA2C;EAEvD,MAAM,EAAE,OAAO,MAAM,aAAa,OADjB,MAAM,OAAO,YAAY,QACM,cAAc,EAAE,OAAO,CAAC;EAExE,OAAO;GACL,UAAA,MAFqB,eAAe,IAAI;GAGxC,MAAM;IAAE,QAAQ;IAAQ,UAAU,SAAS,QAAQ,MAAM,EAAE,SAAS,SAAS,EAAE;GAAO;EACxF;CACF;AACF;;;;ACjBA,SAAS,SAAS,OAAwB;CACxC,IAAI,UAAU,QAAQ,UAAU,KAAA,GAAW,OAAO;CAClD,IAAI,iBAAiB,MAAM,OAAO,MAAM,YAAY,EAAE,MAAM,GAAG,EAAE;CACjE,IAAI,OAAO,UAAU,UAAU;EAC7B,MAAM,IAAI;EACV,IAAI,OAAO,EAAE,YAAY,UAAU,OAAO,EAAE;EAC5C,IAAI,YAAY,GAAG,OAAO,OAAO,EAAE,aAAa,EAAE;EAClD,IAAI,MAAM,QAAQ,EAAE,WAAW,GAC7B,OAAQ,EAAE,YAAyC,KAAK,MAAM,EAAE,QAAQ,EAAE,EAAE,KAAK,EAAE;EAErF,IAAI,eAAe,GAAG,OAAO,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE;CACvE;CACA,OAAO,OAAO,KAAK;AACrB;;;;;;AAOA,MAAa,gBAA2B;CACtC,MAAM;CACN,YAAY,CAAC,QAAQ,MAAM;CAC3B,WAAW,CACT,qEACA,0BACF;CACA,MAAM,QAAQ,QAA2C;EAEvD,MAAM,KAAK,KADM,OAAM,OAAO,aAAY,QACnB,SAAS;EAEhC,MAAM,GAAG,KAAK,KAAK,MAAgC;EAEnD,MAAM,WAAqB,CAAC;EAC5B,MAAM,aAAuB,CAAC;EAC9B,GAAG,WAAW,UAAU;GACtB,WAAW,KAAK,MAAM,IAAI;GAC1B,MAAM,OAAmB,CAAC;GAC1B,MAAM,QAAQ,EAAE,cAAc,MAAM,IAAI,QAAQ;IAC9C,MAAM,QAAkB,CAAC;IACzB,IAAI,SAAS,EAAE,cAAc,KAAK,IAAI,SAAS;KAC7C,MAAM,KAAK,SAAS,KAAK,KAAK,CAAC;IACjC,CAAC;IACD,KAAK,KAAK,KAAK;GACjB,CAAC;GACD,MAAM,QAAQ,eAAe,IAAI;GACjC,IAAI,OAAO,SAAS,KAAK,MAAM,MAAM,KAAK,MAAM,OAAO;EACzD,CAAC;EAED,OAAO;GAAE,UAAU,SAAS,KAAK,MAAM;GAAG,MAAM;IAAE,QAAQ;IAAQ,QAAQ;GAAW;EAAE;CACzF;AACF;;;;ACpDA,SAAgB,iBAAiB,KAAqB;CASpD,OARa,CAAC,GAAG,IAAI,SAAS,yBAAyB,CAAC,EAAE,KAAK,OAC5D,EAAE,MAAM,IACN,QAAQ,UAAU,GAAG,EACrB,QAAQ,SAAS,GAAG,EACpB,QAAQ,SAAS,GAAG,EACpB,QAAQ,WAAW,IAAG,EACtB,QAAQ,WAAW,GAAG,CAEjB,EAAE,KAAK,GAAG,EAAE,QAAQ,QAAQ,GAAG,EAAE,KAAK;AAClD;AAEA,SAAS,YAAY,WAA2B;CAC9C,MAAM,IAAI,UAAU,MAAM,kBAAkB;CAC5C,OAAO,IAAI,SAAS,EAAE,MAAM,KAAK,EAAE,IAAI;AACzC;;;;;;AAOA,MAAa,gBAA2B;CACtC,MAAM;CACN,YAAY,CAAC,MAAM;CACnB,WAAW,CAAC,2EAA2E;CACvF,MAAM,QAAQ,QAA2C;EACvD,MAAM,UAAU,MAAM,OAAO,YAAY;EAEzC,MAAM,SAAS,IADC,OAAO,MACN,EACd,WAAW,EACX,QAAQ,MAAM,+BAA+B,KAAK,EAAE,SAAS,CAAC,EAC9D,MAAM,GAAG,MAAM,YAAY,EAAE,SAAS,IAAI,YAAY,EAAE,SAAS,CAAC;EAErE,MAAM,WAAqB,CAAC;EAC5B,KAAK,MAAM,SAAS,QAAQ;GAC1B,MAAM,OAAO,iBAAiB,MAAM,QAAQ,EAAE,SAAS,OAAO,CAAC;GAC/D,IAAI,MAAM,SAAS,KAAK,YAAY,YAAY,MAAM,SAAS,EAAE,MAAM,MAAM;EAC/E;EACA,OAAO;GAAE,UAAU,SAAS,KAAK,MAAM;GAAG,MAAM;IAAE,QAAQ;IAAQ,QAAQ,OAAO;GAAO;EAAE;CAC5F;AACF;;;;ACzCA,SAAgB,kBAA2B;CACzC,MAAM,KAAK,QAAQ,IAAI,oBAAoB,IAAI,YAAY;CAC3D,OAAO,MAAM,OAAO,MAAM,UAAU,MAAM,SAAS,MAAM;AAC3D;;;;;;;;AASA,eAAe,OAAO,KAAc,YAAqC;CAEvE,IAAI;EACF,MAAM,OAAO;CACf,QAAQ;EACN,MAAM,IAAI,MACR,6FACF;CACF;CAEA,MAAM,EAAE,sBAAsB,MAAM,OAAO;CAC3C,MAAM,EAAE,cAAc,MAAM,OAAO;CACnC,MAAM,OAAO,QAAQ,IAAI,qBAAqB;CAC9C,MAAM,qBAAuC,OAAO;CAEpD,MAAM,WAAW,OAAO,QAAQ,IAAI,8BAA8B,EAAE,KAAK;CACzE,MAAM,YAAY,KAAK,IAAI,YAAY,QAAQ;CAE/C,MAAM,QAAkB,CAAC;CACzB,KAAK,IAAI,IAAI,GAAG,KAAK,WAAW,KAAK;EACnC,MAAM,MAAM,MAAM,kBAAkB,KAAc,GAAG;GAAE;GAAc,OAAO;EAAE,CAAU;EACxF,MAAM,EACJ,MAAM,EAAE,WACN,MAAM,UAAU,OAAO,KAAK,GAAG,GAAG,IAAI;EAC1C,IAAI,KAAK,KAAK,GAAG,MAAM,KAAK,WAAW,EAAE,MAAM,KAAK,KAAK,GAAG;CAC9D;CACA,OAAO,MAAM,KAAK,MAAM;AAC1B;;;;;;;AAQA,MAAa,eAA0B;CACrC,MAAM;CACN,YAAY,CAAC,KAAK;CAClB,WAAW,CAAC,iBAAiB;CAC7B,MAAM,QAAQ,QAAgB,UAA6C;EACzE,MAAM,EAAE,aAAa,qBAAqB,MAAM,OAAO;EACvD,MAAM,MAAM,MAAM,iBAAiB,IAAI,WAAW,MAAM,CAAC;EACzD,MAAM,EAAE,YAAY,SAAS,MAAM,YAAY,KAAK,EAAE,YAAY,KAAK,CAAC;EACxE,MAAM,UAAU,MAAM,QAAQ,IAAI,IAAI,KAAK,KAAK,MAAM,IAAI,MAAM,KAAK;EAErE,IAAI,QAAQ,OAAO;GAAE,UAAU;GAAQ,MAAM;IAAE,QAAQ;IAAO,OAAO;GAAW;EAAE;EAGlF,IAAI,gBAAgB,GAClB,IAAI;GACF,MAAM,MAAM,MAAM,OAAO,KAAK,UAAU;GACxC,IAAI,IAAI,KAAK,GACX,OAAO;IACL,UAAU,eAAe,SAAS,MAAM,WAAW,eAAe,IAAI,KAAK;IAC3E,MAAM;KAAE,QAAQ;KAAO,OAAO;KAAY,KAAK;IAAK;GACtD;EAEJ,SAAS,KAAK;GACZ,QAAQ,OAAO,MACb,mCAAmC,SAAS,IAAK,IAAc,QAAQ,GACzE;EACF;EAGF,OAAO;GAAE,UAAU;GAAI,MAAM;IAAE,QAAQ;IAAO,OAAO;IAAY,cAAc;GAAK;EAAE;CACxF;AACF;;;;;;;;;ACzEA,MAAa,iBAA4B;CACvC,MAAM;CACN,YAAY;EAAC;EAAO;EAAO;EAAQ;EAAO;EAAQ;EAAO;EAAQ;EAAO;CAAK;CAC7E,WAAW,CAAC,SAAS;CACrB,MAAM,QAAQ,QAAgB,UAA6C;EACzE,MAAM,OAAO,QAAQ,IAAI,qBAAqB;EAC9C,MAAM,EAAE,cAAc,MAAM,OAAO;EACnC,MAAM,EACJ,MAAM,EAAE,WACN,MAAM,UAAU,QAAQ,IAAI;EAChC,MAAM,MAAM,KAAK,KAAK;EACtB,OAAO;GACL,UAAU,MAAM,eAAe,SAAS,UAAU,QAAQ;GAC1D,MAAM;IAAE,QAAQ;IAAS,KAAK;IAAM;GAAK;EAC3C;CACF;AACF;;;;;;;ACXA,MAAa,oBAA+B;CAC1C,MAAM;CACN,YAAY,CAAC;CACb,QAAQ,QAAgB,UAA6C;EACnE,MAAM,KAAK,KAAK,IAAI,GAAG,KAAK,MAAM,OAAO,SAAS,IAAI,CAAC;EACvD,OAAO,QAAQ,QAAQ;GACrB,UAAU,0BAA0B,SAAS,MAAM,GAAG;GACtD,MAAM;IAAE,QAAQ;IAAU,OAAO,OAAO;GAAO;EACjD,CAAC;CACH;AACF;;;;;;AAOA,MAAa,aAA0B;CACrC;CACA;CACA;CACA;CACA;CACA;CACA;AACF;AAEA,SAAS,YAAY,UAA0B;CAC7C,OAAO,SAAS,MAAM,GAAG,EAAE,IAAI,GAAG,YAAY,KAAK;AACrD;AAEA,SAAS,YAAY,WAAsB,MAAuB;CAChE,IAAI,CAAC,UAAU,WAAW,OAAO;CACjC,MAAM,QAAQ,KAAK,YAAY;CAC/B,OAAO,UAAU,UAAU,MAAM,MAAM;EACrC,MAAM,KAAK,EAAE,YAAY;EACzB,IAAI,GAAG,SAAS,IAAI,GAAG,OAAO,MAAM,WAAW,GAAG,MAAM,GAAG,EAAE,CAAC;EAC9D,OAAO,OAAO;CAChB,CAAC;AACH;;;;;;AAOA,SAAgB,eAAe,UAAkB,MAAsC;CACrF,MAAM,MAAM,YAAY,QAAQ;CAChC,IAAI,KAAK;EACP,MAAM,QAAQ,WAAW,MAAM,MAAM,EAAE,WAAW,SAAS,GAAG,CAAC;EAC/D,IAAI,OAAO,OAAO;CACpB;CACA,IAAI,MAAM;EACR,MAAM,SAAS,WAAW,MAAM,MAAM,YAAY,GAAG,IAAI,CAAC;EAC1D,IAAI,QAAQ,OAAO;CACrB;AAEF;;;;;;AAOA,eAAsB,kBACpB,QACA,UACA,MAC2B;CAC3B,MAAM,YAAY,eAAe,UAAU,IAAI,KAAK;CACpD,IAAI;EACF,MAAM,SAAS,MAAM,UAAU,QAAQ,QAAQ,QAAQ;EACvD,IAAI,CAAC,OAAO,SAAS,KAAK,GACxB,OAAO;GACL,UAAU,mBAAmB,SAAS;GACtC,MAAM;IAAE,GAAG,OAAO;IAAM,OAAO;GAAK;EACtC;EAEF,OAAO;CACT,SAAS,KAAK;EACZ,QAAQ,OAAO,MACb,gBAAgB,UAAU,KAAK,cAAc,SAAS,IAAK,IAAc,QAAQ,GACnF;EACA,OAAO,kBAAkB,QAAQ,QAAQ,QAAQ;CACnD;AACF;;;;;;AC9DA,SAAgB,uBACd,SACkB;CAClB,MAAM,MAAwB,CAAC;CAC/B,MAAM,QAAQ,SAA6C;EACzD,IAAI,CAAC,MAAM;EACX,MAAM,WAAW,KAAK,YAAY;EAClC,MAAM,eAAe,KAAK,MAAM,gBAAgB;EAChD,IAAI,YAAY,cACd,IAAI,KAAK;GACP;GACA,UAAU,KAAK,YAAY;GAC3B;GACA,MAAM,KAAK,MAAM,QAAQ;EAC3B,CAAC;EAEH,KAAK,MAAM,SAAS,KAAK,SAAS,CAAC,GAAG,KAAK,KAAK;CAClD;CACA,KAAK,OAAO;CACZ,OAAO;AACT;;AAGA,SAAgB,iBAAiB,MAAsB;CAErD,QADa,KAAK,MAAM,OAAO,EAAE,IAAI,KAAK,MAGrC,QAAQ,qBAAqB,GAAG,EAChC,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,GAAG,KAAK;AAExB;;;;;;;;;;AAWA,eAAsB,0BAA0B,MASjB;CAC7B,MAAM,QAAQ,uBAAuB,KAAK,OAAO;CACjD,IAAI,MAAM,WAAW,GAAG,OAAO,CAAC;CAEhC,eAAe,KAAK,IAAI;CACxB,MAAM,WAAW,KAAK,YAAA;CACtB,MAAM,iBAAiB,KAAK,KAAK,KAAK,SAAS,aAAa,KAAK,MAAM,aAAa;CACpF,GAAG,UAAU,gBAAgB,EAAE,WAAW,KAAK,CAAC;CAEhD,MAAM,EAAE,mBAAmB,MAAM,OAAO;CACxC,MAAM,QAA2B,CAAC;CAElC,KAAK,MAAM,QAAQ,OACjB,IAAI;EACF,IAAI,KAAK,OAAO,UAAU;GACxB,OAAO,KAAK,cAAc,iCAAiC;IACzD,UAAU,KAAK;IACf,OAAO,KAAK;GACd,CAAC;GACD;EACF;EAOA,MAAM,QAAO,MALM,KAAK,MAAM,MAAM,SAAS,YAAY,IAAI;GAC3D,QAAQ;GACR,WAAW,KAAK;GAChB,IAAI,KAAK;EACX,CAAC,GACiB,KAAK;EACvB,IAAI,CAAC,MAAM;EACX,MAAM,SAAS,OAAO,KAAK,MAAM,WAAW;EAE5C,MAAM,aAAa,GAAG,KAAK,UAAU,IAAI,iBAAiB,KAAK,QAAQ;EACvE,MAAM,eAAe,GAAG,WAAW;EACnC,GAAG,cAAc,KAAK,KAAK,gBAAgB,UAAU,GAAG,MAAM;EAE9D,MAAM,EAAE,aAAa,MAAM,kBAAkB,QAAQ,KAAK,UAAU,KAAK,QAAQ;EACjF,MAAM,SAAS,KAAK,KAAK,SAAS,eAAe,KAAK,OAAO,KAAK,KAAK,KAAK,OAAO,SAAS;EAC5F,GAAG,cAAc,KAAK,KAAK,gBAAgB,YAAY,GAAG,MAAM;EAEhE,MAAM,MAAM,GAAG,KAAK,OAAO,OAAO,KAAK;EACvC,MAAM,SAAS,UAAU,QAAQ;EACjC,KAAK,IAAI,IAAI,GAAG,IAAI,OAAO,QAAQ,KACjC,MAAM,eAAe,KAAK,SAAS,KAAK,MAAM,OAAO,IAAK,GAAG,IAAI,GAAG,KAAK;GACvE,MAAM,KAAK;GACX,MAAM;EACR,CAAC,EAAE,OAAO,QAAiB;GACzB,OAAO,MAAM,cAAc,2BAA2B,EACpD,OAAQ,IAAc,QACxB,CAAC;EACH,CAAC;EAGH,MAAM,KAAK;GACT,cAAc,KAAK;GACnB;GACA;GACA;GACA,QAAQ,OAAO;EACjB,CAAC;CACH,SAAS,KAAK;EACZ,OAAO,KAAK,cAAc,qBAAqB;GAC7C,UAAU,KAAK;GACf,OAAQ,IAAc;EACxB,CAAC;CACH;CAGF,OAAO;AACT"}
@@ -0,0 +1,32 @@
1
+ //#region src/core/chunk.ts
2
+ /**
3
+ * Split long text into overlapping chunks for embedding/indexing. Each chunk is
4
+ * at most `maxChars`; chunks overlap by `overlap` characters so a query that
5
+ * straddles a boundary still matches. Splits prefer the nearest whitespace
6
+ * before the limit to avoid cutting words mid-token. Short text returns a single
7
+ * trimmed chunk; empty/whitespace-only text returns no chunks.
8
+ */
9
+ function chunkText(text, options = {}) {
10
+ const maxChars = options.maxChars ?? 1500;
11
+ const overlap = Math.min(options.overlap ?? 150, Math.floor(maxChars / 2));
12
+ const trimmed = text.trim();
13
+ if (!trimmed) return [];
14
+ if (trimmed.length <= maxChars) return [trimmed];
15
+ const chunks = [];
16
+ let start = 0;
17
+ while (start < trimmed.length) {
18
+ let end = Math.min(start + maxChars, trimmed.length);
19
+ if (end < trimmed.length) {
20
+ const lastSpace = trimmed.lastIndexOf(" ", end);
21
+ if (lastSpace > start + maxChars / 2) end = lastSpace;
22
+ }
23
+ chunks.push(trimmed.slice(start, end).trim());
24
+ if (end >= trimmed.length) break;
25
+ start = end - overlap;
26
+ }
27
+ return chunks.filter((c) => c.length > 0);
28
+ }
29
+ //#endregion
30
+ exports.chunkText = chunkText;
31
+
32
+ //# sourceMappingURL=chunk-BfDYWZQ8.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk-BfDYWZQ8.cjs","names":[],"sources":["../src/core/chunk.ts"],"sourcesContent":["// src/core/chunk.ts\n\nexport interface ChunkOptions {\n /** Target maximum characters per chunk. */\n maxChars?: number;\n /** Characters of overlap carried from the end of one chunk into the next. */\n overlap?: number;\n}\n\n/**\n * Split long text into overlapping chunks for embedding/indexing. Each chunk is\n * at most `maxChars`; chunks overlap by `overlap` characters so a query that\n * straddles a boundary still matches. Splits prefer the nearest whitespace\n * before the limit to avoid cutting words mid-token. Short text returns a single\n * trimmed chunk; empty/whitespace-only text returns no chunks.\n */\nexport function chunkText(text: string, options: ChunkOptions = {}): string[] {\n const maxChars = options.maxChars ?? 1500;\n const overlap = Math.min(options.overlap ?? 150, Math.floor(maxChars / 2));\n const trimmed = text.trim();\n if (!trimmed) return [];\n if (trimmed.length <= maxChars) return [trimmed];\n\n const chunks: string[] = [];\n let start = 0;\n while (start < trimmed.length) {\n let end = Math.min(start + maxChars, trimmed.length);\n // Prefer breaking on whitespace, but only if it doesn't shrink the chunk too much.\n if (end < trimmed.length) {\n const lastSpace = trimmed.lastIndexOf(\" \", end);\n if (lastSpace > start + maxChars / 2) end = lastSpace;\n }\n chunks.push(trimmed.slice(start, end).trim());\n if (end >= trimmed.length) break;\n start = end - overlap;\n }\n return chunks.filter((c) => c.length > 0);\n}\n"],"mappings":";;;;;;;;AAgBA,SAAgB,UAAU,MAAc,UAAwB,CAAC,GAAa;CAC5E,MAAM,WAAW,QAAQ,YAAY;CACrC,MAAM,UAAU,KAAK,IAAI,QAAQ,WAAW,KAAK,KAAK,MAAM,WAAW,CAAC,CAAC;CACzE,MAAM,UAAU,KAAK,KAAK;CAC1B,IAAI,CAAC,SAAS,OAAO,CAAC;CACtB,IAAI,QAAQ,UAAU,UAAU,OAAO,CAAC,OAAO;CAE/C,MAAM,SAAmB,CAAC;CAC1B,IAAI,QAAQ;CACZ,OAAO,QAAQ,QAAQ,QAAQ;EAC7B,IAAI,MAAM,KAAK,IAAI,QAAQ,UAAU,QAAQ,MAAM;EAEnD,IAAI,MAAM,QAAQ,QAAQ;GACxB,MAAM,YAAY,QAAQ,YAAY,KAAK,GAAG;GAC9C,IAAI,YAAY,QAAQ,WAAW,GAAG,MAAM;EAC9C;EACA,OAAO,KAAK,QAAQ,MAAM,OAAO,GAAG,EAAE,KAAK,CAAC;EAC5C,IAAI,OAAO,QAAQ,QAAQ;EAC3B,QAAQ,MAAM;CAChB;CACA,OAAO,OAAO,QAAQ,MAAM,EAAE,SAAS,CAAC;AAC1C"}
@@ -0,0 +1,32 @@
1
+ //#region src/core/chunk.ts
2
+ /**
3
+ * Split long text into overlapping chunks for embedding/indexing. Each chunk is
4
+ * at most `maxChars`; chunks overlap by `overlap` characters so a query that
5
+ * straddles a boundary still matches. Splits prefer the nearest whitespace
6
+ * before the limit to avoid cutting words mid-token. Short text returns a single
7
+ * trimmed chunk; empty/whitespace-only text returns no chunks.
8
+ */
9
+ function chunkText(text, options = {}) {
10
+ const maxChars = options.maxChars ?? 1500;
11
+ const overlap = Math.min(options.overlap ?? 150, Math.floor(maxChars / 2));
12
+ const trimmed = text.trim();
13
+ if (!trimmed) return [];
14
+ if (trimmed.length <= maxChars) return [trimmed];
15
+ const chunks = [];
16
+ let start = 0;
17
+ while (start < trimmed.length) {
18
+ let end = Math.min(start + maxChars, trimmed.length);
19
+ if (end < trimmed.length) {
20
+ const lastSpace = trimmed.lastIndexOf(" ", end);
21
+ if (lastSpace > start + maxChars / 2) end = lastSpace;
22
+ }
23
+ chunks.push(trimmed.slice(start, end).trim());
24
+ if (end >= trimmed.length) break;
25
+ start = end - overlap;
26
+ }
27
+ return chunks.filter((c) => c.length > 0);
28
+ }
29
+ //#endregion
30
+ export { chunkText as t };
31
+
32
+ //# sourceMappingURL=chunk-BhUZmQg5.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk-BhUZmQg5.js","names":[],"sources":["../src/core/chunk.ts"],"sourcesContent":["// src/core/chunk.ts\n\nexport interface ChunkOptions {\n /** Target maximum characters per chunk. */\n maxChars?: number;\n /** Characters of overlap carried from the end of one chunk into the next. */\n overlap?: number;\n}\n\n/**\n * Split long text into overlapping chunks for embedding/indexing. Each chunk is\n * at most `maxChars`; chunks overlap by `overlap` characters so a query that\n * straddles a boundary still matches. Splits prefer the nearest whitespace\n * before the limit to avoid cutting words mid-token. Short text returns a single\n * trimmed chunk; empty/whitespace-only text returns no chunks.\n */\nexport function chunkText(text: string, options: ChunkOptions = {}): string[] {\n const maxChars = options.maxChars ?? 1500;\n const overlap = Math.min(options.overlap ?? 150, Math.floor(maxChars / 2));\n const trimmed = text.trim();\n if (!trimmed) return [];\n if (trimmed.length <= maxChars) return [trimmed];\n\n const chunks: string[] = [];\n let start = 0;\n while (start < trimmed.length) {\n let end = Math.min(start + maxChars, trimmed.length);\n // Prefer breaking on whitespace, but only if it doesn't shrink the chunk too much.\n if (end < trimmed.length) {\n const lastSpace = trimmed.lastIndexOf(\" \", end);\n if (lastSpace > start + maxChars / 2) end = lastSpace;\n }\n chunks.push(trimmed.slice(start, end).trim());\n if (end >= trimmed.length) break;\n start = end - overlap;\n }\n return chunks.filter((c) => c.length > 0);\n}\n"],"mappings":";;;;;;;;AAgBA,SAAgB,UAAU,MAAc,UAAwB,CAAC,GAAa;CAC5E,MAAM,WAAW,QAAQ,YAAY;CACrC,MAAM,UAAU,KAAK,IAAI,QAAQ,WAAW,KAAK,KAAK,MAAM,WAAW,CAAC,CAAC;CACzE,MAAM,UAAU,KAAK,KAAK;CAC1B,IAAI,CAAC,SAAS,OAAO,CAAC;CACtB,IAAI,QAAQ,UAAU,UAAU,OAAO,CAAC,OAAO;CAE/C,MAAM,SAAmB,CAAC;CAC1B,IAAI,QAAQ;CACZ,OAAO,QAAQ,QAAQ,QAAQ;EAC7B,IAAI,MAAM,KAAK,IAAI,QAAQ,UAAU,QAAQ,MAAM;EAEnD,IAAI,MAAM,QAAQ,QAAQ;GACxB,MAAM,YAAY,QAAQ,YAAY,KAAK,GAAG;GAC9C,IAAI,YAAY,QAAQ,WAAW,GAAG,MAAM;EAC9C;EACA,OAAO,KAAK,QAAQ,MAAM,OAAO,GAAG,EAAE,KAAK,CAAC;EAC5C,IAAI,OAAO,QAAQ,QAAQ;EAC3B,QAAQ,MAAM;CAChB;CACA,OAAO,OAAO,QAAQ,MAAM,EAAE,SAAS,CAAC;AAC1C"}
@@ -0,0 +1,2 @@
1
+ import { t as chunkText } from "./chunk-BhUZmQg5.js";
2
+ export { chunkText };