pptx-browser 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/extract.js ADDED
@@ -0,0 +1,535 @@
1
+ /**
2
+ * extract.js — Structured text extraction from PPTX slides.
3
+ *
4
+ * Extracts all readable text content for:
5
+ * - Search indexing
6
+ * - Accessibility (screen readers, alt text generation)
7
+ * - Copy/paste of slide content
8
+ * - Generating slide outlines / summaries
9
+ * - Translation pipelines
10
+ *
11
+ * Public API (re-exported from index.js):
12
+ *
13
+ * renderer.extractSlide(slideIndex) → SlideContent
14
+ * renderer.extractAll() → SlideContent[]
15
+ * renderer.extractText(slideIndex) → string (plain text, all content)
16
+ * renderer.searchSlides(query) → SearchResult[]
17
+ *
18
+ * @typedef {object} TextRun
19
+ * @property {string} text
20
+ * @property {boolean} bold
21
+ * @property {boolean} italic
22
+ * @property {boolean} underline
23
+ * @property {number} fontSize pt
24
+ * @property {string} color CSS colour string
25
+ *
26
+ * @typedef {object} Paragraph
27
+ * @property {TextRun[]} runs
28
+ * @property {string} text — joined plain text
29
+ * @property {string} align l | ctr | r | just
30
+ * @property {number} level indent level 0–8
31
+ * @property {string|null} bullet bullet char, number string, or null
32
+ *
33
+ * @typedef {object} TextShape
34
+ * @property {string} id
35
+ * @property {string} name
36
+ * @property {string} type title | subtitle | body | textBox | other
37
+ * @property {Paragraph[]} paragraphs
38
+ * @property {string} text plain text
39
+ *
40
+ * @typedef {object} TableCell
41
+ * @property {number} row
42
+ * @property {number} col
43
+ * @property {number} rowSpan
44
+ * @property {number} colSpan
45
+ * @property {string} text
46
+ * @property {Paragraph[]} paragraphs
47
+ *
48
+ * @typedef {object} TableShape
49
+ * @property {string} id
50
+ * @property {string} name
51
+ * @property {TableCell[][]} rows
52
+ * @property {string} text all cell text joined with tabs/newlines
53
+ *
54
+ * @typedef {object} ImageShape
55
+ * @property {string} id
56
+ * @property {string} name
57
+ * @property {string} altText
58
+ * @property {string} title
59
+ *
60
+ * @typedef {object} ChartShape
61
+ * @property {string} id
62
+ * @property {string} name
63
+ * @property {string} chartType
64
+ * @property {string[]} seriesNames
65
+ * @property {string[]} categories
66
+ *
67
+ * @typedef {object} SlideContent
68
+ * @property {number} index
69
+ * @property {string} title — first title shape text
70
+ * @property {string} subtitle — first subtitle shape text
71
+ * @property {TextShape[]} textShapes
72
+ * @property {TableShape[]} tables
73
+ * @property {ImageShape[]} images
74
+ * @property {ChartShape[]} charts
75
+ * @property {string} notes — speaker notes plain text
76
+ * @property {string} text — all text joined, for full-text search
77
+ */
78
+
79
+ // ── XML helpers ───────────────────────────────────────────────────────────────
80
+
81
+ function g1(node, name) {
82
+ if (!node) return null;
83
+ const all = node.getElementsByTagName('*');
84
+ for (let i = 0; i < all.length; i++) if (all[i].localName === name) return all[i];
85
+ return null;
86
+ }
87
+ function gtn(node, name) {
88
+ if (!node) return [];
89
+ const r = [];
90
+ const all = node.getElementsByTagName('*');
91
+ for (let i = 0; i < all.length; i++) if (all[i].localName === name) r.push(all[i]);
92
+ return r;
93
+ }
94
+ function attr(el, name, def = null) {
95
+ if (!el) return def;
96
+ const v = el.getAttribute(name);
97
+ return v !== null ? v : def;
98
+ }
99
+ function attrInt(el, name, def = 0) {
100
+ const v = attr(el, name);
101
+ return v !== null ? parseInt(v, 10) : def;
102
+ }
103
+
104
+ // ── Text run extraction ───────────────────────────────────────────────────────
105
+
106
+ function extractRun(rEl, defRPr) {
107
+ const rPr = g1(rEl, 'rPr');
108
+ const tEl = g1(rEl, 't');
109
+ if (!tEl) return null;
110
+
111
+ const text = tEl.textContent || '';
112
+ const szRaw = rPr ? parseInt(rPr.getAttribute('sz') || '0', 10)
113
+ : defRPr ? parseInt(defRPr.getAttribute('sz') || '0', 10) : 0;
114
+ const fontSize = szRaw ? szRaw / 100 : 12; // pt
115
+
116
+ const bold = rPr ? rPr.getAttribute('b') === '1' : false;
117
+ const italic = rPr ? rPr.getAttribute('i') === '1' : false;
118
+ const u = rPr ? (rPr.getAttribute('u') || 'none') : 'none';
119
+
120
+ // Color
121
+ let color = '#000000';
122
+ if (rPr) {
123
+ const solidFill = g1(rPr, 'solidFill') || g1(rPr, 'lumMod');
124
+ if (solidFill) {
125
+ const srgb = g1(solidFill, 'srgbClr');
126
+ if (srgb) color = '#' + (srgb.getAttribute('val') || '000000');
127
+ }
128
+ }
129
+
130
+ return { text, bold, italic, underline: u !== 'none', fontSize, color };
131
+ }
132
+
133
+ function extractParagraph(paraEl) {
134
+ const pPr = g1(paraEl, 'pPr');
135
+ const algn = attr(pPr, 'algn', 'l');
136
+ const level = attrInt(pPr, 'lvl', 0);
137
+ const defRPr = g1(pPr, 'defRPr');
138
+
139
+ // Bullet
140
+ let bullet = null;
141
+ if (pPr && !g1(pPr, 'buNone')) {
142
+ const buChar = g1(pPr, 'buChar');
143
+ const buAutoNum = g1(pPr, 'buAutoNum');
144
+ if (buChar) bullet = buChar.getAttribute('char') || '•';
145
+ else if (buAutoNum) bullet = '{auto}'; // caller can format
146
+ }
147
+
148
+ const runs = [];
149
+ for (const child of paraEl.children) {
150
+ if (child.localName === 'r') {
151
+ const run = extractRun(child, defRPr);
152
+ if (run) runs.push(run);
153
+ } else if (child.localName === 'br') {
154
+ runs.push({ text: '\n', bold: false, italic: false, underline: false, fontSize: 12, color: '#000' });
155
+ } else if (child.localName === 'fld') {
156
+ // Field (slide number, date, etc.)
157
+ const t = g1(child, 't');
158
+ if (t) runs.push({ text: t.textContent, bold: false, italic: false, underline: false, fontSize: 12, color: '#555' });
159
+ }
160
+ }
161
+
162
+ const text = runs.map(r => r.text).join('');
163
+ return { runs, text, align: algn, level, bullet };
164
+ }
165
+
166
+ function extractTextBody(txBody) {
167
+ if (!txBody) return [];
168
+ return gtn(txBody, 'p').map(extractParagraph).filter(p => p.text.trim());
169
+ }
170
+
171
+ // ── Shape type detection ──────────────────────────────────────────────────────
172
+
173
+ function detectShapeType(spEl) {
174
+ // Check placeholder type
175
+ const nvSpPr = g1(spEl, 'nvSpPr');
176
+ const nvPr = nvSpPr ? g1(nvSpPr, 'nvPr') : null;
177
+ const ph = nvPr ? g1(nvPr, 'ph') : null;
178
+
179
+ if (ph) {
180
+ const phType = attr(ph, 'type', 'body');
181
+ if (phType === 'title' || phType === 'ctrTitle') return 'title';
182
+ if (phType === 'subTitle') return 'subtitle';
183
+ if (phType === 'body') return 'body';
184
+ }
185
+
186
+ // Check for explicit text box (no placeholder, has txBody)
187
+ const txBody = g1(spEl, 'txBody');
188
+ if (txBody) return 'textBox';
189
+
190
+ return 'other';
191
+ }
192
+
193
+ function getShapeId(spEl) {
194
+ const nvSpPr = g1(spEl, 'nvSpPr');
195
+ const cNvPr = nvSpPr ? g1(nvSpPr, 'cNvPr') : null;
196
+ return cNvPr ? (attr(cNvPr, 'id', '') ) : '';
197
+ }
198
+
199
+ function getShapeName(spEl) {
200
+ const nvSpPr = g1(spEl, 'nvSpPr');
201
+ const cNvPr = nvSpPr ? g1(nvSpPr, 'cNvPr') : null;
202
+ return cNvPr ? (attr(cNvPr, 'name', '')) : '';
203
+ }
204
+
205
+ function getAltText(spEl) {
206
+ const nvSpPr = g1(spEl, 'nvSpPr');
207
+ const nvPr = nvSpPr ? g1(nvSpPr, 'nvPr') : null;
208
+ const extLst = nvPr ? g1(nvPr, 'extLst') : null;
209
+ if (extLst) {
210
+ for (const ext of gtn(extLst, 'ext')) {
211
+ const desc = g1(ext, 'lpUserStr') || g1(ext, 'ud');
212
+ if (desc) return desc.getAttribute('val') || desc.textContent || '';
213
+ }
214
+ }
215
+ // title attribute on cNvPr
216
+ const cNvPr = nvSpPr ? g1(nvSpPr, 'cNvPr') : null;
217
+ return cNvPr ? (attr(cNvPr, 'descr', '') || attr(cNvPr, 'title', '')) : '';
218
+ }
219
+
220
+ // ── Table extraction ──────────────────────────────────────────────────────────
221
+
222
+ function extractTable(graphicFrame) {
223
+ const tbl = g1(graphicFrame, 'tbl');
224
+ if (!tbl) return null;
225
+
226
+ const nvGraphicFramePr = g1(graphicFrame, 'nvGraphicFramePr');
227
+ const cNvPr = nvGraphicFramePr ? g1(nvGraphicFramePr, 'cNvPr') : null;
228
+ const id = cNvPr ? attr(cNvPr, 'id', '') : '';
229
+ const name = cNvPr ? attr(cNvPr, 'name', '') : '';
230
+
231
+ const rows = [];
232
+ let ri = 0;
233
+ for (const rowEl of gtn(tbl, 'tr')) {
234
+ const cells = [];
235
+ let ci = 0;
236
+ for (const tcEl of gtn(rowEl, 'tc')) {
237
+ const gridSpan = attrInt(tcEl, 'gridSpan', 1);
238
+ const rowSpan = attrInt(tcEl, 'rowSpan', 1);
239
+ const paragraphs = extractTextBody(g1(tcEl, 'txBody'));
240
+ const text = paragraphs.map(p => p.text).join('\n');
241
+ cells.push({ row: ri, col: ci, rowSpan, colSpan: gridSpan, text, paragraphs });
242
+ ci++;
243
+ }
244
+ rows.push(cells);
245
+ ri++;
246
+ }
247
+
248
+ const text = rows.map(row => row.map(cell => cell.text).join('\t')).join('\n');
249
+ return { id, name, rows, text };
250
+ }
251
+
252
+ // ── Chart extraction ──────────────────────────────────────────────────────────
253
+
254
+ function extractChartRef(graphicFrame, slideRels) {
255
+ const graphic = g1(graphicFrame, 'graphic');
256
+ const graphicData = graphic ? g1(graphic, 'graphicData') : null;
257
+ const chartEl = graphicData ? g1(graphicData, 'chart') : null;
258
+ if (!chartEl) return null;
259
+
260
+ const rId = chartEl.getAttribute('r:id') || chartEl.getAttribute('id');
261
+ const nvFramePr = g1(graphicFrame, 'nvGraphicFramePr');
262
+ const cNvPr = nvFramePr ? g1(nvFramePr, 'cNvPr') : null;
263
+
264
+ return {
265
+ id: cNvPr ? attr(cNvPr, 'id', '') : '',
266
+ name: cNvPr ? attr(cNvPr, 'name', '') : '',
267
+ rId,
268
+ };
269
+ }
270
+
271
+ function extractChartContent(chartDoc) {
272
+ if (!chartDoc) return { chartType: 'unknown', seriesNames: [], categories: [] };
273
+
274
+ const plotArea = g1(chartDoc, 'plotArea');
275
+ if (!plotArea) return { chartType: 'unknown', seriesNames: [], categories: [] };
276
+
277
+ const chartTypes = ['barChart','lineChart','pieChart','areaChart','scatterChart',
278
+ 'doughnutChart','radarChart','bubbleChart','bar3DChart',
279
+ 'line3DChart','pie3DChart','area3DChart'];
280
+ let chartType = 'unknown';
281
+ let chartNode = null;
282
+ for (const t of chartTypes) {
283
+ chartNode = g1(plotArea, t);
284
+ if (chartNode) { chartType = t.replace('3DChart','Chart').replace('Chart',''); break; }
285
+ }
286
+
287
+ const serEls = chartNode ? gtn(chartNode, 'ser') : [];
288
+ const seriesNames = serEls.map(s => {
289
+ const tx = g1(s, 'tx');
290
+ if (!tx) return null;
291
+ const v = g1(tx, 'v');
292
+ if (v) return v.textContent.trim();
293
+ const strCache = g1(tx, 'strCache');
294
+ const pt = strCache ? g1(strCache, 'pt') : null;
295
+ const vEl = pt ? g1(pt, 'v') : null;
296
+ return vEl ? vEl.textContent.trim() : null;
297
+ }).filter(Boolean);
298
+
299
+ // Categories from first series
300
+ const cats = serEls.length > 0 ? (() => {
301
+ const catEl = g1(serEls[0], 'cat') || g1(serEls[0], 'xVal');
302
+ if (!catEl) return [];
303
+ const cache = g1(catEl, 'strCache') || g1(catEl, 'numCache');
304
+ if (!cache) return [];
305
+ return gtn(cache, 'pt').map(pt => g1(pt, 'v')?.textContent || '').filter(Boolean);
306
+ })() : [];
307
+
308
+ return { chartType, seriesNames, categories: cats };
309
+ }
310
+
311
+ // ── Notes extraction ──────────────────────────────────────────────────────────
312
+
313
+ function extractNotes(notesDoc) {
314
+ if (!notesDoc) return '';
315
+ const cSld = g1(notesDoc, 'cSld');
316
+ const spTree = cSld ? g1(cSld, 'spTree') : null;
317
+ if (!spTree) return '';
318
+
319
+ const parts = [];
320
+ for (const spEl of gtn(spTree, 'sp')) {
321
+ const nvSpPr = g1(spEl, 'nvSpPr');
322
+ const nvPr = nvSpPr ? g1(nvSpPr, 'nvPr') : null;
323
+ const ph = nvPr ? g1(nvPr, 'ph') : null;
324
+ // Skip slide number placeholders
325
+ if (ph && attr(ph, 'type') === 'sldNum') continue;
326
+ const txBody = g1(spEl, 'txBody');
327
+ if (txBody) {
328
+ const text = gtn(txBody, 'r')
329
+ .map(r => g1(r, 't')?.textContent || '')
330
+ .join('');
331
+ if (text.trim()) parts.push(text.trim());
332
+ }
333
+ }
334
+ return parts.join('\n\n');
335
+ }
336
+
337
+ // ── Main extractor ─────────────────────────────────────────────────────────────
338
+
339
+ /**
340
+ * Extract all text content from a single slide.
341
+ *
342
+ * @param {number} slideIndex
343
+ * @param {object} renderer — loaded PptxRenderer instance
344
+ * @returns {Promise<SlideContent>}
345
+ */
346
+ export async function extractSlide(slideIndex, renderer) {
347
+ const { _files: files, slidePaths } = renderer;
348
+ if (slideIndex < 0 || slideIndex >= slidePaths.length)
349
+ throw new Error('Slide index out of range');
350
+
351
+ const slideXml = files[slidePaths[slideIndex]];
352
+ if (!slideXml) return emptySlide(slideIndex);
353
+
354
+ const slideDoc = new DOMParser().parseFromString(
355
+ new TextDecoder().decode(slideXml), 'application/xml'
356
+ );
357
+
358
+ // Slide rels (for charts, notes)
359
+ const { getRels } = await import('./render.js');
360
+ const slideRels = await getRels(files, slidePaths[slideIndex]);
361
+
362
+ // Notes
363
+ const notesRel = Object.values(slideRels).find(r => r.type?.includes('notesSlide'));
364
+ let notes = '';
365
+ if (notesRel && files[notesRel.fullPath]) {
366
+ const notesDoc = new DOMParser().parseFromString(
367
+ new TextDecoder().decode(files[notesRel.fullPath]), 'application/xml'
368
+ );
369
+ notes = extractNotes(notesDoc);
370
+ }
371
+
372
+ const cSld = g1(slideDoc, 'cSld');
373
+ const spTree = cSld ? g1(cSld, 'spTree') : null;
374
+ if (!spTree) return { index: slideIndex, title: '', subtitle: '', textShapes: [], tables: [], images: [], charts: [], notes, text: notes };
375
+
376
+ const textShapes = [];
377
+ const tables = [];
378
+ const images = [];
379
+ const charts = [];
380
+
381
+ for (const child of spTree.children) {
382
+ const ln = child.localName;
383
+
384
+ if (ln === 'sp') {
385
+ const txBody = g1(child, 'txBody');
386
+ if (!txBody) continue;
387
+ const type = detectShapeType(child);
388
+ const id = getShapeId(child);
389
+ const name = getShapeName(child);
390
+ const paragraphs = extractTextBody(txBody);
391
+ const text = paragraphs.map(p => p.text).join('\n');
392
+ if (text.trim()) textShapes.push({ id, name, type, paragraphs, text });
393
+ }
394
+
395
+ else if (ln === 'pic') {
396
+ const nvPicPr = g1(child, 'nvPicPr');
397
+ const cNvPr = nvPicPr ? g1(nvPicPr, 'cNvPr') : null;
398
+ const id = cNvPr ? attr(cNvPr, 'id', '') : '';
399
+ const name = cNvPr ? attr(cNvPr, 'name', '') : '';
400
+ const altText = cNvPr ? (attr(cNvPr, 'descr', '') || attr(cNvPr, 'title', '')) : '';
401
+ const nvPr = nvPicPr ? g1(nvPicPr, 'nvPr') : null;
402
+ const cNvPrExt = nvPr ? g1(nvPr, 'extLst') : null;
403
+ images.push({ id, name, altText, title: name });
404
+ }
405
+
406
+ else if (ln === 'graphicFrame') {
407
+ const uri = (() => {
408
+ const graphic = g1(child, 'graphic');
409
+ const gd = graphic ? g1(graphic, 'graphicData') : null;
410
+ return gd ? attr(gd, 'uri', '') : '';
411
+ })();
412
+
413
+ if (uri.includes('table') || g1(child, 'tbl')) {
414
+ const t = extractTable(child);
415
+ if (t) tables.push(t);
416
+ } else if (uri.includes('chart')) {
417
+ const ref = extractChartRef(child, slideRels);
418
+ if (ref) {
419
+ const rel = ref.rId ? slideRels[ref.rId] : null;
420
+ let chartContent = { chartType: 'chart', seriesNames: [], categories: [] };
421
+ if (rel && files[rel.fullPath]) {
422
+ const chartDoc = new DOMParser().parseFromString(
423
+ new TextDecoder().decode(files[rel.fullPath]), 'application/xml'
424
+ );
425
+ chartContent = extractChartContent(chartDoc);
426
+ }
427
+ charts.push({ id: ref.id, name: ref.name, ...chartContent });
428
+ }
429
+ }
430
+ }
431
+
432
+ else if (ln === 'grpSp') {
433
+ // Extract text from group shapes recursively
434
+ for (const spEl of gtn(child, 'sp')) {
435
+ const txBody = g1(spEl, 'txBody');
436
+ if (!txBody) continue;
437
+ const type = detectShapeType(spEl);
438
+ const id = getShapeId(spEl);
439
+ const name = getShapeName(spEl);
440
+ const paragraphs = extractTextBody(txBody);
441
+ const text = paragraphs.map(p => p.text).join('\n');
442
+ if (text.trim()) textShapes.push({ id, name, type, paragraphs, text });
443
+ }
444
+ }
445
+ }
446
+
447
+ // Derive title and subtitle
448
+ const titleShape = textShapes.find(s => s.type === 'title');
449
+ const subtitleShape = textShapes.find(s => s.type === 'subtitle');
450
+ const title = titleShape?.text || '';
451
+ const subtitle = subtitleShape?.text || '';
452
+
453
+ // Full text blob
454
+ const allText = [
455
+ title,
456
+ subtitle,
457
+ ...textShapes.filter(s => s.type !== 'title' && s.type !== 'subtitle').map(s => s.text),
458
+ ...tables.map(t => t.text),
459
+ ...charts.map(c => [c.name, ...c.seriesNames, ...c.categories].join(' ')),
460
+ notes,
461
+ ].filter(Boolean).join('\n\n');
462
+
463
+ return { index: slideIndex, title, subtitle, textShapes, tables, images, charts, notes, text: allText };
464
+ }
465
+
466
+ function emptySlide(index) {
467
+ return { index, title: '', subtitle: '', textShapes: [], tables: [], images: [], charts: [], notes: '', text: '' };
468
+ }
469
+
470
+ /**
471
+ * Extract content from all slides.
472
+ * @param {object} renderer
473
+ * @returns {Promise<SlideContent[]>}
474
+ */
475
+ export async function extractAll(renderer) {
476
+ const results = [];
477
+ for (let i = 0; i < renderer.slideCount; i++) {
478
+ results.push(await extractSlide(i, renderer));
479
+ }
480
+ return results;
481
+ }
482
+
483
+ /**
484
+ * Get all text from a slide as a plain string.
485
+ * @param {number} slideIndex
486
+ * @param {object} renderer
487
+ * @returns {Promise<string>}
488
+ */
489
+ export async function extractText(slideIndex, renderer) {
490
+ const content = await extractSlide(slideIndex, renderer);
491
+ return content.text;
492
+ }
493
+
494
+ /**
495
+ * Full-text search across all slides.
496
+ * Case-insensitive, returns slide index + matching excerpts.
497
+ *
498
+ * @param {string} query
499
+ * @param {object} renderer
500
+ * @returns {Promise<SearchResult[]>}
501
+ */
502
+ export async function searchSlides(query, renderer) {
503
+ const q = query.toLowerCase().trim();
504
+ if (!q) return [];
505
+
506
+ const results = [];
507
+ for (let i = 0; i < renderer.slideCount; i++) {
508
+ const content = await extractSlide(i, renderer);
509
+ const haystack = content.text.toLowerCase();
510
+ if (!haystack.includes(q)) continue;
511
+
512
+ // Find all match positions for excerpt generation
513
+ const excerpts = [];
514
+ let pos = 0;
515
+ while ((pos = haystack.indexOf(q, pos)) !== -1) {
516
+ const start = Math.max(0, pos - 60);
517
+ const end = Math.min(content.text.length, pos + q.length + 60);
518
+ const before = content.text.slice(start, pos);
519
+ const match = content.text.slice(pos, pos + q.length);
520
+ const after = content.text.slice(pos + q.length, end);
521
+ excerpts.push({ before: (start > 0 ? '…' : '') + before, match, after: after + (end < content.text.length ? '…' : '') });
522
+ pos += q.length;
523
+ if (excerpts.length >= 3) break; // max 3 excerpts per slide
524
+ }
525
+
526
+ results.push({
527
+ slideIndex: i,
528
+ title: content.title,
529
+ score: excerpts.length + (content.title.toLowerCase().includes(q) ? 10 : 0),
530
+ excerpts,
531
+ });
532
+ }
533
+
534
+ return results.sort((a, b) => b.score - a.score);
535
+ }