pptx-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # pptx-extractor
2
+
3
+ Extract raw text and structured fields from PowerPoint `.pptx` files.
4
+
5
+ ## Features
6
+
7
+ - Reads slide text directly from the PPTX archive
8
+ - Returns per-slide text and text runs
9
+ - Supports caller-defined field extractors
10
+ - Works in Node.js and other ArrayBuffer-capable runtimes
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ npm install pptx-extractor
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```ts
21
+ import { extractSlides, parseSlideFields } from "pptx-extractor";
22
+
23
+ const slides = extractSlides(arrayBuffer);
24
+
25
+ const fields = parseSlideFields(slides[0].slideText, slides[0].textParts, {
26
+ questionId: { pattern: /\b(Q\d+)\b/, group: 1 },
27
+ headline: { strategy: "longest-insight" },
28
+ });
29
+ ```
30
+
31
+ ## Main Exports
32
+
33
+ - `extractSlides(buffer)`
34
+ - `parseSlideFields(slideText, textParts, extractors)`
35
+ - `extractAndParse(buffer, extractors, options?)`
36
+ - `SURVEY_DECK_EXTRACTORS`
37
+
38
+ Useful for document pipelines, research ingestion, and slide-analysis tools.
package/package.json ADDED
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "pptx-extractor",
3
+ "version": "1.0.0",
4
+ "description": "Extract per-slide text and structured metadata from PPTX files — generic, configurable, no external API calls",
5
+ "license": "MIT",
6
+ "type": "module",
7
+ "main": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": "./dist/index.js",
12
+ "types": "./dist/index.d.ts"
13
+ }
14
+ },
15
+ "scripts": {
16
+ "build": "tsc",
17
+ "dev": "tsc --watch",
18
+ "typecheck": "tsc --noEmit --pretty false"
19
+ },
20
+ "keywords": ["pptx", "powerpoint", "slides", "extract", "parse", "office", "research"],
21
+ "dependencies": {
22
+ "fflate": "^0.8"
23
+ },
24
+ "devDependencies": {
25
+ "@types/node": "^22.0.0",
26
+ "typescript": "^5"
27
+ }
28
+ }
package/src/index.ts ADDED
@@ -0,0 +1,287 @@
1
+ /**
2
+ * pptx-extractor
3
+ *
4
+ * Extract per-slide text and structured metadata from a PPTX buffer.
5
+ *
6
+ * A PPTX file is a ZIP archive. This package uses fflate to unzip it,
7
+ * parses each `ppt/slides/slideN.xml` for `<a:t>` text run elements,
8
+ * and optionally runs configurable regex extractors over the slide text
9
+ * to pull out structured fields (percentages, IDs, headlines, etc.).
10
+ *
11
+ * Designed for processing research deck slides where each slide contains
12
+ * a consistent pattern of structured data (e.g. survey stats, percentages,
13
+ * labelled values). The extractor patterns are fully caller-configurable —
14
+ * no hardcoded assumptions about your slide format.
15
+ *
16
+ * Works in any runtime that supports ArrayBuffer: Node.js, Cloudflare Workers,
17
+ * Deno, browsers.
18
+ *
19
+ * Usage:
20
+ * import { extractSlides, parseSlideFields } from "pptx-extractor";
21
+ *
22
+ * // Step 1 — extract raw slide text
23
+ * const slides = await extractSlides(arrayBuffer);
24
+ * // → [{ slideNum: 1, slideText: "...", textParts: ["...", "..."] }, ...]
25
+ *
26
+ * // Step 2 — parse structured fields from each slide
27
+ * const fields = parseSlideFields(slides[0].slideText, slides[0].textParts, {
28
+ * id: { pattern: /\b(Q\d{1,4})\b/, group: 1 },
29
+ * leaderPct: { pattern: /[Ll]eaders?\s*:?\s*(\d{1,3}(?:\.\d)?)\s*%/, group: 1, asFloat: true },
30
+ * headline: { strategy: "longest-insight" },
31
+ * });
32
+ * // → { id: "Q263", leaderPct: 72.0, headline: "Leaders are 2.3x more likely to..." }
33
+ *
34
+ * // Or use the combined helper:
35
+ * const parsed = await extractAndParse(arrayBuffer, myExtractorConfig);
36
+ */
37
+
38
+ import { unzipSync } from "fflate";
39
+
40
+ // ── Types ──────────────────────────────────────────────────────────────────────
41
+
42
+ /** A single extracted slide */
43
+ export interface Slide {
44
+ slideNum: number;
45
+ /** All text run values joined with spaces */
46
+ slideText: string;
47
+ /** Individual <a:t> text run values, in document order */
48
+ textParts: string[];
49
+ }
50
+
51
+ /** Configuration for a single extracted field */
52
+ export type FieldExtractorConfig =
53
+ | RegexExtractorConfig
54
+ | HeadlineExtractorConfig
55
+ | SectionExtractorConfig;
56
+
57
+ /** Extract via regex. `group` defaults to 1. Set `asFloat: true` to parse as number. */
58
+ export interface RegexExtractorConfig {
59
+ strategy?: "regex";
60
+ pattern: RegExp;
61
+ group?: number;
62
+ asFloat?: boolean;
63
+ }
64
+
65
+ /**
66
+ * Find the best "headline" — the longest text part that reads like an
67
+ * insight sentence (contains verbs or comparative language).
68
+ * Falls back to the longest text part overall.
69
+ */
70
+ export interface HeadlineExtractorConfig {
71
+ strategy: "longest-insight";
72
+ /** Minimum character length to consider (default 40) */
73
+ minLength?: number;
74
+ /** Maximum character length to consider (default 300) */
75
+ maxLength?: number;
76
+ /** Regex that marks a part as insight-like (default: common English verbs) */
77
+ insightPattern?: RegExp;
78
+ /** Parts matching this pattern are skipped (default: short labels / numbers) */
79
+ skipPattern?: RegExp;
80
+ }
81
+
82
+ /**
83
+ * Extract a section header — the first text part that looks like a title.
84
+ */
85
+ export interface SectionExtractorConfig {
86
+ strategy: "section-header";
87
+ minLength?: number;
88
+ maxLength?: number;
89
+ /** Parts matching this pattern are skipped */
90
+ skipPattern?: RegExp;
91
+ }
92
+
93
+ /** Map of field names to their extractor configurations */
94
+ export type ExtractorMap = Record<string, FieldExtractorConfig>;
95
+
96
+ /** Extracted field values */
97
+ export type ParsedFields = Record<string, string | number | null>;
98
+
99
+ // ── Core extraction ────────────────────────────────────────────────────────────
100
+
101
+ /**
102
+ * Unzip a PPTX buffer and return the text of every slide, in order.
103
+ *
104
+ * @param buffer ArrayBuffer of the PPTX file
105
+ * @returns Array of Slide objects, sorted by slide number
106
+ */
107
+ export function extractSlides(buffer: ArrayBuffer): Slide[] {
108
+ const uint8 = new Uint8Array(buffer);
109
+ const unzipped = unzipSync(uint8);
110
+
111
+ const slideEntries = Object.keys(unzipped)
112
+ .filter((p) => /^ppt\/slides\/slide\d+\.xml$/.test(p))
113
+ .sort((a, b) => {
114
+ const na = parseInt(a.match(/slide(\d+)\.xml/)![1]);
115
+ const nb = parseInt(b.match(/slide(\d+)\.xml/)![1]);
116
+ return na - nb;
117
+ });
118
+
119
+ return slideEntries.map((slidePath) => {
120
+ const slideNum = parseInt(slidePath.match(/slide(\d+)\.xml/)![1]);
121
+ const xml = new TextDecoder().decode(unzipped[slidePath]);
122
+ const textParts = [...xml.matchAll(/<a:t>([^<]*)<\/a:t>/g)]
123
+ .map((m) => m[1].trim())
124
+ .filter(Boolean);
125
+ const slideText = textParts.join(" ");
126
+ return { slideNum, slideText, textParts };
127
+ });
128
+ }
129
+
130
+ // ── Field parser ───────────────────────────────────────────────────────────────
131
+
132
+ const DEFAULT_SKIP = /^\s*$|^\d+%$|^\d+$|^[\d.]+[xX]$|^[nN]\s*=\s*\d+$/;
133
+ const DEFAULT_INSIGHT = /\b(likely|more|less|better|higher|lower|greater|tend|report|have|use|deploy|show|find|achieve|enable|reduce|increase)\b/i;
134
+
135
+ /**
136
+ * Parse structured fields from a single slide's text using caller-supplied
137
+ * extractor configurations.
138
+ *
139
+ * @param slideText Full slide text (textParts joined with spaces)
140
+ * @param textParts Individual text run values from the slide
141
+ * @param extractors Map of field name → extractor configuration
142
+ * @returns Map of field name → extracted value (string, number, or null)
143
+ */
144
+ export function parseSlideFields(
145
+ slideText: string,
146
+ textParts: string[],
147
+ extractors: ExtractorMap,
148
+ ): ParsedFields {
149
+ const result: ParsedFields = {};
150
+
151
+ for (const [field, config] of Object.entries(extractors)) {
152
+ if (!config) {
153
+ result[field] = null;
154
+ continue;
155
+ }
156
+
157
+ const strategy = "strategy" in config ? config.strategy : "regex";
158
+
159
+ if (strategy === "regex" || strategy === undefined) {
160
+ const cfg = config as RegexExtractorConfig;
161
+ const match = slideText.match(cfg.pattern);
162
+ if (!match) {
163
+ result[field] = null;
164
+ continue;
165
+ }
166
+ const group = cfg.group ?? 1;
167
+ const raw = match[group] ?? null;
168
+ result[field] = raw !== null && cfg.asFloat ? parseFloat(raw) : raw;
169
+ continue;
170
+ }
171
+
172
+ if (strategy === "longest-insight") {
173
+ const cfg = config as HeadlineExtractorConfig;
174
+ const minLen = cfg.minLength ?? 40;
175
+ const maxLen = cfg.maxLength ?? 300;
176
+ const skip = cfg.skipPattern ?? DEFAULT_SKIP;
177
+ const insightLike = cfg.insightPattern ?? DEFAULT_INSIGHT;
178
+
179
+ const candidates = textParts.filter(
180
+ (t) => t.length >= minLen && t.length <= maxLen && !skip.test(t),
181
+ );
182
+ const insights = candidates.filter((t) => insightLike.test(t));
183
+ result[field] = insights[0] ?? candidates[0] ?? null;
184
+ continue;
185
+ }
186
+
187
+ if (strategy === "section-header") {
188
+ const cfg = config as SectionExtractorConfig;
189
+ const minLen = cfg.minLength ?? 20;
190
+ const maxLen = cfg.maxLength ?? 200;
191
+ const skip = cfg.skipPattern ?? DEFAULT_SKIP;
192
+
193
+ const header = textParts.find(
194
+ (t) =>
195
+ t.length >= minLen &&
196
+ t.length <= maxLen &&
197
+ !skip.test(t) &&
198
+ !/^\d/.test(t),
199
+ );
200
+ result[field] = header ?? null;
201
+ continue;
202
+ }
203
+
204
+ result[field] = null;
205
+ }
206
+
207
+ return result;
208
+ }
209
+
210
+ // ── Combined helper ────────────────────────────────────────────────────────────
211
+
212
+ export interface ParsedSlide extends Slide {
213
+ fields: ParsedFields;
214
+ }
215
+
216
+ /**
217
+ * Extract slides from a PPTX buffer and parse structured fields from each,
218
+ * skipping slides with fewer than `minTextLength` characters (blank/divider slides).
219
+ *
220
+ * @param buffer ArrayBuffer of the PPTX file
221
+ * @param extractors Field extractor configuration
222
+ * @param minTextLength Minimum slide text length to include (default 20)
223
+ */
224
+ export function extractAndParse(
225
+ buffer: ArrayBuffer,
226
+ extractors: ExtractorMap,
227
+ minTextLength = 20,
228
+ ): ParsedSlide[] {
229
+ const slides = extractSlides(buffer);
230
+ return slides
231
+ .filter((s) => s.slideText.trim().length >= minTextLength)
232
+ .map((s) => ({
233
+ ...s,
234
+ fields: parseSlideFields(s.slideText, s.textParts, extractors),
235
+ }));
236
+ }
237
+
238
+ // ── Pre-built extractor configs ───────────────────────────────────────────────
239
+
240
+ /**
241
+ * Ready-to-use extractor config for survey and benchmark slides.
242
+ *
243
+ * Extracts: questionId, leaderPct, followerPct, lfMultiplier, sampleN,
244
+ * chartHeadline, slideSection.
245
+ *
246
+ * Usage:
247
+ * const slides = extractAndParse(buffer, SURVEY_DECK_EXTRACTORS);
248
+ * slides[0].fields.leaderPct // → 72.0
249
+ * slides[0].fields.chartHeadline // → "Leaders are 2.3x more likely to..."
250
+ */
251
+ export const SURVEY_DECK_EXTRACTORS: ExtractorMap = {
252
+ questionId: {
253
+ pattern: /\b(Q\d{1,4})\b/,
254
+ group: 1,
255
+ },
256
+ leaderPct: {
257
+ pattern: /[Ll]eaders?\s*:?\s*(\d{1,3}(?:\.\d)?)\s*%/,
258
+ group: 1,
259
+ asFloat: true,
260
+ },
261
+ followerPct: {
262
+ pattern: /[Ff]ollowers?\s*:?\s*(\d{1,3}(?:\.\d)?)\s*%/,
263
+ group: 1,
264
+ asFloat: true,
265
+ },
266
+ lfMultiplier: {
267
+ pattern: /(\d+(?:\.\d+)?)[xX]\s*more likely/i,
268
+ group: 1,
269
+ asFloat: true,
270
+ },
271
+ sampleN: {
272
+ pattern: /\bn\s*=\s*(\d{2,4})\b/i,
273
+ group: 1,
274
+ asFloat: true,
275
+ },
276
+ chartHeadline: {
277
+ strategy: "longest-insight",
278
+ minLength: 40,
279
+ maxLength: 300,
280
+ skipPattern: /^\s*$|^\d+%$|^[Ll]eaders?$|^[Ff]ollowers?$|^n\s*=|^Source:|^\d+$|^[\d.]+[xX]$/,
281
+ },
282
+ slideSection: {
283
+ strategy: "section-header",
284
+ minLength: 20,
285
+ maxLength: 200,
286
+ },
287
+ };
package/tsconfig.json ADDED
@@ -0,0 +1,14 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "outDir": "./dist",
7
+ "declaration": true,
8
+ "declarationMap": true,
9
+ "sourceMap": true,
10
+ "strict": true,
11
+ "esModuleInterop": true
12
+ },
13
+ "include": ["src"]
14
+ }