@roj-ai/sdk 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/dist/bootstrap.d.ts.map +1 -1
  2. package/dist/bootstrap.js +12 -2
  3. package/dist/bootstrap.js.map +1 -1
  4. package/dist/core/image/types.d.ts +2 -0
  5. package/dist/core/image/types.d.ts.map +1 -1
  6. package/dist/core/image/vips-resizer.d.ts.map +1 -1
  7. package/dist/core/image/vips-resizer.js +12 -11
  8. package/dist/core/image/vips-resizer.js.map +1 -1
  9. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts +20 -0
  10. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts.map +1 -1
  11. package/dist/plugins/uploads/preprocessors/image-classifier.js +78 -26
  12. package/dist/plugins/uploads/preprocessors/image-classifier.js.map +1 -1
  13. package/dist/plugins/uploads/preprocessors/index.d.ts +1 -0
  14. package/dist/plugins/uploads/preprocessors/index.d.ts.map +1 -1
  15. package/dist/plugins/uploads/preprocessors/index.js +1 -0
  16. package/dist/plugins/uploads/preprocessors/index.js.map +1 -1
  17. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts +52 -5
  18. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts.map +1 -1
  19. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js +152 -97
  20. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js.map +1 -1
  21. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts +71 -0
  22. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts.map +1 -0
  23. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js +274 -0
  24. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js.map +1 -0
  25. package/package.json +2 -2
  26. package/src/bootstrap.ts +12 -2
  27. package/src/core/image/types.ts +2 -0
  28. package/src/core/image/vips-resizer.ts +12 -11
  29. package/src/plugins/uploads/preprocessors/image-classifier.ts +93 -27
  30. package/src/plugins/uploads/preprocessors/index.ts +1 -0
  31. package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts +173 -108
  32. package/src/plugins/uploads/preprocessors/pdf-preprocessor.ts +342 -0
@@ -2,11 +2,14 @@
2
2
  * Markitdown Preprocessor
3
3
  *
4
4
  * Converts documents to markdown using Microsoft's markitdown CLI.
5
- * Supports PDF, DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, and more.
5
+ * Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
6
+ *
7
+ * PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
8
+ * (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
9
+ * mostly-unstructured PDFs we see in practice.
6
10
  *
7
11
  * Image extraction:
8
- * - PDF: uses pdfimages (poppler-utils)
9
- * - DOCX/ODT/EPUB: uses pandoc --extract-media
12
+ * - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
10
13
  *
11
14
  * Extracted images are classified via the image classifier preprocessor.
12
15
  * Full content is written to disk; extractedContent contains a structured manifest.
@@ -14,20 +17,31 @@
14
17
  import { dirname } from 'node:path';
15
18
  import { mapWithConcurrency } from '../../../lib/utils/concurrency.js';
16
19
  import { Err, Ok } from '../../../lib/utils/result.js';
17
- const MAX_IMAGES = 50;
20
+ const MAX_IMAGES = 20;
18
21
  const IMAGE_CLASSIFY_CONCURRENCY = 10;
19
- // markitdown converts a text-only document; even large PDFs finish in seconds.
22
+ /**
23
+ * Density filter for extracted images. Bytes-per-pixel ratio below this
24
+ * threshold typically means the image is an alpha mask, overlay layer, or
25
+ * essentially-empty region — not worth a vision call.
26
+ *
27
+ * Empirical reference points:
28
+ * - Dense photo JPEG: 0.3–1.0 B/px
29
+ * - Logo / icon PNG: 0.1–0.5 B/px
30
+ * - Brand PDF layer mask: <0.005 B/px
31
+ */
32
+ export const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05;
33
+ export const MIN_IMAGE_PIXELS = 50 * 50;
34
+ // markitdown converts a text-only document; even large files finish in seconds.
20
35
  const MARKITDOWN_TIMEOUT_MS = 60_000;
21
- // Image extractors (pdfimages, pandoc --extract-media) scale with image count
22
- // and resolution. Real-world large brand PDFs (40 pages, 5MB images) can take
23
- // 60–90s. Upload preprocessing is async/background, so allow generous headroom.
36
+ // Image extractors (pandoc --extract-media) scale with image count
37
+ // and resolution. Real-world large docs can take 60–90s. Upload preprocessing
38
+ // is async/background, so allow generous headroom.
24
39
  const IMAGE_EXTRACT_TIMEOUT_MS = 5 * 60_000;
25
40
  function makeExec(processRunner) {
26
41
  return (cmd, args, timeoutMs = MARKITDOWN_TIMEOUT_MS) => processRunner.execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 50 * 1024 * 1024 });
27
42
  }
28
- /** MIME types where markitdown converts to markdown (non-ZIP, non-image) */
43
+ /** MIME types where markitdown converts to markdown (non-ZIP, non-image, non-PDF) */
29
44
  const SUPPORTED_MIME_TYPES = [
30
- 'application/pdf',
31
45
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
32
46
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
33
47
  'application/vnd.openxmlformats-officedocument.presentationml.presentation',
@@ -52,37 +66,66 @@ const PANDOC_FORMAT_MAP = {
52
66
  'application/vnd.oasis.opendocument.text': 'odt',
53
67
  'application/epub+zip': 'epub',
54
68
  };
55
- /** MIME types where pdfimages can extract images */
56
- const PDFIMAGES_MIMES = new Set([
57
- 'application/pdf',
58
- ]);
59
69
  export class MarkitdownPreprocessor {
60
70
  name = 'markitdown';
61
71
  supportedMimeTypes = SUPPORTED_MIME_TYPES;
62
72
  registry;
63
73
  logger;
64
74
  fs;
75
+ processRunner;
65
76
  exec;
66
77
  constructor(config) {
67
78
  this.registry = config.registry;
68
79
  this.logger = config.logger;
69
80
  this.fs = config.fs;
81
+ this.processRunner = config.process;
70
82
  this.exec = makeExec(config.process);
71
83
  }
72
84
  async process(filePath, mimeType, ctx) {
73
85
  const totalStart = Date.now();
74
- const derivedPaths = [];
75
- const imageEntries = [];
76
86
  this.logger.info('Markitdown processing started', { filePath, mimeType });
77
- // 1. Convert to markdown via markitdown
78
87
  const contentPathResult = ctx.files.realPath('content.md');
79
88
  if (!contentPathResult.ok) {
80
89
  return Err(new Error('Failed to resolve output path'));
81
90
  }
91
+ await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true });
92
+ // Race markitdown text conversion and image extraction — they're
93
+ // independent, so there's no reason to serialize them. For documents
94
+ // where pandoc extraction isn't applicable, the image task resolves
95
+ // immediately.
96
+ const markdownTask = this.runMarkitdown(filePath, mimeType, contentPathResult.value);
97
+ const imageTask = PANDOC_EXTRACT_MIMES.has(mimeType)
98
+ ? this.extractImagesWithPandoc(filePath, mimeType, ctx)
99
+ : Promise.resolve([]);
100
+ const [markdownResult, images] = await Promise.all([markdownTask, imageTask]);
101
+ if (!markdownResult.ok)
102
+ return markdownResult;
103
+ const markdown = markdownResult.value;
104
+ const derivedPaths = ['content.md'];
105
+ const imageEntries = [];
106
+ for (const img of images) {
107
+ derivedPaths.push(img.relativePath);
108
+ imageEntries.push(`- ${img.relativePath} — ${img.description}`);
109
+ }
110
+ const manifestLines = ['Extracted files:'];
111
+ manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`);
112
+ manifestLines.push(...imageEntries);
113
+ this.logger.info('Markitdown processing complete', {
114
+ filePath,
115
+ mimeType,
116
+ contentLength: markdown.length,
117
+ imagesExtracted: imageEntries.length,
118
+ totalDurationMs: Date.now() - totalStart,
119
+ });
120
+ return Ok({
121
+ extractedContent: manifestLines.join('\n'),
122
+ derivedPaths,
123
+ });
124
+ }
125
+ async runMarkitdown(filePath, mimeType, outputPath) {
82
126
  const markitdownStart = Date.now();
83
127
  try {
84
- await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true });
85
- await this.exec('markitdown', [filePath, '-o', contentPathResult.value]);
128
+ await this.exec('markitdown', [filePath, '-o', outputPath]);
86
129
  }
87
130
  catch (error) {
88
131
  const message = error instanceof Error ? error.message : String(error);
@@ -92,48 +135,20 @@ export class MarkitdownPreprocessor {
92
135
  }
93
136
  return Err(new Error(`markitdown failed: ${message}`));
94
137
  }
95
- const contentResult = await ctx.files.read('content.md');
96
- const markdown = contentResult.ok ? contentResult.value : '';
97
- derivedPaths.push('content.md');
98
- this.logger.info('Markitdown conversion complete', {
99
- filePath,
100
- mimeType,
101
- durationMs: Date.now() - markitdownStart,
102
- contentLength: markdown.length,
103
- });
104
- // 2. Extract images based on file type
105
- const imagePhaseStart = Date.now();
106
- if (PANDOC_EXTRACT_MIMES.has(mimeType)) {
107
- const images = await this.extractImagesWithPandoc(filePath, mimeType, ctx);
108
- for (const img of images) {
109
- derivedPaths.push(img.relativePath);
110
- imageEntries.push(`- ${img.relativePath} — ${img.description}`);
111
- }
138
+ let markdown = '';
139
+ try {
140
+ markdown = await this.fs.readFile(outputPath, 'utf-8');
112
141
  }
113
- else if (PDFIMAGES_MIMES.has(mimeType)) {
114
- const images = await this.extractImagesWithPdfimages(filePath, ctx);
115
- for (const img of images) {
116
- derivedPaths.push(img.relativePath);
117
- imageEntries.push(`- ${img.relativePath} — ${img.description}`);
118
- }
142
+ catch {
143
+ // Output missing markitdown completed but produced nothing.
119
144
  }
120
- const imagePhaseDurationMs = Date.now() - imagePhaseStart;
121
- // 3. Build manifest
122
- const manifestLines = ['Extracted files:'];
123
- manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`);
124
- manifestLines.push(...imageEntries);
125
- this.logger.info('Markitdown processing complete', {
145
+ this.logger.info('Markitdown conversion complete', {
126
146
  filePath,
127
147
  mimeType,
148
+ durationMs: Date.now() - markitdownStart,
128
149
  contentLength: markdown.length,
129
- imagesExtracted: imageEntries.length,
130
- imagePhaseDurationMs,
131
- totalDurationMs: Date.now() - totalStart,
132
- });
133
- return Ok({
134
- extractedContent: manifestLines.join('\n'),
135
- derivedPaths,
136
150
  });
151
+ return Ok(markdown);
137
152
  }
138
153
  async extractImagesWithPandoc(filePath, mimeType, ctx) {
139
154
  const mediaStore = ctx.files.scoped('media');
@@ -164,7 +179,7 @@ export class MarkitdownPreprocessor {
164
179
  });
165
180
  }
166
181
  const classifyStart = Date.now();
167
- const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger);
182
+ const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger, this.fs, this.processRunner);
168
183
  this.logger.info('Image classification complete', {
169
184
  source: 'pandoc',
170
185
  count: images.length,
@@ -173,46 +188,16 @@ export class MarkitdownPreprocessor {
173
188
  });
174
189
  return images;
175
190
  }
176
- async extractImagesWithPdfimages(filePath, ctx) {
177
- const imageStore = ctx.files.scoped('images');
178
- const imagesDirResult = imageStore.realPath('');
179
- if (!imagesDirResult.ok)
180
- return [];
181
- const pdfimagesStart = Date.now();
182
- let extractSucceeded = true;
183
- try {
184
- await this.fs.mkdir(imagesDirResult.value, { recursive: true });
185
- await this.exec('pdfimages', ['-png', filePath, `${imagesDirResult.value}/img`], IMAGE_EXTRACT_TIMEOUT_MS);
186
- }
187
- catch (error) {
188
- extractSucceeded = false;
189
- this.logger.warn('pdfimages failed (will classify any partial output)', {
190
- filePath,
191
- durationMs: Date.now() - pdfimagesStart,
192
- error: error instanceof Error ? error.message : String(error),
193
- });
194
- }
195
- if (extractSucceeded) {
196
- this.logger.info('pdfimages complete', {
197
- filePath,
198
- durationMs: Date.now() - pdfimagesStart,
199
- });
200
- }
201
- const classifyStart = Date.now();
202
- const images = await classifyExtractedImages(imageStore, 'images', ctx, this.registry, this.logger);
203
- this.logger.info('Image classification complete', {
204
- source: 'pdfimages',
205
- count: images.length,
206
- partial: !extractSucceeded,
207
- durationMs: Date.now() - classifyStart,
208
- });
209
- return images;
210
- }
211
191
  }
212
192
  // ============================================================================
213
193
  // Shared image helpers
214
194
  // ============================================================================
215
- const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i;
195
+ /**
196
+ * Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
197
+ * jb2e, jp2) are ignored — they'd require local conversion before being
198
+ * useful for classification.
199
+ */
200
+ export const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i;
216
201
  const IMAGE_MIME_MAP = {
217
202
  png: 'image/png',
218
203
  jpg: 'image/jpeg',
@@ -228,15 +213,85 @@ export function guessImageMime(filename) {
228
213
  const ext = filename.split('.').pop()?.toLowerCase();
229
214
  return IMAGE_MIME_MAP[ext ?? ''] ?? 'image/png';
230
215
  }
231
- export async function classifyExtractedImages(imageStore, relativePrefix, ctx, registry, logger) {
216
+ /**
217
+ * Reject images that are unlikely to carry useful visual information.
218
+ *
219
+ * `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
220
+ * empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
221
+ * matching transparency/overlay layer at the same dimensions but a fraction
222
+ * of a percent of the size (<0.005 B/px).
223
+ *
224
+ * The minimum pixel count protects against tiny icons whose density alone
225
+ * doesn't disqualify them.
226
+ */
227
+ export function shouldClassifyImage(meta) {
228
+ const pixels = meta.width * meta.height;
229
+ if (pixels < MIN_IMAGE_PIXELS)
230
+ return false;
231
+ const density = meta.sizeBytes / pixels;
232
+ return density >= MIN_IMAGE_DENSITY_BYTES_PER_PX;
233
+ }
234
+ /**
235
+ * Read image dimensions via vipsheader. Returns null when the tool isn't
236
+ * available or output is unparseable — caller should treat that as
237
+ * "include without filtering".
238
+ */
239
+ export async function getImageDimensions(filePath, processRunner) {
240
+ try {
241
+ const { stdout } = await processRunner.execFile('vipsheader', ['-f', 'width', '-f', 'height', filePath], { timeout: 10_000 });
242
+ const lines = stdout.trim().split('\n');
243
+ if (lines.length < 2)
244
+ return null;
245
+ const width = parseInt(lines[0], 10);
246
+ const height = parseInt(lines[1], 10);
247
+ if (!Number.isFinite(width) || !Number.isFinite(height))
248
+ return null;
249
+ return { width, height };
250
+ }
251
+ catch {
252
+ return null;
253
+ }
254
+ }
255
+ export async function classifyExtractedImages(imageStore, relativePrefix, ctx, registry, logger, fs, processRunner) {
232
256
  const listResult = await imageStore.list('', { maxDepth: 3 });
233
257
  if (!listResult.ok)
234
258
  return [];
235
- const imageFiles = listResult.value
236
- .filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
237
- .sort((a, b) => a.name.localeCompare(b.name))
259
+ const candidates = listResult.value.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name));
260
+ // Stat + density filter, then keep the top MAX_IMAGES by file size.
261
+ const inspected = await mapWithConcurrency(candidates, 8, async (entry) => {
262
+ const pathResult = imageStore.realPath(entry.name);
263
+ if (!pathResult.ok)
264
+ return null;
265
+ let sizeBytes = 0;
266
+ try {
267
+ sizeBytes = (await fs.stat(pathResult.value)).size;
268
+ }
269
+ catch {
270
+ return null;
271
+ }
272
+ const dims = await getImageDimensions(pathResult.value, processRunner);
273
+ if (!dims) {
274
+ // Unknown dims — include but warn; better to classify than silently drop.
275
+ return { name: entry.name, sizeBytes, width: 0, height: 0, kept: true };
276
+ }
277
+ const kept = shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes });
278
+ return { name: entry.name, sizeBytes, width: dims.width, height: dims.height, kept };
279
+ });
280
+ const filtered = inspected
281
+ .filter((r) => r !== null && r.kept)
282
+ .sort((a, b) => b.sizeBytes - a.sizeBytes)
238
283
  .slice(0, MAX_IMAGES);
239
- const settled = await mapWithConcurrency(imageFiles, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
284
+ const droppedCount = inspected.filter(r => r !== null && !r.kept).length;
285
+ if (droppedCount > 0 || inspected.length > MAX_IMAGES) {
286
+ logger.info('Image filter applied', {
287
+ source: relativePrefix,
288
+ candidates: candidates.length,
289
+ passedDensityFilter: candidates.length - droppedCount,
290
+ selected: filtered.length,
291
+ droppedByDensity: droppedCount,
292
+ });
293
+ }
294
+ const settled = await mapWithConcurrency(filtered, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
240
295
  const imgPathResult = imageStore.realPath(imgFile.name);
241
296
  if (!imgPathResult.ok)
242
297
  return null;
@@ -1 +1 @@
1
- {"version":3,"file":"markitdown-preprocessor.js","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/markitdown-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAE/D,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,uBAAuB,CAAA;AAO/C,MAAM,UAAU,GAAG,EAAE,CAAA;AACrB,MAAM,0BAA0B,GAAG,EAAE,CAAA;AAErC,+EAA+E;AAC/E,MAAM,qBAAqB,GAAG,MAAM,CAAA;AACpC,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAChF,MAAM,wBAAwB,GAAG,CAAC,GAAG,MAAM,CAAA;AAE3C,SAAS,QAAQ,CAAC,aAA4B;IAC7C,OAAO,CAAC,GAAW,EAAE,IAAc,EAAE,YAAoB,qBAAqB,EAAE,EAAE,CACjF,aAAa,CAAC,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC,CAAA;AACxF,CAAC;AAED,4EAA4E;AAC5E,MAAM,oBAAoB,GAAG;IAC5B,iBAAiB;IACjB,yEAAyE;IACzE,mEAAmE;IACnE,2EAA2E;IAC3E,yCAAyC;IACzC,iBAAiB;IACjB,sBAAsB;IACtB,WAAW;IACX,uBAAuB;IACvB,UAAU;IACV,kBAAkB;IAClB,iBAAiB;IACjB,UAAU;CACV,CAAA;AAED,yDAAyD;AACzD,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC;IACpC,yEAAyE;IACzE,yCAAyC;IACzC,sBAAsB;CACtB,CAAC,CAAA;AAEF,MAAM,iBAAiB,GAA2B;IACjD,yEAAyE,EAAE,MAAM;IACjF,yCAAyC,EAAE,KAAK;IAChD,sBAAsB,EAAE,MAAM;CAC9B,CAAA;AAED,oDAAoD;AACpD,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC;IAC/B,iBAAiB;CACjB,CAAC,CAAA;AASF,MAAM,OAAO,sBAAsB;IACzB,IAAI,GAAG,YAAY,CAAA;IACnB,kBAAkB,GAAG,oBAAoB,CAAA;IAEjC,QAAQ,CAAsB;IAC9B,MAAM,CAAQ;IACd,EAAE,CAAY;IACd,IAAI,CAAkG;IAEvH,YAAY,MAAoC;QAC/C,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAA;QAC/B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAA;QAC3B,IAAI,CAAC,EAAE,GAAG,MAAM,CAAC,EAAE,CAAA;QACnB,IAAI,CAAC,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IACrC,CAAC;IAED,KAAK,CAAC,OAAO,CACZ,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAC7B,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,MAAM,YAAY,GAAa,EAAE,CAAA;QAEjC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,CAAA;QAEzE,wCAAwC;QACxC,MAAM,iBAAiB,GAAG,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAA;QAC1D,IAAI,CAAC,iBAAiB,CAAC,EAAE,EAAE,CAAC;YAC3B,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAA;QACvD,CAAC;QAED,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAClC,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,CAAC,KAAK,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC1E,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAA;QACzE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,IAAI,CAAC,MAAM,CAAC,KAAK,CAChB,uBAAuB,EACvB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,EAC1C,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe,EAAE,CAChE,CAAA;YACD,IAAI,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAChC,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC,CAAA;YAC3F,CAAC;YACD,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,sBAAsB,OAAO,EAAE,CAAC,CAAC,CAAA;QACvD,CAAC;QAED,MAAM,aAAa,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QACxD,MAAM,QAAQ,GAAG,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAA;QAE5D,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QAE/B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YAClD,QAAQ;YACR,QAAQ;YACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe;YACxC,aAAa,EAAE,QAAQ,CAAC,MAAM;SAC9B,CAAC,CAAA;QAEF,uCAAuC;QACvC,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAClC,IAAI,oBAAoB,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACxC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAA;YAC1E,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;gBAC1B,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;gBACnC,YAAY,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,YAAY,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,CAAA;YAChE,CAAC;QACF,CAAC;aAAM,IAAI,eAAe,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,0BAA0B,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAA;YACnE,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;gBAC1B,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;gBACnC,YAAY,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,YAAY,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,CAAA;YAChE,CAAC;QACF,CAAC;QACD,MAAM,oBAAoB,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe,CAAA;QAEzD,oBAAoB;QACpB,MAAM,aAAa,GAAa,CAAC,kBAAkB,CAAC,CAAA;QACpD,aAAa,CAAC,IAAI,CAAC,2BAA2B,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAA;QACvE,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAA;QAEnC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YAClD,QAAQ;YACR,QAAQ;YACR,aAAa,EAAE,QAAQ,CAAC,MAAM;YAC9B,eAAe,EAAE,YAAY,CAAC,MAAM;YACpC,oBAAoB;YACpB,eAAe,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,UAAU;SACxC,CAAC,CAAA;QAEF,OAAO,EAAE,CAAC;YACT,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;YAC1C,YAAY;SACZ,CAAC,CAAA;IACH,CAAC;IAEO,KAAK,CAAC,uBAAuB,CACpC,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QAC5C,MAAM,cAAc,GAAG,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAA;QAC9C,IAAI,CAAC,cAAc,CAAC,EAAE;YAAE,OAAO,EAAE,CAAA;QAEjC,MAAM,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;QAC1C,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAA;QAEtB,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAC9B,IAAI,gBAAgB,GAAG,IAAI,CAAA;QAC3B,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,IAAI,CACd,QAAQ,EACR,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,mBAAmB,cAAc,CAAC,KAAK,EAAE,CAAC,EACnG,wBAAwB,CACxB,CAAA;QACF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,gBAAgB,GAAG,KAAK,CAAA;YACxB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,kEAAkE,EAAE;gBACpF,QAAQ;gBACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW;gBACpC,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;aAC7D,CAAC,CAAA;QACH,CAAC;QACD,IAAI,gBAAgB,EAAE,CAAC;YACtB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,iCAAiC,EAAE;gBACnD,QAAQ;gBACR,MAAM;gBACN,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW;aACpC,CAAC,CAAA;QACH,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAChC,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,UAAU,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;QAClG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE;YACjD,MAAM,EAAE,QAAQ;YAChB,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,OAAO,EAAE,CAAC,gBAAgB;YAC1B,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa;SACtC,CAAC,CAAA;QACF,OAAO,MAAM,CAAA;IACd,CAAC;IAEO,KAAK,CAAC,0BAA0B,CACvC,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAA;QAC7C,MAAM,eAAe,GAAG,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAA;QAC/C,IAAI,CAAC,eAAe,CAAC,EAAE;YAAE,OAAO,EAAE,CAAA;QAElC,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QACjC,IAAI,gBAAgB,GAAG,IAAI,CAAA;QAC3B,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,eAAe,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC/D,MAAM,IAAI,CAAC,IAAI,CACd,WAAW,EACX,CAAC,MAAM,EAAE,QAAQ,EAAE,GAAG,eAAe,CAAC,KAAK,MAAM,CAAC,EAClD,wBAAwB,CACxB,CAAA;QACF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,gBAAgB,GAAG,KAAK,CAAA;YACxB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,qDAAqD,EAAE;gBACvE,QAAQ;gBACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,cAAc;gBACvC,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;aAC7D,CAAC,CAAA;QACH,CAAC;QACD,IAAI,gBAAgB,EAAE,CAAC;YACtB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,oBAAoB,EAAE;gBACtC,QAAQ;gBACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,cAAc;aACvC,CAAC,CAAA;QACH,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAChC,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,UAAU,EAAE,QAAQ,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;QACnG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE;YACjD,MAAM,EAAE,WAAW;YACnB,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,OAAO,EAAE,CAAC,gBAAgB;YAC1B,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa;SACtC,CAAC,CAAA;QACF,OAAO,MAAM,CAAA;IACd,CAAC;CACD;AAED,+EAA+E;AAC/E,uBAAuB;AACvB,+EAA+E;AAE/E,MAAM,YAAY,GAAG,wCAAwC,CAAA;AAE7D,MAAM,cAAc,GAA2B;IAC9C,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;IAChB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,eAAe;IACpB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;CAChB,CAAA;AAED,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC9C,MAAM,GAAG,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,CAAA;IACpD,OAAO,cAAc,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,WAAW,CAAA;AAChD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC5C,UAAqB,EACrB,cAAsB,EACtB,GAAwB,EACxB,QAA8B,EAC9B,MAAc;IAEd,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAA;IAC7D,IAAI,CAAC,UAAU,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAE7B,MAAM,UAAU,GAAG,UAAU,CAAC,KAAK;SACjC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SAC3D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SAC5C,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;IAEtB,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,0BAA0B,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;QAClG,MAAM,aAAa,GAAG,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QACvD,IAAI,CAAC,aAAa,CAAC,EAAE;YAAE,OAAO,IAAI,CAAA;QAElC,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QAC5C,IAAI,WAAW,GAAG,OAAO,CAAA;QAEzB,MAAM,UAAU,GAAG,QAAQ,CAAC,cAAc,CAAC,OAAO,CAAC,CAAA;QACnD,IAAI,UAAU,EAAE,CAAC;YAChB,MAAM,cAAc,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE;gBAC7E,KAAK,EAAE,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,OAAO,CAAC;aACjE,CAAC,CAAA;YACF,IAAI,cAAc,CAAC,EAAE,IAAI,cAAc,CAAC,KAAK,CAAC,gBAAgB,EAAE,CAAC;gBAChE,WAAW,GAAG,cAAc,CAAC,KAAK,CAAC,gBAAgB,CAAA;YACpD,CAAC;QACF,CAAC;QAED,OAAO,EAAE,YAAY,EAAE,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,CAAA;IAC1E,CAAC,CAAC,CAAA;IAEF,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAsD,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAA;AAC7F,CAAC"}
1
+ {"version":3,"file":"markitdown-preprocessor.js","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/markitdown-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAE/D,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,uBAAuB,CAAA;AAO/C,MAAM,UAAU,GAAG,EAAE,CAAA;AACrB,MAAM,0BAA0B,GAAG,EAAE,CAAA;AAErC;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,8BAA8B,GAAG,IAAI,CAAA;AAClD,MAAM,CAAC,MAAM,gBAAgB,GAAG,EAAE,GAAG,EAAE,CAAA;AAEvC,gFAAgF;AAChF,MAAM,qBAAqB,GAAG,MAAM,CAAA;AACpC,mEAAmE;AACnE,8EAA8E;AAC9E,mDAAmD;AACnD,MAAM,wBAAwB,GAAG,CAAC,GAAG,MAAM,CAAA;AAE3C,SAAS,QAAQ,CAAC,aAA4B;IAC7C,OAAO,CAAC,GAAW,EAAE,IAAc,EAAE,YAAoB,qBAAqB,EAAE,EAAE,CACjF,aAAa,CAAC,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC,CAAA;AACxF,CAAC;AAED,qFAAqF;AACrF,MAAM,oBAAoB,GAAG;IAC5B,yEAAyE;IACzE,mEAAmE;IACnE,2EAA2E;IAC3E,yCAAyC;IACzC,iBAAiB;IACjB,sBAAsB;IACtB,WAAW;IACX,uBAAuB;IACvB,UAAU;IACV,kBAAkB;IAClB,iBAAiB;IACjB,UAAU;CACV,CAAA;AAED,yDAAyD;AACzD,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC;IACpC,yEAAyE;IACzE,yCAAyC;IACzC,sBAAsB;CACtB,CAAC,CAAA;AAEF,MAAM,iBAAiB,GAA2B;IACjD,yEAAyE,EAAE,MAAM;IACjF,yCAAyC,EAAE,KAAK;IAChD,sBAAsB,EAAE,MAAM;CAC9B,CAAA;AASD,MAAM,OAAO,sBAAsB;IACzB,IAAI,GAAG,YAAY,CAAA;IACnB,kBAAkB,GAAG,oBAAoB,CAAA;IAEjC,QAAQ,CAAsB;IAC9B,MAAM,CAAQ;IACd,EAAE,CAAY;IACd,aAAa,CAAe;IAC5B,IAAI,CAAkG;IAEvH,YAAY,MAAoC;QAC/C,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAA;QAC/B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAA;QAC3B,IAAI,CAAC,EAAE,GAAG,MAAM,CAAC,EAAE,CAAA;QACnB,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,OAAO,CAAA;QACnC,IAAI,CAAC,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IACrC,CAAC;IAED,KAAK,CAAC,OAAO,CACZ,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAE7B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,CAAA;QAEzE,MAAM,iBAAiB,GAAG,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAA;QAC1D,IAAI,CAAC,iBAAiB,CAAC,EAAE,EAAE,CAAC;YAC3B,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAA;QACvD,CAAC;QACD,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,CAAC,KAAK,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;QAE1E,iEAAiE;QACjE,qEAAqE;QACrE,oEAAoE;QACpE,eAAe;QACf,MAAM,YAAY,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,CAAC,KAAK,CAAC,CAAA;QACpF,MAAM,SAAS,GAAG,oBAAoB,CAAC,GAAG,CAAC,QAAQ,CAAC;YACnD,CAAC,CAAC,IAAI,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,CAAC;YACvD,CAAC,CAAC,OAAO,CAAC,OAAO,CAAuD,EAAE,CAAC,CAAA;QAE5E,MAAM,CAAC,cAAc,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,SAAS,CAAC,CAAC,CAAA;QAE7E,IAAI,CAAC,cAAc,CAAC,EAAE;YAAE,OAAO,cAAc,CAAA;QAE7C,MAAM,QAAQ,GAAG,cAAc,CAAC,KAAK,CAAA;QAErC,MAAM,YAAY,GAAa,CAAC,YAAY,CAAC,CAAA;QAC7C,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;YAC1B,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;YACnC,YAAY,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,YAAY,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,CAAA;QAChE,CAAC;QAED,MAAM,aAAa,GAAa,CAAC,kBAAkB,CAAC,CAAA;QACpD,aAAa,CAAC,IAAI,CAAC,2BAA2B,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAA;QACvE,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAA;QAEnC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YAClD,QAAQ;YACR,QAAQ;YACR,aAAa,EAAE,QAAQ,CAAC,MAAM;YAC9B,eAAe,EAAE,YAAY,CAAC,MAAM;YACpC,eAAe,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,UAAU;SACxC,CAAC,CAAA;QAEF,OAAO,EAAE,CAAC;YACT,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;YAC1C,YAAY;SACZ,CAAC,CAAA;IACH,CAAC;IAEO,KAAK,CAAC,aAAa,CAC1B,QAAgB,EAChB,QAAgB,EAChB,UAAkB;QAElB,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAClC,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC,CAAA;QAC5D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,IAAI,CAAC,MAAM,CAAC,KAAK,CAChB,uBAAuB,EACvB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,EAC1C,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe,EAAE,CAChE,CAAA;YACD,IAAI,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAChC,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC,CAAA;YAC3F,CAAC;YACD,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,sBAAsB,OAAO,EAAE,CAAC,CAAC,CAAA;QACvD,CAAC;QAED,IAAI,QAAQ,GAAG,EAAE,CAAA;QACjB,IAAI,CAAC;YACJ,QAAQ,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,UAAU,EAAE,OAAO,CAAC,CAAA;QACvD,CAAC;QAAC,MAAM,CAAC;YACR,8DAA8D;QAC/D,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YAClD,QAAQ;YACR,QAAQ;YACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe;YACxC,aAAa,EAAE,QAAQ,CAAC,MAAM;SAC9B,CAAC,CAAA;QAEF,OAAO,EAAE,CAAC,QAAQ,CAAC,CAAA;IACpB,CAAC;IAEO,KAAK,CAAC,uBAAuB,CACpC,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QAC5C,MAAM,cAAc,GAAG,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAA;QAC9C,IAAI,CAAC,cAAc,CAAC,EAAE;YAAE,OAAO,EAAE,CAAA;QAEjC,MAAM,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;QAC1C,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAA;QAEtB,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAC9B,IAAI,gBAAgB,GAAG,IAAI,CAAA;QAC3B,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,IAAI,CACd,QAAQ,EACR,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,mBAAmB,cAAc,CAAC,KAAK,EAAE,CAAC,EACnG,wBAAwB,CACxB,CAAA;QACF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,gBAAgB,GAAG,KAAK,CAAA;YACxB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,kEAAkE,EAAE;gBACpF,QAAQ;gBACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW;gBACpC,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;aAC7D,CAAC,CAAA;QACH,CAAC;QACD,IAAI,gBAAgB,EAAE,CAAC;YACtB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,iCAAiC,EAAE;gBACnD,QAAQ;gBACR,MAAM;gBACN,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW;aACpC,CAAC,CAAA;QACH,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAChC,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,UAAU,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAA;QAC/H,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE;YACjD,MAAM,EAAE,QAAQ;YAChB,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,OAAO,EAAE,CAAC,gBAAgB;YAC1B,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa;SACtC,CAAC,CAAA;QACF,OAAO,MAAM,CAAA;IACd,CAAC;CACD;AAED,+EAA+E;AAC/E,uBAAuB;AACvB,+EAA+E;AAE/E;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,wCAAwC,CAAA;AAEpE,MAAM,cAAc,GAA2B;IAC9C,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;IAChB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,eAAe;IACpB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;CAChB,CAAA;AAED,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC9C,MAAM,GAAG,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,CAAA;IACpD,OAAO,cAAc,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,WAAW,CAAA;AAChD,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAA0D;IAC7F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,MAAM,CAAA;IACvC,IAAI,MAAM,GAAG,gBAAgB;QAAE,OAAO,KAAK,CAAA;IAC3C,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,GAAG,MAAM,CAAA;IACvC,OAAO,OAAO,IAAI,8BAA8B,CAAA;AACjD,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACvC,QAAgB,EAChB,aAA4B;IAE5B,IAAI,CAAC;QACJ,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,aAAa,CAAC,QAAQ,CAC9C,YAAY,EACZ,CAAC,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,EACzC,EAAE,OAAO,EAAE,MAAM,EAAE,CACnB,CAAA;QACD,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QACvC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,IAAI,CAAA;QACjC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QACpC,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QACrC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,IAAI,CAAA;QACpE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,CAAA;IACzB,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,IAAI,CAAA;IACZ,CAAC;AACF,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC5C,UAAqB,EACrB,cAAsB,EACtB,GAAwB,EACxB,QAA8B,EAC9B,MAAc,EACd,EAAc,EACd,aAA4B;IAE5B,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAA;IAC7D,IAAI,CAAC,UAAU,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAE7B,MAAM,UAAU,GAAG,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;IAE/F,oEAAoE;IACpE,MAAM,SAAS,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE;QACzE,MAAM,UAAU,GAAG,UAAU,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QAClD,IAAI,CAAC,UAAU,CAAC,EAAE;YAAE,OAAO,IAAI,CAAA;QAE/B,IAAI,SAAS,GAAG,CAAC,CAAA;QACjB,IAAI,CAAC;YACJ,SAAS,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAA;QACnD,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,IAAI,CAAA;QACZ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,kBAAkB,CAAC,UAAU,CAAC,KAAK,EAAE,aAAa,CAAC,CAAA;QACtE,IAAI,CAAC,IAAI,EAAE,CAAC;YACX,0EAA0E;YAC1E,OAAO,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;QACxE,CAAC;QAED,MAAM,IAAI,GAAG,mBAAmB,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS,EAAE,CAAC,CAAA;QACvF,OAAO,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,CAAA;IACrF,CAAC,CAAC,CAAA;IAEF,MAAM,QAAQ,GAAG,SAAS;SACxB,MAAM,CAAC,CAAC,CAAC,EAA8B,EAAE,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC;SAC/D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC;SACzC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;IAEtB,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,CAAA;IACxE,IAAI,YAAY,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,UAAU,EAAE,CAAC;QACvD,MAAM,CAAC,IAAI,CAAC,sBAAsB,EAAE;YACnC,MAAM,EAAE,cAAc;YACtB,UAAU,EAAE,UAAU,CAAC,MAAM;YAC7B,mBAAmB,EAAE,UAAU,CAAC,MAAM,GAAG,YAAY;YACrD,QAAQ,EAAE,QAAQ,CAAC,MAAM;YACzB,gBAAgB,EAAE,YAAY;SAC9B,CAAC,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,QAAQ,EAAE,0BAA0B,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;QAChG,MAAM,aAAa,GAAG,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QACvD,IAAI,CAAC,aAAa,CAAC,EAAE;YAAE,OAAO,IAAI,CAAA;QAElC,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QAC5C,IAAI,WAAW,GAAG,OAAO,CAAA;QAEzB,MAAM,UAAU,GAAG,QAAQ,CAAC,cAAc,CAAC,OAAO,CAAC,CAAA;QACnD,IAAI,UAAU,EAAE,CAAC;YAChB,MAAM,cAAc,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE;gBAC7E,KAAK,EAAE,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,OAAO,CAAC;aACjE,CAAC,CAAA;YACF,IAAI,cAAc,CAAC,EAAE,IAAI,cAAc,CAAC,KAAK,CAAC,gBAAgB,EAAE,CAAC;gBAChE,WAAW,GAAG,cAAc,CAAC,KAAK,CAAC,gBAAgB,CAAA;YACpD,CAAC;QACF,CAAC;QAED,OAAO,EAAE,YAAY,EAAE,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,CAAA;IAC1E,CAAC,CAAC,CAAA;IAEF,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAsD,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAA;AAC7F,CAAC"}
@@ -0,0 +1,71 @@
1
+ /**
2
+ * PDF Preprocessor
3
+ *
4
+ * Dedicated PDF pipeline:
5
+ *
6
+ * 1. Text extraction via `pdftotext` (poppler-utils, C++) — ~1 s for a
7
+ * 3 MB PDF. Replaces markitdown/pdfminer.six (~22 s for the same file)
8
+ * because PDFs in practice don't carry the rich markdown structure
9
+ * that justifies the slower backend.
10
+ *
11
+ * 2. Image extraction via `pdfimages -all` — keeps the original embedded
12
+ * format (JPEG stays JPEG) instead of re-encoding everything to PNG
13
+ * (~10× faster, much smaller files).
14
+ *
15
+ * 3. Text and image extraction run in parallel.
16
+ *
17
+ * 4. Images stream into the classifier as soon as `pdfimages` writes them
18
+ * to disk — the classifier doesn't wait for the whole extraction to
19
+ * finish. A density filter (bytes/pixel) drops alpha masks and overlay
20
+ * layers before the vision call.
21
+ */
22
+ import type { Result } from '../../../lib/utils/result.js';
23
+ import type { FileSystem } from '../../../platform/fs.js';
24
+ import type { ProcessRunner } from '../../../platform/process.js';
25
+ import type { Logger } from '../../../lib/logger/logger.js';
26
+ import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js';
27
+ export interface PdfPreprocessorConfig {
28
+ registry: PreprocessorRegistry;
29
+ logger: Logger;
30
+ fs: FileSystem;
31
+ process: ProcessRunner;
32
+ }
33
+ export declare class PdfPreprocessor implements Preprocessor {
34
+ readonly name = "pdf";
35
+ readonly supportedMimeTypes: string[];
36
+ private readonly registry;
37
+ private readonly logger;
38
+ private readonly fs;
39
+ private readonly processRunner;
40
+ constructor(config: PdfPreprocessorConfig);
41
+ process(filePath: string, mimeType: string, ctx: PreprocessorContext): Promise<Result<PreprocessorResult, Error>>;
42
+ /**
43
+ * Extract plain text via pdftotext. Writes to content.md verbatim — no
44
+ * markdown structure to preserve, but the file extension stays .md for
45
+ * consistency with the markitdown pipeline (downstream consumers expect
46
+ * "content.md" in the upload directory).
47
+ *
48
+ * `-layout` preserves the original visual layout (columns, tables),
49
+ * which is what users typically expect when looking at PDFs.
50
+ */
51
+ private extractText;
52
+ /**
53
+ * Extract images via pdfimages and classify them as they appear on disk.
54
+ *
55
+ * pdfimages writes files atomically per image (open temp, write, rename
56
+ * to final name), so polling `readdir` is safe — we either see a name or
57
+ * we don't, never a half-written file.
58
+ *
59
+ * Streaming overlaps the extraction tail with the first classification
60
+ * batches. Hard cap of MAX_IMAGES applies across the *filtered* set: as
61
+ * soon as MAX_IMAGES images have passed the density filter, further
62
+ * candidates are stat-checked but not classified.
63
+ *
64
+ * `-all` keeps the embedded format (JPEG, JBIG2, JP2). We only classify
65
+ * those Anthropic vision accepts (PNG/JPEG/GIF/WebP); other formats are
66
+ * extracted to disk for reference but skipped at the classification step.
67
+ */
68
+ private extractAndClassifyImages;
69
+ private scanAndDispatch;
70
+ }
71
+ //# sourceMappingURL=pdf-preprocessor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf-preprocessor.d.ts","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/pdf-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAA;AAEnD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAA;AAClD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAA;AAC1D,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAC3D,OAAO,KAAK,EAAE,YAAY,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAA;AAmBrH,MAAM,WAAW,qBAAqB;IACrC,QAAQ,EAAE,oBAAoB,CAAA;IAC9B,MAAM,EAAE,MAAM,CAAA;IACd,EAAE,EAAE,UAAU,CAAA;IACd,OAAO,EAAE,aAAa,CAAA;CACtB;AAED,qBAAa,eAAgB,YAAW,YAAY;IACnD,QAAQ,CAAC,IAAI,SAAQ;IACrB,QAAQ,CAAC,kBAAkB,WAAuB;IAElD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAsB;IAC/C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAQ;IAC/B,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAY;IAC/B,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;gBAEjC,MAAM,EAAE,qBAAqB;IAOnC,OAAO,CACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,MAAM,EAChB,GAAG,EAAE,mBAAmB,GACtB,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAAE,KAAK,CAAC,CAAC;IA8C7C;;;;;;;;OAQG;YACW,WAAW;IA8BzB;;;;;;;;;;;;;;;OAeG;YACW,wBAAwB;YAoJxB,eAAe;CAc7B"}