@roj-ai/sdk 0.1.15 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bootstrap.d.ts.map +1 -1
- package/dist/bootstrap.js +12 -2
- package/dist/bootstrap.js.map +1 -1
- package/dist/core/image/types.d.ts +2 -0
- package/dist/core/image/types.d.ts.map +1 -1
- package/dist/core/image/vips-resizer.d.ts.map +1 -1
- package/dist/core/image/vips-resizer.js +12 -11
- package/dist/core/image/vips-resizer.js.map +1 -1
- package/dist/core/sessions/session.d.ts.map +1 -1
- package/dist/core/sessions/session.js +0 -7
- package/dist/core/sessions/session.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/image-classifier.d.ts +20 -0
- package/dist/plugins/uploads/preprocessors/image-classifier.d.ts.map +1 -1
- package/dist/plugins/uploads/preprocessors/image-classifier.js +93 -28
- package/dist/plugins/uploads/preprocessors/image-classifier.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/index.d.ts +1 -0
- package/dist/plugins/uploads/preprocessors/index.d.ts.map +1 -1
- package/dist/plugins/uploads/preprocessors/index.js +1 -0
- package/dist/plugins/uploads/preprocessors/index.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts +52 -5
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts.map +1 -1
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js +183 -75
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts +71 -0
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts.map +1 -0
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js +274 -0
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js.map +1 -0
- package/package.json +2 -2
- package/src/bootstrap.ts +12 -2
- package/src/core/image/types.ts +2 -0
- package/src/core/image/vips-resizer.ts +12 -11
- package/src/core/sessions/session.ts +0 -8
- package/src/plugins/uploads/preprocessors/image-classifier.ts +108 -29
- package/src/plugins/uploads/preprocessors/index.ts +1 -0
- package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts +213 -79
- package/src/plugins/uploads/preprocessors/pdf-preprocessor.ts +342 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,iCAAiC,EAA8B,2BAA2B,EAAE,MAAM,uBAAuB,CAAA;AAClI,OAAO,EAAE,sBAAsB,EAAqC,MAAM,8BAA8B,CAAA;AACxG,OAAO,EAAE,eAAe,EAA8B,MAAM,uBAAuB,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,iCAAiC,EAA8B,2BAA2B,EAAE,MAAM,uBAAuB,CAAA;AAClI,OAAO,EAAE,sBAAsB,EAAqC,MAAM,8BAA8B,CAAA;AACxG,OAAO,EAAE,eAAe,EAA8B,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAE,eAAe,EAA8B,MAAM,uBAAuB,CAAA"}
|
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
* Markitdown Preprocessor
|
|
3
3
|
*
|
|
4
4
|
* Converts documents to markdown using Microsoft's markitdown CLI.
|
|
5
|
-
* Supports
|
|
5
|
+
* Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
|
|
6
|
+
*
|
|
7
|
+
* PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
|
|
8
|
+
* (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
|
|
9
|
+
* mostly-unstructured PDFs we see in practice.
|
|
6
10
|
*
|
|
7
11
|
* Image extraction:
|
|
8
|
-
* -
|
|
9
|
-
* - DOCX/ODT/EPUB: uses pandoc --extract-media
|
|
12
|
+
* - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
|
|
10
13
|
*
|
|
11
14
|
* Extracted images are classified via the image classifier preprocessor.
|
|
12
15
|
* Full content is written to disk; extractedContent contains a structured manifest.
|
|
@@ -17,6 +20,18 @@ import type { ProcessRunner } from '../../../platform/process.js';
|
|
|
17
20
|
import type { FileStore } from '../../../core/file-store/types.js';
|
|
18
21
|
import type { Logger } from '../../../lib/logger/logger.js';
|
|
19
22
|
import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js';
|
|
23
|
+
/**
|
|
24
|
+
* Density filter for extracted images. Bytes-per-pixel ratio below this
|
|
25
|
+
* threshold typically means the image is an alpha mask, overlay layer, or
|
|
26
|
+
* essentially-empty region — not worth a vision call.
|
|
27
|
+
*
|
|
28
|
+
* Empirical reference points:
|
|
29
|
+
* - Dense photo JPEG: 0.3–1.0 B/px
|
|
30
|
+
* - Logo / icon PNG: 0.1–0.5 B/px
|
|
31
|
+
* - Brand PDF layer mask: <0.005 B/px
|
|
32
|
+
*/
|
|
33
|
+
export declare const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05;
|
|
34
|
+
export declare const MIN_IMAGE_PIXELS: number;
|
|
20
35
|
export interface MarkitdownPreprocessorConfig {
|
|
21
36
|
registry: PreprocessorRegistry;
|
|
22
37
|
logger: Logger;
|
|
@@ -29,14 +44,46 @@ export declare class MarkitdownPreprocessor implements Preprocessor {
|
|
|
29
44
|
private readonly registry;
|
|
30
45
|
private readonly logger;
|
|
31
46
|
private readonly fs;
|
|
47
|
+
private readonly processRunner;
|
|
32
48
|
private readonly exec;
|
|
33
49
|
constructor(config: MarkitdownPreprocessorConfig);
|
|
34
50
|
process(filePath: string, mimeType: string, ctx: PreprocessorContext): Promise<Result<PreprocessorResult, Error>>;
|
|
51
|
+
private runMarkitdown;
|
|
35
52
|
private extractImagesWithPandoc;
|
|
36
|
-
private extractImagesWithPdfimages;
|
|
37
53
|
}
|
|
54
|
+
/**
|
|
55
|
+
* Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
|
|
56
|
+
* jb2e, jp2) are ignored — they'd require local conversion before being
|
|
57
|
+
* useful for classification.
|
|
58
|
+
*/
|
|
59
|
+
export declare const IMAGE_EXT_RE: RegExp;
|
|
38
60
|
export declare function guessImageMime(filename: string): string;
|
|
39
|
-
|
|
61
|
+
/**
|
|
62
|
+
* Reject images that are unlikely to carry useful visual information.
|
|
63
|
+
*
|
|
64
|
+
* `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
|
|
65
|
+
* empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
|
|
66
|
+
* matching transparency/overlay layer at the same dimensions but a fraction
|
|
67
|
+
* of a percent of the size (<0.005 B/px).
|
|
68
|
+
*
|
|
69
|
+
* The minimum pixel count protects against tiny icons whose density alone
|
|
70
|
+
* doesn't disqualify them.
|
|
71
|
+
*/
|
|
72
|
+
export declare function shouldClassifyImage(meta: {
|
|
73
|
+
width: number;
|
|
74
|
+
height: number;
|
|
75
|
+
sizeBytes: number;
|
|
76
|
+
}): boolean;
|
|
77
|
+
/**
|
|
78
|
+
* Read image dimensions via vipsheader. Returns null when the tool isn't
|
|
79
|
+
* available or output is unparseable — caller should treat that as
|
|
80
|
+
* "include without filtering".
|
|
81
|
+
*/
|
|
82
|
+
export declare function getImageDimensions(filePath: string, processRunner: ProcessRunner): Promise<{
|
|
83
|
+
width: number;
|
|
84
|
+
height: number;
|
|
85
|
+
} | null>;
|
|
86
|
+
export declare function classifyExtractedImages(imageStore: FileStore, relativePrefix: string, ctx: PreprocessorContext, registry: PreprocessorRegistry, logger: Logger, fs: FileSystem, processRunner: ProcessRunner): Promise<Array<{
|
|
40
87
|
relativePath: string;
|
|
41
88
|
description: string;
|
|
42
89
|
}>>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markitdown-preprocessor.d.ts","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/markitdown-preprocessor.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"markitdown-preprocessor.d.ts","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/markitdown-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAA;AAEnD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAA;AAClD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAA;AAC1D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,mCAAmC,CAAA;AAClE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAC3D,OAAO,KAAK,EAAE,YAAY,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAA;AAKrH;;;;;;;;;GASG;AACH,eAAO,MAAM,8BAA8B,OAAO,CAAA;AAClD,eAAO,MAAM,gBAAgB,QAAU,CAAA;AA2CvC,MAAM,WAAW,4BAA4B;IAC5C,QAAQ,EAAE,oBAAoB,CAAA;IAC9B,MAAM,EAAE,MAAM,CAAA;IACd,EAAE,EAAE,UAAU,CAAA;IACd,OAAO,EAAE,aAAa,CAAA;CACtB;AAED,qBAAa,sBAAuB,YAAW,YAAY;IAC1D,QAAQ,CAAC,IAAI,gBAAe;IAC5B,QAAQ,CAAC,kBAAkB,WAAuB;IAElD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAsB;IAC/C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAQ;IAC/B,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAY;IAC/B,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAC7C,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAkG;gBAE3G,MAAM,EAAE,4BAA4B;IAQ1C,OAAO,CACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,MAAM,EAChB,GAAG,EAAE,mBAAmB,GACtB,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAAE,KAAK,CAAC,CAAC;YAmD/B,aAAa;YAsCb,uBAAuB;CA8CrC;AAMD;;;;GAIG;AACH,eAAO,MAAM,YAAY,QAA2C,CAAA;AAcpE,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGvD;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GAAG,OAAO,CAKvG;AAED;;;;GAIG;AACH,wBAAsB,kBAAkB,CACvC,QAAQ,EAAE,MAAM,EAChB,aAAa,EAAE,aAAa,GAC1B,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,GAAG,IAAI,CAAC,CAgBnD;AAED,wBAAsB,uBAAuB,CAC5C,UAAU,EAAE,SAAS,EACrB,cAAc,EAAE,MAAM,EACtB,GAAG,EAAE,mBAAmB,EACxB,QAAQ,EAAE,oBAAoB,EAC9B,MAAM,EAAE,MAAM,EACd,EAAE,EAAE,UAAU,EACd,aAAa,EAAE,aAAa,GAC1B,OAAO,CAAC,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,WAAW,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC,CAiE/D"}
|
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
* Markitdown Preprocessor
|
|
3
3
|
*
|
|
4
4
|
* Converts documents to markdown using Microsoft's markitdown CLI.
|
|
5
|
-
* Supports
|
|
5
|
+
* Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
|
|
6
|
+
*
|
|
7
|
+
* PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
|
|
8
|
+
* (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
|
|
9
|
+
* mostly-unstructured PDFs we see in practice.
|
|
6
10
|
*
|
|
7
11
|
* Image extraction:
|
|
8
|
-
* -
|
|
9
|
-
* - DOCX/ODT/EPUB: uses pandoc --extract-media
|
|
12
|
+
* - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
|
|
10
13
|
*
|
|
11
14
|
* Extracted images are classified via the image classifier preprocessor.
|
|
12
15
|
* Full content is written to disk; extractedContent contains a structured manifest.
|
|
@@ -14,14 +17,31 @@
|
|
|
14
17
|
import { dirname } from 'node:path';
|
|
15
18
|
import { mapWithConcurrency } from '../../../lib/utils/concurrency.js';
|
|
16
19
|
import { Err, Ok } from '../../../lib/utils/result.js';
|
|
17
|
-
const MAX_IMAGES =
|
|
20
|
+
const MAX_IMAGES = 20;
|
|
18
21
|
const IMAGE_CLASSIFY_CONCURRENCY = 10;
|
|
22
|
+
/**
|
|
23
|
+
* Density filter for extracted images. Bytes-per-pixel ratio below this
|
|
24
|
+
* threshold typically means the image is an alpha mask, overlay layer, or
|
|
25
|
+
* essentially-empty region — not worth a vision call.
|
|
26
|
+
*
|
|
27
|
+
* Empirical reference points:
|
|
28
|
+
* - Dense photo JPEG: 0.3–1.0 B/px
|
|
29
|
+
* - Logo / icon PNG: 0.1–0.5 B/px
|
|
30
|
+
* - Brand PDF layer mask: <0.005 B/px
|
|
31
|
+
*/
|
|
32
|
+
export const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05;
|
|
33
|
+
export const MIN_IMAGE_PIXELS = 50 * 50;
|
|
34
|
+
// markitdown converts a text-only document; even large files finish in seconds.
|
|
35
|
+
const MARKITDOWN_TIMEOUT_MS = 60_000;
|
|
36
|
+
// Image extractors (pandoc --extract-media) scale with image count
|
|
37
|
+
// and resolution. Real-world large docs can take 60–90s. Upload preprocessing
|
|
38
|
+
// is async/background, so allow generous headroom.
|
|
39
|
+
const IMAGE_EXTRACT_TIMEOUT_MS = 5 * 60_000;
|
|
19
40
|
function makeExec(processRunner) {
|
|
20
|
-
return (cmd, args) => processRunner.execFile(cmd, args, { timeout:
|
|
41
|
+
return (cmd, args, timeoutMs = MARKITDOWN_TIMEOUT_MS) => processRunner.execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 50 * 1024 * 1024 });
|
|
21
42
|
}
|
|
22
|
-
/** MIME types where markitdown converts to markdown (non-ZIP, non-image) */
|
|
43
|
+
/** MIME types where markitdown converts to markdown (non-ZIP, non-image, non-PDF) */
|
|
23
44
|
const SUPPORTED_MIME_TYPES = [
|
|
24
|
-
'application/pdf',
|
|
25
45
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
26
46
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
27
47
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
@@ -46,74 +66,89 @@ const PANDOC_FORMAT_MAP = {
|
|
|
46
66
|
'application/vnd.oasis.opendocument.text': 'odt',
|
|
47
67
|
'application/epub+zip': 'epub',
|
|
48
68
|
};
|
|
49
|
-
/** MIME types where pdfimages can extract images */
|
|
50
|
-
const PDFIMAGES_MIMES = new Set([
|
|
51
|
-
'application/pdf',
|
|
52
|
-
]);
|
|
53
69
|
export class MarkitdownPreprocessor {
|
|
54
70
|
name = 'markitdown';
|
|
55
71
|
supportedMimeTypes = SUPPORTED_MIME_TYPES;
|
|
56
72
|
registry;
|
|
57
73
|
logger;
|
|
58
74
|
fs;
|
|
75
|
+
processRunner;
|
|
59
76
|
exec;
|
|
60
77
|
constructor(config) {
|
|
61
78
|
this.registry = config.registry;
|
|
62
79
|
this.logger = config.logger;
|
|
63
80
|
this.fs = config.fs;
|
|
81
|
+
this.processRunner = config.process;
|
|
64
82
|
this.exec = makeExec(config.process);
|
|
65
83
|
}
|
|
66
84
|
async process(filePath, mimeType, ctx) {
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
// 1. Convert to markdown via markitdown
|
|
85
|
+
const totalStart = Date.now();
|
|
86
|
+
this.logger.info('Markitdown processing started', { filePath, mimeType });
|
|
70
87
|
const contentPathResult = ctx.files.realPath('content.md');
|
|
71
88
|
if (!contentPathResult.ok) {
|
|
72
89
|
return Err(new Error('Failed to resolve output path'));
|
|
73
90
|
}
|
|
91
|
+
await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true });
|
|
92
|
+
// Race markitdown text conversion and image extraction — they're
|
|
93
|
+
// independent, so there's no reason to serialize them. For documents
|
|
94
|
+
// where pandoc extraction isn't applicable, the image task resolves
|
|
95
|
+
// immediately.
|
|
96
|
+
const markdownTask = this.runMarkitdown(filePath, mimeType, contentPathResult.value);
|
|
97
|
+
const imageTask = PANDOC_EXTRACT_MIMES.has(mimeType)
|
|
98
|
+
? this.extractImagesWithPandoc(filePath, mimeType, ctx)
|
|
99
|
+
: Promise.resolve([]);
|
|
100
|
+
const [markdownResult, images] = await Promise.all([markdownTask, imageTask]);
|
|
101
|
+
if (!markdownResult.ok)
|
|
102
|
+
return markdownResult;
|
|
103
|
+
const markdown = markdownResult.value;
|
|
104
|
+
const derivedPaths = ['content.md'];
|
|
105
|
+
const imageEntries = [];
|
|
106
|
+
for (const img of images) {
|
|
107
|
+
derivedPaths.push(img.relativePath);
|
|
108
|
+
imageEntries.push(`- ${img.relativePath} — ${img.description}`);
|
|
109
|
+
}
|
|
110
|
+
const manifestLines = ['Extracted files:'];
|
|
111
|
+
manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`);
|
|
112
|
+
manifestLines.push(...imageEntries);
|
|
113
|
+
this.logger.info('Markitdown processing complete', {
|
|
114
|
+
filePath,
|
|
115
|
+
mimeType,
|
|
116
|
+
contentLength: markdown.length,
|
|
117
|
+
imagesExtracted: imageEntries.length,
|
|
118
|
+
totalDurationMs: Date.now() - totalStart,
|
|
119
|
+
});
|
|
120
|
+
return Ok({
|
|
121
|
+
extractedContent: manifestLines.join('\n'),
|
|
122
|
+
derivedPaths,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
async runMarkitdown(filePath, mimeType, outputPath) {
|
|
126
|
+
const markitdownStart = Date.now();
|
|
74
127
|
try {
|
|
75
|
-
await this.
|
|
76
|
-
await this.exec('markitdown', [filePath, '-o', contentPathResult.value]);
|
|
128
|
+
await this.exec('markitdown', [filePath, '-o', outputPath]);
|
|
77
129
|
}
|
|
78
130
|
catch (error) {
|
|
79
131
|
const message = error instanceof Error ? error.message : String(error);
|
|
132
|
+
this.logger.error('markitdown CLI failed', error instanceof Error ? error : undefined, { filePath, mimeType, durationMs: Date.now() - markitdownStart });
|
|
80
133
|
if (message.includes('ENOENT')) {
|
|
81
134
|
return Err(new Error('markitdown not found. Install with: pip install "markitdown[all]"'));
|
|
82
135
|
}
|
|
83
136
|
return Err(new Error(`markitdown failed: ${message}`));
|
|
84
137
|
}
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
// 2. Extract images based on file type
|
|
89
|
-
if (PANDOC_EXTRACT_MIMES.has(mimeType)) {
|
|
90
|
-
const images = await this.extractImagesWithPandoc(filePath, mimeType, ctx);
|
|
91
|
-
for (const img of images) {
|
|
92
|
-
derivedPaths.push(img.relativePath);
|
|
93
|
-
imageEntries.push(`- ${img.relativePath} — ${img.description}`);
|
|
94
|
-
}
|
|
138
|
+
let markdown = '';
|
|
139
|
+
try {
|
|
140
|
+
markdown = await this.fs.readFile(outputPath, 'utf-8');
|
|
95
141
|
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
for (const img of images) {
|
|
99
|
-
derivedPaths.push(img.relativePath);
|
|
100
|
-
imageEntries.push(`- ${img.relativePath} — ${img.description}`);
|
|
101
|
-
}
|
|
142
|
+
catch {
|
|
143
|
+
// Output missing — markitdown completed but produced nothing.
|
|
102
144
|
}
|
|
103
|
-
|
|
104
|
-
const manifestLines = ['Extracted files:'];
|
|
105
|
-
manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`);
|
|
106
|
-
manifestLines.push(...imageEntries);
|
|
107
|
-
this.logger.debug('Markitdown processed', {
|
|
145
|
+
this.logger.info('Markitdown conversion complete', {
|
|
108
146
|
filePath,
|
|
109
147
|
mimeType,
|
|
148
|
+
durationMs: Date.now() - markitdownStart,
|
|
110
149
|
contentLength: markdown.length,
|
|
111
|
-
imagesExtracted: imageEntries.length,
|
|
112
|
-
});
|
|
113
|
-
return Ok({
|
|
114
|
-
extractedContent: manifestLines.join('\n'),
|
|
115
|
-
derivedPaths,
|
|
116
150
|
});
|
|
151
|
+
return Ok(markdown);
|
|
117
152
|
}
|
|
118
153
|
async extractImagesWithPandoc(filePath, mimeType, ctx) {
|
|
119
154
|
const mediaStore = ctx.files.scoped('media');
|
|
@@ -123,43 +158,46 @@ export class MarkitdownPreprocessor {
|
|
|
123
158
|
const format = PANDOC_FORMAT_MAP[mimeType];
|
|
124
159
|
if (!format)
|
|
125
160
|
return [];
|
|
161
|
+
const pandocStart = Date.now();
|
|
162
|
+
let extractSucceeded = true;
|
|
126
163
|
try {
|
|
127
|
-
await this.exec('pandoc', [
|
|
128
|
-
'-f',
|
|
129
|
-
format,
|
|
130
|
-
'-t',
|
|
131
|
-
'gfm',
|
|
132
|
-
filePath,
|
|
133
|
-
'-o',
|
|
134
|
-
'/dev/null',
|
|
135
|
-
`--extract-media=${mediaDirResult.value}`,
|
|
136
|
-
]);
|
|
137
|
-
}
|
|
138
|
-
catch {
|
|
139
|
-
this.logger.warn('pandoc --extract-media failed', { filePath });
|
|
140
|
-
return [];
|
|
164
|
+
await this.exec('pandoc', ['-f', format, '-t', 'gfm', filePath, '-o', '/dev/null', `--extract-media=${mediaDirResult.value}`], IMAGE_EXTRACT_TIMEOUT_MS);
|
|
141
165
|
}
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
try {
|
|
150
|
-
await this.fs.mkdir(imagesDirResult.value, { recursive: true });
|
|
151
|
-
await this.exec('pdfimages', ['-png', filePath, `${imagesDirResult.value}/img`]);
|
|
166
|
+
catch (error) {
|
|
167
|
+
extractSucceeded = false;
|
|
168
|
+
this.logger.warn('pandoc --extract-media failed (will classify any partial output)', {
|
|
169
|
+
filePath,
|
|
170
|
+
durationMs: Date.now() - pandocStart,
|
|
171
|
+
error: error instanceof Error ? error.message : String(error),
|
|
172
|
+
});
|
|
152
173
|
}
|
|
153
|
-
|
|
154
|
-
|
|
174
|
+
if (extractSucceeded) {
|
|
175
|
+
this.logger.info('pandoc --extract-media complete', {
|
|
176
|
+
filePath,
|
|
177
|
+
format,
|
|
178
|
+
durationMs: Date.now() - pandocStart,
|
|
179
|
+
});
|
|
155
180
|
}
|
|
156
|
-
|
|
181
|
+
const classifyStart = Date.now();
|
|
182
|
+
const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger, this.fs, this.processRunner);
|
|
183
|
+
this.logger.info('Image classification complete', {
|
|
184
|
+
source: 'pandoc',
|
|
185
|
+
count: images.length,
|
|
186
|
+
partial: !extractSucceeded,
|
|
187
|
+
durationMs: Date.now() - classifyStart,
|
|
188
|
+
});
|
|
189
|
+
return images;
|
|
157
190
|
}
|
|
158
191
|
}
|
|
159
192
|
// ============================================================================
|
|
160
193
|
// Shared image helpers
|
|
161
194
|
// ============================================================================
|
|
162
|
-
|
|
195
|
+
/**
|
|
196
|
+
* Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
|
|
197
|
+
* jb2e, jp2) are ignored — they'd require local conversion before being
|
|
198
|
+
* useful for classification.
|
|
199
|
+
*/
|
|
200
|
+
export const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i;
|
|
163
201
|
const IMAGE_MIME_MAP = {
|
|
164
202
|
png: 'image/png',
|
|
165
203
|
jpg: 'image/jpeg',
|
|
@@ -175,15 +213,85 @@ export function guessImageMime(filename) {
|
|
|
175
213
|
const ext = filename.split('.').pop()?.toLowerCase();
|
|
176
214
|
return IMAGE_MIME_MAP[ext ?? ''] ?? 'image/png';
|
|
177
215
|
}
|
|
178
|
-
|
|
216
|
+
/**
|
|
217
|
+
* Reject images that are unlikely to carry useful visual information.
|
|
218
|
+
*
|
|
219
|
+
* `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
|
|
220
|
+
* empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
|
|
221
|
+
* matching transparency/overlay layer at the same dimensions but a fraction
|
|
222
|
+
* of a percent of the size (<0.005 B/px).
|
|
223
|
+
*
|
|
224
|
+
* The minimum pixel count protects against tiny icons whose density alone
|
|
225
|
+
* doesn't disqualify them.
|
|
226
|
+
*/
|
|
227
|
+
export function shouldClassifyImage(meta) {
|
|
228
|
+
const pixels = meta.width * meta.height;
|
|
229
|
+
if (pixels < MIN_IMAGE_PIXELS)
|
|
230
|
+
return false;
|
|
231
|
+
const density = meta.sizeBytes / pixels;
|
|
232
|
+
return density >= MIN_IMAGE_DENSITY_BYTES_PER_PX;
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Read image dimensions via vipsheader. Returns null when the tool isn't
|
|
236
|
+
* available or output is unparseable — caller should treat that as
|
|
237
|
+
* "include without filtering".
|
|
238
|
+
*/
|
|
239
|
+
export async function getImageDimensions(filePath, processRunner) {
|
|
240
|
+
try {
|
|
241
|
+
const { stdout } = await processRunner.execFile('vipsheader', ['-f', 'width', '-f', 'height', filePath], { timeout: 10_000 });
|
|
242
|
+
const lines = stdout.trim().split('\n');
|
|
243
|
+
if (lines.length < 2)
|
|
244
|
+
return null;
|
|
245
|
+
const width = parseInt(lines[0], 10);
|
|
246
|
+
const height = parseInt(lines[1], 10);
|
|
247
|
+
if (!Number.isFinite(width) || !Number.isFinite(height))
|
|
248
|
+
return null;
|
|
249
|
+
return { width, height };
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
return null;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
export async function classifyExtractedImages(imageStore, relativePrefix, ctx, registry, logger, fs, processRunner) {
|
|
179
256
|
const listResult = await imageStore.list('', { maxDepth: 3 });
|
|
180
257
|
if (!listResult.ok)
|
|
181
258
|
return [];
|
|
182
|
-
const
|
|
183
|
-
|
|
184
|
-
|
|
259
|
+
const candidates = listResult.value.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name));
|
|
260
|
+
// Stat + density filter, then keep the top MAX_IMAGES by file size.
|
|
261
|
+
const inspected = await mapWithConcurrency(candidates, 8, async (entry) => {
|
|
262
|
+
const pathResult = imageStore.realPath(entry.name);
|
|
263
|
+
if (!pathResult.ok)
|
|
264
|
+
return null;
|
|
265
|
+
let sizeBytes = 0;
|
|
266
|
+
try {
|
|
267
|
+
sizeBytes = (await fs.stat(pathResult.value)).size;
|
|
268
|
+
}
|
|
269
|
+
catch {
|
|
270
|
+
return null;
|
|
271
|
+
}
|
|
272
|
+
const dims = await getImageDimensions(pathResult.value, processRunner);
|
|
273
|
+
if (!dims) {
|
|
274
|
+
// Unknown dims — include but warn; better to classify than silently drop.
|
|
275
|
+
return { name: entry.name, sizeBytes, width: 0, height: 0, kept: true };
|
|
276
|
+
}
|
|
277
|
+
const kept = shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes });
|
|
278
|
+
return { name: entry.name, sizeBytes, width: dims.width, height: dims.height, kept };
|
|
279
|
+
});
|
|
280
|
+
const filtered = inspected
|
|
281
|
+
.filter((r) => r !== null && r.kept)
|
|
282
|
+
.sort((a, b) => b.sizeBytes - a.sizeBytes)
|
|
185
283
|
.slice(0, MAX_IMAGES);
|
|
186
|
-
const
|
|
284
|
+
const droppedCount = inspected.filter(r => r !== null && !r.kept).length;
|
|
285
|
+
if (droppedCount > 0 || inspected.length > MAX_IMAGES) {
|
|
286
|
+
logger.info('Image filter applied', {
|
|
287
|
+
source: relativePrefix,
|
|
288
|
+
candidates: candidates.length,
|
|
289
|
+
passedDensityFilter: candidates.length - droppedCount,
|
|
290
|
+
selected: filtered.length,
|
|
291
|
+
droppedByDensity: droppedCount,
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
const settled = await mapWithConcurrency(filtered, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
|
|
187
295
|
const imgPathResult = imageStore.realPath(imgFile.name);
|
|
188
296
|
if (!imgPathResult.ok)
|
|
189
297
|
return null;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markitdown-preprocessor.js","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/markitdown-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAE/D,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,uBAAuB,CAAA;AAO/C,MAAM,UAAU,GAAG,EAAE,CAAA;AACrB,MAAM,0BAA0B,GAAG,EAAE,CAAA;AAErC,SAAS,QAAQ,CAAC,aAA4B;IAC7C,OAAO,CAAC,GAAW,EAAE,IAAc,EAAE,EAAE,CAAC,aAAa,CAAC,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC,CAAA;AAC5H,CAAC;AAED,4EAA4E;AAC5E,MAAM,oBAAoB,GAAG;IAC5B,iBAAiB;IACjB,yEAAyE;IACzE,mEAAmE;IACnE,2EAA2E;IAC3E,yCAAyC;IACzC,iBAAiB;IACjB,sBAAsB;IACtB,WAAW;IACX,uBAAuB;IACvB,UAAU;IACV,kBAAkB;IAClB,iBAAiB;IACjB,UAAU;CACV,CAAA;AAED,yDAAyD;AACzD,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC;IACpC,yEAAyE;IACzE,yCAAyC;IACzC,sBAAsB;CACtB,CAAC,CAAA;AAEF,MAAM,iBAAiB,GAA2B;IACjD,yEAAyE,EAAE,MAAM;IACjF,yCAAyC,EAAE,KAAK;IAChD,sBAAsB,EAAE,MAAM;CAC9B,CAAA;AAED,oDAAoD;AACpD,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC;IAC/B,iBAAiB;CACjB,CAAC,CAAA;AASF,MAAM,OAAO,sBAAsB;IACzB,IAAI,GAAG,YAAY,CAAA;IACnB,kBAAkB,GAAG,oBAAoB,CAAA;IAEjC,QAAQ,CAAsB;IAC9B,MAAM,CAAQ;IACd,EAAE,CAAY;IACd,IAAI,CAA8E;IAEnG,YAAY,MAAoC;QAC/C,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAA;QAC/B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAA;QAC3B,IAAI,CAAC,EAAE,GAAG,MAAM,CAAC,EAAE,CAAA;QACnB,IAAI,CAAC,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IACrC,CAAC;IAED,KAAK,CAAC,OAAO,CACZ,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,MAAM,YAAY,GAAa,EAAE,CAAA;QAEjC,wCAAwC;QACxC,MAAM,iBAAiB,GAAG,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAA;QAC1D,IAAI,CAAC,iBAAiB,CAAC,EAAE,EAAE,CAAC;YAC3B,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAA;QACvD,CAAC;QAED,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,CAAC,KAAK,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC1E,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAA;QACzE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,IAAI,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAChC,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC,CAAA;YAC3F,CAAC;YACD,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,sBAAsB,OAAO,EAAE,CAAC,CAAC,CAAA;QACvD,CAAC;QAED,MAAM,aAAa,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QACxD,MAAM,QAAQ,GAAG,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAA;QAE5D,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QAE/B,uCAAuC;QACvC,IAAI,oBAAoB,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACxC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAA;YAC1E,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;gBAC1B,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;gBACnC,YAAY,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,YAAY,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,CAAA;YAChE,CAAC;QACF,CAAC;aAAM,IAAI,eAAe,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,0BAA0B,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAA;YACnE,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;gBAC1B,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;gBACnC,YAAY,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,YAAY,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,CAAA;YAChE,CAAC;QACF,CAAC;QAED,oBAAoB;QACpB,MAAM,aAAa,GAAa,CAAC,kBAAkB,CAAC,CAAA;QACpD,aAAa,CAAC,IAAI,CAAC,2BAA2B,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAA;QACvE,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAA;QAEnC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,sBAAsB,EAAE;YACzC,QAAQ;YACR,QAAQ;YACR,aAAa,EAAE,QAAQ,CAAC,MAAM;YAC9B,eAAe,EAAE,YAAY,CAAC,MAAM;SACpC,CAAC,CAAA;QAEF,OAAO,EAAE,CAAC;YACT,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;YAC1C,YAAY;SACZ,CAAC,CAAA;IACH,CAAC;IAEO,KAAK,CAAC,uBAAuB,CACpC,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QAC5C,MAAM,cAAc,GAAG,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAA;QAC9C,IAAI,CAAC,cAAc,CAAC,EAAE;YAAE,OAAO,EAAE,CAAA;QAEjC,MAAM,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;QAC1C,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAA;QAEtB,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE;gBACzB,IAAI;gBACJ,MAAM;gBACN,IAAI;gBACJ,KAAK;gBACL,QAAQ;gBACR,IAAI;gBACJ,WAAW;gBACX,mBAAmB,cAAc,CAAC,KAAK,EAAE;aACzC,CAAC,CAAA;QACH,CAAC;QAAC,MAAM,CAAC;YACR,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAA;YAC/D,OAAO,EAAE,CAAA;QACV,CAAC;QAED,OAAO,uBAAuB,CAAC,UAAU,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACrF,CAAC;IAEO,KAAK,CAAC,0BAA0B,CACvC,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAA;QAC7C,MAAM,eAAe,GAAG,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAA;QAC/C,IAAI,CAAC,eAAe,CAAC,EAAE;YAAE,OAAO,EAAE,CAAA;QAElC,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,eAAe,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC/D,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,GAAG,eAAe,CAAC,KAAK,MAAM,CAAC,CAAC,CAAA;QACjF,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,EAAE,CAAA;QACV,CAAC;QAED,OAAO,uBAAuB,CAAC,UAAU,EAAE,QAAQ,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACtF,CAAC;CACD;AAED,+EAA+E;AAC/E,uBAAuB;AACvB,+EAA+E;AAE/E,MAAM,YAAY,GAAG,wCAAwC,CAAA;AAE7D,MAAM,cAAc,GAA2B;IAC9C,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;IAChB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,eAAe;IACpB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;CAChB,CAAA;AAED,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC9C,MAAM,GAAG,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,CAAA;IACpD,OAAO,cAAc,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,WAAW,CAAA;AAChD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC5C,UAAqB,EACrB,cAAsB,EACtB,GAAwB,EACxB,QAA8B,EAC9B,MAAc;IAEd,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAA;IAC7D,IAAI,CAAC,UAAU,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAE7B,MAAM,UAAU,GAAG,UAAU,CAAC,KAAK;SACjC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SAC3D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SAC5C,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;IAEtB,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,0BAA0B,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;QAClG,MAAM,aAAa,GAAG,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QACvD,IAAI,CAAC,aAAa,CAAC,EAAE;YAAE,OAAO,IAAI,CAAA;QAElC,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QAC5C,IAAI,WAAW,GAAG,OAAO,CAAA;QAEzB,MAAM,UAAU,GAAG,QAAQ,CAAC,cAAc,CAAC,OAAO,CAAC,CAAA;QACnD,IAAI,UAAU,EAAE,CAAC;YAChB,MAAM,cAAc,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE;gBAC7E,KAAK,EAAE,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,OAAO,CAAC;aACjE,CAAC,CAAA;YACF,IAAI,cAAc,CAAC,EAAE,IAAI,cAAc,CAAC,KAAK,CAAC,gBAAgB,EAAE,CAAC;gBAChE,WAAW,GAAG,cAAc,CAAC,KAAK,CAAC,gBAAgB,CAAA;YACpD,CAAC;QACF,CAAC;QAED,OAAO,EAAE,YAAY,EAAE,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,CAAA;IAC1E,CAAC,CAAC,CAAA;IAEF,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAsD,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAA;AAC7F,CAAC"}
|
|
1
|
+
{"version":3,"file":"markitdown-preprocessor.js","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/markitdown-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAE/D,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,uBAAuB,CAAA;AAO/C,MAAM,UAAU,GAAG,EAAE,CAAA;AACrB,MAAM,0BAA0B,GAAG,EAAE,CAAA;AAErC;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,8BAA8B,GAAG,IAAI,CAAA;AAClD,MAAM,CAAC,MAAM,gBAAgB,GAAG,EAAE,GAAG,EAAE,CAAA;AAEvC,gFAAgF;AAChF,MAAM,qBAAqB,GAAG,MAAM,CAAA;AACpC,mEAAmE;AACnE,8EAA8E;AAC9E,mDAAmD;AACnD,MAAM,wBAAwB,GAAG,CAAC,GAAG,MAAM,CAAA;AAE3C,SAAS,QAAQ,CAAC,aAA4B;IAC7C,OAAO,CAAC,GAAW,EAAE,IAAc,EAAE,YAAoB,qBAAqB,EAAE,EAAE,CACjF,aAAa,CAAC,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC,CAAA;AACxF,CAAC;AAED,qFAAqF;AACrF,MAAM,oBAAoB,GAAG;IAC5B,yEAAyE;IACzE,mEAAmE;IACnE,2EAA2E;IAC3E,yCAAyC;IACzC,iBAAiB;IACjB,sBAAsB;IACtB,WAAW;IACX,uBAAuB;IACvB,UAAU;IACV,kBAAkB;IAClB,iBAAiB;IACjB,UAAU;CACV,CAAA;AAED,yDAAyD;AACzD,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC;IACpC,yEAAyE;IACzE,yCAAyC;IACzC,sBAAsB;CACtB,CAAC,CAAA;AAEF,MAAM,iBAAiB,GAA2B;IACjD,yEAAyE,EAAE,MAAM;IACjF,yCAAyC,EAAE,KAAK;IAChD,sBAAsB,EAAE,MAAM;CAC9B,CAAA;AASD,MAAM,OAAO,sBAAsB;IACzB,IAAI,GAAG,YAAY,CAAA;IACnB,kBAAkB,GAAG,oBAAoB,CAAA;IAEjC,QAAQ,CAAsB;IAC9B,MAAM,CAAQ;IACd,EAAE,CAAY;IACd,aAAa,CAAe;IAC5B,IAAI,CAAkG;IAEvH,YAAY,MAAoC;QAC/C,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAA;QAC/B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAA;QAC3B,IAAI,CAAC,EAAE,GAAG,MAAM,CAAC,EAAE,CAAA;QACnB,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,OAAO,CAAA;QACnC,IAAI,CAAC,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IACrC,CAAC;IAED,KAAK,CAAC,OAAO,CACZ,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAE7B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,CAAA;QAEzE,MAAM,iBAAiB,GAAG,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAA;QAC1D,IAAI,CAAC,iBAAiB,CAAC,EAAE,EAAE,CAAC;YAC3B,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAA;QACvD,CAAC;QACD,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,CAAC,KAAK,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;QAE1E,iEAAiE;QACjE,qEAAqE;QACrE,oEAAoE;QACpE,eAAe;QACf,MAAM,YAAY,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,CAAC,KAAK,CAAC,CAAA;QACpF,MAAM,SAAS,GAAG,oBAAoB,CAAC,GAAG,CAAC,QAAQ,CAAC;YACnD,CAAC,CAAC,IAAI,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,CAAC;YACvD,CAAC,CAAC,OAAO,CAAC,OAAO,CAAuD,EAAE,CAAC,CAAA;QAE5E,MAAM,CAAC,cAAc,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,SAAS,CAAC,CAAC,CAAA;QAE7E,IAAI,CAAC,cAAc,CAAC,EAAE;YAAE,OAAO,cAAc,CAAA;QAE7C,MAAM,QAAQ,GAAG,cAAc,CAAC,KAAK,CAAA;QAErC,MAAM,YAAY,GAAa,CAAC,YAAY,CAAC,CAAA;QAC7C,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;YAC1B,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;YACnC,YAAY,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,YAAY,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,CAAA;QAChE,CAAC;QAED,MAAM,aAAa,GAAa,CAAC,kBAAkB,CAAC,CAAA;QACpD,aAAa,CAAC,IAAI,CAAC,2BAA2B,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAA;QACvE,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAA;QAEnC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YAClD,QAAQ;YACR,QAAQ;YACR,aAAa,EAAE,QAAQ,CAAC,MAAM;YAC9B,eAAe,EAAE,YAAY,CAAC,MAAM;YACpC,eAAe,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,UAAU;SACxC,CAAC,CAAA;QAEF,OAAO,EAAE,CAAC;YACT,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;YAC1C,YAAY;SACZ,CAAC,CAAA;IACH,CAAC;IAEO,KAAK,CAAC,aAAa,CAC1B,QAAgB,EAChB,QAAgB,EAChB,UAAkB;QAElB,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAClC,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC,CAAA;QAC5D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,IAAI,CAAC,MAAM,CAAC,KAAK,CAChB,uBAAuB,EACvB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,EAC1C,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe,EAAE,CAChE,CAAA;YACD,IAAI,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAChC,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC,CAAA;YAC3F,CAAC;YACD,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,sBAAsB,OAAO,EAAE,CAAC,CAAC,CAAA;QACvD,CAAC;QAED,IAAI,QAAQ,GAAG,EAAE,CAAA;QACjB,IAAI,CAAC;YACJ,QAAQ,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,UAAU,EAAE,OAAO,CAAC,CAAA;QACvD,CAAC;QAAC,MAAM,CAAC;YACR,8DAA8D;QAC/D,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YAClD,QAAQ;YACR,QAAQ;YACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe;YACxC,aAAa,EAAE,QAAQ,CAAC,MAAM;SAC9B,CAAC,CAAA;QAEF,OAAO,EAAE,CAAC,QAAQ,CAAC,CAAA;IACpB,CAAC;IAEO,KAAK,CAAC,uBAAuB,CACpC,QAAgB,EAChB,QAAgB,EAChB,GAAwB;QAExB,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QAC5C,MAAM,cAAc,GAAG,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAA;QAC9C,IAAI,CAAC,cAAc,CAAC,EAAE;YAAE,OAAO,EAAE,CAAA;QAEjC,MAAM,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;QAC1C,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAA;QAEtB,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAC9B,IAAI,gBAAgB,GAAG,IAAI,CAAA;QAC3B,IAAI,CAAC;YACJ,MAAM,IAAI,CAAC,IAAI,CACd,QAAQ,EACR,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,mBAAmB,cAAc,CAAC,KAAK,EAAE,CAAC,EACnG,wBAAwB,CACxB,CAAA;QACF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,gBAAgB,GAAG,KAAK,CAAA;YACxB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,kEAAkE,EAAE;gBACpF,QAAQ;gBACR,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW;gBACpC,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;aAC7D,CAAC,CAAA;QACH,CAAC;QACD,IAAI,gBAAgB,EAAE,CAAC;YACtB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,iCAAiC,EAAE;gBACnD,QAAQ;gBACR,MAAM;gBACN,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW;aACpC,CAAC,CAAA;QACH,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QAChC,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,UAAU,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAA;QAC/H,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+BAA+B,EAAE;YACjD,MAAM,EAAE,QAAQ;YAChB,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,OAAO,EAAE,CAAC,gBAAgB;YAC1B,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa;SACtC,CAAC,CAAA;QACF,OAAO,MAAM,CAAA;IACd,CAAC;CACD;AAED,+EAA+E;AAC/E,uBAAuB;AACvB,+EAA+E;AAE/E;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,wCAAwC,CAAA;AAEpE,MAAM,cAAc,GAA2B;IAC9C,GAAG,EAAE,WAAW;IAChB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;IAChB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,eAAe;IACpB,GAAG,EAAE,YAAY;IACjB,IAAI,EAAE,YAAY;IAClB,GAAG,EAAE,WAAW;CAChB,CAAA;AAED,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC9C,MAAM,GAAG,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,CAAA;IACpD,OAAO,cAAc,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,WAAW,CAAA;AAChD,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAA0D;IAC7F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,MAAM,CAAA;IACvC,IAAI,MAAM,GAAG,gBAAgB;QAAE,OAAO,KAAK,CAAA;IAC3C,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,GAAG,MAAM,CAAA;IACvC,OAAO,OAAO,IAAI,8BAA8B,CAAA;AACjD,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACvC,QAAgB,EAChB,aAA4B;IAE5B,IAAI,CAAC;QACJ,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,aAAa,CAAC,QAAQ,CAC9C,YAAY,EACZ,CAAC,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,EACzC,EAAE,OAAO,EAAE,MAAM,EAAE,CACnB,CAAA;QACD,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QACvC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,IAAI,CAAA;QACjC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QACpC,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QACrC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,IAAI,CAAA;QACpE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,CAAA;IACzB,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,IAAI,CAAA;IACZ,CAAC;AACF,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC5C,UAAqB,EACrB,cAAsB,EACtB,GAAwB,EACxB,QAA8B,EAC9B,MAAc,EACd,EAAc,EACd,aAA4B;IAE5B,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAA;IAC7D,IAAI,CAAC,UAAU,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAE7B,MAAM,UAAU,GAAG,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;IAE/F,oEAAoE;IACpE,MAAM,SAAS,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE;QACzE,MAAM,UAAU,GAAG,UAAU,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QAClD,IAAI,CAAC,UAAU,CAAC,EAAE;YAAE,OAAO,IAAI,CAAA;QAE/B,IAAI,SAAS,GAAG,CAAC,CAAA;QACjB,IAAI,CAAC;YACJ,SAAS,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAA;QACnD,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,IAAI,CAAA;QACZ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,kBAAkB,CAAC,UAAU,CAAC,KAAK,EAAE,aAAa,CAAC,CAAA;QACtE,IAAI,CAAC,IAAI,EAAE,CAAC;YACX,0EAA0E;YAC1E,OAAO,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;QACxE,CAAC;QAED,MAAM,IAAI,GAAG,mBAAmB,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS,EAAE,CAAC,CAAA;QACvF,OAAO,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,CAAA;IACrF,CAAC,CAAC,CAAA;IAEF,MAAM,QAAQ,GAAG,SAAS;SACxB,MAAM,CAAC,CAAC,CAAC,EAA8B,EAAE,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC;SAC/D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC;SACzC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;IAEtB,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,CAAA;IACxE,IAAI,YAAY,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,UAAU,EAAE,CAAC;QACvD,MAAM,CAAC,IAAI,CAAC,sBAAsB,EAAE;YACnC,MAAM,EAAE,cAAc;YACtB,UAAU,EAAE,UAAU,CAAC,MAAM;YAC7B,mBAAmB,EAAE,UAAU,CAAC,MAAM,GAAG,YAAY;YACrD,QAAQ,EAAE,QAAQ,CAAC,MAAM;YACzB,gBAAgB,EAAE,YAAY;SAC9B,CAAC,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,QAAQ,EAAE,0BAA0B,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;QAChG,MAAM,aAAa,GAAG,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QACvD,IAAI,CAAC,aAAa,CAAC,EAAE;YAAE,OAAO,IAAI,CAAA;QAElC,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QAC5C,IAAI,WAAW,GAAG,OAAO,CAAA;QAEzB,MAAM,UAAU,GAAG,QAAQ,CAAC,cAAc,CAAC,OAAO,CAAC,CAAA;QACnD,IAAI,UAAU,EAAE,CAAC;YAChB,MAAM,cAAc,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE;gBAC7E,KAAK,EAAE,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,OAAO,CAAC;aACjE,CAAC,CAAA;YACF,IAAI,cAAc,CAAC,EAAE,IAAI,cAAc,CAAC,KAAK,CAAC,gBAAgB,EAAE,CAAC;gBAChE,WAAW,GAAG,cAAc,CAAC,KAAK,CAAC,gBAAgB,CAAA;YACpD,CAAC;QACF,CAAC;QAED,OAAO,EAAE,YAAY,EAAE,GAAG,cAAc,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,CAAA;IAC1E,CAAC,CAAC,CAAA;IAEF,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAsD,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAA;AAC7F,CAAC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF Preprocessor
|
|
3
|
+
*
|
|
4
|
+
* Dedicated PDF pipeline:
|
|
5
|
+
*
|
|
6
|
+
* 1. Text extraction via `pdftotext` (poppler-utils, C++) — ~1 s for a
|
|
7
|
+
* 3 MB PDF. Replaces markitdown/pdfminer.six (~22 s for the same file)
|
|
8
|
+
* because PDFs in practice don't carry the rich markdown structure
|
|
9
|
+
* that justifies the slower backend.
|
|
10
|
+
*
|
|
11
|
+
* 2. Image extraction via `pdfimages -all` — keeps the original embedded
|
|
12
|
+
* format (JPEG stays JPEG) instead of re-encoding everything to PNG
|
|
13
|
+
* (~10× faster, much smaller files).
|
|
14
|
+
*
|
|
15
|
+
* 3. Text and image extraction run in parallel.
|
|
16
|
+
*
|
|
17
|
+
* 4. Images stream into the classifier as soon as `pdfimages` writes them
|
|
18
|
+
* to disk — the classifier doesn't wait for the whole extraction to
|
|
19
|
+
* finish. A density filter (bytes/pixel) drops alpha masks and overlay
|
|
20
|
+
* layers before the vision call.
|
|
21
|
+
*/
|
|
22
|
+
import type { Result } from '../../../lib/utils/result.js';
|
|
23
|
+
import type { FileSystem } from '../../../platform/fs.js';
|
|
24
|
+
import type { ProcessRunner } from '../../../platform/process.js';
|
|
25
|
+
import type { Logger } from '../../../lib/logger/logger.js';
|
|
26
|
+
import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js';
|
|
27
|
+
export interface PdfPreprocessorConfig {
|
|
28
|
+
registry: PreprocessorRegistry;
|
|
29
|
+
logger: Logger;
|
|
30
|
+
fs: FileSystem;
|
|
31
|
+
process: ProcessRunner;
|
|
32
|
+
}
|
|
33
|
+
export declare class PdfPreprocessor implements Preprocessor {
|
|
34
|
+
readonly name = "pdf";
|
|
35
|
+
readonly supportedMimeTypes: string[];
|
|
36
|
+
private readonly registry;
|
|
37
|
+
private readonly logger;
|
|
38
|
+
private readonly fs;
|
|
39
|
+
private readonly processRunner;
|
|
40
|
+
constructor(config: PdfPreprocessorConfig);
|
|
41
|
+
process(filePath: string, mimeType: string, ctx: PreprocessorContext): Promise<Result<PreprocessorResult, Error>>;
|
|
42
|
+
/**
|
|
43
|
+
* Extract plain text via pdftotext. Writes to content.md verbatim — no
|
|
44
|
+
* markdown structure to preserve, but the file extension stays .md for
|
|
45
|
+
* consistency with the markitdown pipeline (downstream consumers expect
|
|
46
|
+
* "content.md" in the upload directory).
|
|
47
|
+
*
|
|
48
|
+
* `-layout` preserves the original visual layout (columns, tables),
|
|
49
|
+
* which is what users typically expect when looking at PDFs.
|
|
50
|
+
*/
|
|
51
|
+
private extractText;
|
|
52
|
+
/**
|
|
53
|
+
* Extract images via pdfimages and classify them as they appear on disk.
|
|
54
|
+
*
|
|
55
|
+
* pdfimages writes files atomically per image (open temp, write, rename
|
|
56
|
+
* to final name), so polling `readdir` is safe — we either see a name or
|
|
57
|
+
* we don't, never a half-written file.
|
|
58
|
+
*
|
|
59
|
+
* Streaming overlaps the extraction tail with the first classification
|
|
60
|
+
* batches. Hard cap of MAX_IMAGES applies across the *filtered* set: as
|
|
61
|
+
* soon as MAX_IMAGES images have passed the density filter, further
|
|
62
|
+
* candidates are stat-checked but not classified.
|
|
63
|
+
*
|
|
64
|
+
* `-all` keeps the embedded format (JPEG, JBIG2, JP2). We only classify
|
|
65
|
+
* those Anthropic vision accepts (PNG/JPEG/GIF/WebP); other formats are
|
|
66
|
+
* extracted to disk for reference but skipped at the classification step.
|
|
67
|
+
*/
|
|
68
|
+
private extractAndClassifyImages;
|
|
69
|
+
private scanAndDispatch;
|
|
70
|
+
}
|
|
71
|
+
//# sourceMappingURL=pdf-preprocessor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-preprocessor.d.ts","sourceRoot":"","sources":["../../../../src/plugins/uploads/preprocessors/pdf-preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAA;AAEnD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAA;AAClD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAA;AAC1D,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAC3D,OAAO,KAAK,EAAE,YAAY,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAA;AAmBrH,MAAM,WAAW,qBAAqB;IACrC,QAAQ,EAAE,oBAAoB,CAAA;IAC9B,MAAM,EAAE,MAAM,CAAA;IACd,EAAE,EAAE,UAAU,CAAA;IACd,OAAO,EAAE,aAAa,CAAA;CACtB;AAED,qBAAa,eAAgB,YAAW,YAAY;IACnD,QAAQ,CAAC,IAAI,SAAQ;IACrB,QAAQ,CAAC,kBAAkB,WAAuB;IAElD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAsB;IAC/C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAQ;IAC/B,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAY;IAC/B,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;gBAEjC,MAAM,EAAE,qBAAqB;IAOnC,OAAO,CACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,MAAM,EAChB,GAAG,EAAE,mBAAmB,GACtB,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAAE,KAAK,CAAC,CAAC;IA8C7C;;;;;;;;OAQG;YACW,WAAW;IA8BzB;;;;;;;;;;;;;;;OAeG;YACW,wBAAwB;YAoJxB,eAAe;CAc7B"}
|