npm - @roj-ai/sdk - Versions diffs - 0.1.15 → 0.1.17 - Mend

@roj-ai/sdk 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts CHANGED Viewed

@@ -2,11 +2,14 @@
  * Markitdown Preprocessor
  *
  * Converts documents to markdown using Microsoft's markitdown CLI.
- * Supports PDF, DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, and more.
+ * Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
+ *
+ * PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
+ * (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
+ * mostly-unstructured PDFs we see in practice.
  *
  * Image extraction:
- * - PDF: uses pdfimages (poppler-utils)
- * - DOCX/ODT/EPUB: uses pandoc --extract-media
+ * - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
  *
  * Extracted images are classified via the image classifier preprocessor.
  * Full content is written to disk; extractedContent contains a structured manifest.
@@ -22,16 +25,36 @@ import type { FileStore } from '../../../core/file-store/types.js'
 import type { Logger } from '../../../lib/logger/logger.js'
 import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js'
-const MAX_IMAGES = 50
+const MAX_IMAGES = 20
 const IMAGE_CLASSIFY_CONCURRENCY = 10
+/**
+ * Density filter for extracted images. Bytes-per-pixel ratio below this
+ * threshold typically means the image is an alpha mask, overlay layer, or
+ * essentially-empty region — not worth a vision call.
+ *
+ * Empirical reference points:
+ * - Dense photo JPEG: 0.3–1.0 B/px
+ * - Logo / icon PNG: 0.1–0.5 B/px
+ * - Brand PDF layer mask: <0.005 B/px
+ */
+export const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05
+export const MIN_IMAGE_PIXELS = 50 * 50
+// markitdown converts a text-only document; even large files finish in seconds.
+const MARKITDOWN_TIMEOUT_MS = 60_000
+// Image extractors (pandoc --extract-media) scale with image count
+// and resolution. Real-world large docs can take 60–90s. Upload preprocessing
+// is async/background, so allow generous headroom.
+const IMAGE_EXTRACT_TIMEOUT_MS = 5 * 60_000
 function makeExec(processRunner: ProcessRunner) {
-	return (cmd: string, args: string[]) => processRunner.execFile(cmd, args, { timeout: 60_000, maxBuffer: 50 * 1024 * 1024 })
+	return (cmd: string, args: string[], timeoutMs: number = MARKITDOWN_TIMEOUT_MS) =>
+		processRunner.execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 50 * 1024 * 1024 })
 }
-/** MIME types where markitdown converts to markdown (non-ZIP, non-image) */
+/** MIME types where markitdown converts to markdown (non-ZIP, non-image, non-PDF) */
 const SUPPORTED_MIME_TYPES = [
-	'application/pdf',
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
 	'application/vnd.openxmlformats-officedocument.presentationml.presentation',
@@ -59,11 +82,6 @@ const PANDOC_FORMAT_MAP: Record<string, string> = {
 	'application/epub+zip': 'epub',
 }
-/** MIME types where pdfimages can extract images */
-const PDFIMAGES_MIMES = new Set([
-	'application/pdf',
-])
 export interface MarkitdownPreprocessorConfig {
 	registry: PreprocessorRegistry
 	logger: Logger
@@ -78,12 +96,14 @@ export class MarkitdownPreprocessor implements Preprocessor {
 	private readonly registry: PreprocessorRegistry
 	private readonly logger: Logger
 	private readonly fs: FileSystem
-	private readonly exec: (cmd: string, args: string[]) => Promise<{ stdout: string; stderr: string }>
+	private readonly processRunner: ProcessRunner
+	private readonly exec: (cmd: string, args: string[], timeoutMs?: number) => Promise<{ stdout: string; stderr: string }>
 	constructor(config: MarkitdownPreprocessorConfig) {
 		this.registry = config.registry
 		this.logger = config.logger
 		this.fs = config.fs
+		this.processRunner = config.process
 		this.exec = makeExec(config.process)
 	}
@@ -92,56 +112,48 @@ export class MarkitdownPreprocessor implements Preprocessor {
 		mimeType: string,
 		ctx: PreprocessorContext,
 	): Promise<Result<PreprocessorResult, Error>> {
-		const derivedPaths: string[] = []
-		const imageEntries: string[] = []
+		const totalStart = Date.now()
+		this.logger.info('Markitdown processing started', { filePath, mimeType })
-		// 1. Convert to markdown via markitdown
 		const contentPathResult = ctx.files.realPath('content.md')
 		if (!contentPathResult.ok) {
 			return Err(new Error('Failed to resolve output path'))
 		}
+		await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
-		try {
-			await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
-			await this.exec('markitdown', [filePath, '-o', contentPathResult.value])
-		} catch (error) {
-			const message = error instanceof Error ? error.message : String(error)
-			if (message.includes('ENOENT')) {
-				return Err(new Error('markitdown not found. Install with: pip install "markitdown[all]"'))
-			}
-			return Err(new Error(`markitdown failed: ${message}`))
-		}
+		// Race markitdown text conversion and image extraction — they're
+		// independent, so there's no reason to serialize them. For documents
+		// where pandoc extraction isn't applicable, the image task resolves
+		// immediately.
+		const markdownTask = this.runMarkitdown(filePath, mimeType, contentPathResult.value)
+		const imageTask = PANDOC_EXTRACT_MIMES.has(mimeType)
+			? this.extractImagesWithPandoc(filePath, mimeType, ctx)
+			: Promise.resolve<Array<{ relativePath: string; description: string }>>([])
-		const contentResult = await ctx.files.read('content.md')
-		const markdown = contentResult.ok ? contentResult.value : ''
+		const [markdownResult, images] = await Promise.all([markdownTask, imageTask])
-		derivedPaths.push('content.md')
+		if (!markdownResult.ok) return markdownResult
-		// 2. Extract images based on file type
-		if (PANDOC_EXTRACT_MIMES.has(mimeType)) {
-			const images = await this.extractImagesWithPandoc(filePath, mimeType, ctx)
-			for (const img of images) {
-				derivedPaths.push(img.relativePath)
-				imageEntries.push(`- ${img.relativePath} — ${img.description}`)
-			}
-		} else if (PDFIMAGES_MIMES.has(mimeType)) {
-			const images = await this.extractImagesWithPdfimages(filePath, ctx)
-			for (const img of images) {
-				derivedPaths.push(img.relativePath)
-				imageEntries.push(`- ${img.relativePath} — ${img.description}`)
-			}
+		const markdown = markdownResult.value
+		const derivedPaths: string[] = ['content.md']
+		const imageEntries: string[] = []
+		for (const img of images) {
+			derivedPaths.push(img.relativePath)
+			imageEntries.push(`- ${img.relativePath} — ${img.description}`)
 		}
-		// 3. Build manifest
 		const manifestLines: string[] = ['Extracted files:']
 		manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`)
 		manifestLines.push(...imageEntries)
-		this.logger.debug('Markitdown processed', {
+		this.logger.info('Markitdown processing complete', {
 			filePath,
 			mimeType,
 			contentLength: markdown.length,
 			imagesExtracted: imageEntries.length,
+			totalDurationMs: Date.now() - totalStart,
 		})
 		return Ok({
@@ -150,6 +162,44 @@ export class MarkitdownPreprocessor implements Preprocessor {
 		})
 	}
+	private async runMarkitdown(
+		filePath: string,
+		mimeType: string,
+		outputPath: string,
+	): Promise<Result<string, Error>> {
+		const markitdownStart = Date.now()
+		try {
+			await this.exec('markitdown', [filePath, '-o', outputPath])
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error)
+			this.logger.error(
+				'markitdown CLI failed',
+				error instanceof Error ? error : undefined,
+				{ filePath, mimeType, durationMs: Date.now() - markitdownStart },
+			)
+			if (message.includes('ENOENT')) {
+				return Err(new Error('markitdown not found. Install with: pip install "markitdown[all]"'))
+			}
+			return Err(new Error(`markitdown failed: ${message}`))
+		}
+		let markdown = ''
+		try {
+			markdown = await this.fs.readFile(outputPath, 'utf-8')
+		} catch {
+			// Output missing — markitdown completed but produced nothing.
+		}
+		this.logger.info('Markitdown conversion complete', {
+			filePath,
+			mimeType,
+			durationMs: Date.now() - markitdownStart,
+			contentLength: markdown.length,
+		})
+		return Ok(markdown)
+	}
 	private async extractImagesWithPandoc(
 		filePath: string,
 		mimeType: string,
@@ -162,41 +212,39 @@ export class MarkitdownPreprocessor implements Preprocessor {
 		const format = PANDOC_FORMAT_MAP[mimeType]
 		if (!format) return []
+		const pandocStart = Date.now()
+		let extractSucceeded = true
 		try {
-			await this.exec('pandoc', [
-				'-f',
-				format,
-				'-t',
-				'gfm',
+			await this.exec(
+				'pandoc',
+				['-f', format, '-t', 'gfm', filePath, '-o', '/dev/null', `--extract-media=${mediaDirResult.value}`],
+				IMAGE_EXTRACT_TIMEOUT_MS,
+			)
+		} catch (error) {
+			extractSucceeded = false
+			this.logger.warn('pandoc --extract-media failed (will classify any partial output)', {
 				filePath,
-				'-o',
-				'/dev/null',
-				`--extract-media=${mediaDirResult.value}`,
-			])
-		} catch {
-			this.logger.warn('pandoc --extract-media failed', { filePath })
-			return []
+				durationMs: Date.now() - pandocStart,
+				error: error instanceof Error ? error.message : String(error),
+			})
 		}
-		return classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger)
-	}
-	private async extractImagesWithPdfimages(
-		filePath: string,
-		ctx: PreprocessorContext,
-	): Promise<Array<{ relativePath: string; description: string }>> {
-		const imageStore = ctx.files.scoped('images')
-		const imagesDirResult = imageStore.realPath('')
-		if (!imagesDirResult.ok) return []
-		try {
-			await this.fs.mkdir(imagesDirResult.value, { recursive: true })
-			await this.exec('pdfimages', ['-png', filePath, `${imagesDirResult.value}/img`])
-		} catch {
-			return []
+		if (extractSucceeded) {
+			this.logger.info('pandoc --extract-media complete', {
+				filePath,
+				format,
+				durationMs: Date.now() - pandocStart,
+			})
 		}
-		return classifyExtractedImages(imageStore, 'images', ctx, this.registry, this.logger)
+		const classifyStart = Date.now()
+		const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger, this.fs, this.processRunner)
+		this.logger.info('Image classification complete', {
+			source: 'pandoc',
+			count: images.length,
+			partial: !extractSucceeded,
+			durationMs: Date.now() - classifyStart,
+		})
+		return images
 	}
 }
@@ -204,7 +252,12 @@ export class MarkitdownPreprocessor implements Preprocessor {
 // Shared image helpers
 // ============================================================================
-const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
+/**
+ * Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
+ * jb2e, jp2) are ignored — they'd require local conversion before being
+ * useful for classification.
+ */
+export const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
 const IMAGE_MIME_MAP: Record<string, string> = {
 	png: 'image/png',
@@ -223,22 +276,103 @@ export function guessImageMime(filename: string): string {
 	return IMAGE_MIME_MAP[ext ?? ''] ?? 'image/png'
 }
+/**
+ * Reject images that are unlikely to carry useful visual information.
+ *
+ * `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
+ * empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
+ * matching transparency/overlay layer at the same dimensions but a fraction
+ * of a percent of the size (<0.005 B/px).
+ *
+ * The minimum pixel count protects against tiny icons whose density alone
+ * doesn't disqualify them.
+ */
+export function shouldClassifyImage(meta: { width: number; height: number; sizeBytes: number }): boolean {
+	const pixels = meta.width * meta.height
+	if (pixels < MIN_IMAGE_PIXELS) return false
+	const density = meta.sizeBytes / pixels
+	return density >= MIN_IMAGE_DENSITY_BYTES_PER_PX
+}
+/**
+ * Read image dimensions via vipsheader. Returns null when the tool isn't
+ * available or output is unparseable — caller should treat that as
+ * "include without filtering".
+ */
+export async function getImageDimensions(
+	filePath: string,
+	processRunner: ProcessRunner,
+): Promise<{ width: number; height: number } | null> {
+	try {
+		const { stdout } = await processRunner.execFile(
+			'vipsheader',
+			['-f', 'width', '-f', 'height', filePath],
+			{ timeout: 10_000 },
+		)
+		const lines = stdout.trim().split('\n')
+		if (lines.length < 2) return null
+		const width = parseInt(lines[0], 10)
+		const height = parseInt(lines[1], 10)
+		if (!Number.isFinite(width) || !Number.isFinite(height)) return null
+		return { width, height }
+	} catch {
+		return null
+	}
+}
 export async function classifyExtractedImages(
 	imageStore: FileStore,
 	relativePrefix: string,
 	ctx: PreprocessorContext,
 	registry: PreprocessorRegistry,
 	logger: Logger,
+	fs: FileSystem,
+	processRunner: ProcessRunner,
 ): Promise<Array<{ relativePath: string; description: string }>> {
 	const listResult = await imageStore.list('', { maxDepth: 3 })
 	if (!listResult.ok) return []
-	const imageFiles = listResult.value
-		.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
-		.sort((a, b) => a.name.localeCompare(b.name))
+	const candidates = listResult.value.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
+	// Stat + density filter, then keep the top MAX_IMAGES by file size.
+	const inspected = await mapWithConcurrency(candidates, 8, async (entry) => {
+		const pathResult = imageStore.realPath(entry.name)
+		if (!pathResult.ok) return null
+		let sizeBytes = 0
+		try {
+			sizeBytes = (await fs.stat(pathResult.value)).size
+		} catch {
+			return null
+		}
+		const dims = await getImageDimensions(pathResult.value, processRunner)
+		if (!dims) {
+			// Unknown dims — include but warn; better to classify than silently drop.
+			return { name: entry.name, sizeBytes, width: 0, height: 0, kept: true }
+		}
+		const kept = shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes })
+		return { name: entry.name, sizeBytes, width: dims.width, height: dims.height, kept }
+	})
+	const filtered = inspected
+		.filter((r): r is NonNullable<typeof r> => r !== null && r.kept)
+		.sort((a, b) => b.sizeBytes - a.sizeBytes)
 		.slice(0, MAX_IMAGES)
-	const settled = await mapWithConcurrency(imageFiles, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
+	const droppedCount = inspected.filter(r => r !== null && !r.kept).length
+	if (droppedCount > 0 || inspected.length > MAX_IMAGES) {
+		logger.info('Image filter applied', {
+			source: relativePrefix,
+			candidates: candidates.length,
+			passedDensityFilter: candidates.length - droppedCount,
+			selected: filtered.length,
+			droppedByDensity: droppedCount,
+		})
+	}
+	const settled = await mapWithConcurrency(filtered, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
 		const imgPathResult = imageStore.realPath(imgFile.name)
 		if (!imgPathResult.ok) return null