@roj-ai/sdk 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/bootstrap.d.ts.map +1 -1
  2. package/dist/bootstrap.js +12 -2
  3. package/dist/bootstrap.js.map +1 -1
  4. package/dist/core/image/types.d.ts +2 -0
  5. package/dist/core/image/types.d.ts.map +1 -1
  6. package/dist/core/image/vips-resizer.d.ts.map +1 -1
  7. package/dist/core/image/vips-resizer.js +12 -11
  8. package/dist/core/image/vips-resizer.js.map +1 -1
  9. package/dist/core/sessions/session.d.ts.map +1 -1
  10. package/dist/core/sessions/session.js +0 -7
  11. package/dist/core/sessions/session.js.map +1 -1
  12. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts +20 -0
  13. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts.map +1 -1
  14. package/dist/plugins/uploads/preprocessors/image-classifier.js +93 -28
  15. package/dist/plugins/uploads/preprocessors/image-classifier.js.map +1 -1
  16. package/dist/plugins/uploads/preprocessors/index.d.ts +1 -0
  17. package/dist/plugins/uploads/preprocessors/index.d.ts.map +1 -1
  18. package/dist/plugins/uploads/preprocessors/index.js +1 -0
  19. package/dist/plugins/uploads/preprocessors/index.js.map +1 -1
  20. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts +52 -5
  21. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts.map +1 -1
  22. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js +183 -75
  23. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js.map +1 -1
  24. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts +71 -0
  25. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts.map +1 -0
  26. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js +274 -0
  27. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js.map +1 -0
  28. package/package.json +2 -2
  29. package/src/bootstrap.ts +12 -2
  30. package/src/core/image/types.ts +2 -0
  31. package/src/core/image/vips-resizer.ts +12 -11
  32. package/src/core/sessions/session.ts +0 -8
  33. package/src/plugins/uploads/preprocessors/image-classifier.ts +108 -29
  34. package/src/plugins/uploads/preprocessors/index.ts +1 -0
  35. package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts +213 -79
  36. package/src/plugins/uploads/preprocessors/pdf-preprocessor.ts +342 -0
@@ -2,11 +2,14 @@
2
2
  * Markitdown Preprocessor
3
3
  *
4
4
  * Converts documents to markdown using Microsoft's markitdown CLI.
5
- * Supports PDF, DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, and more.
5
+ * Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
6
+ *
7
+ * PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
8
+ * (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
9
+ * mostly-unstructured PDFs we see in practice.
6
10
  *
7
11
  * Image extraction:
8
- * - PDF: uses pdfimages (poppler-utils)
9
- * - DOCX/ODT/EPUB: uses pandoc --extract-media
12
+ * - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
10
13
  *
11
14
  * Extracted images are classified via the image classifier preprocessor.
12
15
  * Full content is written to disk; extractedContent contains a structured manifest.
@@ -22,16 +25,36 @@ import type { FileStore } from '../../../core/file-store/types.js'
22
25
  import type { Logger } from '../../../lib/logger/logger.js'
23
26
  import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js'
24
27
 
25
- const MAX_IMAGES = 50
28
+ const MAX_IMAGES = 20
26
29
  const IMAGE_CLASSIFY_CONCURRENCY = 10
27
30
 
31
+ /**
32
+ * Density filter for extracted images. Bytes-per-pixel ratio below this
33
+ * threshold typically means the image is an alpha mask, overlay layer, or
34
+ * essentially-empty region — not worth a vision call.
35
+ *
36
+ * Empirical reference points:
37
+ * - Dense photo JPEG: 0.3–1.0 B/px
38
+ * - Logo / icon PNG: 0.1–0.5 B/px
39
+ * - Brand PDF layer mask: <0.005 B/px
40
+ */
41
+ export const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05
42
+ export const MIN_IMAGE_PIXELS = 50 * 50
43
+
44
+ // markitdown converts a text-only document; even large files finish in seconds.
45
+ const MARKITDOWN_TIMEOUT_MS = 60_000
46
+ // Image extractors (pandoc --extract-media) scale with image count
47
+ // and resolution. Real-world large docs can take 60–90s. Upload preprocessing
48
+ // is async/background, so allow generous headroom.
49
+ const IMAGE_EXTRACT_TIMEOUT_MS = 5 * 60_000
50
+
28
51
  function makeExec(processRunner: ProcessRunner) {
29
- return (cmd: string, args: string[]) => processRunner.execFile(cmd, args, { timeout: 60_000, maxBuffer: 50 * 1024 * 1024 })
52
+ return (cmd: string, args: string[], timeoutMs: number = MARKITDOWN_TIMEOUT_MS) =>
53
+ processRunner.execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 50 * 1024 * 1024 })
30
54
  }
31
55
 
32
- /** MIME types where markitdown converts to markdown (non-ZIP, non-image) */
56
+ /** MIME types where markitdown converts to markdown (non-ZIP, non-image, non-PDF) */
33
57
  const SUPPORTED_MIME_TYPES = [
34
- 'application/pdf',
35
58
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
36
59
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
37
60
  'application/vnd.openxmlformats-officedocument.presentationml.presentation',
@@ -59,11 +82,6 @@ const PANDOC_FORMAT_MAP: Record<string, string> = {
59
82
  'application/epub+zip': 'epub',
60
83
  }
61
84
 
62
- /** MIME types where pdfimages can extract images */
63
- const PDFIMAGES_MIMES = new Set([
64
- 'application/pdf',
65
- ])
66
-
67
85
  export interface MarkitdownPreprocessorConfig {
68
86
  registry: PreprocessorRegistry
69
87
  logger: Logger
@@ -78,12 +96,14 @@ export class MarkitdownPreprocessor implements Preprocessor {
78
96
  private readonly registry: PreprocessorRegistry
79
97
  private readonly logger: Logger
80
98
  private readonly fs: FileSystem
81
- private readonly exec: (cmd: string, args: string[]) => Promise<{ stdout: string; stderr: string }>
99
+ private readonly processRunner: ProcessRunner
100
+ private readonly exec: (cmd: string, args: string[], timeoutMs?: number) => Promise<{ stdout: string; stderr: string }>
82
101
 
83
102
  constructor(config: MarkitdownPreprocessorConfig) {
84
103
  this.registry = config.registry
85
104
  this.logger = config.logger
86
105
  this.fs = config.fs
106
+ this.processRunner = config.process
87
107
  this.exec = makeExec(config.process)
88
108
  }
89
109
 
@@ -92,56 +112,48 @@ export class MarkitdownPreprocessor implements Preprocessor {
92
112
  mimeType: string,
93
113
  ctx: PreprocessorContext,
94
114
  ): Promise<Result<PreprocessorResult, Error>> {
95
- const derivedPaths: string[] = []
96
- const imageEntries: string[] = []
115
+ const totalStart = Date.now()
116
+
117
+ this.logger.info('Markitdown processing started', { filePath, mimeType })
97
118
 
98
- // 1. Convert to markdown via markitdown
99
119
  const contentPathResult = ctx.files.realPath('content.md')
100
120
  if (!contentPathResult.ok) {
101
121
  return Err(new Error('Failed to resolve output path'))
102
122
  }
123
+ await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
103
124
 
104
- try {
105
- await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
106
- await this.exec('markitdown', [filePath, '-o', contentPathResult.value])
107
- } catch (error) {
108
- const message = error instanceof Error ? error.message : String(error)
109
- if (message.includes('ENOENT')) {
110
- return Err(new Error('markitdown not found. Install with: pip install "markitdown[all]"'))
111
- }
112
- return Err(new Error(`markitdown failed: ${message}`))
113
- }
125
+ // Race markitdown text conversion and image extraction — they're
126
+ // independent, so there's no reason to serialize them. For documents
127
+ // where pandoc extraction isn't applicable, the image task resolves
128
+ // immediately.
129
+ const markdownTask = this.runMarkitdown(filePath, mimeType, contentPathResult.value)
130
+ const imageTask = PANDOC_EXTRACT_MIMES.has(mimeType)
131
+ ? this.extractImagesWithPandoc(filePath, mimeType, ctx)
132
+ : Promise.resolve<Array<{ relativePath: string; description: string }>>([])
114
133
 
115
- const contentResult = await ctx.files.read('content.md')
116
- const markdown = contentResult.ok ? contentResult.value : ''
134
+ const [markdownResult, images] = await Promise.all([markdownTask, imageTask])
117
135
 
118
- derivedPaths.push('content.md')
136
+ if (!markdownResult.ok) return markdownResult
119
137
 
120
- // 2. Extract images based on file type
121
- if (PANDOC_EXTRACT_MIMES.has(mimeType)) {
122
- const images = await this.extractImagesWithPandoc(filePath, mimeType, ctx)
123
- for (const img of images) {
124
- derivedPaths.push(img.relativePath)
125
- imageEntries.push(`- ${img.relativePath} — ${img.description}`)
126
- }
127
- } else if (PDFIMAGES_MIMES.has(mimeType)) {
128
- const images = await this.extractImagesWithPdfimages(filePath, ctx)
129
- for (const img of images) {
130
- derivedPaths.push(img.relativePath)
131
- imageEntries.push(`- ${img.relativePath} — ${img.description}`)
132
- }
138
+ const markdown = markdownResult.value
139
+
140
+ const derivedPaths: string[] = ['content.md']
141
+ const imageEntries: string[] = []
142
+ for (const img of images) {
143
+ derivedPaths.push(img.relativePath)
144
+ imageEntries.push(`- ${img.relativePath} — ${img.description}`)
133
145
  }
134
146
 
135
- // 3. Build manifest
136
147
  const manifestLines: string[] = ['Extracted files:']
137
148
  manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`)
138
149
  manifestLines.push(...imageEntries)
139
150
 
140
- this.logger.debug('Markitdown processed', {
151
+ this.logger.info('Markitdown processing complete', {
141
152
  filePath,
142
153
  mimeType,
143
154
  contentLength: markdown.length,
144
155
  imagesExtracted: imageEntries.length,
156
+ totalDurationMs: Date.now() - totalStart,
145
157
  })
146
158
 
147
159
  return Ok({
@@ -150,6 +162,44 @@ export class MarkitdownPreprocessor implements Preprocessor {
150
162
  })
151
163
  }
152
164
 
165
+ private async runMarkitdown(
166
+ filePath: string,
167
+ mimeType: string,
168
+ outputPath: string,
169
+ ): Promise<Result<string, Error>> {
170
+ const markitdownStart = Date.now()
171
+ try {
172
+ await this.exec('markitdown', [filePath, '-o', outputPath])
173
+ } catch (error) {
174
+ const message = error instanceof Error ? error.message : String(error)
175
+ this.logger.error(
176
+ 'markitdown CLI failed',
177
+ error instanceof Error ? error : undefined,
178
+ { filePath, mimeType, durationMs: Date.now() - markitdownStart },
179
+ )
180
+ if (message.includes('ENOENT')) {
181
+ return Err(new Error('markitdown not found. Install with: pip install "markitdown[all]"'))
182
+ }
183
+ return Err(new Error(`markitdown failed: ${message}`))
184
+ }
185
+
186
+ let markdown = ''
187
+ try {
188
+ markdown = await this.fs.readFile(outputPath, 'utf-8')
189
+ } catch {
190
+ // Output missing — markitdown completed but produced nothing.
191
+ }
192
+
193
+ this.logger.info('Markitdown conversion complete', {
194
+ filePath,
195
+ mimeType,
196
+ durationMs: Date.now() - markitdownStart,
197
+ contentLength: markdown.length,
198
+ })
199
+
200
+ return Ok(markdown)
201
+ }
202
+
153
203
  private async extractImagesWithPandoc(
154
204
  filePath: string,
155
205
  mimeType: string,
@@ -162,41 +212,39 @@ export class MarkitdownPreprocessor implements Preprocessor {
162
212
  const format = PANDOC_FORMAT_MAP[mimeType]
163
213
  if (!format) return []
164
214
 
215
+ const pandocStart = Date.now()
216
+ let extractSucceeded = true
165
217
  try {
166
- await this.exec('pandoc', [
167
- '-f',
168
- format,
169
- '-t',
170
- 'gfm',
218
+ await this.exec(
219
+ 'pandoc',
220
+ ['-f', format, '-t', 'gfm', filePath, '-o', '/dev/null', `--extract-media=${mediaDirResult.value}`],
221
+ IMAGE_EXTRACT_TIMEOUT_MS,
222
+ )
223
+ } catch (error) {
224
+ extractSucceeded = false
225
+ this.logger.warn('pandoc --extract-media failed (will classify any partial output)', {
171
226
  filePath,
172
- '-o',
173
- '/dev/null',
174
- `--extract-media=${mediaDirResult.value}`,
175
- ])
176
- } catch {
177
- this.logger.warn('pandoc --extract-media failed', { filePath })
178
- return []
227
+ durationMs: Date.now() - pandocStart,
228
+ error: error instanceof Error ? error.message : String(error),
229
+ })
179
230
  }
180
-
181
- return classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger)
182
- }
183
-
184
- private async extractImagesWithPdfimages(
185
- filePath: string,
186
- ctx: PreprocessorContext,
187
- ): Promise<Array<{ relativePath: string; description: string }>> {
188
- const imageStore = ctx.files.scoped('images')
189
- const imagesDirResult = imageStore.realPath('')
190
- if (!imagesDirResult.ok) return []
191
-
192
- try {
193
- await this.fs.mkdir(imagesDirResult.value, { recursive: true })
194
- await this.exec('pdfimages', ['-png', filePath, `${imagesDirResult.value}/img`])
195
- } catch {
196
- return []
231
+ if (extractSucceeded) {
232
+ this.logger.info('pandoc --extract-media complete', {
233
+ filePath,
234
+ format,
235
+ durationMs: Date.now() - pandocStart,
236
+ })
197
237
  }
198
238
 
199
- return classifyExtractedImages(imageStore, 'images', ctx, this.registry, this.logger)
239
+ const classifyStart = Date.now()
240
+ const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger, this.fs, this.processRunner)
241
+ this.logger.info('Image classification complete', {
242
+ source: 'pandoc',
243
+ count: images.length,
244
+ partial: !extractSucceeded,
245
+ durationMs: Date.now() - classifyStart,
246
+ })
247
+ return images
200
248
  }
201
249
  }
202
250
 
@@ -204,7 +252,12 @@ export class MarkitdownPreprocessor implements Preprocessor {
204
252
  // Shared image helpers
205
253
  // ============================================================================
206
254
 
207
- const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
255
+ /**
256
+ * Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
257
+ * jb2e, jp2) are ignored — they'd require local conversion before being
258
+ * useful for classification.
259
+ */
260
+ export const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
208
261
 
209
262
  const IMAGE_MIME_MAP: Record<string, string> = {
210
263
  png: 'image/png',
@@ -223,22 +276,103 @@ export function guessImageMime(filename: string): string {
223
276
  return IMAGE_MIME_MAP[ext ?? ''] ?? 'image/png'
224
277
  }
225
278
 
279
+ /**
280
+ * Reject images that are unlikely to carry useful visual information.
281
+ *
282
+ * `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
283
+ * empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
284
+ * matching transparency/overlay layer at the same dimensions but a fraction
285
+ * of a percent of the size (<0.005 B/px).
286
+ *
287
+ * The minimum pixel count protects against tiny icons whose density alone
288
+ * doesn't disqualify them.
289
+ */
290
+ export function shouldClassifyImage(meta: { width: number; height: number; sizeBytes: number }): boolean {
291
+ const pixels = meta.width * meta.height
292
+ if (pixels < MIN_IMAGE_PIXELS) return false
293
+ const density = meta.sizeBytes / pixels
294
+ return density >= MIN_IMAGE_DENSITY_BYTES_PER_PX
295
+ }
296
+
297
+ /**
298
+ * Read image dimensions via vipsheader. Returns null when the tool isn't
299
+ * available or output is unparseable — caller should treat that as
300
+ * "include without filtering".
301
+ */
302
+ export async function getImageDimensions(
303
+ filePath: string,
304
+ processRunner: ProcessRunner,
305
+ ): Promise<{ width: number; height: number } | null> {
306
+ try {
307
+ const { stdout } = await processRunner.execFile(
308
+ 'vipsheader',
309
+ ['-f', 'width', '-f', 'height', filePath],
310
+ { timeout: 10_000 },
311
+ )
312
+ const lines = stdout.trim().split('\n')
313
+ if (lines.length < 2) return null
314
+ const width = parseInt(lines[0], 10)
315
+ const height = parseInt(lines[1], 10)
316
+ if (!Number.isFinite(width) || !Number.isFinite(height)) return null
317
+ return { width, height }
318
+ } catch {
319
+ return null
320
+ }
321
+ }
322
+
226
323
  export async function classifyExtractedImages(
227
324
  imageStore: FileStore,
228
325
  relativePrefix: string,
229
326
  ctx: PreprocessorContext,
230
327
  registry: PreprocessorRegistry,
231
328
  logger: Logger,
329
+ fs: FileSystem,
330
+ processRunner: ProcessRunner,
232
331
  ): Promise<Array<{ relativePath: string; description: string }>> {
233
332
  const listResult = await imageStore.list('', { maxDepth: 3 })
234
333
  if (!listResult.ok) return []
235
334
 
236
- const imageFiles = listResult.value
237
- .filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
238
- .sort((a, b) => a.name.localeCompare(b.name))
335
+ const candidates = listResult.value.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
336
+
337
+ // Stat + density filter, then keep the top MAX_IMAGES by file size.
338
+ const inspected = await mapWithConcurrency(candidates, 8, async (entry) => {
339
+ const pathResult = imageStore.realPath(entry.name)
340
+ if (!pathResult.ok) return null
341
+
342
+ let sizeBytes = 0
343
+ try {
344
+ sizeBytes = (await fs.stat(pathResult.value)).size
345
+ } catch {
346
+ return null
347
+ }
348
+
349
+ const dims = await getImageDimensions(pathResult.value, processRunner)
350
+ if (!dims) {
351
+ // Unknown dims — include but warn; better to classify than silently drop.
352
+ return { name: entry.name, sizeBytes, width: 0, height: 0, kept: true }
353
+ }
354
+
355
+ const kept = shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes })
356
+ return { name: entry.name, sizeBytes, width: dims.width, height: dims.height, kept }
357
+ })
358
+
359
+ const filtered = inspected
360
+ .filter((r): r is NonNullable<typeof r> => r !== null && r.kept)
361
+ .sort((a, b) => b.sizeBytes - a.sizeBytes)
239
362
  .slice(0, MAX_IMAGES)
240
363
 
241
- const settled = await mapWithConcurrency(imageFiles, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
364
+ const droppedCount = inspected.filter(r => r !== null && !r.kept).length
365
+ if (droppedCount > 0 || inspected.length > MAX_IMAGES) {
366
+ logger.info('Image filter applied', {
367
+ source: relativePrefix,
368
+ candidates: candidates.length,
369
+ passedDensityFilter: candidates.length - droppedCount,
370
+ selected: filtered.length,
371
+ droppedByDensity: droppedCount,
372
+ })
373
+ }
374
+
375
+ const settled = await mapWithConcurrency(filtered, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
242
376
  const imgPathResult = imageStore.realPath(imgFile.name)
243
377
  if (!imgPathResult.ok) return null
244
378