@roj-ai/sdk 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/bootstrap.d.ts.map +1 -1
  2. package/dist/bootstrap.js +12 -2
  3. package/dist/bootstrap.js.map +1 -1
  4. package/dist/config.d.ts +12 -0
  5. package/dist/config.d.ts.map +1 -1
  6. package/dist/config.js.map +1 -1
  7. package/dist/core/image/types.d.ts +2 -0
  8. package/dist/core/image/types.d.ts.map +1 -1
  9. package/dist/core/image/vips-resizer.d.ts.map +1 -1
  10. package/dist/core/image/vips-resizer.js +12 -11
  11. package/dist/core/image/vips-resizer.js.map +1 -1
  12. package/dist/info.d.ts +7 -0
  13. package/dist/info.d.ts.map +1 -0
  14. package/dist/info.js +8 -0
  15. package/dist/info.js.map +1 -0
  16. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts +20 -0
  17. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts.map +1 -1
  18. package/dist/plugins/uploads/preprocessors/image-classifier.js +78 -26
  19. package/dist/plugins/uploads/preprocessors/image-classifier.js.map +1 -1
  20. package/dist/plugins/uploads/preprocessors/index.d.ts +1 -0
  21. package/dist/plugins/uploads/preprocessors/index.d.ts.map +1 -1
  22. package/dist/plugins/uploads/preprocessors/index.js +1 -0
  23. package/dist/plugins/uploads/preprocessors/index.js.map +1 -1
  24. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts +52 -5
  25. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts.map +1 -1
  26. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js +152 -97
  27. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js.map +1 -1
  28. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts +71 -0
  29. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts.map +1 -0
  30. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js +274 -0
  31. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js.map +1 -0
  32. package/dist/transport/http/app.d.ts.map +1 -1
  33. package/dist/transport/http/app.js +6 -1
  34. package/dist/transport/http/app.js.map +1 -1
  35. package/package.json +2 -2
  36. package/src/bootstrap.ts +12 -2
  37. package/src/config.ts +13 -0
  38. package/src/core/image/types.ts +2 -0
  39. package/src/core/image/vips-resizer.ts +12 -11
  40. package/src/info.ts +9 -0
  41. package/src/plugins/uploads/preprocessors/image-classifier.ts +93 -27
  42. package/src/plugins/uploads/preprocessors/index.ts +1 -0
  43. package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts +173 -108
  44. package/src/plugins/uploads/preprocessors/pdf-preprocessor.ts +342 -0
  45. package/src/transport/http/app.ts +6 -1
@@ -2,11 +2,14 @@
2
2
  * Markitdown Preprocessor
3
3
  *
4
4
  * Converts documents to markdown using Microsoft's markitdown CLI.
5
- * Supports PDF, DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, and more.
5
+ * Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
6
+ *
7
+ * PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
8
+ * (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
9
+ * mostly-unstructured PDFs we see in practice.
6
10
  *
7
11
  * Image extraction:
8
- * - PDF: uses pdfimages (poppler-utils)
9
- * - DOCX/ODT/EPUB: uses pandoc --extract-media
12
+ * - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
10
13
  *
11
14
  * Extracted images are classified via the image classifier preprocessor.
12
15
  * Full content is written to disk; extractedContent contains a structured manifest.
@@ -22,14 +25,27 @@ import type { FileStore } from '../../../core/file-store/types.js'
22
25
  import type { Logger } from '../../../lib/logger/logger.js'
23
26
  import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js'
24
27
 
25
- const MAX_IMAGES = 50
28
+ const MAX_IMAGES = 20
26
29
  const IMAGE_CLASSIFY_CONCURRENCY = 10
27
30
 
28
- // markitdown converts a text-only document; even large PDFs finish in seconds.
31
+ /**
32
+ * Density filter for extracted images. Bytes-per-pixel ratio below this
33
+ * threshold typically means the image is an alpha mask, overlay layer, or
34
+ * essentially-empty region — not worth a vision call.
35
+ *
36
+ * Empirical reference points:
37
+ * - Dense photo JPEG: 0.3–1.0 B/px
38
+ * - Logo / icon PNG: 0.1–0.5 B/px
39
+ * - Brand PDF layer mask: <0.005 B/px
40
+ */
41
+ export const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05
42
+ export const MIN_IMAGE_PIXELS = 50 * 50
43
+
44
+ // markitdown converts a text-only document; even large files finish in seconds.
29
45
  const MARKITDOWN_TIMEOUT_MS = 60_000
30
- // Image extractors (pdfimages, pandoc --extract-media) scale with image count
31
- // and resolution. Real-world large brand PDFs (40 pages, 5MB images) can take
32
- // 60–90s. Upload preprocessing is async/background, so allow generous headroom.
46
+ // Image extractors (pandoc --extract-media) scale with image count
47
+ // and resolution. Real-world large docs can take 60–90s. Upload preprocessing
48
+ // is async/background, so allow generous headroom.
33
49
  const IMAGE_EXTRACT_TIMEOUT_MS = 5 * 60_000
34
50
 
35
51
  function makeExec(processRunner: ProcessRunner) {
@@ -37,9 +53,8 @@ function makeExec(processRunner: ProcessRunner) {
37
53
  processRunner.execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 50 * 1024 * 1024 })
38
54
  }
39
55
 
40
- /** MIME types where markitdown converts to markdown (non-ZIP, non-image) */
56
+ /** MIME types where markitdown converts to markdown (non-ZIP, non-image, non-PDF) */
41
57
  const SUPPORTED_MIME_TYPES = [
42
- 'application/pdf',
43
58
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
44
59
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
45
60
  'application/vnd.openxmlformats-officedocument.presentationml.presentation',
@@ -67,11 +82,6 @@ const PANDOC_FORMAT_MAP: Record<string, string> = {
67
82
  'application/epub+zip': 'epub',
68
83
  }
69
84
 
70
- /** MIME types where pdfimages can extract images */
71
- const PDFIMAGES_MIMES = new Set([
72
- 'application/pdf',
73
- ])
74
-
75
85
  export interface MarkitdownPreprocessorConfig {
76
86
  registry: PreprocessorRegistry
77
87
  logger: Logger
@@ -86,12 +96,14 @@ export class MarkitdownPreprocessor implements Preprocessor {
86
96
  private readonly registry: PreprocessorRegistry
87
97
  private readonly logger: Logger
88
98
  private readonly fs: FileSystem
99
+ private readonly processRunner: ProcessRunner
89
100
  private readonly exec: (cmd: string, args: string[], timeoutMs?: number) => Promise<{ stdout: string; stderr: string }>
90
101
 
91
102
  constructor(config: MarkitdownPreprocessorConfig) {
92
103
  this.registry = config.registry
93
104
  this.logger = config.logger
94
105
  this.fs = config.fs
106
+ this.processRunner = config.process
95
107
  this.exec = makeExec(config.process)
96
108
  }
97
109
 
@@ -101,21 +113,63 @@ export class MarkitdownPreprocessor implements Preprocessor {
101
113
  ctx: PreprocessorContext,
102
114
  ): Promise<Result<PreprocessorResult, Error>> {
103
115
  const totalStart = Date.now()
104
- const derivedPaths: string[] = []
105
- const imageEntries: string[] = []
106
116
 
107
117
  this.logger.info('Markitdown processing started', { filePath, mimeType })
108
118
 
109
- // 1. Convert to markdown via markitdown
110
119
  const contentPathResult = ctx.files.realPath('content.md')
111
120
  if (!contentPathResult.ok) {
112
121
  return Err(new Error('Failed to resolve output path'))
113
122
  }
123
+ await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
124
+
125
+ // Race markitdown text conversion and image extraction — they're
126
+ // independent, so there's no reason to serialize them. For documents
127
+ // where pandoc extraction isn't applicable, the image task resolves
128
+ // immediately.
129
+ const markdownTask = this.runMarkitdown(filePath, mimeType, contentPathResult.value)
130
+ const imageTask = PANDOC_EXTRACT_MIMES.has(mimeType)
131
+ ? this.extractImagesWithPandoc(filePath, mimeType, ctx)
132
+ : Promise.resolve<Array<{ relativePath: string; description: string }>>([])
133
+
134
+ const [markdownResult, images] = await Promise.all([markdownTask, imageTask])
135
+
136
+ if (!markdownResult.ok) return markdownResult
137
+
138
+ const markdown = markdownResult.value
139
+
140
+ const derivedPaths: string[] = ['content.md']
141
+ const imageEntries: string[] = []
142
+ for (const img of images) {
143
+ derivedPaths.push(img.relativePath)
144
+ imageEntries.push(`- ${img.relativePath} — ${img.description}`)
145
+ }
146
+
147
+ const manifestLines: string[] = ['Extracted files:']
148
+ manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`)
149
+ manifestLines.push(...imageEntries)
150
+
151
+ this.logger.info('Markitdown processing complete', {
152
+ filePath,
153
+ mimeType,
154
+ contentLength: markdown.length,
155
+ imagesExtracted: imageEntries.length,
156
+ totalDurationMs: Date.now() - totalStart,
157
+ })
158
+
159
+ return Ok({
160
+ extractedContent: manifestLines.join('\n'),
161
+ derivedPaths,
162
+ })
163
+ }
114
164
 
165
+ private async runMarkitdown(
166
+ filePath: string,
167
+ mimeType: string,
168
+ outputPath: string,
169
+ ): Promise<Result<string, Error>> {
115
170
  const markitdownStart = Date.now()
116
171
  try {
117
- await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
118
- await this.exec('markitdown', [filePath, '-o', contentPathResult.value])
172
+ await this.exec('markitdown', [filePath, '-o', outputPath])
119
173
  } catch (error) {
120
174
  const message = error instanceof Error ? error.message : String(error)
121
175
  this.logger.error(
@@ -129,10 +183,12 @@ export class MarkitdownPreprocessor implements Preprocessor {
129
183
  return Err(new Error(`markitdown failed: ${message}`))
130
184
  }
131
185
 
132
- const contentResult = await ctx.files.read('content.md')
133
- const markdown = contentResult.ok ? contentResult.value : ''
134
-
135
- derivedPaths.push('content.md')
186
+ let markdown = ''
187
+ try {
188
+ markdown = await this.fs.readFile(outputPath, 'utf-8')
189
+ } catch {
190
+ // Output missing — markitdown completed but produced nothing.
191
+ }
136
192
 
137
193
  this.logger.info('Markitdown conversion complete', {
138
194
  filePath,
@@ -141,41 +197,7 @@ export class MarkitdownPreprocessor implements Preprocessor {
141
197
  contentLength: markdown.length,
142
198
  })
143
199
 
144
- // 2. Extract images based on file type
145
- const imagePhaseStart = Date.now()
146
- if (PANDOC_EXTRACT_MIMES.has(mimeType)) {
147
- const images = await this.extractImagesWithPandoc(filePath, mimeType, ctx)
148
- for (const img of images) {
149
- derivedPaths.push(img.relativePath)
150
- imageEntries.push(`- ${img.relativePath} — ${img.description}`)
151
- }
152
- } else if (PDFIMAGES_MIMES.has(mimeType)) {
153
- const images = await this.extractImagesWithPdfimages(filePath, ctx)
154
- for (const img of images) {
155
- derivedPaths.push(img.relativePath)
156
- imageEntries.push(`- ${img.relativePath} — ${img.description}`)
157
- }
158
- }
159
- const imagePhaseDurationMs = Date.now() - imagePhaseStart
160
-
161
- // 3. Build manifest
162
- const manifestLines: string[] = ['Extracted files:']
163
- manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`)
164
- manifestLines.push(...imageEntries)
165
-
166
- this.logger.info('Markitdown processing complete', {
167
- filePath,
168
- mimeType,
169
- contentLength: markdown.length,
170
- imagesExtracted: imageEntries.length,
171
- imagePhaseDurationMs,
172
- totalDurationMs: Date.now() - totalStart,
173
- })
174
-
175
- return Ok({
176
- extractedContent: manifestLines.join('\n'),
177
- derivedPaths,
178
- })
200
+ return Ok(markdown)
179
201
  }
180
202
 
181
203
  private async extractImagesWithPandoc(
@@ -215,7 +237,7 @@ export class MarkitdownPreprocessor implements Preprocessor {
215
237
  }
216
238
 
217
239
  const classifyStart = Date.now()
218
- const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger)
240
+ const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger, this.fs, this.processRunner)
219
241
  this.logger.info('Image classification complete', {
220
242
  source: 'pandoc',
221
243
  count: images.length,
@@ -224,56 +246,18 @@ export class MarkitdownPreprocessor implements Preprocessor {
224
246
  })
225
247
  return images
226
248
  }
227
-
228
- private async extractImagesWithPdfimages(
229
- filePath: string,
230
- ctx: PreprocessorContext,
231
- ): Promise<Array<{ relativePath: string; description: string }>> {
232
- const imageStore = ctx.files.scoped('images')
233
- const imagesDirResult = imageStore.realPath('')
234
- if (!imagesDirResult.ok) return []
235
-
236
- const pdfimagesStart = Date.now()
237
- let extractSucceeded = true
238
- try {
239
- await this.fs.mkdir(imagesDirResult.value, { recursive: true })
240
- await this.exec(
241
- 'pdfimages',
242
- ['-png', filePath, `${imagesDirResult.value}/img`],
243
- IMAGE_EXTRACT_TIMEOUT_MS,
244
- )
245
- } catch (error) {
246
- extractSucceeded = false
247
- this.logger.warn('pdfimages failed (will classify any partial output)', {
248
- filePath,
249
- durationMs: Date.now() - pdfimagesStart,
250
- error: error instanceof Error ? error.message : String(error),
251
- })
252
- }
253
- if (extractSucceeded) {
254
- this.logger.info('pdfimages complete', {
255
- filePath,
256
- durationMs: Date.now() - pdfimagesStart,
257
- })
258
- }
259
-
260
- const classifyStart = Date.now()
261
- const images = await classifyExtractedImages(imageStore, 'images', ctx, this.registry, this.logger)
262
- this.logger.info('Image classification complete', {
263
- source: 'pdfimages',
264
- count: images.length,
265
- partial: !extractSucceeded,
266
- durationMs: Date.now() - classifyStart,
267
- })
268
- return images
269
- }
270
249
  }
271
250
 
272
251
  // ============================================================================
273
252
  // Shared image helpers
274
253
  // ============================================================================
275
254
 
276
- const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
255
+ /**
256
+ * Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
257
+ * jb2e, jp2) are ignored — they'd require local conversion before being
258
+ * useful for classification.
259
+ */
260
+ export const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
277
261
 
278
262
  const IMAGE_MIME_MAP: Record<string, string> = {
279
263
  png: 'image/png',
@@ -292,22 +276,103 @@ export function guessImageMime(filename: string): string {
292
276
  return IMAGE_MIME_MAP[ext ?? ''] ?? 'image/png'
293
277
  }
294
278
 
279
+ /**
280
+ * Reject images that are unlikely to carry useful visual information.
281
+ *
282
+ * `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
283
+ * empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
284
+ * matching transparency/overlay layer at the same dimensions but a fraction
285
+ * of a percent of the size (<0.005 B/px).
286
+ *
287
+ * The minimum pixel count protects against tiny icons whose density alone
288
+ * doesn't disqualify them.
289
+ */
290
+ export function shouldClassifyImage(meta: { width: number; height: number; sizeBytes: number }): boolean {
291
+ const pixels = meta.width * meta.height
292
+ if (pixels < MIN_IMAGE_PIXELS) return false
293
+ const density = meta.sizeBytes / pixels
294
+ return density >= MIN_IMAGE_DENSITY_BYTES_PER_PX
295
+ }
296
+
297
+ /**
298
+ * Read image dimensions via vipsheader. Returns null when the tool isn't
299
+ * available or output is unparseable — caller should treat that as
300
+ * "include without filtering".
301
+ */
302
+ export async function getImageDimensions(
303
+ filePath: string,
304
+ processRunner: ProcessRunner,
305
+ ): Promise<{ width: number; height: number } | null> {
306
+ try {
307
+ const { stdout } = await processRunner.execFile(
308
+ 'vipsheader',
309
+ ['-f', 'width', '-f', 'height', filePath],
310
+ { timeout: 10_000 },
311
+ )
312
+ const lines = stdout.trim().split('\n')
313
+ if (lines.length < 2) return null
314
+ const width = parseInt(lines[0], 10)
315
+ const height = parseInt(lines[1], 10)
316
+ if (!Number.isFinite(width) || !Number.isFinite(height)) return null
317
+ return { width, height }
318
+ } catch {
319
+ return null
320
+ }
321
+ }
322
+
295
323
  export async function classifyExtractedImages(
296
324
  imageStore: FileStore,
297
325
  relativePrefix: string,
298
326
  ctx: PreprocessorContext,
299
327
  registry: PreprocessorRegistry,
300
328
  logger: Logger,
329
+ fs: FileSystem,
330
+ processRunner: ProcessRunner,
301
331
  ): Promise<Array<{ relativePath: string; description: string }>> {
302
332
  const listResult = await imageStore.list('', { maxDepth: 3 })
303
333
  if (!listResult.ok) return []
304
334
 
305
- const imageFiles = listResult.value
306
- .filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
307
- .sort((a, b) => a.name.localeCompare(b.name))
335
+ const candidates = listResult.value.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
336
+
337
+ // Stat + density filter, then keep the top MAX_IMAGES by file size.
338
+ const inspected = await mapWithConcurrency(candidates, 8, async (entry) => {
339
+ const pathResult = imageStore.realPath(entry.name)
340
+ if (!pathResult.ok) return null
341
+
342
+ let sizeBytes = 0
343
+ try {
344
+ sizeBytes = (await fs.stat(pathResult.value)).size
345
+ } catch {
346
+ return null
347
+ }
348
+
349
+ const dims = await getImageDimensions(pathResult.value, processRunner)
350
+ if (!dims) {
351
+ // Unknown dims — include but warn; better to classify than silently drop.
352
+ return { name: entry.name, sizeBytes, width: 0, height: 0, kept: true }
353
+ }
354
+
355
+ const kept = shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes })
356
+ return { name: entry.name, sizeBytes, width: dims.width, height: dims.height, kept }
357
+ })
358
+
359
+ const filtered = inspected
360
+ .filter((r): r is NonNullable<typeof r> => r !== null && r.kept)
361
+ .sort((a, b) => b.sizeBytes - a.sizeBytes)
308
362
  .slice(0, MAX_IMAGES)
309
363
 
310
- const settled = await mapWithConcurrency(imageFiles, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
364
+ const droppedCount = inspected.filter(r => r !== null && !r.kept).length
365
+ if (droppedCount > 0 || inspected.length > MAX_IMAGES) {
366
+ logger.info('Image filter applied', {
367
+ source: relativePrefix,
368
+ candidates: candidates.length,
369
+ passedDensityFilter: candidates.length - droppedCount,
370
+ selected: filtered.length,
371
+ droppedByDensity: droppedCount,
372
+ })
373
+ }
374
+
375
+ const settled = await mapWithConcurrency(filtered, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
311
376
  const imgPathResult = imageStore.realPath(imgFile.name)
312
377
  if (!imgPathResult.ok) return null
313
378