@roj-ai/sdk 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/bootstrap.d.ts.map +1 -1
  2. package/dist/bootstrap.js +12 -2
  3. package/dist/bootstrap.js.map +1 -1
  4. package/dist/core/image/types.d.ts +2 -0
  5. package/dist/core/image/types.d.ts.map +1 -1
  6. package/dist/core/image/vips-resizer.d.ts.map +1 -1
  7. package/dist/core/image/vips-resizer.js +12 -11
  8. package/dist/core/image/vips-resizer.js.map +1 -1
  9. package/dist/core/sessions/session.d.ts.map +1 -1
  10. package/dist/core/sessions/session.js +0 -7
  11. package/dist/core/sessions/session.js.map +1 -1
  12. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts +20 -0
  13. package/dist/plugins/uploads/preprocessors/image-classifier.d.ts.map +1 -1
  14. package/dist/plugins/uploads/preprocessors/image-classifier.js +93 -28
  15. package/dist/plugins/uploads/preprocessors/image-classifier.js.map +1 -1
  16. package/dist/plugins/uploads/preprocessors/index.d.ts +1 -0
  17. package/dist/plugins/uploads/preprocessors/index.d.ts.map +1 -1
  18. package/dist/plugins/uploads/preprocessors/index.js +1 -0
  19. package/dist/plugins/uploads/preprocessors/index.js.map +1 -1
  20. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts +52 -5
  21. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts.map +1 -1
  22. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js +183 -75
  23. package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js.map +1 -1
  24. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts +71 -0
  25. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts.map +1 -0
  26. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js +274 -0
  27. package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js.map +1 -0
  28. package/package.json +2 -2
  29. package/src/bootstrap.ts +12 -2
  30. package/src/core/image/types.ts +2 -0
  31. package/src/core/image/vips-resizer.ts +12 -11
  32. package/src/core/sessions/session.ts +0 -8
  33. package/src/plugins/uploads/preprocessors/image-classifier.ts +108 -29
  34. package/src/plugins/uploads/preprocessors/index.ts +1 -0
  35. package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts +213 -79
  36. package/src/plugins/uploads/preprocessors/pdf-preprocessor.ts +342 -0
@@ -0,0 +1,342 @@
1
+ /**
2
+ * PDF Preprocessor
3
+ *
4
+ * Dedicated PDF pipeline:
5
+ *
6
+ * 1. Text extraction via `pdftotext` (poppler-utils, C++) — ~1 s for a
7
+ * 3 MB PDF. Replaces markitdown/pdfminer.six (~22 s for the same file)
8
+ * because PDFs in practice don't carry the rich markdown structure
9
+ * that justifies the slower backend.
10
+ *
11
+ * 2. Image extraction via `pdfimages -all` — keeps the original embedded
12
+ * format (JPEG stays JPEG) instead of re-encoding everything to PNG
13
+ * (~10× faster, much smaller files).
14
+ *
15
+ * 3. Text and image extraction run in parallel.
16
+ *
17
+ * 4. Images stream into the classifier as soon as `pdfimages` writes them
18
+ * to disk — the classifier doesn't wait for the whole extraction to
19
+ * finish. A density filter (bytes/pixel) drops alpha masks and overlay
20
+ * layers before the vision call.
21
+ */
22
+
23
+ import { dirname } from 'node:path'
24
+ import type { Result } from '~/lib/utils/result.js'
25
+ import { Err, Ok } from '~/lib/utils/result.js'
26
+ import type { FileSystem } from '~/platform/fs.js'
27
+ import type { ProcessRunner } from '~/platform/process.js'
28
+ import type { Logger } from '../../../lib/logger/logger.js'
29
+ import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js'
30
+ import {
31
+ getImageDimensions,
32
+ guessImageMime,
33
+ IMAGE_EXT_RE,
34
+ MIN_IMAGE_DENSITY_BYTES_PER_PX,
35
+ MIN_IMAGE_PIXELS,
36
+ shouldClassifyImage,
37
+ } from './markitdown-preprocessor.js'
38
+
39
+ const PDFTOTEXT_TIMEOUT_MS = 60_000
40
+ const PDFIMAGES_TIMEOUT_MS = 5 * 60_000
41
+
42
+ const MAX_IMAGES = 20
43
+ const CLASSIFY_CONCURRENCY = 10
44
+ const STREAM_POLL_INTERVAL_MS = 250
45
+
46
+ const SUPPORTED_MIME_TYPES = ['application/pdf']
47
+
48
+ export interface PdfPreprocessorConfig {
49
+ registry: PreprocessorRegistry
50
+ logger: Logger
51
+ fs: FileSystem
52
+ process: ProcessRunner
53
+ }
54
+
55
+ export class PdfPreprocessor implements Preprocessor {
56
+ readonly name = 'pdf'
57
+ readonly supportedMimeTypes = SUPPORTED_MIME_TYPES
58
+
59
+ private readonly registry: PreprocessorRegistry
60
+ private readonly logger: Logger
61
+ private readonly fs: FileSystem
62
+ private readonly processRunner: ProcessRunner
63
+
64
+ constructor(config: PdfPreprocessorConfig) {
65
+ this.registry = config.registry
66
+ this.logger = config.logger
67
+ this.fs = config.fs
68
+ this.processRunner = config.process
69
+ }
70
+
71
+ async process(
72
+ filePath: string,
73
+ mimeType: string,
74
+ ctx: PreprocessorContext,
75
+ ): Promise<Result<PreprocessorResult, Error>> {
76
+ const totalStart = Date.now()
77
+ this.logger.info('PDF processing started', { filePath })
78
+
79
+ const contentPathResult = ctx.files.realPath('content.md')
80
+ if (!contentPathResult.ok) return Err(new Error('Failed to resolve content output path'))
81
+
82
+ const imagesDirResult = ctx.files.scoped('images').realPath('')
83
+ if (!imagesDirResult.ok) return Err(new Error('Failed to resolve images output path'))
84
+
85
+ await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
86
+ await this.fs.mkdir(imagesDirResult.value, { recursive: true })
87
+
88
+ // Run text extraction and image extraction (with streaming classification)
89
+ // in parallel. They share no state and don't block each other.
90
+ const [textResult, images] = await Promise.all([
91
+ this.extractText(filePath, contentPathResult.value),
92
+ this.extractAndClassifyImages(filePath, imagesDirResult.value, ctx),
93
+ ])
94
+
95
+ const markdown = textResult.ok ? textResult.value : ''
96
+
97
+ const derivedPaths: string[] = ['content.md']
98
+ const imageEntries: string[] = []
99
+ for (const img of images) {
100
+ derivedPaths.push(img.relativePath)
101
+ imageEntries.push(`- ${img.relativePath} — ${img.description}`)
102
+ }
103
+
104
+ const manifestLines: string[] = ['Extracted files:']
105
+ manifestLines.push(`- content.md (text, ${markdown.length} chars)`)
106
+ manifestLines.push(...imageEntries)
107
+
108
+ this.logger.info('PDF processing complete', {
109
+ filePath,
110
+ contentLength: markdown.length,
111
+ imagesClassified: imageEntries.length,
112
+ totalDurationMs: Date.now() - totalStart,
113
+ })
114
+
115
+ return Ok({
116
+ extractedContent: manifestLines.join('\n'),
117
+ derivedPaths,
118
+ })
119
+ }
120
+
121
+ /**
122
+ * Extract plain text via pdftotext. Writes to content.md verbatim — no
123
+ * markdown structure to preserve, but the file extension stays .md for
124
+ * consistency with the markitdown pipeline (downstream consumers expect
125
+ * "content.md" in the upload directory).
126
+ *
127
+ * `-layout` preserves the original visual layout (columns, tables),
128
+ * which is what users typically expect when looking at PDFs.
129
+ */
130
+ private async extractText(filePath: string, outputPath: string): Promise<Result<string, Error>> {
131
+ const start = Date.now()
132
+ try {
133
+ await this.processRunner.execFile(
134
+ 'pdftotext',
135
+ ['-layout', filePath, outputPath],
136
+ { timeout: PDFTOTEXT_TIMEOUT_MS, maxBuffer: 50 * 1024 * 1024 },
137
+ )
138
+ } catch (error) {
139
+ const message = error instanceof Error ? error.message : String(error)
140
+ this.logger.warn('pdftotext failed', { filePath, durationMs: Date.now() - start, error: message })
141
+ return Err(new Error(`pdftotext failed: ${message}`))
142
+ }
143
+
144
+ let text = ''
145
+ try {
146
+ text = await this.fs.readFile(outputPath, 'utf-8')
147
+ } catch {
148
+ // File missing — pdftotext succeeded but produced no output.
149
+ }
150
+
151
+ this.logger.info('pdftotext complete', {
152
+ filePath,
153
+ durationMs: Date.now() - start,
154
+ contentLength: text.length,
155
+ })
156
+
157
+ return Ok(text)
158
+ }
159
+
160
+ /**
161
+ * Extract images via pdfimages and classify them as they appear on disk.
162
+ *
163
+ * pdfimages writes files atomically per image (open temp, write, rename
164
+ * to final name), so polling `readdir` is safe — we either see a name or
165
+ * we don't, never a half-written file.
166
+ *
167
+ * Streaming overlaps the extraction tail with the first classification
168
+ * batches. Hard cap of MAX_IMAGES applies across the *filtered* set: as
169
+ * soon as MAX_IMAGES images have passed the density filter, further
170
+ * candidates are stat-checked but not classified.
171
+ *
172
+ * `-all` keeps the embedded format (JPEG, JBIG2, JP2). We only classify
173
+ * those Anthropic vision accepts (PNG/JPEG/GIF/WebP); other formats are
174
+ * extracted to disk for reference but skipped at the classification step.
175
+ */
176
+ private async extractAndClassifyImages(
177
+ filePath: string,
178
+ imagesDir: string,
179
+ ctx: PreprocessorContext,
180
+ ): Promise<Array<{ relativePath: string; description: string }>> {
181
+ const extractStart = Date.now()
182
+ const seen = new Set<string>()
183
+ const acceptedQueue: Array<{ name: string; sizeBytes: number; width: number; height: number }> = []
184
+ const classifyPromises: Array<Promise<{ relativePath: string; description: string } | null>> = []
185
+ let stopAccepting = false
186
+ let droppedByDensity = 0
187
+ let skippedUnsupportedExt = 0
188
+
189
+ // Active classification gate — caps in-flight vision calls.
190
+ let active = 0
191
+ const waiters: Array<() => void> = []
192
+ const acquire = () => new Promise<void>(resolve => {
193
+ if (active < CLASSIFY_CONCURRENCY) { active++; resolve() }
194
+ else waiters.push(() => { active++; resolve() })
195
+ })
196
+ const release = () => {
197
+ active--
198
+ const next = waiters.shift()
199
+ if (next) next()
200
+ }
201
+
202
+ const classifyOne = async (name: string): Promise<{ relativePath: string; description: string } | null> => {
203
+ await acquire()
204
+ try {
205
+ const mime = guessImageMime(name)
206
+ const fullPath = `${imagesDir}/${name}`
207
+ const imageStore = ctx.files.scoped('images')
208
+ let description = mime
209
+
210
+ const classifier = this.registry.getForMimeType(mime)
211
+ if (classifier) {
212
+ const result = await classifier.process(fullPath, mime, {
213
+ files: ctx.files.scoped(`images/${name}-meta`),
214
+ })
215
+ if (result.ok && result.value.extractedContent) {
216
+ description = result.value.extractedContent
217
+ }
218
+ }
219
+
220
+ return { relativePath: `images/${name}`, description }
221
+ } finally {
222
+ release()
223
+ }
224
+ }
225
+
226
+ const inspectAndMaybeClassify = async (name: string) => {
227
+ if (seen.has(name) || stopAccepting) return
228
+ seen.add(name)
229
+
230
+ if (!IMAGE_EXT_RE.test(name)) {
231
+ skippedUnsupportedExt++
232
+ return
233
+ }
234
+
235
+ const fullPath = `${imagesDir}/${name}`
236
+ let sizeBytes = 0
237
+ try {
238
+ sizeBytes = (await this.fs.stat(fullPath)).size
239
+ } catch {
240
+ return
241
+ }
242
+
243
+ const dims = await getImageDimensions(fullPath, this.processRunner)
244
+ const hasDims = dims !== null
245
+ const passesFilter = hasDims
246
+ ? shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes })
247
+ : sizeBytes >= MIN_IMAGE_PIXELS * MIN_IMAGE_DENSITY_BYTES_PER_PX // fall back to absolute byte floor
248
+
249
+ if (!passesFilter) {
250
+ droppedByDensity++
251
+ return
252
+ }
253
+
254
+ acceptedQueue.push({
255
+ name,
256
+ sizeBytes,
257
+ width: dims?.width ?? 0,
258
+ height: dims?.height ?? 0,
259
+ })
260
+
261
+ if (acceptedQueue.length >= MAX_IMAGES) {
262
+ stopAccepting = true
263
+ }
264
+
265
+ classifyPromises.push(classifyOne(name))
266
+ }
267
+
268
+ // Run pdfimages and a parallel poll loop. The poll calls readdir
269
+ // periodically and dispatches `inspectAndMaybeClassify` for newly
270
+ // appeared files; doing it this way avoids fs.watch quirks (some
271
+ // container filesystems don't deliver events).
272
+ const pdfimagesPromise = this.processRunner.execFile(
273
+ 'pdfimages',
274
+ ['-all', filePath, `${imagesDir}/img`],
275
+ { timeout: PDFIMAGES_TIMEOUT_MS, maxBuffer: 1024 * 1024 },
276
+ ).then(() => true).catch((error) => {
277
+ this.logger.warn('pdfimages failed (will classify any partial output)', {
278
+ filePath,
279
+ durationMs: Date.now() - extractStart,
280
+ error: error instanceof Error ? error.message : String(error),
281
+ })
282
+ return false
283
+ })
284
+
285
+ let extractionDone = false
286
+ const poll = async () => {
287
+ while (!extractionDone) {
288
+ await this.scanAndDispatch(imagesDir, inspectAndMaybeClassify)
289
+ await sleep(STREAM_POLL_INTERVAL_MS)
290
+ }
291
+ // Final sweep — pick up anything that landed between the last poll
292
+ // and pdfimages exiting.
293
+ await this.scanAndDispatch(imagesDir, inspectAndMaybeClassify)
294
+ }
295
+
296
+ const pollPromise = poll()
297
+
298
+ const extractSucceeded = await pdfimagesPromise
299
+ extractionDone = true
300
+ await pollPromise
301
+
302
+ this.logger.info(extractSucceeded ? 'pdfimages complete' : 'pdfimages failed (partial)', {
303
+ filePath,
304
+ durationMs: Date.now() - extractStart,
305
+ filesEmitted: seen.size,
306
+ passedFilter: acceptedQueue.length,
307
+ droppedByDensity,
308
+ skippedUnsupportedExt,
309
+ })
310
+
311
+ const classifyStart = Date.now()
312
+ const settled = await Promise.all(classifyPromises)
313
+ const images = settled.filter((r): r is { relativePath: string; description: string } => r !== null)
314
+
315
+ this.logger.info('PDF image classification complete', {
316
+ filePath,
317
+ count: images.length,
318
+ durationMs: Date.now() - classifyStart,
319
+ })
320
+
321
+ return images
322
+ }
323
+
324
+ private async scanAndDispatch(
325
+ dir: string,
326
+ handle: (name: string) => Promise<void>,
327
+ ): Promise<void> {
328
+ let entries: string[]
329
+ try {
330
+ entries = await this.fs.readdir(dir)
331
+ } catch {
332
+ return
333
+ }
334
+ // Fire dispatches in parallel — `inspectAndMaybeClassify` is internally
335
+ // idempotent for already-seen names.
336
+ await Promise.all(entries.map(handle))
337
+ }
338
+ }
339
+
340
+ function sleep(ms: number): Promise<void> {
341
+ return new Promise(resolve => setTimeout(resolve, ms))
342
+ }