@roj-ai/sdk 0.1.16 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bootstrap.d.ts.map +1 -1
- package/dist/bootstrap.js +12 -2
- package/dist/bootstrap.js.map +1 -1
- package/dist/config.d.ts +12 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js.map +1 -1
- package/dist/core/image/types.d.ts +2 -0
- package/dist/core/image/types.d.ts.map +1 -1
- package/dist/core/image/vips-resizer.d.ts.map +1 -1
- package/dist/core/image/vips-resizer.js +12 -11
- package/dist/core/image/vips-resizer.js.map +1 -1
- package/dist/info.d.ts +7 -0
- package/dist/info.d.ts.map +1 -0
- package/dist/info.js +8 -0
- package/dist/info.js.map +1 -0
- package/dist/plugins/uploads/preprocessors/image-classifier.d.ts +20 -0
- package/dist/plugins/uploads/preprocessors/image-classifier.d.ts.map +1 -1
- package/dist/plugins/uploads/preprocessors/image-classifier.js +78 -26
- package/dist/plugins/uploads/preprocessors/image-classifier.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/index.d.ts +1 -0
- package/dist/plugins/uploads/preprocessors/index.d.ts.map +1 -1
- package/dist/plugins/uploads/preprocessors/index.js +1 -0
- package/dist/plugins/uploads/preprocessors/index.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts +52 -5
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.d.ts.map +1 -1
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js +152 -97
- package/dist/plugins/uploads/preprocessors/markitdown-preprocessor.js.map +1 -1
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts +71 -0
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.d.ts.map +1 -0
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js +274 -0
- package/dist/plugins/uploads/preprocessors/pdf-preprocessor.js.map +1 -0
- package/dist/transport/http/app.d.ts.map +1 -1
- package/dist/transport/http/app.js +6 -1
- package/dist/transport/http/app.js.map +1 -1
- package/package.json +2 -2
- package/src/bootstrap.ts +12 -2
- package/src/config.ts +13 -0
- package/src/core/image/types.ts +2 -0
- package/src/core/image/vips-resizer.ts +12 -11
- package/src/info.ts +9 -0
- package/src/plugins/uploads/preprocessors/image-classifier.ts +93 -27
- package/src/plugins/uploads/preprocessors/index.ts +1 -0
- package/src/plugins/uploads/preprocessors/markitdown-preprocessor.ts +173 -108
- package/src/plugins/uploads/preprocessors/pdf-preprocessor.ts +342 -0
- package/src/transport/http/app.ts +6 -1
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
* Markitdown Preprocessor
|
|
3
3
|
*
|
|
4
4
|
* Converts documents to markdown using Microsoft's markitdown CLI.
|
|
5
|
-
* Supports
|
|
5
|
+
* Supports DOCX, XLSX, PPTX, HTML, CSV, JSON, XML, EPUB, RTF, ODT.
|
|
6
|
+
*
|
|
7
|
+
* PDFs are handled by `PdfPreprocessor` instead — markitdown's PDF backend
|
|
8
|
+
* (pdfminer.six) is ~20× slower than pdftotext for no real gain on the
|
|
9
|
+
* mostly-unstructured PDFs we see in practice.
|
|
6
10
|
*
|
|
7
11
|
* Image extraction:
|
|
8
|
-
* -
|
|
9
|
-
* - DOCX/ODT/EPUB: uses pandoc --extract-media
|
|
12
|
+
* - DOCX/ODT/EPUB: uses pandoc --extract-media (runs in parallel with markitdown)
|
|
10
13
|
*
|
|
11
14
|
* Extracted images are classified via the image classifier preprocessor.
|
|
12
15
|
* Full content is written to disk; extractedContent contains a structured manifest.
|
|
@@ -22,14 +25,27 @@ import type { FileStore } from '../../../core/file-store/types.js'
|
|
|
22
25
|
import type { Logger } from '../../../lib/logger/logger.js'
|
|
23
26
|
import type { Preprocessor, PreprocessorContext, PreprocessorRegistry, PreprocessorResult } from '../preprocessor.js'
|
|
24
27
|
|
|
25
|
-
const MAX_IMAGES =
|
|
28
|
+
const MAX_IMAGES = 20
|
|
26
29
|
const IMAGE_CLASSIFY_CONCURRENCY = 10
|
|
27
30
|
|
|
28
|
-
|
|
31
|
+
/**
|
|
32
|
+
* Density filter for extracted images. Bytes-per-pixel ratio below this
|
|
33
|
+
* threshold typically means the image is an alpha mask, overlay layer, or
|
|
34
|
+
* essentially-empty region — not worth a vision call.
|
|
35
|
+
*
|
|
36
|
+
* Empirical reference points:
|
|
37
|
+
* - Dense photo JPEG: 0.3–1.0 B/px
|
|
38
|
+
* - Logo / icon PNG: 0.1–0.5 B/px
|
|
39
|
+
* - Brand PDF layer mask: <0.005 B/px
|
|
40
|
+
*/
|
|
41
|
+
export const MIN_IMAGE_DENSITY_BYTES_PER_PX = 0.05
|
|
42
|
+
export const MIN_IMAGE_PIXELS = 50 * 50
|
|
43
|
+
|
|
44
|
+
// markitdown converts a text-only document; even large files finish in seconds.
|
|
29
45
|
const MARKITDOWN_TIMEOUT_MS = 60_000
|
|
30
|
-
// Image extractors (
|
|
31
|
-
// and resolution. Real-world large
|
|
32
|
-
//
|
|
46
|
+
// Image extractors (pandoc --extract-media) scale with image count
|
|
47
|
+
// and resolution. Real-world large docs can take 60–90s. Upload preprocessing
|
|
48
|
+
// is async/background, so allow generous headroom.
|
|
33
49
|
const IMAGE_EXTRACT_TIMEOUT_MS = 5 * 60_000
|
|
34
50
|
|
|
35
51
|
function makeExec(processRunner: ProcessRunner) {
|
|
@@ -37,9 +53,8 @@ function makeExec(processRunner: ProcessRunner) {
|
|
|
37
53
|
processRunner.execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 50 * 1024 * 1024 })
|
|
38
54
|
}
|
|
39
55
|
|
|
40
|
-
/** MIME types where markitdown converts to markdown (non-ZIP, non-image) */
|
|
56
|
+
/** MIME types where markitdown converts to markdown (non-ZIP, non-image, non-PDF) */
|
|
41
57
|
const SUPPORTED_MIME_TYPES = [
|
|
42
|
-
'application/pdf',
|
|
43
58
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
44
59
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
45
60
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
@@ -67,11 +82,6 @@ const PANDOC_FORMAT_MAP: Record<string, string> = {
|
|
|
67
82
|
'application/epub+zip': 'epub',
|
|
68
83
|
}
|
|
69
84
|
|
|
70
|
-
/** MIME types where pdfimages can extract images */
|
|
71
|
-
const PDFIMAGES_MIMES = new Set([
|
|
72
|
-
'application/pdf',
|
|
73
|
-
])
|
|
74
|
-
|
|
75
85
|
export interface MarkitdownPreprocessorConfig {
|
|
76
86
|
registry: PreprocessorRegistry
|
|
77
87
|
logger: Logger
|
|
@@ -86,12 +96,14 @@ export class MarkitdownPreprocessor implements Preprocessor {
|
|
|
86
96
|
private readonly registry: PreprocessorRegistry
|
|
87
97
|
private readonly logger: Logger
|
|
88
98
|
private readonly fs: FileSystem
|
|
99
|
+
private readonly processRunner: ProcessRunner
|
|
89
100
|
private readonly exec: (cmd: string, args: string[], timeoutMs?: number) => Promise<{ stdout: string; stderr: string }>
|
|
90
101
|
|
|
91
102
|
constructor(config: MarkitdownPreprocessorConfig) {
|
|
92
103
|
this.registry = config.registry
|
|
93
104
|
this.logger = config.logger
|
|
94
105
|
this.fs = config.fs
|
|
106
|
+
this.processRunner = config.process
|
|
95
107
|
this.exec = makeExec(config.process)
|
|
96
108
|
}
|
|
97
109
|
|
|
@@ -101,21 +113,63 @@ export class MarkitdownPreprocessor implements Preprocessor {
|
|
|
101
113
|
ctx: PreprocessorContext,
|
|
102
114
|
): Promise<Result<PreprocessorResult, Error>> {
|
|
103
115
|
const totalStart = Date.now()
|
|
104
|
-
const derivedPaths: string[] = []
|
|
105
|
-
const imageEntries: string[] = []
|
|
106
116
|
|
|
107
117
|
this.logger.info('Markitdown processing started', { filePath, mimeType })
|
|
108
118
|
|
|
109
|
-
// 1. Convert to markdown via markitdown
|
|
110
119
|
const contentPathResult = ctx.files.realPath('content.md')
|
|
111
120
|
if (!contentPathResult.ok) {
|
|
112
121
|
return Err(new Error('Failed to resolve output path'))
|
|
113
122
|
}
|
|
123
|
+
await this.fs.mkdir(dirname(contentPathResult.value), { recursive: true })
|
|
124
|
+
|
|
125
|
+
// Race markitdown text conversion and image extraction — they're
|
|
126
|
+
// independent, so there's no reason to serialize them. For documents
|
|
127
|
+
// where pandoc extraction isn't applicable, the image task resolves
|
|
128
|
+
// immediately.
|
|
129
|
+
const markdownTask = this.runMarkitdown(filePath, mimeType, contentPathResult.value)
|
|
130
|
+
const imageTask = PANDOC_EXTRACT_MIMES.has(mimeType)
|
|
131
|
+
? this.extractImagesWithPandoc(filePath, mimeType, ctx)
|
|
132
|
+
: Promise.resolve<Array<{ relativePath: string; description: string }>>([])
|
|
133
|
+
|
|
134
|
+
const [markdownResult, images] = await Promise.all([markdownTask, imageTask])
|
|
135
|
+
|
|
136
|
+
if (!markdownResult.ok) return markdownResult
|
|
137
|
+
|
|
138
|
+
const markdown = markdownResult.value
|
|
139
|
+
|
|
140
|
+
const derivedPaths: string[] = ['content.md']
|
|
141
|
+
const imageEntries: string[] = []
|
|
142
|
+
for (const img of images) {
|
|
143
|
+
derivedPaths.push(img.relativePath)
|
|
144
|
+
imageEntries.push(`- ${img.relativePath} — ${img.description}`)
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const manifestLines: string[] = ['Extracted files:']
|
|
148
|
+
manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`)
|
|
149
|
+
manifestLines.push(...imageEntries)
|
|
150
|
+
|
|
151
|
+
this.logger.info('Markitdown processing complete', {
|
|
152
|
+
filePath,
|
|
153
|
+
mimeType,
|
|
154
|
+
contentLength: markdown.length,
|
|
155
|
+
imagesExtracted: imageEntries.length,
|
|
156
|
+
totalDurationMs: Date.now() - totalStart,
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
return Ok({
|
|
160
|
+
extractedContent: manifestLines.join('\n'),
|
|
161
|
+
derivedPaths,
|
|
162
|
+
})
|
|
163
|
+
}
|
|
114
164
|
|
|
165
|
+
private async runMarkitdown(
|
|
166
|
+
filePath: string,
|
|
167
|
+
mimeType: string,
|
|
168
|
+
outputPath: string,
|
|
169
|
+
): Promise<Result<string, Error>> {
|
|
115
170
|
const markitdownStart = Date.now()
|
|
116
171
|
try {
|
|
117
|
-
await this.
|
|
118
|
-
await this.exec('markitdown', [filePath, '-o', contentPathResult.value])
|
|
172
|
+
await this.exec('markitdown', [filePath, '-o', outputPath])
|
|
119
173
|
} catch (error) {
|
|
120
174
|
const message = error instanceof Error ? error.message : String(error)
|
|
121
175
|
this.logger.error(
|
|
@@ -129,10 +183,12 @@ export class MarkitdownPreprocessor implements Preprocessor {
|
|
|
129
183
|
return Err(new Error(`markitdown failed: ${message}`))
|
|
130
184
|
}
|
|
131
185
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
186
|
+
let markdown = ''
|
|
187
|
+
try {
|
|
188
|
+
markdown = await this.fs.readFile(outputPath, 'utf-8')
|
|
189
|
+
} catch {
|
|
190
|
+
// Output missing — markitdown completed but produced nothing.
|
|
191
|
+
}
|
|
136
192
|
|
|
137
193
|
this.logger.info('Markitdown conversion complete', {
|
|
138
194
|
filePath,
|
|
@@ -141,41 +197,7 @@ export class MarkitdownPreprocessor implements Preprocessor {
|
|
|
141
197
|
contentLength: markdown.length,
|
|
142
198
|
})
|
|
143
199
|
|
|
144
|
-
|
|
145
|
-
const imagePhaseStart = Date.now()
|
|
146
|
-
if (PANDOC_EXTRACT_MIMES.has(mimeType)) {
|
|
147
|
-
const images = await this.extractImagesWithPandoc(filePath, mimeType, ctx)
|
|
148
|
-
for (const img of images) {
|
|
149
|
-
derivedPaths.push(img.relativePath)
|
|
150
|
-
imageEntries.push(`- ${img.relativePath} — ${img.description}`)
|
|
151
|
-
}
|
|
152
|
-
} else if (PDFIMAGES_MIMES.has(mimeType)) {
|
|
153
|
-
const images = await this.extractImagesWithPdfimages(filePath, ctx)
|
|
154
|
-
for (const img of images) {
|
|
155
|
-
derivedPaths.push(img.relativePath)
|
|
156
|
-
imageEntries.push(`- ${img.relativePath} — ${img.description}`)
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
const imagePhaseDurationMs = Date.now() - imagePhaseStart
|
|
160
|
-
|
|
161
|
-
// 3. Build manifest
|
|
162
|
-
const manifestLines: string[] = ['Extracted files:']
|
|
163
|
-
manifestLines.push(`- content.md (markdown, ${markdown.length} chars)`)
|
|
164
|
-
manifestLines.push(...imageEntries)
|
|
165
|
-
|
|
166
|
-
this.logger.info('Markitdown processing complete', {
|
|
167
|
-
filePath,
|
|
168
|
-
mimeType,
|
|
169
|
-
contentLength: markdown.length,
|
|
170
|
-
imagesExtracted: imageEntries.length,
|
|
171
|
-
imagePhaseDurationMs,
|
|
172
|
-
totalDurationMs: Date.now() - totalStart,
|
|
173
|
-
})
|
|
174
|
-
|
|
175
|
-
return Ok({
|
|
176
|
-
extractedContent: manifestLines.join('\n'),
|
|
177
|
-
derivedPaths,
|
|
178
|
-
})
|
|
200
|
+
return Ok(markdown)
|
|
179
201
|
}
|
|
180
202
|
|
|
181
203
|
private async extractImagesWithPandoc(
|
|
@@ -215,7 +237,7 @@ export class MarkitdownPreprocessor implements Preprocessor {
|
|
|
215
237
|
}
|
|
216
238
|
|
|
217
239
|
const classifyStart = Date.now()
|
|
218
|
-
const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger)
|
|
240
|
+
const images = await classifyExtractedImages(mediaStore, 'media', ctx, this.registry, this.logger, this.fs, this.processRunner)
|
|
219
241
|
this.logger.info('Image classification complete', {
|
|
220
242
|
source: 'pandoc',
|
|
221
243
|
count: images.length,
|
|
@@ -224,56 +246,18 @@ export class MarkitdownPreprocessor implements Preprocessor {
|
|
|
224
246
|
})
|
|
225
247
|
return images
|
|
226
248
|
}
|
|
227
|
-
|
|
228
|
-
private async extractImagesWithPdfimages(
|
|
229
|
-
filePath: string,
|
|
230
|
-
ctx: PreprocessorContext,
|
|
231
|
-
): Promise<Array<{ relativePath: string; description: string }>> {
|
|
232
|
-
const imageStore = ctx.files.scoped('images')
|
|
233
|
-
const imagesDirResult = imageStore.realPath('')
|
|
234
|
-
if (!imagesDirResult.ok) return []
|
|
235
|
-
|
|
236
|
-
const pdfimagesStart = Date.now()
|
|
237
|
-
let extractSucceeded = true
|
|
238
|
-
try {
|
|
239
|
-
await this.fs.mkdir(imagesDirResult.value, { recursive: true })
|
|
240
|
-
await this.exec(
|
|
241
|
-
'pdfimages',
|
|
242
|
-
['-png', filePath, `${imagesDirResult.value}/img`],
|
|
243
|
-
IMAGE_EXTRACT_TIMEOUT_MS,
|
|
244
|
-
)
|
|
245
|
-
} catch (error) {
|
|
246
|
-
extractSucceeded = false
|
|
247
|
-
this.logger.warn('pdfimages failed (will classify any partial output)', {
|
|
248
|
-
filePath,
|
|
249
|
-
durationMs: Date.now() - pdfimagesStart,
|
|
250
|
-
error: error instanceof Error ? error.message : String(error),
|
|
251
|
-
})
|
|
252
|
-
}
|
|
253
|
-
if (extractSucceeded) {
|
|
254
|
-
this.logger.info('pdfimages complete', {
|
|
255
|
-
filePath,
|
|
256
|
-
durationMs: Date.now() - pdfimagesStart,
|
|
257
|
-
})
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
const classifyStart = Date.now()
|
|
261
|
-
const images = await classifyExtractedImages(imageStore, 'images', ctx, this.registry, this.logger)
|
|
262
|
-
this.logger.info('Image classification complete', {
|
|
263
|
-
source: 'pdfimages',
|
|
264
|
-
count: images.length,
|
|
265
|
-
partial: !extractSucceeded,
|
|
266
|
-
durationMs: Date.now() - classifyStart,
|
|
267
|
-
})
|
|
268
|
-
return images
|
|
269
|
-
}
|
|
270
249
|
}
|
|
271
250
|
|
|
272
251
|
// ============================================================================
|
|
273
252
|
// Shared image helpers
|
|
274
253
|
// ============================================================================
|
|
275
254
|
|
|
276
|
-
|
|
255
|
+
/**
|
|
256
|
+
* Recognized by Anthropic vision API. Other pdfimages outputs (pbm, ppm,
|
|
257
|
+
* jb2e, jp2) are ignored — they'd require local conversion before being
|
|
258
|
+
* useful for classification.
|
|
259
|
+
*/
|
|
260
|
+
export const IMAGE_EXT_RE = /\.(png|jpe?g|gif|webp|tiff?|bmp|svg)$/i
|
|
277
261
|
|
|
278
262
|
const IMAGE_MIME_MAP: Record<string, string> = {
|
|
279
263
|
png: 'image/png',
|
|
@@ -292,22 +276,103 @@ export function guessImageMime(filename: string): string {
|
|
|
292
276
|
return IMAGE_MIME_MAP[ext ?? ''] ?? 'image/png'
|
|
293
277
|
}
|
|
294
278
|
|
|
279
|
+
/**
|
|
280
|
+
* Reject images that are unlikely to carry useful visual information.
|
|
281
|
+
*
|
|
282
|
+
* `bytesPerPixel` filters out alpha masks, sparse overlays, and essentially-
|
|
283
|
+
* empty pages — brand PDFs typically emit a real photo (~1 B/px) plus a
|
|
284
|
+
* matching transparency/overlay layer at the same dimensions but a fraction
|
|
285
|
+
* of a percent of the size (<0.005 B/px).
|
|
286
|
+
*
|
|
287
|
+
* The minimum pixel count protects against tiny icons whose density alone
|
|
288
|
+
* doesn't disqualify them.
|
|
289
|
+
*/
|
|
290
|
+
export function shouldClassifyImage(meta: { width: number; height: number; sizeBytes: number }): boolean {
|
|
291
|
+
const pixels = meta.width * meta.height
|
|
292
|
+
if (pixels < MIN_IMAGE_PIXELS) return false
|
|
293
|
+
const density = meta.sizeBytes / pixels
|
|
294
|
+
return density >= MIN_IMAGE_DENSITY_BYTES_PER_PX
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Read image dimensions via vipsheader. Returns null when the tool isn't
|
|
299
|
+
* available or output is unparseable — caller should treat that as
|
|
300
|
+
* "include without filtering".
|
|
301
|
+
*/
|
|
302
|
+
export async function getImageDimensions(
|
|
303
|
+
filePath: string,
|
|
304
|
+
processRunner: ProcessRunner,
|
|
305
|
+
): Promise<{ width: number; height: number } | null> {
|
|
306
|
+
try {
|
|
307
|
+
const { stdout } = await processRunner.execFile(
|
|
308
|
+
'vipsheader',
|
|
309
|
+
['-f', 'width', '-f', 'height', filePath],
|
|
310
|
+
{ timeout: 10_000 },
|
|
311
|
+
)
|
|
312
|
+
const lines = stdout.trim().split('\n')
|
|
313
|
+
if (lines.length < 2) return null
|
|
314
|
+
const width = parseInt(lines[0], 10)
|
|
315
|
+
const height = parseInt(lines[1], 10)
|
|
316
|
+
if (!Number.isFinite(width) || !Number.isFinite(height)) return null
|
|
317
|
+
return { width, height }
|
|
318
|
+
} catch {
|
|
319
|
+
return null
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
295
323
|
export async function classifyExtractedImages(
|
|
296
324
|
imageStore: FileStore,
|
|
297
325
|
relativePrefix: string,
|
|
298
326
|
ctx: PreprocessorContext,
|
|
299
327
|
registry: PreprocessorRegistry,
|
|
300
328
|
logger: Logger,
|
|
329
|
+
fs: FileSystem,
|
|
330
|
+
processRunner: ProcessRunner,
|
|
301
331
|
): Promise<Array<{ relativePath: string; description: string }>> {
|
|
302
332
|
const listResult = await imageStore.list('', { maxDepth: 3 })
|
|
303
333
|
if (!listResult.ok) return []
|
|
304
334
|
|
|
305
|
-
const
|
|
306
|
-
|
|
307
|
-
|
|
335
|
+
const candidates = listResult.value.filter(e => e.type === 'file' && IMAGE_EXT_RE.test(e.name))
|
|
336
|
+
|
|
337
|
+
// Stat + density filter, then keep the top MAX_IMAGES by file size.
|
|
338
|
+
const inspected = await mapWithConcurrency(candidates, 8, async (entry) => {
|
|
339
|
+
const pathResult = imageStore.realPath(entry.name)
|
|
340
|
+
if (!pathResult.ok) return null
|
|
341
|
+
|
|
342
|
+
let sizeBytes = 0
|
|
343
|
+
try {
|
|
344
|
+
sizeBytes = (await fs.stat(pathResult.value)).size
|
|
345
|
+
} catch {
|
|
346
|
+
return null
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
const dims = await getImageDimensions(pathResult.value, processRunner)
|
|
350
|
+
if (!dims) {
|
|
351
|
+
// Unknown dims — include but warn; better to classify than silently drop.
|
|
352
|
+
return { name: entry.name, sizeBytes, width: 0, height: 0, kept: true }
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
const kept = shouldClassifyImage({ width: dims.width, height: dims.height, sizeBytes })
|
|
356
|
+
return { name: entry.name, sizeBytes, width: dims.width, height: dims.height, kept }
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
const filtered = inspected
|
|
360
|
+
.filter((r): r is NonNullable<typeof r> => r !== null && r.kept)
|
|
361
|
+
.sort((a, b) => b.sizeBytes - a.sizeBytes)
|
|
308
362
|
.slice(0, MAX_IMAGES)
|
|
309
363
|
|
|
310
|
-
const
|
|
364
|
+
const droppedCount = inspected.filter(r => r !== null && !r.kept).length
|
|
365
|
+
if (droppedCount > 0 || inspected.length > MAX_IMAGES) {
|
|
366
|
+
logger.info('Image filter applied', {
|
|
367
|
+
source: relativePrefix,
|
|
368
|
+
candidates: candidates.length,
|
|
369
|
+
passedDensityFilter: candidates.length - droppedCount,
|
|
370
|
+
selected: filtered.length,
|
|
371
|
+
droppedByDensity: droppedCount,
|
|
372
|
+
})
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
const settled = await mapWithConcurrency(filtered, IMAGE_CLASSIFY_CONCURRENCY, async (imgFile) => {
|
|
311
376
|
const imgPathResult = imageStore.realPath(imgFile.name)
|
|
312
377
|
if (!imgPathResult.ok) return null
|
|
313
378
|
|