@heripo/pdf-parser 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,6 +1,8 @@
1
1
  import { LoggerMethods } from '@heripo/logger';
2
- import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
3
- export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.cjs';
2
+ import { OcrStrategy, TokenUsageReport } from '@heripo/model';
3
+ import { LanguageModel } from 'ai';
4
+ import { ConversionOptions } from 'docling-sdk';
5
+ import { LLMTokenUsageAggregator } from '@heripo/shared';
4
6
 
5
7
  /**
6
8
  * Callback function invoked after PDF conversion completes
@@ -8,19 +10,37 @@ export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from '
8
10
  */
9
11
  type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
10
12
  /**
11
- * Pipeline type for PDF conversion
12
- * - 'standard': Use OCR-based pipeline (default, uses ocrmac)
13
- * - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
13
+ * Extended options for PDF conversion.
14
14
  */
15
- type PipelineType = 'standard' | 'vlm';
16
- /**
17
- * Extended options for PDF conversion including pipeline selection
18
- */
19
- type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
15
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
20
16
  num_threads?: number;
21
- pipeline?: PipelineType;
22
- vlm_model?: string | VlmModelLocal;
17
+ /**
18
+ * Force pre-conversion to image-based PDF before processing.
19
+ * Requires ImageMagick and Ghostscript.
20
+ */
21
+ forceImagePdf?: boolean;
22
+ /** Vision model for OCR strategy sampling (enables new strategy-based flow) */
23
+ strategySamplerModel?: LanguageModel;
24
+ /** Vision model for VLM page processing (required when strategy selects VLM) */
25
+ vlmProcessorModel?: LanguageModel;
26
+ /** Concurrency for VLM page processing (default: 1) */
27
+ vlmConcurrency?: number;
28
+ /** Skip sampling and default to ocrmac */
29
+ skipSampling?: boolean;
30
+ /** Force a specific OCR method, bypassing sampling */
31
+ forcedMethod?: 'ocrmac' | 'vlm';
32
+ /** Token usage aggregator for tracking across sampling and VLM processing */
33
+ aggregator?: LLMTokenUsageAggregator;
34
+ /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
35
+ onTokenUsage?: (report: TokenUsageReport) => void;
23
36
  };
37
+ /** Result of strategy-based conversion */
38
+ interface ConvertWithStrategyResult {
39
+ /** The OCR strategy that was determined */
40
+ strategy: OcrStrategy;
41
+ /** Token usage report from sampling and/or VLM processing (null when no LLM usage occurs) */
42
+ tokenUsageReport: TokenUsageReport | null;
43
+ }
24
44
 
25
45
  type Options = {
26
46
  logger: LoggerMethods;
@@ -120,7 +140,15 @@ declare class PDFParser {
120
140
  */
121
141
  private restartServer;
122
142
  private waitForServerReady;
123
- parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
143
+ parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<TokenUsageReport | null>;
144
+ /**
145
+ * Parse a PDF using OCR strategy sampling to decide between ocrmac and VLM.
146
+ * Delegates to PDFConverter.convertWithStrategy() and returns the token usage report.
147
+ *
148
+ * Server recovery (restart on ECONNREFUSED) is preserved because
149
+ * the ocrmac path still uses the Docling server.
150
+ */
151
+ private parseWithStrategy;
124
152
  /**
125
153
  * Dispose the parser instance.
126
154
  * - Sets the internal client to null
@@ -140,4 +168,112 @@ declare class ImagePdfFallbackError extends Error {
140
168
  constructor(originalError: Error, fallbackError: Error);
141
169
  }
142
170
 
143
- export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
171
+ /**
172
+ * Intermediate format produced by VLM page-by-page processing.
173
+ * Intentionally kept simple so VLM prompts stay short and accurate.
174
+ * The DoclingDocumentAssembler converts these into a full DoclingDocument.
175
+ */
176
+ /** Allowed element types matching DoclingDocument text labels */
177
+ type VlmElementType = 'text' | 'section_header' | 'caption' | 'footnote' | 'page_header' | 'page_footer' | 'list_item' | 'picture' | 'table';
178
+ /**
179
+ * Normalized bounding box with coordinates in the range 0.0 to 1.0,
180
+ * using top-left origin (standard image coordinates).
181
+ */
182
+ interface VlmBBox {
183
+ /** Left edge (0.0 = left boundary) */
184
+ l: number;
185
+ /** Top edge (0.0 = top boundary) */
186
+ t: number;
187
+ /** Right edge (1.0 = right boundary) */
188
+ r: number;
189
+ /** Bottom edge (1.0 = bottom boundary) */
190
+ b: number;
191
+ }
192
+ /** A single content element detected on a page by VLM */
193
+ interface VlmPageElement {
194
+ /** Element type */
195
+ type: VlmElementType;
196
+ /** Text content (empty string for picture elements) */
197
+ content: string;
198
+ /** Heading depth for section_header (1 = top-level) */
199
+ level?: number;
200
+ /** List marker for list_item (e.g., "1)", "a.", "\u2022") */
201
+ marker?: string;
202
+ /** Reading order within the page (top-to-bottom, left-to-right) */
203
+ order: number;
204
+ /**
205
+ * Bounding box in normalized coordinates (0.0-1.0, top-left origin).
206
+ * - Text elements: optional (included in prov if present)
207
+ * - Picture elements: **required** (used for image cropping)
208
+ */
209
+ bbox?: VlmBBox;
210
+ }
211
+ /** Types of quality issues detected in VLM responses */
212
+ type VlmQualityIssueType = 'placeholder_text' | 'script_anomaly' | 'meta_description' | 'repetitive_pattern';
213
+ /** Quality metadata for a processed page */
214
+ interface VlmPageQuality {
215
+ /** Whether the page passed quality validation (after possible retry) */
216
+ isValid: boolean;
217
+ /** Whether a quality retry was performed for this page */
218
+ retried: boolean;
219
+ /** Issue types detected (may persist even after retry) */
220
+ issueTypes: VlmQualityIssueType[];
221
+ }
222
+
223
+ /** A single quality issue found during validation */
224
+ interface VlmQualityIssue {
225
+ /** Type of issue detected */
226
+ type: VlmQualityIssueType;
227
+ /** Human-readable description of the issue */
228
+ message: string;
229
+ /** Element reading-order indices that triggered the issue */
230
+ affectedElements: number[];
231
+ }
232
+ /** Result of VLM response quality validation */
233
+ interface VlmValidationResult {
234
+ /** Whether the response passes quality validation */
235
+ isValid: boolean;
236
+ /** List of quality issues found (empty if valid) */
237
+ issues: VlmQualityIssue[];
238
+ }
239
+ /**
240
+ * Lightweight, stateless validator for VLM page extraction responses.
241
+ *
242
+ * Detects four categories of hallucination without any additional VLM calls:
243
+ * 1. Placeholder text (Lorem ipsum and variants)
244
+ * 2. Script anomaly (expected Korean but got Latin-only text)
245
+ * 3. Meta description (VLM described the image instead of transcribing text)
246
+ * 4. Repetitive pattern (repeated character patterns like `: : : : :`)
247
+ */
248
+ declare class VlmResponseValidator {
249
+ /**
250
+ * Validate VLM page result quality.
251
+ *
252
+ * @param elements - Extracted page elements to validate
253
+ * @param documentLanguages - BCP 47 language tags (e.g., ['ko-KR', 'en-US'])
254
+ * @returns Validation result with issues list
255
+ */
256
+ static validate(elements: VlmPageElement[], documentLanguages?: string[]): VlmValidationResult;
257
+ /**
258
+ * Detect known placeholder / filler text in elements.
259
+ */
260
+ private static detectPlaceholderText;
261
+ /**
262
+ * Detect script anomaly: expected Korean content but found mostly Latin text.
263
+ * Counts Hangul + CJK characters and flags if the ratio is below threshold.
264
+ */
265
+ private static detectScriptAnomaly;
266
+ /**
267
+ * Detect meta description: VLM described the image/resolution instead
268
+ * of transcribing actual text content.
269
+ */
270
+ private static detectMetaDescription;
271
+ /**
272
+ * Detect repetitive character patterns (e.g., `: : : : :` or `= = = = =`).
273
+ * Flags when the same character repeats with spaces 5+ times and the
274
+ * repetitive portion exceeds 30% of total content.
275
+ */
276
+ private static detectRepetitivePattern;
277
+ }
278
+
279
+ export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
package/dist/index.d.ts CHANGED
@@ -1,6 +1,8 @@
1
1
  import { LoggerMethods } from '@heripo/logger';
2
- import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
3
- export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.js';
2
+ import { OcrStrategy, TokenUsageReport } from '@heripo/model';
3
+ import { LanguageModel } from 'ai';
4
+ import { ConversionOptions } from 'docling-sdk';
5
+ import { LLMTokenUsageAggregator } from '@heripo/shared';
4
6
 
5
7
  /**
6
8
  * Callback function invoked after PDF conversion completes
@@ -8,19 +10,37 @@ export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from '
8
10
  */
9
11
  type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
10
12
  /**
11
- * Pipeline type for PDF conversion
12
- * - 'standard': Use OCR-based pipeline (default, uses ocrmac)
13
- * - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
13
+ * Extended options for PDF conversion.
14
14
  */
15
- type PipelineType = 'standard' | 'vlm';
16
- /**
17
- * Extended options for PDF conversion including pipeline selection
18
- */
19
- type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
15
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
20
16
  num_threads?: number;
21
- pipeline?: PipelineType;
22
- vlm_model?: string | VlmModelLocal;
17
+ /**
18
+ * Force pre-conversion to image-based PDF before processing.
19
+ * Requires ImageMagick and Ghostscript.
20
+ */
21
+ forceImagePdf?: boolean;
22
+ /** Vision model for OCR strategy sampling (enables new strategy-based flow) */
23
+ strategySamplerModel?: LanguageModel;
24
+ /** Vision model for VLM page processing (required when strategy selects VLM) */
25
+ vlmProcessorModel?: LanguageModel;
26
+ /** Concurrency for VLM page processing (default: 1) */
27
+ vlmConcurrency?: number;
28
+ /** Skip sampling and default to ocrmac */
29
+ skipSampling?: boolean;
30
+ /** Force a specific OCR method, bypassing sampling */
31
+ forcedMethod?: 'ocrmac' | 'vlm';
32
+ /** Token usage aggregator for tracking across sampling and VLM processing */
33
+ aggregator?: LLMTokenUsageAggregator;
34
+ /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
35
+ onTokenUsage?: (report: TokenUsageReport) => void;
23
36
  };
37
+ /** Result of strategy-based conversion */
38
+ interface ConvertWithStrategyResult {
39
+ /** The OCR strategy that was determined */
40
+ strategy: OcrStrategy;
41
+ /** Token usage report from sampling and/or VLM processing (null when no LLM usage occurs) */
42
+ tokenUsageReport: TokenUsageReport | null;
43
+ }
24
44
 
25
45
  type Options = {
26
46
  logger: LoggerMethods;
@@ -120,7 +140,15 @@ declare class PDFParser {
120
140
  */
121
141
  private restartServer;
122
142
  private waitForServerReady;
123
- parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
143
+ parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<TokenUsageReport | null>;
144
+ /**
145
+ * Parse a PDF using OCR strategy sampling to decide between ocrmac and VLM.
146
+ * Delegates to PDFConverter.convertWithStrategy() and returns the token usage report.
147
+ *
148
+ * Server recovery (restart on ECONNREFUSED) is preserved because
149
+ * the ocrmac path still uses the Docling server.
150
+ */
151
+ private parseWithStrategy;
124
152
  /**
125
153
  * Dispose the parser instance.
126
154
  * - Sets the internal client to null
@@ -140,4 +168,112 @@ declare class ImagePdfFallbackError extends Error {
140
168
  constructor(originalError: Error, fallbackError: Error);
141
169
  }
142
170
 
143
- export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
171
+ /**
172
+ * Intermediate format produced by VLM page-by-page processing.
173
+ * Intentionally kept simple so VLM prompts stay short and accurate.
174
+ * The DoclingDocumentAssembler converts these into a full DoclingDocument.
175
+ */
176
+ /** Allowed element types matching DoclingDocument text labels */
177
+ type VlmElementType = 'text' | 'section_header' | 'caption' | 'footnote' | 'page_header' | 'page_footer' | 'list_item' | 'picture' | 'table';
178
+ /**
179
+ * Normalized bounding box with coordinates in the range 0.0 to 1.0,
180
+ * using top-left origin (standard image coordinates).
181
+ */
182
+ interface VlmBBox {
183
+ /** Left edge (0.0 = left boundary) */
184
+ l: number;
185
+ /** Top edge (0.0 = top boundary) */
186
+ t: number;
187
+ /** Right edge (1.0 = right boundary) */
188
+ r: number;
189
+ /** Bottom edge (1.0 = bottom boundary) */
190
+ b: number;
191
+ }
192
+ /** A single content element detected on a page by VLM */
193
+ interface VlmPageElement {
194
+ /** Element type */
195
+ type: VlmElementType;
196
+ /** Text content (empty string for picture elements) */
197
+ content: string;
198
+ /** Heading depth for section_header (1 = top-level) */
199
+ level?: number;
200
+ /** List marker for list_item (e.g., "1)", "a.", "\u2022") */
201
+ marker?: string;
202
+ /** Reading order within the page (top-to-bottom, left-to-right) */
203
+ order: number;
204
+ /**
205
+ * Bounding box in normalized coordinates (0.0-1.0, top-left origin).
206
+ * - Text elements: optional (included in prov if present)
207
+ * - Picture elements: **required** (used for image cropping)
208
+ */
209
+ bbox?: VlmBBox;
210
+ }
211
+ /** Types of quality issues detected in VLM responses */
212
+ type VlmQualityIssueType = 'placeholder_text' | 'script_anomaly' | 'meta_description' | 'repetitive_pattern';
213
+ /** Quality metadata for a processed page */
214
+ interface VlmPageQuality {
215
+ /** Whether the page passed quality validation (after possible retry) */
216
+ isValid: boolean;
217
+ /** Whether a quality retry was performed for this page */
218
+ retried: boolean;
219
+ /** Issue types detected (may persist even after retry) */
220
+ issueTypes: VlmQualityIssueType[];
221
+ }
222
+
223
+ /** A single quality issue found during validation */
224
+ interface VlmQualityIssue {
225
+ /** Type of issue detected */
226
+ type: VlmQualityIssueType;
227
+ /** Human-readable description of the issue */
228
+ message: string;
229
+ /** Element reading-order indices that triggered the issue */
230
+ affectedElements: number[];
231
+ }
232
+ /** Result of VLM response quality validation */
233
+ interface VlmValidationResult {
234
+ /** Whether the response passes quality validation */
235
+ isValid: boolean;
236
+ /** List of quality issues found (empty if valid) */
237
+ issues: VlmQualityIssue[];
238
+ }
239
+ /**
240
+ * Lightweight, stateless validator for VLM page extraction responses.
241
+ *
242
+ * Detects four categories of hallucination without any additional VLM calls:
243
+ * 1. Placeholder text (Lorem ipsum and variants)
244
+ * 2. Script anomaly (expected Korean but got Latin-only text)
245
+ * 3. Meta description (VLM described the image instead of transcribing text)
246
+ * 4. Repetitive pattern (repeated character patterns like `: : : : :`)
247
+ */
248
+ declare class VlmResponseValidator {
249
+ /**
250
+ * Validate VLM page result quality.
251
+ *
252
+ * @param elements - Extracted page elements to validate
253
+ * @param documentLanguages - BCP 47 language tags (e.g., ['ko-KR', 'en-US'])
254
+ * @returns Validation result with issues list
255
+ */
256
+ static validate(elements: VlmPageElement[], documentLanguages?: string[]): VlmValidationResult;
257
+ /**
258
+ * Detect known placeholder / filler text in elements.
259
+ */
260
+ private static detectPlaceholderText;
261
+ /**
262
+ * Detect script anomaly: expected Korean content but found mostly Latin text.
263
+ * Counts Hangul + CJK characters and flags if the ratio is below threshold.
264
+ */
265
+ private static detectScriptAnomaly;
266
+ /**
267
+ * Detect meta description: VLM described the image/resolution instead
268
+ * of transcribing actual text content.
269
+ */
270
+ private static detectMetaDescription;
271
+ /**
272
+ * Detect repetitive character patterns (e.g., `: : : : :` or `= = = = =`).
273
+ * Flags when the same character repeats with spaces 5+ times and the
274
+ * repetitive portion exceeds 30% of total content.
275
+ */
276
+ private static detectRepetitivePattern;
277
+ }
278
+
279
+ export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };