@llamaindex/liteparse-grpc 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +154 -0
- package/dist/client.js +1674 -0
- package/dist/protogen/parser.js +1542 -0
- package/dist/server.js +6224 -0
- package/dist/types/client.d.ts +3 -0
- package/dist/types/protogen/parser.d.ts +267 -0
- package/dist/types/server.d.ts +3 -0
- package/package.json +84 -0
- package/proto/parser.proto +213 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import { BinaryReader, BinaryWriter } from "@bufbuild/protobuf/wire";
|
|
2
|
+
import { type CallOptions, type ChannelCredentials, Client, type ClientOptions, type ClientUnaryCall, type handleUnaryCall, type Metadata, type ServiceError, type UntypedServiceImplementation } from "@grpc/grpc-js";
|
|
3
|
+
export declare const protobufPackage = "parser";
|
|
4
|
+
/**
|
|
5
|
+
* Image handling for the markdown emitter.
|
|
6
|
+
*
|
|
7
|
+
* * OFF — strip image references entirely.
|
|
8
|
+
* * PLACEHOLDER (default) — emit `` references in
|
|
9
|
+
* reading order at each image's y position, but do not extract or
|
|
10
|
+
* return pixel bytes. Keeps response size small while letting the LLM see
|
|
11
|
+
* where figures live in the document.
|
|
12
|
+
* * EMBED — same references, plus bytes returned via `ParseResult.images`.
|
|
13
|
+
* Opt-in because pixel bytes can dwarf the text payload on image-heavy
|
|
14
|
+
* PDFs. (Bytes plumbing lands in stage 11b — current variant is parsed but
|
|
15
|
+
* behaves like PLACEHOLDER until then.)
|
|
16
|
+
*/
|
|
17
|
+
export declare enum ImageMode {
|
|
18
|
+
IMAGE_MODE_UNSPECIFIED = 0,
|
|
19
|
+
IMAGE_MODE_OFF = 1,
|
|
20
|
+
IMAGE_MODE_PLACEHOLDER = 2,
|
|
21
|
+
IMAGE_MODE_EMBED = 3,
|
|
22
|
+
UNRECOGNIZED = -1
|
|
23
|
+
}
|
|
24
|
+
export declare function imageModeFromJSON(object: any): ImageMode;
|
|
25
|
+
export declare function imageModeToJSON(object: ImageMode): string;
|
|
26
|
+
/** Supported output formats. */
|
|
27
|
+
export declare enum OutputFormat {
|
|
28
|
+
OUTPUT_FORMAT_UNSPECIFIED = 0,
|
|
29
|
+
OUTPUT_FORMAT_JSON = 1,
|
|
30
|
+
OUTPUT_FORMAT_TEXT = 2,
|
|
31
|
+
OUTPUT_FORMAT_MARKDOWN = 3,
|
|
32
|
+
UNRECOGNIZED = -1
|
|
33
|
+
}
|
|
34
|
+
export declare function outputFormatFromJSON(object: any): OutputFormat;
|
|
35
|
+
export declare function outputFormatToJSON(object: OutputFormat): string;
|
|
36
|
+
/** Configuration for LiteParse document parsing. */
|
|
37
|
+
export interface LiteParseConfig {
|
|
38
|
+
/** OCR language code (Tesseract format: "eng", "fra", "deu", etc.). */
|
|
39
|
+
ocrLanguage: string;
|
|
40
|
+
/** Whether OCR is enabled. When true, runs on text-sparse pages and embedded images. */
|
|
41
|
+
ocrEnabled: boolean;
|
|
42
|
+
/** HTTP OCR server URL (uses Tesseract if not provided) */
|
|
43
|
+
ocrServerUrl?: string | undefined;
|
|
44
|
+
/**
|
|
45
|
+
* Extra HTTP headers sent with every request to `ocr_server_url`.
|
|
46
|
+
* Ignored when `ocr_server_url` is unset.
|
|
47
|
+
*/
|
|
48
|
+
ocrServerHeaders: HttpHeader[];
|
|
49
|
+
/** Path to tessdata directory. Falls back to TESSDATA_PREFIX env var if not set. */
|
|
50
|
+
tessdataPath?: string | undefined;
|
|
51
|
+
/** Maximum number of pages to parse. */
|
|
52
|
+
maxPages: number;
|
|
53
|
+
/** Specific pages to parse (e.g., "1-5,10,15-20"). Unset means all pages. */
|
|
54
|
+
targetPages?: string | undefined;
|
|
55
|
+
/** DPI for rendering pages (used for OCR and screenshots). */
|
|
56
|
+
dpi: number;
|
|
57
|
+
/** Output format. */
|
|
58
|
+
outputFormat: OutputFormat;
|
|
59
|
+
/** Keep very small text that would normally be filtered out. */
|
|
60
|
+
preserveVerySmallText: boolean;
|
|
61
|
+
/** Password for encrypted/protected documents. */
|
|
62
|
+
password?: string | undefined;
|
|
63
|
+
/** Suppress progress output. */
|
|
64
|
+
quiet: boolean;
|
|
65
|
+
/** Number of concurrent OCR workers. Defaults to (number of CPU cores - 1), minimum 1. */
|
|
66
|
+
numWorkers: number;
|
|
67
|
+
/**
|
|
68
|
+
* Controls how raster images are surfaced in markdown output. Has no
|
|
69
|
+
* effect on JSON / text outputs.
|
|
70
|
+
*/
|
|
71
|
+
imageMode: ImageMode;
|
|
72
|
+
/**
|
|
73
|
+
* Extract hyperlink annotations and render them as `[text](url)` in
|
|
74
|
+
* markdown output. Default on. Disable for benchmark parity with
|
|
75
|
+
* plain-text ground truth (the GT corpora never use link syntax).
|
|
76
|
+
*/
|
|
77
|
+
extractLinks: boolean;
|
|
78
|
+
/**
|
|
79
|
+
* Whether a systemic OCR failure (every OCR task failed *and* at least one
|
|
80
|
+
* was a text-sparse page whose primary text source was OCR) aborts the
|
|
81
|
+
* whole parse. Default `true`: surface the root cause instead of silently
|
|
82
|
+
* emitting blank pages. Set `false` to keep already-recovered native text
|
|
83
|
+
* and return partial results when OCR is unavailable — useful for callers
|
|
84
|
+
* that prefer a degraded document over a hard failure (e.g. when the host
|
|
85
|
+
* has its own OCR fallback or treats OCR as best-effort enrichment).
|
|
86
|
+
*/
|
|
87
|
+
ocrFailureFatal: boolean;
|
|
88
|
+
/**
|
|
89
|
+
* OCR request-hedging schedule (milliseconds) for the HTTP OCR engine.
|
|
90
|
+
* Empty (default) = no hedging. With multiple delays (e.g.
|
|
91
|
+
* `[0, 5000, 10000, 15000, 20000]`), each OCR attempt fires a duplicate
|
|
92
|
+
* request at every delay and takes the first to succeed — trading extra
|
|
93
|
+
* OCR-server load for lower tail latency on a slow/stuck pod. No effect on
|
|
94
|
+
* the Tesseract engine.
|
|
95
|
+
*/
|
|
96
|
+
ocrHedgeDelaysMs: number[];
|
|
97
|
+
/**
|
|
98
|
+
* Emit per-word sub-boxes on each `TextItem` (`TextItem.words`). Default
|
|
99
|
+
* `false`: a text item already carries its own box, and word boxes roughly
|
|
100
|
+
* double the text-item payload (size + napi marshalling), so they are only
|
|
101
|
+
* worth computing for callers doing word-level bbox attribution. When
|
|
102
|
+
* `false`, `TextItem.words` is always empty and the per-word tracking is
|
|
103
|
+
* skipped entirely (zero allocation).
|
|
104
|
+
*/
|
|
105
|
+
emitWordBoxes: boolean;
|
|
106
|
+
}
|
|
107
|
+
/** A single HTTP header as a name/value pair. */
|
|
108
|
+
export interface HttpHeader {
|
|
109
|
+
name: string;
|
|
110
|
+
value: string;
|
|
111
|
+
}
|
|
112
|
+
/** Request to parse a file. */
|
|
113
|
+
export interface ParseRequest {
|
|
114
|
+
file: Buffer;
|
|
115
|
+
config?: LiteParseConfig | undefined;
|
|
116
|
+
}
|
|
117
|
+
/** Request to screenshot the pages of a file. */
|
|
118
|
+
export interface ScreenshotRequest {
|
|
119
|
+
file: Buffer;
|
|
120
|
+
config?: LiteParseConfig | undefined;
|
|
121
|
+
}
|
|
122
|
+
export interface TextItem {
|
|
123
|
+
text: string;
|
|
124
|
+
x: number;
|
|
125
|
+
y: number;
|
|
126
|
+
width: number;
|
|
127
|
+
height: number;
|
|
128
|
+
}
|
|
129
|
+
export interface ParsedPage {
|
|
130
|
+
pageNumber: number;
|
|
131
|
+
pageWidth: number;
|
|
132
|
+
pageHeight: number;
|
|
133
|
+
textItems: TextItem[];
|
|
134
|
+
}
|
|
135
|
+
/** Response with a parsed file */
|
|
136
|
+
export interface ParseResponse {
|
|
137
|
+
text: string;
|
|
138
|
+
pages: ParsedPage[];
|
|
139
|
+
}
|
|
140
|
+
export interface ScreenshotPage {
|
|
141
|
+
imageBytes: Buffer;
|
|
142
|
+
pageNumber: number;
|
|
143
|
+
height: number;
|
|
144
|
+
width: number;
|
|
145
|
+
mimeType: string;
|
|
146
|
+
}
|
|
147
|
+
/** Screenshot response for a PDF file */
|
|
148
|
+
export interface ScreenshotResponse {
|
|
149
|
+
screenshots: ScreenshotPage[];
|
|
150
|
+
}
|
|
151
|
+
/** Request to estimate the complexity of a file. */
|
|
152
|
+
export interface IsComplexRequest {
|
|
153
|
+
file: Buffer;
|
|
154
|
+
config?: LiteParseConfig | undefined;
|
|
155
|
+
}
|
|
156
|
+
export interface PageComplexityStats {
|
|
157
|
+
pageNumber: number;
|
|
158
|
+
textLength: number;
|
|
159
|
+
textCoverage: number;
|
|
160
|
+
hasSubstantialImages: boolean;
|
|
161
|
+
imageBlockCount: number;
|
|
162
|
+
imageCoverage: number;
|
|
163
|
+
largestImageCoverage: number;
|
|
164
|
+
fullPageImage: boolean;
|
|
165
|
+
uncovertedVectorArea?: number | undefined;
|
|
166
|
+
isGarbled: boolean;
|
|
167
|
+
pageArea: number;
|
|
168
|
+
needsOcr: boolean;
|
|
169
|
+
reasons: string[];
|
|
170
|
+
}
|
|
171
|
+
/** Response detailing the complexity estimation for a file */
|
|
172
|
+
export interface IsComplexResponse {
|
|
173
|
+
complexity: PageComplexityStats[];
|
|
174
|
+
}
|
|
175
|
+
export declare const LiteParseConfig: MessageFns<LiteParseConfig>;
|
|
176
|
+
export declare const HttpHeader: MessageFns<HttpHeader>;
|
|
177
|
+
export declare const ParseRequest: MessageFns<ParseRequest>;
|
|
178
|
+
export declare const ScreenshotRequest: MessageFns<ScreenshotRequest>;
|
|
179
|
+
export declare const TextItem: MessageFns<TextItem>;
|
|
180
|
+
export declare const ParsedPage: MessageFns<ParsedPage>;
|
|
181
|
+
export declare const ParseResponse: MessageFns<ParseResponse>;
|
|
182
|
+
export declare const ScreenshotPage: MessageFns<ScreenshotPage>;
|
|
183
|
+
export declare const ScreenshotResponse: MessageFns<ScreenshotResponse>;
|
|
184
|
+
export declare const IsComplexRequest: MessageFns<IsComplexRequest>;
|
|
185
|
+
export declare const PageComplexityStats: MessageFns<PageComplexityStats>;
|
|
186
|
+
export declare const IsComplexResponse: MessageFns<IsComplexResponse>;
|
|
187
|
+
/** The Parser service definition. */
|
|
188
|
+
export type ParserServiceService = typeof ParserServiceService;
|
|
189
|
+
export declare const ParserServiceService: {
|
|
190
|
+
/** Parse a file */
|
|
191
|
+
readonly parse: {
|
|
192
|
+
readonly path: "/parser.ParserService/Parse";
|
|
193
|
+
readonly requestStream: false;
|
|
194
|
+
readonly responseStream: false;
|
|
195
|
+
readonly requestSerialize: (value: ParseRequest) => Buffer;
|
|
196
|
+
readonly requestDeserialize: (value: Buffer) => ParseRequest;
|
|
197
|
+
readonly responseSerialize: (value: ParseResponse) => Buffer;
|
|
198
|
+
readonly responseDeserialize: (value: Buffer) => ParseResponse;
|
|
199
|
+
};
|
|
200
|
+
/** Screenshot the pages of a PDF file */
|
|
201
|
+
readonly screenshot: {
|
|
202
|
+
readonly path: "/parser.ParserService/Screenshot";
|
|
203
|
+
readonly requestStream: false;
|
|
204
|
+
readonly responseStream: false;
|
|
205
|
+
readonly requestSerialize: (value: ScreenshotRequest) => Buffer;
|
|
206
|
+
readonly requestDeserialize: (value: Buffer) => ScreenshotRequest;
|
|
207
|
+
readonly responseSerialize: (value: ScreenshotResponse) => Buffer;
|
|
208
|
+
readonly responseDeserialize: (value: Buffer) => ScreenshotResponse;
|
|
209
|
+
};
|
|
210
|
+
/** Estimate the complexity of a file and the need for OCR */
|
|
211
|
+
readonly isComplex: {
|
|
212
|
+
readonly path: "/parser.ParserService/IsComplex";
|
|
213
|
+
readonly requestStream: false;
|
|
214
|
+
readonly responseStream: false;
|
|
215
|
+
readonly requestSerialize: (value: IsComplexRequest) => Buffer;
|
|
216
|
+
readonly requestDeserialize: (value: Buffer) => IsComplexRequest;
|
|
217
|
+
readonly responseSerialize: (value: IsComplexResponse) => Buffer;
|
|
218
|
+
readonly responseDeserialize: (value: Buffer) => IsComplexResponse;
|
|
219
|
+
};
|
|
220
|
+
};
|
|
221
|
+
export interface ParserServiceServer extends UntypedServiceImplementation {
|
|
222
|
+
/** Parse a file */
|
|
223
|
+
parse: handleUnaryCall<ParseRequest, ParseResponse>;
|
|
224
|
+
/** Screenshot the pages of a PDF file */
|
|
225
|
+
screenshot: handleUnaryCall<ScreenshotRequest, ScreenshotResponse>;
|
|
226
|
+
/** Estimate the complexity of a file and the need for OCR */
|
|
227
|
+
isComplex: handleUnaryCall<IsComplexRequest, IsComplexResponse>;
|
|
228
|
+
}
|
|
229
|
+
export interface ParserServiceClient extends Client {
|
|
230
|
+
/** Parse a file */
|
|
231
|
+
parse(request: ParseRequest, callback: (error: ServiceError | null, response: ParseResponse) => void): ClientUnaryCall;
|
|
232
|
+
parse(request: ParseRequest, metadata: Metadata, callback: (error: ServiceError | null, response: ParseResponse) => void): ClientUnaryCall;
|
|
233
|
+
parse(request: ParseRequest, metadata: Metadata, options: Partial<CallOptions>, callback: (error: ServiceError | null, response: ParseResponse) => void): ClientUnaryCall;
|
|
234
|
+
/** Screenshot the pages of a PDF file */
|
|
235
|
+
screenshot(request: ScreenshotRequest, callback: (error: ServiceError | null, response: ScreenshotResponse) => void): ClientUnaryCall;
|
|
236
|
+
screenshot(request: ScreenshotRequest, metadata: Metadata, callback: (error: ServiceError | null, response: ScreenshotResponse) => void): ClientUnaryCall;
|
|
237
|
+
screenshot(request: ScreenshotRequest, metadata: Metadata, options: Partial<CallOptions>, callback: (error: ServiceError | null, response: ScreenshotResponse) => void): ClientUnaryCall;
|
|
238
|
+
/** Estimate the complexity of a file and the need for OCR */
|
|
239
|
+
isComplex(request: IsComplexRequest, callback: (error: ServiceError | null, response: IsComplexResponse) => void): ClientUnaryCall;
|
|
240
|
+
isComplex(request: IsComplexRequest, metadata: Metadata, callback: (error: ServiceError | null, response: IsComplexResponse) => void): ClientUnaryCall;
|
|
241
|
+
isComplex(request: IsComplexRequest, metadata: Metadata, options: Partial<CallOptions>, callback: (error: ServiceError | null, response: IsComplexResponse) => void): ClientUnaryCall;
|
|
242
|
+
}
|
|
243
|
+
export declare const ParserServiceClient: {
|
|
244
|
+
new (address: string, credentials: ChannelCredentials, options?: Partial<ClientOptions>): ParserServiceClient;
|
|
245
|
+
service: typeof ParserServiceService;
|
|
246
|
+
serviceName: string;
|
|
247
|
+
};
|
|
248
|
+
type Builtin = Date | Function | Uint8Array | string | number | boolean | undefined;
|
|
249
|
+
export type DeepPartial<T> = T extends Builtin ? T : T extends globalThis.Array<infer U> ? globalThis.Array<DeepPartial<U>> : T extends ReadonlyArray<infer U> ? ReadonlyArray<DeepPartial<U>> : T extends {} ? {
|
|
250
|
+
[K in keyof T]?: DeepPartial<T[K]>;
|
|
251
|
+
} : Partial<T>;
|
|
252
|
+
type KeysOfUnion<T> = T extends T ? keyof T : never;
|
|
253
|
+
export type Exact<P, I extends P> = P extends Builtin ? P : P & {
|
|
254
|
+
[K in keyof P]: Exact<P[K], I[K]>;
|
|
255
|
+
} & {
|
|
256
|
+
[K in Exclude<keyof I, KeysOfUnion<P>>]: never;
|
|
257
|
+
};
|
|
258
|
+
export interface MessageFns<T> {
|
|
259
|
+
encode(message: T, writer?: BinaryWriter): BinaryWriter;
|
|
260
|
+
decode(input: BinaryReader | Uint8Array, length?: number): T;
|
|
261
|
+
fromJSON(object: any): T;
|
|
262
|
+
toJSON(message: T): unknown;
|
|
263
|
+
create<I extends Exact<DeepPartial<T>, I>>(base?: I): T;
|
|
264
|
+
fromPartial<I extends Exact<DeepPartial<T>, I>>(object: I): T;
|
|
265
|
+
}
|
|
266
|
+
export {};
|
|
267
|
+
//# sourceMappingURL=parser.d.ts.map
|
package/package.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@llamaindex/liteparse-grpc",
|
|
3
|
+
"description": "gRPC server powered by LiteParse to perform parsing, screenshotting and complexity estimation of unstructured documents",
|
|
4
|
+
"version": "0.1.0",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"parsing",
|
|
7
|
+
"text-extraction",
|
|
8
|
+
"ocr",
|
|
9
|
+
"markdown",
|
|
10
|
+
"grpc",
|
|
11
|
+
"server"
|
|
12
|
+
],
|
|
13
|
+
"author": {
|
|
14
|
+
"name": "LlamaIndex",
|
|
15
|
+
"url": "https://llamaindex.ai"
|
|
16
|
+
},
|
|
17
|
+
"repository": {
|
|
18
|
+
"type": "git",
|
|
19
|
+
"url": "git+ssh://git@github.com/run-llama/liteparse-server.git"
|
|
20
|
+
},
|
|
21
|
+
"type": "module",
|
|
22
|
+
"main": "./dist/protogen/parser.js",
|
|
23
|
+
"types": "./dist/types/protogen/parser.d.ts",
|
|
24
|
+
"exports": {
|
|
25
|
+
".": {
|
|
26
|
+
"types": "./dist/types/protogen/parser.d.ts",
|
|
27
|
+
"import": "./dist/protogen/parser.js",
|
|
28
|
+
"default": "./dist/protogen/parser.js"
|
|
29
|
+
},
|
|
30
|
+
"./client": {
|
|
31
|
+
"types": "./dist/types/client.d.ts",
|
|
32
|
+
"import": "./dist/client.js",
|
|
33
|
+
"default": "./dist/client.js"
|
|
34
|
+
},
|
|
35
|
+
"./server": {
|
|
36
|
+
"types": "./dist/types/server.d.ts",
|
|
37
|
+
"import": "./dist/server.js",
|
|
38
|
+
"default": "./dist/server.js"
|
|
39
|
+
},
|
|
40
|
+
"./proto": "./proto/parser.proto",
|
|
41
|
+
"./package.json": "./package.json"
|
|
42
|
+
},
|
|
43
|
+
"bin": {
|
|
44
|
+
"liteparse-grpc-server": "dist/server.js",
|
|
45
|
+
"liteparse-grpc-client": "dist/client.js"
|
|
46
|
+
},
|
|
47
|
+
"devDependencies": {
|
|
48
|
+
"@eslint/js": "^10.0.1",
|
|
49
|
+
"@types/node": "^26.1.0",
|
|
50
|
+
"esbuild": "^0.28.1",
|
|
51
|
+
"eslint": "^10.6.0",
|
|
52
|
+
"globals": "^17.7.0",
|
|
53
|
+
"jiti": "^2.7.0",
|
|
54
|
+
"pino-pretty": "^13.1.3",
|
|
55
|
+
"prettier": "^3.9.4",
|
|
56
|
+
"ts-proto": "^2.11.10",
|
|
57
|
+
"typescript-eslint": "^8.62.1"
|
|
58
|
+
},
|
|
59
|
+
"peerDependencies": {
|
|
60
|
+
"typescript": "^5"
|
|
61
|
+
},
|
|
62
|
+
"dependencies": {
|
|
63
|
+
"@bufbuild/protobuf": "^2.12.1",
|
|
64
|
+
"@grpc/grpc-js": "^1.14.4",
|
|
65
|
+
"@llamaindex/liteparse": "^2.4.0",
|
|
66
|
+
"pino": "^10.3.1"
|
|
67
|
+
},
|
|
68
|
+
"files": [
|
|
69
|
+
"dist/**/*.js",
|
|
70
|
+
"dist/**/*.d.ts",
|
|
71
|
+
"proto/",
|
|
72
|
+
"README.md"
|
|
73
|
+
],
|
|
74
|
+
"scripts": {
|
|
75
|
+
"lint": "eslint src/**",
|
|
76
|
+
"prettier": "prettier --write .",
|
|
77
|
+
"prettier:check": "prettier .",
|
|
78
|
+
"build:server": "node buildscripts/build-server.cjs",
|
|
79
|
+
"build:client": "node buildscripts/build-client.cjs",
|
|
80
|
+
"build:proto": "node buildscripts/build-proto.cjs",
|
|
81
|
+
"build:types": "tsc -p tsconfig.build.json",
|
|
82
|
+
"build": "pnpm run build:proto && pnpm run build:server && pnpm run build:client && pnpm run build:types"
|
|
83
|
+
}
|
|
84
|
+
}
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
// Copyright 2015 gRPC authors.
|
|
2
|
+
//
|
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
// you may not use this file except in compliance with the License.
|
|
5
|
+
// You may obtain a copy of the License at
|
|
6
|
+
//
|
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
//
|
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
// See the License for the specific language governing permissions and
|
|
13
|
+
// limitations under the License.
|
|
14
|
+
|
|
15
|
+
syntax = "proto3";
|
|
16
|
+
|
|
17
|
+
package parser;
|
|
18
|
+
|
|
19
|
+
option java_multiple_files = true;
|
|
20
|
+
option java_outer_classname = "ParserProto";
|
|
21
|
+
option java_package = "io.grpc.parser";
|
|
22
|
+
|
|
23
|
+
// The Parser service definition.
|
|
24
|
+
service ParserService {
|
|
25
|
+
// Parse a file
|
|
26
|
+
rpc Parse(ParseRequest) returns (ParseResponse) {}
|
|
27
|
+
// Screenshot the pages of a PDF file
|
|
28
|
+
rpc Screenshot(ScreenshotRequest) returns (ScreenshotResponse) {}
|
|
29
|
+
// Estimate the complexity of a file and the need for OCR
|
|
30
|
+
rpc IsComplex(IsComplexRequest) returns (IsComplexResponse) {}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Configuration for LiteParse document parsing.
|
|
34
|
+
message LiteParseConfig {
|
|
35
|
+
// OCR language code (Tesseract format: "eng", "fra", "deu", etc.).
|
|
36
|
+
string ocr_language = 1;
|
|
37
|
+
|
|
38
|
+
// Whether OCR is enabled. When true, runs on text-sparse pages and embedded images.
|
|
39
|
+
bool ocr_enabled = 2;
|
|
40
|
+
|
|
41
|
+
// HTTP OCR server URL (uses Tesseract if not provided)
|
|
42
|
+
optional string ocr_server_url = 3;
|
|
43
|
+
|
|
44
|
+
// Extra HTTP headers sent with every request to `ocr_server_url`.
|
|
45
|
+
// Ignored when `ocr_server_url` is unset.
|
|
46
|
+
repeated HttpHeader ocr_server_headers = 4;
|
|
47
|
+
|
|
48
|
+
// Path to tessdata directory. Falls back to TESSDATA_PREFIX env var if not set.
|
|
49
|
+
optional string tessdata_path = 5;
|
|
50
|
+
|
|
51
|
+
// Maximum number of pages to parse.
|
|
52
|
+
uint64 max_pages = 6;
|
|
53
|
+
|
|
54
|
+
// Specific pages to parse (e.g., "1-5,10,15-20"). Unset means all pages.
|
|
55
|
+
optional string target_pages = 7;
|
|
56
|
+
|
|
57
|
+
// DPI for rendering pages (used for OCR and screenshots).
|
|
58
|
+
float dpi = 8;
|
|
59
|
+
|
|
60
|
+
// Output format.
|
|
61
|
+
OutputFormat output_format = 9;
|
|
62
|
+
|
|
63
|
+
// Keep very small text that would normally be filtered out.
|
|
64
|
+
bool preserve_very_small_text = 10;
|
|
65
|
+
|
|
66
|
+
// Password for encrypted/protected documents.
|
|
67
|
+
optional string password = 11;
|
|
68
|
+
|
|
69
|
+
// Suppress progress output.
|
|
70
|
+
bool quiet = 12;
|
|
71
|
+
|
|
72
|
+
// Number of concurrent OCR workers. Defaults to (number of CPU cores - 1), minimum 1.
|
|
73
|
+
uint64 num_workers = 13;
|
|
74
|
+
|
|
75
|
+
// Controls how raster images are surfaced in markdown output. Has no
|
|
76
|
+
// effect on JSON / text outputs.
|
|
77
|
+
ImageMode image_mode = 14;
|
|
78
|
+
|
|
79
|
+
// Extract hyperlink annotations and render them as `[text](url)` in
|
|
80
|
+
// markdown output. Default on. Disable for benchmark parity with
|
|
81
|
+
// plain-text ground truth (the GT corpora never use link syntax).
|
|
82
|
+
bool extract_links = 15;
|
|
83
|
+
|
|
84
|
+
// Whether a systemic OCR failure (every OCR task failed *and* at least one
|
|
85
|
+
// was a text-sparse page whose primary text source was OCR) aborts the
|
|
86
|
+
// whole parse. Default `true`: surface the root cause instead of silently
|
|
87
|
+
// emitting blank pages. Set `false` to keep already-recovered native text
|
|
88
|
+
// and return partial results when OCR is unavailable — useful for callers
|
|
89
|
+
// that prefer a degraded document over a hard failure (e.g. when the host
|
|
90
|
+
// has its own OCR fallback or treats OCR as best-effort enrichment).
|
|
91
|
+
bool ocr_failure_fatal = 16;
|
|
92
|
+
|
|
93
|
+
// OCR request-hedging schedule (milliseconds) for the HTTP OCR engine.
|
|
94
|
+
// Empty (default) = no hedging. With multiple delays (e.g.
|
|
95
|
+
// `[0, 5000, 10000, 15000, 20000]`), each OCR attempt fires a duplicate
|
|
96
|
+
// request at every delay and takes the first to succeed — trading extra
|
|
97
|
+
// OCR-server load for lower tail latency on a slow/stuck pod. No effect on
|
|
98
|
+
// the Tesseract engine.
|
|
99
|
+
repeated uint64 ocr_hedge_delays_ms = 17;
|
|
100
|
+
|
|
101
|
+
// Emit per-word sub-boxes on each `TextItem` (`TextItem.words`). Default
|
|
102
|
+
// `false`: a text item already carries its own box, and word boxes roughly
|
|
103
|
+
// double the text-item payload (size + napi marshalling), so they are only
|
|
104
|
+
// worth computing for callers doing word-level bbox attribution. When
|
|
105
|
+
// `false`, `TextItem.words` is always empty and the per-word tracking is
|
|
106
|
+
// skipped entirely (zero allocation).
|
|
107
|
+
bool emit_word_boxes = 18;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// A single HTTP header as a name/value pair.
|
|
111
|
+
message HttpHeader {
|
|
112
|
+
string name = 1;
|
|
113
|
+
string value = 2;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Image handling for the markdown emitter.
|
|
117
|
+
//
|
|
118
|
+
// * OFF — strip image references entirely.
|
|
119
|
+
// * PLACEHOLDER (default) — emit `` references in
|
|
120
|
+
// reading order at each image's y position, but do not extract or
|
|
121
|
+
// return pixel bytes. Keeps response size small while letting the LLM see
|
|
122
|
+
// where figures live in the document.
|
|
123
|
+
// * EMBED — same references, plus bytes returned via `ParseResult.images`.
|
|
124
|
+
// Opt-in because pixel bytes can dwarf the text payload on image-heavy
|
|
125
|
+
// PDFs. (Bytes plumbing lands in stage 11b — current variant is parsed but
|
|
126
|
+
// behaves like PLACEHOLDER until then.)
|
|
127
|
+
enum ImageMode {
|
|
128
|
+
IMAGE_MODE_UNSPECIFIED = 0;
|
|
129
|
+
IMAGE_MODE_OFF = 1;
|
|
130
|
+
IMAGE_MODE_PLACEHOLDER = 2;
|
|
131
|
+
IMAGE_MODE_EMBED = 3;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Supported output formats.
|
|
135
|
+
enum OutputFormat {
|
|
136
|
+
OUTPUT_FORMAT_UNSPECIFIED = 0;
|
|
137
|
+
OUTPUT_FORMAT_JSON = 1;
|
|
138
|
+
OUTPUT_FORMAT_TEXT = 2;
|
|
139
|
+
OUTPUT_FORMAT_MARKDOWN = 3;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Request to parse a file.
|
|
143
|
+
message ParseRequest {
|
|
144
|
+
bytes file = 1;
|
|
145
|
+
optional LiteParseConfig config = 2;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Request to screenshot the pages of a file.
|
|
149
|
+
message ScreenshotRequest {
|
|
150
|
+
bytes file = 1;
|
|
151
|
+
optional LiteParseConfig config = 2;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
message TextItem {
|
|
155
|
+
string text = 1;
|
|
156
|
+
float x = 2;
|
|
157
|
+
float y = 3;
|
|
158
|
+
float width = 4;
|
|
159
|
+
float height = 5;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
message ParsedPage {
|
|
163
|
+
uint32 page_number = 1;
|
|
164
|
+
float page_width = 2;
|
|
165
|
+
float page_height = 3;
|
|
166
|
+
repeated TextItem text_items = 4;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Response with a parsed file
|
|
170
|
+
message ParseResponse {
|
|
171
|
+
string text = 1;
|
|
172
|
+
repeated ParsedPage pages = 2;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
message ScreenshotPage {
|
|
176
|
+
bytes image_bytes = 1;
|
|
177
|
+
uint32 page_number = 2;
|
|
178
|
+
uint32 height = 3;
|
|
179
|
+
uint32 width = 4;
|
|
180
|
+
string mime_type = 5;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Screenshot response for a PDF file
|
|
184
|
+
message ScreenshotResponse {
|
|
185
|
+
repeated ScreenshotPage screenshots = 1;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Request to estimate the complexity of a file.
|
|
189
|
+
message IsComplexRequest {
|
|
190
|
+
bytes file = 1;
|
|
191
|
+
optional LiteParseConfig config = 2;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
message PageComplexityStats {
|
|
195
|
+
uint32 page_number = 1;
|
|
196
|
+
uint64 text_length = 2;
|
|
197
|
+
float text_coverage = 3;
|
|
198
|
+
bool has_substantial_images = 4;
|
|
199
|
+
uint32 image_block_count = 5;
|
|
200
|
+
float image_coverage = 6;
|
|
201
|
+
float largest_image_coverage = 7;
|
|
202
|
+
bool full_page_image = 8;
|
|
203
|
+
optional float uncoverted_vector_area = 9;
|
|
204
|
+
bool is_garbled = 10;
|
|
205
|
+
float page_area = 11;
|
|
206
|
+
bool needs_ocr = 12;
|
|
207
|
+
repeated string reasons = 13;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Response detailing the complexity estimation for a file
|
|
211
|
+
message IsComplexResponse {
|
|
212
|
+
repeated PageComplexityStats complexity = 1;
|
|
213
|
+
}
|