@llamaindex/llama-cloud 1.8.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +58 -0
- package/README.md +10 -8
- package/client.d.mts +4 -6
- package/client.d.mts.map +1 -1
- package/client.d.ts +4 -6
- package/client.d.ts.map +1 -1
- package/client.js +7 -6
- package/client.js.map +1 -1
- package/client.mjs +7 -6
- package/client.mjs.map +1 -1
- package/core/pagination.d.mts +0 -23
- package/core/pagination.d.mts.map +1 -1
- package/core/pagination.d.ts +0 -23
- package/core/pagination.d.ts.map +1 -1
- package/core/pagination.js +1 -32
- package/core/pagination.js.map +1 -1
- package/core/pagination.mjs +0 -30
- package/core/pagination.mjs.map +1 -1
- package/package.json +12 -1
- package/resources/beta/batch/batch.d.mts +55 -30
- package/resources/beta/batch/batch.d.mts.map +1 -1
- package/resources/beta/batch/batch.d.ts +55 -30
- package/resources/beta/batch/batch.d.ts.map +1 -1
- package/resources/beta/batch/batch.js +14 -11
- package/resources/beta/batch/batch.js.map +1 -1
- package/resources/beta/batch/batch.mjs +14 -11
- package/resources/beta/batch/batch.mjs.map +1 -1
- package/resources/beta/batch/job-items.d.mts +36 -13
- package/resources/beta/batch/job-items.d.mts.map +1 -1
- package/resources/beta/batch/job-items.d.ts +36 -13
- package/resources/beta/batch/job-items.d.ts.map +1 -1
- package/resources/beta/batch/job-items.js +6 -8
- package/resources/beta/batch/job-items.js.map +1 -1
- package/resources/beta/batch/job-items.mjs +6 -8
- package/resources/beta/batch/job-items.mjs.map +1 -1
- package/resources/beta/sheets.d.mts +16 -0
- package/resources/beta/sheets.d.mts.map +1 -1
- package/resources/beta/sheets.d.ts +16 -0
- package/resources/beta/sheets.d.ts.map +1 -1
- package/resources/beta/split.d.mts +60 -16
- package/resources/beta/split.d.mts.map +1 -1
- package/resources/beta/split.d.ts +60 -16
- package/resources/beta/split.d.ts.map +1 -1
- package/resources/beta/split.js.map +1 -1
- package/resources/beta/split.mjs.map +1 -1
- package/resources/classifier/jobs.d.mts +12 -3
- package/resources/classifier/jobs.d.mts.map +1 -1
- package/resources/classifier/jobs.d.ts +12 -3
- package/resources/classifier/jobs.d.ts.map +1 -1
- package/resources/classify.d.mts +76 -29
- package/resources/classify.d.mts.map +1 -1
- package/resources/classify.d.ts +76 -29
- package/resources/classify.d.ts.map +1 -1
- package/resources/classify.js +19 -2
- package/resources/classify.js.map +1 -1
- package/resources/classify.mjs +19 -2
- package/resources/classify.mjs.map +1 -1
- package/resources/extract.d.mts +1588 -0
- package/resources/extract.d.mts.map +1 -0
- package/resources/extract.d.ts +1588 -0
- package/resources/extract.d.ts.map +1 -0
- package/resources/extract.js +217 -0
- package/resources/extract.js.map +1 -0
- package/resources/extract.mjs +213 -0
- package/resources/extract.mjs.map +1 -0
- package/resources/files.d.mts +52 -38
- package/resources/files.d.mts.map +1 -1
- package/resources/files.d.ts +52 -38
- package/resources/files.d.ts.map +1 -1
- package/resources/files.js +10 -9
- package/resources/files.js.map +1 -1
- package/resources/files.mjs +10 -9
- package/resources/files.mjs.map +1 -1
- package/resources/index.d.mts +1 -1
- package/resources/index.d.mts.map +1 -1
- package/resources/index.d.ts +1 -1
- package/resources/index.d.ts.map +1 -1
- package/resources/index.js +3 -3
- package/resources/index.js.map +1 -1
- package/resources/index.mjs +1 -1
- package/resources/index.mjs.map +1 -1
- package/resources/parsing.d.mts +292 -138
- package/resources/parsing.d.mts.map +1 -1
- package/resources/parsing.d.ts +292 -138
- package/resources/parsing.d.ts.map +1 -1
- package/resources/parsing.js +30 -4
- package/resources/parsing.js.map +1 -1
- package/resources/parsing.mjs +30 -4
- package/resources/parsing.mjs.map +1 -1
- package/resources/pipelines/pipelines.d.mts +59 -13
- package/resources/pipelines/pipelines.d.mts.map +1 -1
- package/resources/pipelines/pipelines.d.ts +59 -13
- package/resources/pipelines/pipelines.d.ts.map +1 -1
- package/resources/pipelines/pipelines.js +24 -9
- package/resources/pipelines/pipelines.js.map +1 -1
- package/resources/pipelines/pipelines.mjs +24 -9
- package/resources/pipelines/pipelines.mjs.map +1 -1
- package/resources/pipelines/sync.d.mts +5 -3
- package/resources/pipelines/sync.d.mts.map +1 -1
- package/resources/pipelines/sync.d.ts +5 -3
- package/resources/pipelines/sync.d.ts.map +1 -1
- package/resources/pipelines/sync.js +5 -3
- package/resources/pipelines/sync.js.map +1 -1
- package/resources/pipelines/sync.mjs +5 -3
- package/resources/pipelines/sync.mjs.map +1 -1
- package/src/client.ts +50 -15
- package/src/core/pagination.ts +0 -71
- package/src/resources/beta/batch/batch.ts +75 -30
- package/src/resources/beta/batch/job-items.ts +56 -13
- package/src/resources/beta/sheets.ts +20 -0
- package/src/resources/beta/split.ts +70 -17
- package/src/resources/classifier/jobs.ts +12 -3
- package/src/resources/classify.ts +82 -29
- package/src/resources/extract.ts +2045 -0
- package/src/resources/files.ts +52 -38
- package/src/resources/index.ts +22 -1
- package/src/resources/parsing.ts +323 -136
- package/src/resources/pipelines/pipelines.ts +80 -14
- package/src/resources/pipelines/sync.ts +5 -3
- package/src/version.ts +1 -1
- package/version.d.mts +1 -1
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/version.mjs +1 -1
- package/resources/extraction/extraction-agents/extraction-agents.d.mts +0 -126
- package/resources/extraction/extraction-agents/extraction-agents.d.mts.map +0 -1
- package/resources/extraction/extraction-agents/extraction-agents.d.ts +0 -126
- package/resources/extraction/extraction-agents/extraction-agents.d.ts.map +0 -1
- package/resources/extraction/extraction-agents/extraction-agents.js +0 -56
- package/resources/extraction/extraction-agents/extraction-agents.js.map +0 -1
- package/resources/extraction/extraction-agents/extraction-agents.mjs +0 -51
- package/resources/extraction/extraction-agents/extraction-agents.mjs.map +0 -1
- package/resources/extraction/extraction-agents/index.d.mts +0 -3
- package/resources/extraction/extraction-agents/index.d.mts.map +0 -1
- package/resources/extraction/extraction-agents/index.d.ts +0 -3
- package/resources/extraction/extraction-agents/index.d.ts.map +0 -1
- package/resources/extraction/extraction-agents/index.js +0 -9
- package/resources/extraction/extraction-agents/index.js.map +0 -1
- package/resources/extraction/extraction-agents/index.mjs +0 -4
- package/resources/extraction/extraction-agents/index.mjs.map +0 -1
- package/resources/extraction/extraction-agents/schema.d.mts +0 -75
- package/resources/extraction/extraction-agents/schema.d.mts.map +0 -1
- package/resources/extraction/extraction-agents/schema.d.ts +0 -75
- package/resources/extraction/extraction-agents/schema.d.ts.map +0 -1
- package/resources/extraction/extraction-agents/schema.js +0 -28
- package/resources/extraction/extraction-agents/schema.js.map +0 -1
- package/resources/extraction/extraction-agents/schema.mjs +0 -24
- package/resources/extraction/extraction-agents/schema.mjs.map +0 -1
- package/resources/extraction/extraction-agents.d.mts +0 -2
- package/resources/extraction/extraction-agents.d.mts.map +0 -1
- package/resources/extraction/extraction-agents.d.ts +0 -2
- package/resources/extraction/extraction-agents.d.ts.map +0 -1
- package/resources/extraction/extraction-agents.js +0 -6
- package/resources/extraction/extraction-agents.js.map +0 -1
- package/resources/extraction/extraction-agents.mjs +0 -3
- package/resources/extraction/extraction-agents.mjs.map +0 -1
- package/resources/extraction/extraction.d.mts +0 -118
- package/resources/extraction/extraction.d.mts.map +0 -1
- package/resources/extraction/extraction.d.ts +0 -118
- package/resources/extraction/extraction.d.ts.map +0 -1
- package/resources/extraction/extraction.js +0 -91
- package/resources/extraction/extraction.js.map +0 -1
- package/resources/extraction/extraction.mjs +0 -86
- package/resources/extraction/extraction.mjs.map +0 -1
- package/resources/extraction/index.d.mts +0 -5
- package/resources/extraction/index.d.mts.map +0 -1
- package/resources/extraction/index.d.ts +0 -5
- package/resources/extraction/index.d.ts.map +0 -1
- package/resources/extraction/index.js +0 -13
- package/resources/extraction/index.js.map +0 -1
- package/resources/extraction/index.mjs +0 -6
- package/resources/extraction/index.mjs.map +0 -1
- package/resources/extraction/jobs.d.mts +0 -280
- package/resources/extraction/jobs.d.mts.map +0 -1
- package/resources/extraction/jobs.d.ts +0 -280
- package/resources/extraction/jobs.d.ts.map +0 -1
- package/resources/extraction/jobs.js +0 -179
- package/resources/extraction/jobs.js.map +0 -1
- package/resources/extraction/jobs.mjs +0 -175
- package/resources/extraction/jobs.mjs.map +0 -1
- package/resources/extraction/runs.d.mts +0 -198
- package/resources/extraction/runs.d.mts.map +0 -1
- package/resources/extraction/runs.d.ts +0 -198
- package/resources/extraction/runs.d.ts.map +0 -1
- package/resources/extraction/runs.js +0 -42
- package/resources/extraction/runs.js.map +0 -1
- package/resources/extraction/runs.mjs +0 -38
- package/resources/extraction/runs.mjs.map +0 -1
- package/resources/extraction.d.mts +0 -2
- package/resources/extraction.d.mts.map +0 -1
- package/resources/extraction.d.ts +0 -2
- package/resources/extraction.d.ts.map +0 -1
- package/resources/extraction.js +0 -6
- package/resources/extraction.js.map +0 -1
- package/resources/extraction.mjs +0 -3
- package/resources/extraction.mjs.map +0 -1
- package/src/resources/extraction/extraction-agents/extraction-agents.ts +0 -196
- package/src/resources/extraction/extraction-agents/index.ts +0 -18
- package/src/resources/extraction/extraction-agents/schema.ts +0 -100
- package/src/resources/extraction/extraction-agents.ts +0 -3
- package/src/resources/extraction/extraction.ts +0 -224
- package/src/resources/extraction/index.ts +0 -34
- package/src/resources/extraction/jobs.ts +0 -414
- package/src/resources/extraction/runs.ts +0 -315
- package/src/resources/extraction.ts +0 -3
|
@@ -0,0 +1,1588 @@
|
|
|
1
|
+
import { APIResource } from "../core/resource.js";
|
|
2
|
+
import * as ExtractAPI from "./extract.js";
|
|
3
|
+
import * as ParsingAPI from "./parsing.js";
|
|
4
|
+
import * as SplitAPI from "./beta/split.js";
|
|
5
|
+
import { APIPromise } from "../core/api-promise.js";
|
|
6
|
+
import { PagePromise, PaginatedCursor, type PaginatedCursorParams } from "../core/pagination.js";
|
|
7
|
+
import { RequestOptions } from "../internal/request-options.js";
|
|
8
|
+
import { PollingOptions } from "../core/polling.js";
|
|
9
|
+
export declare class Extract extends APIResource {
|
|
10
|
+
/**
|
|
11
|
+
* Create an extraction job.
|
|
12
|
+
*
|
|
13
|
+
* Extracts structured data from a document using either a saved configuration or
|
|
14
|
+
* an inline JSON Schema.
|
|
15
|
+
*
|
|
16
|
+
* ## Input
|
|
17
|
+
*
|
|
18
|
+
* Provide exactly one of:
|
|
19
|
+
*
|
|
20
|
+
* - `configuration_id` — reference a saved extraction config
|
|
21
|
+
* - `configuration` — inline configuration with a `data_schema`
|
|
22
|
+
*
|
|
23
|
+
* ## Document input
|
|
24
|
+
*
|
|
25
|
+
* Set `document_input_value` to a file ID (`dfl-...`) or a completed parse job ID
|
|
26
|
+
* (`pjb-...`).
|
|
27
|
+
*
|
|
28
|
+
* The job runs asynchronously. Poll `GET /extract/{job_id}` or register a webhook
|
|
29
|
+
* to monitor completion.
|
|
30
|
+
*
|
|
31
|
+
* @example
|
|
32
|
+
* ```ts
|
|
33
|
+
* const extractV2Job = await client.extract.create({
|
|
34
|
+
* document_input_value:
|
|
35
|
+
* 'dfl-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
|
|
36
|
+
* });
|
|
37
|
+
* ```
|
|
38
|
+
*/
|
|
39
|
+
create(params: ExtractCreateParams, options?: RequestOptions): APIPromise<ExtractV2Job>;
|
|
40
|
+
/**
|
|
41
|
+
* List extraction jobs with optional filtering and pagination.
|
|
42
|
+
*
|
|
43
|
+
* Filter by `configuration_id`, `status`, `document_input_value`, or creation date
|
|
44
|
+
* range. Results are returned newest-first. Use `expand=configuration` to include
|
|
45
|
+
* the full configuration used, and `expand=extract_metadata` for per-field
|
|
46
|
+
* metadata.
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```ts
|
|
50
|
+
* // Automatically fetches more pages as needed.
|
|
51
|
+
* for await (const extractV2Job of client.extract.list()) {
|
|
52
|
+
* // ...
|
|
53
|
+
* }
|
|
54
|
+
* ```
|
|
55
|
+
*/
|
|
56
|
+
list(query?: ExtractListParams | null | undefined, options?: RequestOptions): PagePromise<ExtractV2JobsPaginatedCursor, ExtractV2Job>;
|
|
57
|
+
/**
|
|
58
|
+
* Delete an extraction job and its results.
|
|
59
|
+
*
|
|
60
|
+
* @example
|
|
61
|
+
* ```ts
|
|
62
|
+
* const extract = await client.extract.delete('job_id');
|
|
63
|
+
* ```
|
|
64
|
+
*/
|
|
65
|
+
delete(jobID: string, params?: ExtractDeleteParams | null | undefined, options?: RequestOptions): APIPromise<unknown>;
|
|
66
|
+
/**
|
|
67
|
+
* Generate a JSON schema and return a product configuration request.
|
|
68
|
+
*
|
|
69
|
+
* @example
|
|
70
|
+
* ```ts
|
|
71
|
+
* const response = await client.extract.generateSchema();
|
|
72
|
+
* ```
|
|
73
|
+
*/
|
|
74
|
+
generateSchema(params: ExtractGenerateSchemaParams, options?: RequestOptions): APIPromise<ExtractGenerateSchemaResponse>;
|
|
75
|
+
/**
|
|
76
|
+
* Get a single extraction job by ID.
|
|
77
|
+
*
|
|
78
|
+
* Returns the job status and results when complete. Use `expand=configuration` to
|
|
79
|
+
* include the full configuration used, and `expand=extract_metadata` for per-field
|
|
80
|
+
* metadata.
|
|
81
|
+
*
|
|
82
|
+
* @example
|
|
83
|
+
* ```ts
|
|
84
|
+
* const extractV2Job = await client.extract.get('job_id');
|
|
85
|
+
* ```
|
|
86
|
+
*/
|
|
87
|
+
get(jobID: string, query?: ExtractGetParams | null | undefined, options?: RequestOptions): APIPromise<ExtractV2Job>;
|
|
88
|
+
/**
|
|
89
|
+
* Validate a JSON schema for extraction.
|
|
90
|
+
*
|
|
91
|
+
* @example
|
|
92
|
+
* ```ts
|
|
93
|
+
* const extractV2SchemaValidateResponse =
|
|
94
|
+
* await client.extract.validateSchema({
|
|
95
|
+
* data_schema: { foo: { foo: 'bar' } },
|
|
96
|
+
* });
|
|
97
|
+
* ```
|
|
98
|
+
*/
|
|
99
|
+
validateSchema(body: ExtractValidateSchemaParams, options?: RequestOptions): APIPromise<ExtractV2SchemaValidateResponse>;
|
|
100
|
+
/**
|
|
101
|
+
* Wait for an extraction job to complete by polling until it reaches a terminal state.
|
|
102
|
+
*
|
|
103
|
+
* @param jobID - The ID of the extraction job to wait for
|
|
104
|
+
* @param query - Optional query parameters (organization_id, project_id)
|
|
105
|
+
* @param options - Polling configuration and request options
|
|
106
|
+
* @returns The completed extraction job
|
|
107
|
+
* @throws {PollingTimeoutError} If the job doesn't complete within the timeout period
|
|
108
|
+
* @throws {PollingError} If the job fails or is cancelled
|
|
109
|
+
*
|
|
110
|
+
* @example
|
|
111
|
+
* ```typescript
|
|
112
|
+
* const job = await client.extract.create({
|
|
113
|
+
* document_input_value: 'dfl-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
|
|
114
|
+
* });
|
|
115
|
+
*
|
|
116
|
+
* const completed = await client.extract.waitForCompletion(job.id, undefined, { verbose: true });
|
|
117
|
+
* console.log(completed.extract_result);
|
|
118
|
+
* ```
|
|
119
|
+
*/
|
|
120
|
+
waitForCompletion(jobID: string, query?: ExtractGetParams, options?: PollingOptions & RequestOptions): Promise<ExtractV2Job>;
|
|
121
|
+
/**
|
|
122
|
+
* Create an extraction job, wait for it to complete, and return the result.
|
|
123
|
+
*
|
|
124
|
+
* This is a convenience method that combines create() and waitForCompletion()
|
|
125
|
+
* into a single call for the most common end-to-end workflow.
|
|
126
|
+
*
|
|
127
|
+
* @param params - Extract job creation parameters
|
|
128
|
+
* @param options - Polling configuration and request options
|
|
129
|
+
* @returns The completed extraction job with extract_result populated
|
|
130
|
+
* @throws {PollingTimeoutError} If the job doesn't complete within the timeout period
|
|
131
|
+
* @throws {PollingError} If the job fails or is cancelled
|
|
132
|
+
*
|
|
133
|
+
* @example
|
|
134
|
+
* ```typescript
|
|
135
|
+
* import { LlamaCloud } from 'llama-cloud';
|
|
136
|
+
*
|
|
137
|
+
* const client = new LlamaCloud({ apiKey: '...' });
|
|
138
|
+
*
|
|
139
|
+
* const result = await client.extract.run({
|
|
140
|
+
* document_input_value: 'dfl-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
|
|
141
|
+
* configuration: {
|
|
142
|
+
* data_schema: { name: { type: 'string' }, age: { type: 'number' } },
|
|
143
|
+
* },
|
|
144
|
+
* }, { verbose: true });
|
|
145
|
+
*
|
|
146
|
+
* console.log(result.extract_result);
|
|
147
|
+
* ```
|
|
148
|
+
*/
|
|
149
|
+
run(params: ExtractCreateParams, options?: PollingOptions & RequestOptions): Promise<ExtractV2Job>;
|
|
150
|
+
}
|
|
151
|
+
export type ExtractV2JobsPaginatedCursor = PaginatedCursor<ExtractV2Job>;
|
|
152
|
+
/**
|
|
153
|
+
* Extract configuration combining parse and extract settings.
|
|
154
|
+
*/
|
|
155
|
+
export interface ExtractConfiguration {
|
|
156
|
+
/**
|
|
157
|
+
* JSON Schema defining the fields to extract. Validate with the /schema/validate
|
|
158
|
+
* endpoint first.
|
|
159
|
+
*/
|
|
160
|
+
data_schema: {
|
|
161
|
+
[key: string]: {
|
|
162
|
+
[key: string]: unknown;
|
|
163
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
164
|
+
};
|
|
165
|
+
/**
|
|
166
|
+
* Include citations in results
|
|
167
|
+
*/
|
|
168
|
+
cite_sources?: boolean;
|
|
169
|
+
/**
|
|
170
|
+
* Include confidence scores in results
|
|
171
|
+
*/
|
|
172
|
+
confidence_scores?: boolean;
|
|
173
|
+
/**
|
|
174
|
+
* Extract algorithm version. Use 'latest' or a date string.
|
|
175
|
+
*/
|
|
176
|
+
extract_version?: string;
|
|
177
|
+
/**
|
|
178
|
+
* Granularity of extraction: per_doc returns one object per document, per_page
|
|
179
|
+
* returns one object per page, per_table_row returns one object per table row
|
|
180
|
+
*/
|
|
181
|
+
extraction_target?: 'per_doc' | 'per_page' | 'per_table_row';
|
|
182
|
+
/**
|
|
183
|
+
* ISO 639-1 language code for the document
|
|
184
|
+
*/
|
|
185
|
+
lang?: string;
|
|
186
|
+
/**
|
|
187
|
+
* Maximum number of pages to process. Omit for no limit.
|
|
188
|
+
*/
|
|
189
|
+
max_pages?: number | null;
|
|
190
|
+
/**
|
|
191
|
+
* Saved parse configuration ID to control how the document is parsed before
|
|
192
|
+
* extraction
|
|
193
|
+
*/
|
|
194
|
+
parse_config_id?: string | null;
|
|
195
|
+
/**
|
|
196
|
+
* Parse tier to use before extraction (fast, cost_effective, or agentic)
|
|
197
|
+
*/
|
|
198
|
+
parse_tier?: string | null;
|
|
199
|
+
/**
|
|
200
|
+
* Custom system prompt to guide extraction behavior
|
|
201
|
+
*/
|
|
202
|
+
system_prompt?: string | null;
|
|
203
|
+
/**
|
|
204
|
+
* Comma-separated page numbers or ranges to process (1-based). Omit to process all
|
|
205
|
+
* pages.
|
|
206
|
+
*/
|
|
207
|
+
target_pages?: string | null;
|
|
208
|
+
/**
|
|
209
|
+
* Extract tier: cost_effective (5 credits/page) or agentic (15 credits/page)
|
|
210
|
+
*/
|
|
211
|
+
tier?: 'cost_effective' | 'agentic';
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Extraction metadata.
|
|
215
|
+
*/
|
|
216
|
+
export interface ExtractJobMetadata {
|
|
217
|
+
/**
|
|
218
|
+
* Metadata for extracted fields including document, page, and row level info.
|
|
219
|
+
*/
|
|
220
|
+
field_metadata?: ExtractedFieldMetadata | null;
|
|
221
|
+
/**
|
|
222
|
+
* Reference to the ParseJob ID used for parsing
|
|
223
|
+
*/
|
|
224
|
+
parse_job_id?: string | null;
|
|
225
|
+
/**
|
|
226
|
+
* Parse tier used for parsing the document
|
|
227
|
+
*/
|
|
228
|
+
parse_tier?: string | null;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Extraction usage metrics.
|
|
232
|
+
*/
|
|
233
|
+
export interface ExtractJobUsage {
|
|
234
|
+
/**
|
|
235
|
+
* Number of document tokens
|
|
236
|
+
*/
|
|
237
|
+
num_document_tokens?: number | null;
|
|
238
|
+
/**
|
|
239
|
+
* Number of output tokens
|
|
240
|
+
*/
|
|
241
|
+
num_output_tokens?: number | null;
|
|
242
|
+
/**
|
|
243
|
+
* Number of pages extracted
|
|
244
|
+
*/
|
|
245
|
+
num_pages_extracted?: number | null;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* An extraction job.
|
|
249
|
+
*/
|
|
250
|
+
export interface ExtractV2Job {
|
|
251
|
+
/**
|
|
252
|
+
* Unique job identifier (job_id)
|
|
253
|
+
*/
|
|
254
|
+
id: string;
|
|
255
|
+
/**
|
|
256
|
+
* Creation timestamp
|
|
257
|
+
*/
|
|
258
|
+
created_at: string;
|
|
259
|
+
/**
|
|
260
|
+
* File ID or parse job ID that was extracted
|
|
261
|
+
*/
|
|
262
|
+
document_input_value: string;
|
|
263
|
+
/**
|
|
264
|
+
* Project this job belongs to
|
|
265
|
+
*/
|
|
266
|
+
project_id: string;
|
|
267
|
+
/**
|
|
268
|
+
* Current job status.
|
|
269
|
+
*
|
|
270
|
+
* - `PENDING` — queued, not yet started
|
|
271
|
+
* - `RUNNING` — actively processing
|
|
272
|
+
* - `COMPLETED` — finished successfully
|
|
273
|
+
* - `FAILED` — terminated with an error
|
|
274
|
+
* - `CANCELLED` — cancelled by user
|
|
275
|
+
*/
|
|
276
|
+
status: string;
|
|
277
|
+
/**
|
|
278
|
+
* Last update timestamp
|
|
279
|
+
*/
|
|
280
|
+
updated_at: string;
|
|
281
|
+
/**
|
|
282
|
+
* Extract configuration combining parse and extract settings.
|
|
283
|
+
*/
|
|
284
|
+
configuration?: ExtractConfiguration | null;
|
|
285
|
+
/**
|
|
286
|
+
* Saved extract configuration ID used for this job, if any
|
|
287
|
+
*/
|
|
288
|
+
configuration_id?: string | null;
|
|
289
|
+
/**
|
|
290
|
+
* Error details when status is FAILED
|
|
291
|
+
*/
|
|
292
|
+
error_message?: string | null;
|
|
293
|
+
/**
|
|
294
|
+
* Extraction metadata.
|
|
295
|
+
*/
|
|
296
|
+
extract_metadata?: ExtractJobMetadata | null;
|
|
297
|
+
/**
|
|
298
|
+
* Extracted data conforming to the data_schema. Returns a single object for
|
|
299
|
+
* per_doc, or an array for per_page / per_table_row.
|
|
300
|
+
*/
|
|
301
|
+
extract_result?: {
|
|
302
|
+
[key: string]: {
|
|
303
|
+
[key: string]: unknown;
|
|
304
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
305
|
+
} | Array<{
|
|
306
|
+
[key: string]: {
|
|
307
|
+
[key: string]: unknown;
|
|
308
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
309
|
+
}> | null;
|
|
310
|
+
/**
|
|
311
|
+
* Job-level metadata.
|
|
312
|
+
*/
|
|
313
|
+
metadata?: ExtractV2Job.Metadata | null;
|
|
314
|
+
}
|
|
315
|
+
export declare namespace ExtractV2Job {
|
|
316
|
+
/**
|
|
317
|
+
* Job-level metadata.
|
|
318
|
+
*/
|
|
319
|
+
interface Metadata {
|
|
320
|
+
/**
|
|
321
|
+
* Extraction usage metrics.
|
|
322
|
+
*/
|
|
323
|
+
usage?: ExtractAPI.ExtractJobUsage | null;
|
|
324
|
+
[k: string]: unknown;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Request to create an extraction job. Provide configuration_id or inline
|
|
329
|
+
* configuration.
|
|
330
|
+
*/
|
|
331
|
+
export interface ExtractV2JobCreate {
|
|
332
|
+
/**
|
|
333
|
+
* File ID or Parse Job ID to extract from
|
|
334
|
+
*/
|
|
335
|
+
document_input_value: string;
|
|
336
|
+
/**
|
|
337
|
+
* Extract configuration combining parse and extract settings.
|
|
338
|
+
*/
|
|
339
|
+
configuration?: ExtractConfiguration | null;
|
|
340
|
+
/**
|
|
341
|
+
* Saved extract configuration ID (mutually exclusive with configuration)
|
|
342
|
+
*/
|
|
343
|
+
configuration_id?: string | null;
|
|
344
|
+
/**
|
|
345
|
+
* Outbound webhook endpoints to notify on job status changes
|
|
346
|
+
*/
|
|
347
|
+
webhook_configurations?: Array<ExtractV2JobCreate.WebhookConfiguration> | null;
|
|
348
|
+
}
|
|
349
|
+
export declare namespace ExtractV2JobCreate {
|
|
350
|
+
/**
|
|
351
|
+
* Configuration for a single outbound webhook endpoint.
|
|
352
|
+
*/
|
|
353
|
+
interface WebhookConfiguration {
|
|
354
|
+
/**
|
|
355
|
+
* Events to subscribe to (e.g. 'parse.success', 'extract.error'). If null, all
|
|
356
|
+
* events are delivered.
|
|
357
|
+
*/
|
|
358
|
+
webhook_events?: Array<'extract.pending' | 'extract.success' | 'extract.error' | 'extract.partial_success' | 'extract.cancelled' | 'parse.pending' | 'parse.running' | 'parse.success' | 'parse.error' | 'parse.partial_success' | 'parse.cancelled' | 'classify.pending' | 'classify.success' | 'classify.error' | 'classify.partial_success' | 'classify.cancelled' | 'unmapped_event'> | null;
|
|
359
|
+
/**
|
|
360
|
+
* Custom HTTP headers sent with each webhook request (e.g. auth tokens)
|
|
361
|
+
*/
|
|
362
|
+
webhook_headers?: {
|
|
363
|
+
[key: string]: string;
|
|
364
|
+
} | null;
|
|
365
|
+
/**
|
|
366
|
+
* Response format sent to the webhook: 'string' (default) or 'json'
|
|
367
|
+
*/
|
|
368
|
+
webhook_output_format?: string | null;
|
|
369
|
+
/**
|
|
370
|
+
* URL to receive webhook POST notifications
|
|
371
|
+
*/
|
|
372
|
+
webhook_url?: string | null;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
/**
|
|
376
|
+
* Paginated list of extraction jobs.
|
|
377
|
+
*/
|
|
378
|
+
export interface ExtractV2JobQueryResponse {
|
|
379
|
+
/**
|
|
380
|
+
* The list of items.
|
|
381
|
+
*/
|
|
382
|
+
items: Array<ExtractV2Job>;
|
|
383
|
+
/**
|
|
384
|
+
* A token, which can be sent as page_token to retrieve the next page. If this
|
|
385
|
+
* field is omitted, there are no subsequent pages.
|
|
386
|
+
*/
|
|
387
|
+
next_page_token?: string | null;
|
|
388
|
+
/**
|
|
389
|
+
* The total number of items available. This is only populated when specifically
|
|
390
|
+
* requested. The value may be an estimate and can be used for display purposes
|
|
391
|
+
* only.
|
|
392
|
+
*/
|
|
393
|
+
total_size?: number | null;
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Request schema for generating an extraction schema.
|
|
397
|
+
*/
|
|
398
|
+
export interface ExtractV2SchemaGenerateRequest {
|
|
399
|
+
/**
|
|
400
|
+
* Optional schema to validate, refine, or extend
|
|
401
|
+
*/
|
|
402
|
+
data_schema?: {
|
|
403
|
+
[key: string]: {
|
|
404
|
+
[key: string]: unknown;
|
|
405
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
406
|
+
} | null;
|
|
407
|
+
/**
|
|
408
|
+
* Optional file ID to analyze for schema generation
|
|
409
|
+
*/
|
|
410
|
+
file_id?: string | null;
|
|
411
|
+
/**
|
|
412
|
+
* Name for the generated configuration (auto-generated if omitted)
|
|
413
|
+
*/
|
|
414
|
+
name?: string | null;
|
|
415
|
+
/**
|
|
416
|
+
* Natural language description of the data structure to extract
|
|
417
|
+
*/
|
|
418
|
+
prompt?: string | null;
|
|
419
|
+
}
|
|
420
|
+
/**
|
|
421
|
+
* Request schema for validating an extraction schema.
|
|
422
|
+
*/
|
|
423
|
+
export interface ExtractV2SchemaValidateRequest {
|
|
424
|
+
/**
|
|
425
|
+
* JSON Schema to validate for use with extract jobs
|
|
426
|
+
*/
|
|
427
|
+
data_schema: {
|
|
428
|
+
[key: string]: {
|
|
429
|
+
[key: string]: unknown;
|
|
430
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
/**
|
|
434
|
+
* Response schema for schema validation.
|
|
435
|
+
*/
|
|
436
|
+
export interface ExtractV2SchemaValidateResponse {
|
|
437
|
+
/**
|
|
438
|
+
* Validated JSON Schema, ready for use in extract jobs
|
|
439
|
+
*/
|
|
440
|
+
data_schema: {
|
|
441
|
+
[key: string]: {
|
|
442
|
+
[key: string]: unknown;
|
|
443
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Metadata for extracted fields including document, page, and row level info.
|
|
448
|
+
*/
|
|
449
|
+
export interface ExtractedFieldMetadata {
|
|
450
|
+
/**
|
|
451
|
+
* Document-level metadata (citations, confidence) keyed by field name
|
|
452
|
+
*/
|
|
453
|
+
document_metadata?: {
|
|
454
|
+
[key: string]: {
|
|
455
|
+
[key: string]: unknown;
|
|
456
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
457
|
+
} | null;
|
|
458
|
+
/**
|
|
459
|
+
* Per-page metadata when extraction_target is per_page
|
|
460
|
+
*/
|
|
461
|
+
page_metadata?: Array<{
|
|
462
|
+
[key: string]: {
|
|
463
|
+
[key: string]: unknown;
|
|
464
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
465
|
+
}> | null;
|
|
466
|
+
/**
|
|
467
|
+
* Per-row metadata when extraction_target is per_table_row
|
|
468
|
+
*/
|
|
469
|
+
row_metadata?: Array<{
|
|
470
|
+
[key: string]: {
|
|
471
|
+
[key: string]: unknown;
|
|
472
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
473
|
+
}> | null;
|
|
474
|
+
}
|
|
475
|
+
export type ExtractDeleteResponse = unknown;
|
|
476
|
+
/**
|
|
477
|
+
* Request body for creating a product configuration.
|
|
478
|
+
*/
|
|
479
|
+
export interface ExtractGenerateSchemaResponse {
|
|
480
|
+
/**
|
|
481
|
+
* Human-readable name for this configuration.
|
|
482
|
+
*/
|
|
483
|
+
name: string;
|
|
484
|
+
/**
|
|
485
|
+
* Product-specific configuration parameters.
|
|
486
|
+
*/
|
|
487
|
+
parameters: ExtractGenerateSchemaResponse.SplitV1Parameters | ExtractGenerateSchemaResponse.ExtractV2Parameters | ExtractGenerateSchemaResponse.ClassifyV2Parameters | ExtractGenerateSchemaResponse.ParseV2Parameters | ExtractGenerateSchemaResponse.UntypedParameters;
|
|
488
|
+
}
|
|
489
|
+
export declare namespace ExtractGenerateSchemaResponse {
|
|
490
|
+
/**
|
|
491
|
+
* Typed parameters for a _split v1_ product configuration.
|
|
492
|
+
*/
|
|
493
|
+
interface SplitV1Parameters {
|
|
494
|
+
/**
|
|
495
|
+
* Categories to split documents into.
|
|
496
|
+
*/
|
|
497
|
+
categories: Array<SplitAPI.SplitCategory>;
|
|
498
|
+
/**
|
|
499
|
+
* Product type.
|
|
500
|
+
*/
|
|
501
|
+
product_type: 'split_v1';
|
|
502
|
+
/**
|
|
503
|
+
* Strategy for splitting documents.
|
|
504
|
+
*/
|
|
505
|
+
splitting_strategy?: SplitV1Parameters.SplittingStrategy;
|
|
506
|
+
}
|
|
507
|
+
namespace SplitV1Parameters {
|
|
508
|
+
/**
|
|
509
|
+
* Strategy for splitting documents.
|
|
510
|
+
*/
|
|
511
|
+
interface SplittingStrategy {
|
|
512
|
+
/**
|
|
513
|
+
* Controls handling of pages that don't match any category. 'include': pages can
|
|
514
|
+
* be grouped as 'uncategorized' and included in results. 'forbid': all pages must
|
|
515
|
+
* be assigned to a defined category. 'omit': pages can be classified as
|
|
516
|
+
* 'uncategorized' but are excluded from results.
|
|
517
|
+
*/
|
|
518
|
+
allow_uncategorized?: 'include' | 'forbid' | 'omit';
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
/**
|
|
522
|
+
* Typed parameters for an _extract v2_ product configuration.
|
|
523
|
+
*/
|
|
524
|
+
interface ExtractV2Parameters {
|
|
525
|
+
/**
|
|
526
|
+
* JSON Schema defining the fields to extract. Validate with the /schema/validate
|
|
527
|
+
* endpoint first.
|
|
528
|
+
*/
|
|
529
|
+
data_schema: {
|
|
530
|
+
[key: string]: {
|
|
531
|
+
[key: string]: unknown;
|
|
532
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
533
|
+
};
|
|
534
|
+
/**
|
|
535
|
+
* Product type.
|
|
536
|
+
*/
|
|
537
|
+
product_type: 'extract_v2';
|
|
538
|
+
/**
|
|
539
|
+
* Include citations in results
|
|
540
|
+
*/
|
|
541
|
+
cite_sources?: boolean;
|
|
542
|
+
/**
|
|
543
|
+
* Include confidence scores in results
|
|
544
|
+
*/
|
|
545
|
+
confidence_scores?: boolean;
|
|
546
|
+
/**
|
|
547
|
+
* Extract algorithm version. Use 'latest' or a date string.
|
|
548
|
+
*/
|
|
549
|
+
extract_version?: string;
|
|
550
|
+
/**
|
|
551
|
+
* Granularity of extraction: per_doc returns one object per document, per_page
|
|
552
|
+
* returns one object per page, per_table_row returns one object per table row
|
|
553
|
+
*/
|
|
554
|
+
extraction_target?: 'per_doc' | 'per_page' | 'per_table_row';
|
|
555
|
+
/**
|
|
556
|
+
* ISO 639-1 language code for the document
|
|
557
|
+
*/
|
|
558
|
+
lang?: string;
|
|
559
|
+
/**
|
|
560
|
+
* Maximum number of pages to process. Omit for no limit.
|
|
561
|
+
*/
|
|
562
|
+
max_pages?: number | null;
|
|
563
|
+
/**
|
|
564
|
+
* Saved parse configuration ID to control how the document is parsed before
|
|
565
|
+
* extraction
|
|
566
|
+
*/
|
|
567
|
+
parse_config_id?: string | null;
|
|
568
|
+
/**
|
|
569
|
+
* Parse tier to use before extraction (fast, cost_effective, or agentic)
|
|
570
|
+
*/
|
|
571
|
+
parse_tier?: string | null;
|
|
572
|
+
/**
|
|
573
|
+
* Custom system prompt to guide extraction behavior
|
|
574
|
+
*/
|
|
575
|
+
system_prompt?: string | null;
|
|
576
|
+
/**
|
|
577
|
+
* Comma-separated page numbers or ranges to process (1-based). Omit to process all
|
|
578
|
+
* pages.
|
|
579
|
+
*/
|
|
580
|
+
target_pages?: string | null;
|
|
581
|
+
/**
|
|
582
|
+
* Extract tier: cost_effective (5 credits/page) or agentic (15 credits/page)
|
|
583
|
+
*/
|
|
584
|
+
tier?: 'cost_effective' | 'agentic';
|
|
585
|
+
}
|
|
586
|
+
/**
|
|
587
|
+
* Typed parameters for a _classify v2_ product configuration.
|
|
588
|
+
*/
|
|
589
|
+
interface ClassifyV2Parameters {
|
|
590
|
+
/**
|
|
591
|
+
* Product type.
|
|
592
|
+
*/
|
|
593
|
+
product_type: 'classify_v2';
|
|
594
|
+
/**
|
|
595
|
+
* Classify rules to evaluate against the document (at least one required)
|
|
596
|
+
*/
|
|
597
|
+
rules: Array<ClassifyV2Parameters.Rule>;
|
|
598
|
+
/**
|
|
599
|
+
* Classify execution mode
|
|
600
|
+
*/
|
|
601
|
+
mode?: 'FAST';
|
|
602
|
+
/**
|
|
603
|
+
* Parsing configuration for classify jobs.
|
|
604
|
+
*/
|
|
605
|
+
parsing_configuration?: ClassifyV2Parameters.ParsingConfiguration | null;
|
|
606
|
+
}
|
|
607
|
+
namespace ClassifyV2Parameters {
|
|
608
|
+
/**
|
|
609
|
+
* A rule for classifying documents.
|
|
610
|
+
*/
|
|
611
|
+
interface Rule {
|
|
612
|
+
/**
|
|
613
|
+
* Natural language criteria for matching this rule
|
|
614
|
+
*/
|
|
615
|
+
description: string;
|
|
616
|
+
/**
|
|
617
|
+
* Document type to assign when rule matches
|
|
618
|
+
*/
|
|
619
|
+
type: string;
|
|
620
|
+
}
|
|
621
|
+
/**
|
|
622
|
+
* Parsing configuration for classify jobs.
|
|
623
|
+
*/
|
|
624
|
+
interface ParsingConfiguration {
|
|
625
|
+
/**
|
|
626
|
+
* ISO 639-1 language code for the document
|
|
627
|
+
*/
|
|
628
|
+
lang?: string;
|
|
629
|
+
/**
|
|
630
|
+
* Maximum number of pages to process. Omit for no limit.
|
|
631
|
+
*/
|
|
632
|
+
max_pages?: number | null;
|
|
633
|
+
/**
|
|
634
|
+
* Comma-separated page numbers or ranges to process (1-based). Omit to process all
|
|
635
|
+
* pages.
|
|
636
|
+
*/
|
|
637
|
+
target_pages?: string | null;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Configuration for LlamaParse v2 document parsing.
|
|
642
|
+
*
|
|
643
|
+
* Includes tier selection, processing options, output formatting, page targeting,
|
|
644
|
+
* and webhook delivery. Refer to the LlamaParse documentation for details on each
|
|
645
|
+
* field.
|
|
646
|
+
*/
|
|
647
|
+
interface ParseV2Parameters {
|
|
648
|
+
/**
|
|
649
|
+
* Product type.
|
|
650
|
+
*/
|
|
651
|
+
product_type: 'parse_v2';
|
|
652
|
+
/**
|
|
653
|
+
* Parsing tier: 'fast' (rule-based, cheapest), 'cost_effective' (balanced),
|
|
654
|
+
* 'agentic' (AI-powered with custom prompts), or 'agentic_plus' (premium AI with
|
|
655
|
+
* highest accuracy)
|
|
656
|
+
*/
|
|
657
|
+
tier: 'fast' | 'cost_effective' | 'agentic' | 'agentic_plus';
|
|
658
|
+
/**
|
|
659
|
+
* Tier version. Use 'latest' for the current stable version, or specify a specific
|
|
660
|
+
* version (e.g., '1.0', '2.0') for reproducible results
|
|
661
|
+
*/
|
|
662
|
+
version: '2025-12-11' | '2025-12-18' | '2025-12-31' | '2026-01-08' | '2026-01-09' | '2026-01-16' | '2026-01-21' | '2026-01-22' | '2026-01-24' | '2026-01-29' | '2026-01-30' | '2026-02-03' | '2026-02-18' | '2026-02-20' | '2026-02-24' | '2026-02-26' | '2026-03-02' | '2026-03-03' | '2026-03-04' | '2026-03-05' | '2026-03-09' | '2026-03-10' | '2026-03-11' | '2026-03-12' | '2026-03-17' | '2026-03-19' | '2026-03-20' | '2026-03-22' | '2026-03-23' | '2026-03-24' | '2026-03-25' | '2026-03-26' | '2026-03-27' | 'latest' | (string & {});
|
|
663
|
+
/**
|
|
664
|
+
* Options for AI-powered parsing tiers (cost_effective, agentic, agentic_plus).
|
|
665
|
+
*
|
|
666
|
+
* These options customize how the AI processes and interprets document content.
|
|
667
|
+
* Only applicable when using non-fast tiers.
|
|
668
|
+
*/
|
|
669
|
+
agentic_options?: ParseV2Parameters.AgenticOptions | null;
|
|
670
|
+
/**
|
|
671
|
+
* Identifier for the client/application making the request. Used for analytics and
|
|
672
|
+
* debugging. Example: 'my-app-v2'
|
|
673
|
+
*/
|
|
674
|
+
client_name?: string | null;
|
|
675
|
+
/**
|
|
676
|
+
* Crop boundaries to process only a portion of each page. Values are ratios 0-1
|
|
677
|
+
* from page edges
|
|
678
|
+
*/
|
|
679
|
+
crop_box?: ParseV2Parameters.CropBox;
|
|
680
|
+
/**
|
|
681
|
+
* Bypass result caching and force re-parsing. Use when document content may have
|
|
682
|
+
* changed or you need fresh results
|
|
683
|
+
*/
|
|
684
|
+
disable_cache?: boolean | null;
|
|
685
|
+
/**
|
|
686
|
+
* Options for fast tier parsing (rule-based, no AI).
|
|
687
|
+
*
|
|
688
|
+
* Fast tier uses deterministic algorithms for text extraction without AI
|
|
689
|
+
* enhancement. It's the fastest and most cost-effective option, best suited for
|
|
690
|
+
* simple documents with standard layouts. Currently has no configurable options
|
|
691
|
+
* but reserved for future expansion.
|
|
692
|
+
*/
|
|
693
|
+
fast_options?: unknown | null;
|
|
694
|
+
/**
|
|
695
|
+
* Format-specific options (HTML, PDF, spreadsheet, presentation). Applied based on
|
|
696
|
+
* detected input file type
|
|
697
|
+
*/
|
|
698
|
+
input_options?: ParseV2Parameters.InputOptions;
|
|
699
|
+
/**
|
|
700
|
+
* Output formatting options for markdown, text, and extracted images
|
|
701
|
+
*/
|
|
702
|
+
output_options?: ParseV2Parameters.OutputOptions;
|
|
703
|
+
/**
|
|
704
|
+
* Page selection: limit total pages or specify exact pages to process
|
|
705
|
+
*/
|
|
706
|
+
page_ranges?: ParseV2Parameters.PageRanges;
|
|
707
|
+
/**
|
|
708
|
+
* Job execution controls including timeouts and failure thresholds
|
|
709
|
+
*/
|
|
710
|
+
processing_control?: ParseV2Parameters.ProcessingControl;
|
|
711
|
+
/**
|
|
712
|
+
* Document processing options including OCR, table extraction, and chart parsing
|
|
713
|
+
*/
|
|
714
|
+
processing_options?: ParseV2Parameters.ProcessingOptions;
|
|
715
|
+
/**
|
|
716
|
+
* Webhook endpoints for job status notifications. Multiple webhooks can be
|
|
717
|
+
* configured for different events or services
|
|
718
|
+
*/
|
|
719
|
+
webhook_configurations?: Array<ParseV2Parameters.WebhookConfiguration>;
|
|
720
|
+
}
|
|
721
|
+
namespace ParseV2Parameters {
|
|
722
|
+
/**
|
|
723
|
+
* Options for AI-powered parsing tiers (cost_effective, agentic, agentic_plus).
|
|
724
|
+
*
|
|
725
|
+
* These options customize how the AI processes and interprets document content.
|
|
726
|
+
* Only applicable when using non-fast tiers.
|
|
727
|
+
*/
|
|
728
|
+
interface AgenticOptions {
|
|
729
|
+
/**
|
|
730
|
+
* Custom instructions for the AI parser. Use to guide extraction behavior, specify
|
|
731
|
+
* output formatting, or provide domain-specific context. Example: 'Extract
|
|
732
|
+
* financial tables with currency symbols. Format dates as YYYY-MM-DD.'
|
|
733
|
+
*/
|
|
734
|
+
custom_prompt?: string | null;
|
|
735
|
+
}
|
|
736
|
+
/**
|
|
737
|
+
* Crop boundaries to process only a portion of each page. Values are ratios 0-1
|
|
738
|
+
* from page edges
|
|
739
|
+
*/
|
|
740
|
+
interface CropBox {
|
|
741
|
+
/**
|
|
742
|
+
* Bottom boundary as ratio (0-1). 0=top edge, 1=bottom edge. Content below this
|
|
743
|
+
* line is excluded
|
|
744
|
+
*/
|
|
745
|
+
bottom?: number | null;
|
|
746
|
+
/**
|
|
747
|
+
* Left boundary as ratio (0-1). 0=left edge, 1=right edge. Content left of this
|
|
748
|
+
* line is excluded
|
|
749
|
+
*/
|
|
750
|
+
left?: number | null;
|
|
751
|
+
/**
|
|
752
|
+
* Right boundary as ratio (0-1). 0=left edge, 1=right edge. Content right of this
|
|
753
|
+
* line is excluded
|
|
754
|
+
*/
|
|
755
|
+
right?: number | null;
|
|
756
|
+
/**
|
|
757
|
+
* Top boundary as ratio (0-1). 0=top edge, 1=bottom edge. Content above this line
|
|
758
|
+
* is excluded
|
|
759
|
+
*/
|
|
760
|
+
top?: number | null;
|
|
761
|
+
}
|
|
762
|
+
/**
|
|
763
|
+
* Format-specific options (HTML, PDF, spreadsheet, presentation). Applied based on
|
|
764
|
+
* detected input file type
|
|
765
|
+
*/
|
|
766
|
+
interface InputOptions {
|
|
767
|
+
/**
|
|
768
|
+
* HTML/web page parsing options (applies to .html, .htm files)
|
|
769
|
+
*/
|
|
770
|
+
html?: InputOptions.HTML;
|
|
771
|
+
/**
|
|
772
|
+
* PDF-specific parsing options (applies to .pdf files)
|
|
773
|
+
*/
|
|
774
|
+
pdf?: unknown;
|
|
775
|
+
/**
|
|
776
|
+
* Presentation parsing options (applies to .pptx, .ppt, .odp, .key files)
|
|
777
|
+
*/
|
|
778
|
+
presentation?: InputOptions.Presentation;
|
|
779
|
+
/**
|
|
780
|
+
* Spreadsheet parsing options (applies to .xlsx, .xls, .csv, .ods files)
|
|
781
|
+
*/
|
|
782
|
+
spreadsheet?: InputOptions.Spreadsheet;
|
|
783
|
+
}
|
|
784
|
+
namespace InputOptions {
|
|
785
|
+
/**
|
|
786
|
+
* HTML/web page parsing options (applies to .html, .htm files)
|
|
787
|
+
*/
|
|
788
|
+
interface HTML {
|
|
789
|
+
/**
|
|
790
|
+
* Force all HTML elements to be visible by overriding CSS display/visibility
|
|
791
|
+
* properties. Useful for parsing pages with hidden content or collapsed sections
|
|
792
|
+
*/
|
|
793
|
+
make_all_elements_visible?: boolean | null;
|
|
794
|
+
/**
|
|
795
|
+
* Remove fixed-position elements (headers, footers, floating buttons) that appear
|
|
796
|
+
* on every page render
|
|
797
|
+
*/
|
|
798
|
+
remove_fixed_elements?: boolean | null;
|
|
799
|
+
/**
|
|
800
|
+
* Remove navigation elements (nav bars, sidebars, menus) to focus on main content
|
|
801
|
+
*/
|
|
802
|
+
remove_navigation_elements?: boolean | null;
|
|
803
|
+
}
|
|
804
|
+
/**
|
|
805
|
+
* Presentation parsing options (applies to .pptx, .ppt, .odp, .key files)
|
|
806
|
+
*/
|
|
807
|
+
interface Presentation {
|
|
808
|
+
/**
|
|
809
|
+
* Extract content positioned outside the visible slide area. Some presentations
|
|
810
|
+
* have hidden notes or content that extends beyond slide boundaries
|
|
811
|
+
*/
|
|
812
|
+
out_of_bounds_content?: boolean | null;
|
|
813
|
+
/**
|
|
814
|
+
* Skip extraction of embedded chart data tables. When true, only the visual
|
|
815
|
+
* representation of charts is captured, not the underlying data
|
|
816
|
+
*/
|
|
817
|
+
skip_embedded_data?: boolean | null;
|
|
818
|
+
}
|
|
819
|
+
/**
|
|
820
|
+
* Spreadsheet parsing options (applies to .xlsx, .xls, .csv, .ods files)
|
|
821
|
+
*/
|
|
822
|
+
interface Spreadsheet {
|
|
823
|
+
/**
|
|
824
|
+
* Detect and extract multiple tables within a single sheet. Useful when
|
|
825
|
+
* spreadsheets contain several data regions separated by blank rows/columns
|
|
826
|
+
*/
|
|
827
|
+
detect_sub_tables_in_sheets?: boolean | null;
|
|
828
|
+
/**
|
|
829
|
+
* Compute formula results instead of extracting formula text. Use when you need
|
|
830
|
+
* calculated values rather than formula definitions
|
|
831
|
+
*/
|
|
832
|
+
force_formula_computation_in_sheets?: boolean | null;
|
|
833
|
+
/**
|
|
834
|
+
* Parse hidden sheets in addition to visible ones. By default, hidden sheets are
|
|
835
|
+
* skipped
|
|
836
|
+
*/
|
|
837
|
+
include_hidden_sheets?: boolean | null;
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
/**
|
|
841
|
+
* Output formatting options for markdown, text, and extracted images
|
|
842
|
+
*/
|
|
843
|
+
interface OutputOptions {
|
|
844
|
+
/**
|
|
845
|
+
* Extract the printed page number as it appears in the document (e.g., 'Page 5 of
|
|
846
|
+
* 10', 'v', 'A-3'). Useful for referencing original page numbers
|
|
847
|
+
*/
|
|
848
|
+
extract_printed_page_number?: boolean | null;
|
|
849
|
+
/**
|
|
850
|
+
* Image categories to extract and save. Options: 'screenshot' (full page renders
|
|
851
|
+
* useful for visual QA), 'embedded' (images found within the document), 'layout'
|
|
852
|
+
* (cropped regions from layout detection like figures and diagrams). Empty list
|
|
853
|
+
* saves no images
|
|
854
|
+
*/
|
|
855
|
+
images_to_save?: Array<'screenshot' | 'embedded' | 'layout'>;
|
|
856
|
+
/**
|
|
857
|
+
* Markdown formatting options including table styles and link annotations
|
|
858
|
+
*/
|
|
859
|
+
markdown?: OutputOptions.Markdown;
|
|
860
|
+
/**
|
|
861
|
+
* Spatial text output options for preserving document layout structure
|
|
862
|
+
*/
|
|
863
|
+
spatial_text?: OutputOptions.SpatialText;
|
|
864
|
+
/**
|
|
865
|
+
* Options for exporting tables as XLSX spreadsheets
|
|
866
|
+
*/
|
|
867
|
+
tables_as_spreadsheet?: OutputOptions.TablesAsSpreadsheet;
|
|
868
|
+
}
|
|
869
|
+
namespace OutputOptions {
|
|
870
|
+
/**
|
|
871
|
+
* Markdown formatting options including table styles and link annotations
|
|
872
|
+
*/
|
|
873
|
+
interface Markdown {
|
|
874
|
+
/**
|
|
875
|
+
* Add link annotations to markdown output in the format [text](url). When false,
|
|
876
|
+
* only the link text is included
|
|
877
|
+
*/
|
|
878
|
+
annotate_links?: boolean | null;
|
|
879
|
+
/**
|
|
880
|
+
* Embed images directly in markdown as base64 data URIs instead of extracting them
|
|
881
|
+
* as separate files. Useful for self-contained markdown output
|
|
882
|
+
*/
|
|
883
|
+
inline_images?: boolean | null;
|
|
884
|
+
/**
|
|
885
|
+
* Table formatting options including markdown vs HTML format and merging behavior
|
|
886
|
+
*/
|
|
887
|
+
tables?: Markdown.Tables;
|
|
888
|
+
}
|
|
889
|
+
namespace Markdown {
|
|
890
|
+
/**
|
|
891
|
+
* Table formatting options including markdown vs HTML format and merging behavior
|
|
892
|
+
*/
|
|
893
|
+
interface Tables {
|
|
894
|
+
/**
|
|
895
|
+
* Remove extra whitespace padding in markdown table cells for more compact output
|
|
896
|
+
*/
|
|
897
|
+
compact_markdown_tables?: boolean | null;
|
|
898
|
+
/**
|
|
899
|
+
* Separator string for multiline cell content in markdown tables. Example:
|
|
900
|
+
* '<br>' to preserve line breaks, ' ' to join with spaces
|
|
901
|
+
*/
|
|
902
|
+
markdown_table_multiline_separator?: string | null;
|
|
903
|
+
/**
|
|
904
|
+
* Automatically merge tables that span multiple pages into a single table. The
|
|
905
|
+
* merged table appears on the first page with merged_from_pages metadata
|
|
906
|
+
*/
|
|
907
|
+
merge_continued_tables?: boolean | null;
|
|
908
|
+
/**
|
|
909
|
+
* Output tables as markdown pipe tables instead of HTML <table> tags.
|
|
910
|
+
* Markdown tables are simpler but cannot represent complex structures like merged
|
|
911
|
+
* cells
|
|
912
|
+
*/
|
|
913
|
+
output_tables_as_markdown?: boolean | null;
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
/**
|
|
917
|
+
* Spatial text output options for preserving document layout structure
|
|
918
|
+
*/
|
|
919
|
+
interface SpatialText {
|
|
920
|
+
/**
|
|
921
|
+
* Keep multi-column layouts intact instead of linearizing columns into sequential
|
|
922
|
+
* text. Automatically enabled for non-fast tiers
|
|
923
|
+
*/
|
|
924
|
+
do_not_unroll_columns?: boolean | null;
|
|
925
|
+
/**
|
|
926
|
+
* Maintain consistent text column alignment across page boundaries. Automatically
|
|
927
|
+
* enabled for document-level parsing modes
|
|
928
|
+
*/
|
|
929
|
+
preserve_layout_alignment_across_pages?: boolean | null;
|
|
930
|
+
/**
|
|
931
|
+
* Include text below the normal size threshold. Useful for footnotes, watermarks,
|
|
932
|
+
* or fine print that might otherwise be filtered out
|
|
933
|
+
*/
|
|
934
|
+
preserve_very_small_text?: boolean | null;
|
|
935
|
+
}
|
|
936
|
+
/**
|
|
937
|
+
* Options for exporting tables as XLSX spreadsheets
|
|
938
|
+
*/
|
|
939
|
+
interface TablesAsSpreadsheet {
|
|
940
|
+
/**
|
|
941
|
+
* Whether this option is enabled
|
|
942
|
+
*/
|
|
943
|
+
enable?: boolean | null;
|
|
944
|
+
/**
|
|
945
|
+
* Automatically generate descriptive sheet names from table context (headers,
|
|
946
|
+
* surrounding text) instead of using generic names like 'Table_1'
|
|
947
|
+
*/
|
|
948
|
+
guess_sheet_name?: boolean;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
/**
|
|
952
|
+
* Page selection: limit total pages or specify exact pages to process
|
|
953
|
+
*/
|
|
954
|
+
interface PageRanges {
|
|
955
|
+
/**
|
|
956
|
+
* Maximum number of pages to process. Pages are processed in order starting from
|
|
957
|
+
* page 1. If both max_pages and target_pages are set, target_pages takes
|
|
958
|
+
* precedence
|
|
959
|
+
*/
|
|
960
|
+
max_pages?: number | null;
|
|
961
|
+
/**
|
|
962
|
+
* Comma-separated list of specific pages to process using 1-based indexing.
|
|
963
|
+
* Supports individual pages and ranges. Examples: '1,3,5' (pages 1, 3, 5), '1-5'
|
|
964
|
+
* (pages 1 through 5 inclusive), '1,3,5-8,10' (pages 1, 3, 5-8, and 10). Pages are
|
|
965
|
+
* sorted and deduplicated automatically. Duplicate pages cause an error
|
|
966
|
+
*/
|
|
967
|
+
target_pages?: string | null;
|
|
968
|
+
}
|
|
969
|
+
/**
|
|
970
|
+
* Job execution controls including timeouts and failure thresholds
|
|
971
|
+
*/
|
|
972
|
+
interface ProcessingControl {
|
|
973
|
+
/**
|
|
974
|
+
* Quality thresholds that determine when a job should fail vs complete with
|
|
975
|
+
* partial results
|
|
976
|
+
*/
|
|
977
|
+
job_failure_conditions?: ProcessingControl.JobFailureConditions;
|
|
978
|
+
/**
|
|
979
|
+
* Timeout settings for job execution. Increase for large or complex documents
|
|
980
|
+
*/
|
|
981
|
+
timeouts?: ProcessingControl.Timeouts;
|
|
982
|
+
}
|
|
983
|
+
namespace ProcessingControl {
|
|
984
|
+
/**
|
|
985
|
+
* Quality thresholds that determine when a job should fail vs complete with
|
|
986
|
+
* partial results
|
|
987
|
+
*/
|
|
988
|
+
interface JobFailureConditions {
|
|
989
|
+
/**
|
|
990
|
+
* Maximum ratio of pages allowed to fail before the job fails (0-1). Example: 0.1
|
|
991
|
+
* means job fails if more than 10% of pages fail. Default is 0.05 (5%)
|
|
992
|
+
*/
|
|
993
|
+
allowed_page_failure_ratio?: number | null;
|
|
994
|
+
/**
|
|
995
|
+
* Fail the job if a problematic font is detected that may cause incorrect text
|
|
996
|
+
* extraction. Buggy fonts can produce garbled or missing characters
|
|
997
|
+
*/
|
|
998
|
+
fail_on_buggy_font?: boolean | null;
|
|
999
|
+
/**
|
|
1000
|
+
* Fail the entire job if any embedded image cannot be extracted. By default, image
|
|
1001
|
+
* extraction errors are logged but don't fail the job
|
|
1002
|
+
*/
|
|
1003
|
+
fail_on_image_extraction_error?: boolean | null;
|
|
1004
|
+
/**
|
|
1005
|
+
* Fail the entire job if OCR fails on any image. By default, OCR errors result in
|
|
1006
|
+
* empty text for that image
|
|
1007
|
+
*/
|
|
1008
|
+
fail_on_image_ocr_error?: boolean | null;
|
|
1009
|
+
/**
|
|
1010
|
+
* Fail the entire job if markdown cannot be reconstructed for any page. By
|
|
1011
|
+
* default, failed pages use fallback text extraction
|
|
1012
|
+
*/
|
|
1013
|
+
fail_on_markdown_reconstruction_error?: boolean | null;
|
|
1014
|
+
}
|
|
1015
|
+
/**
|
|
1016
|
+
* Timeout settings for job execution. Increase for large or complex documents
|
|
1017
|
+
*/
|
|
1018
|
+
interface Timeouts {
|
|
1019
|
+
/**
|
|
1020
|
+
* Base timeout for the job in seconds (max 1800 = 30 minutes). This is the minimum
|
|
1021
|
+
* time allowed regardless of document size
|
|
1022
|
+
*/
|
|
1023
|
+
base_in_seconds?: number | null;
|
|
1024
|
+
/**
|
|
1025
|
+
* Additional timeout per page in seconds (max 300 = 5 minutes). Total timeout =
|
|
1026
|
+
* base + (this value × page count)
|
|
1027
|
+
*/
|
|
1028
|
+
extra_time_per_page_in_seconds?: number | null;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
/**
|
|
1032
|
+
* Document processing options including OCR, table extraction, and chart parsing
|
|
1033
|
+
*/
|
|
1034
|
+
interface ProcessingOptions {
|
|
1035
|
+
/**
|
|
1036
|
+
* Use aggressive heuristics to detect table boundaries, even without visible
|
|
1037
|
+
* borders. Useful for documents with borderless or complex tables
|
|
1038
|
+
*/
|
|
1039
|
+
aggressive_table_extraction?: boolean | null;
|
|
1040
|
+
/**
|
|
1041
|
+
* Conditional processing rules that apply different parsing options based on page
|
|
1042
|
+
* content, document structure, or filename patterns. Each entry defines trigger
|
|
1043
|
+
* conditions and the parsing configuration to apply when triggered
|
|
1044
|
+
*/
|
|
1045
|
+
auto_mode_configuration?: Array<ProcessingOptions.AutoModeConfiguration> | null;
|
|
1046
|
+
/**
|
|
1047
|
+
* Cost optimizer configuration for reducing parsing costs on simpler pages.
|
|
1048
|
+
*
|
|
1049
|
+
* When enabled, the parser analyzes each page and routes simpler pages to faster,
|
|
1050
|
+
* cheaper processing while preserving quality for complex pages. Only works with
|
|
1051
|
+
* 'agentic' or 'agentic_plus' tiers.
|
|
1052
|
+
*/
|
|
1053
|
+
cost_optimizer?: ProcessingOptions.CostOptimizer | null;
|
|
1054
|
+
/**
|
|
1055
|
+
* Disable automatic heuristics including outlined table extraction and adaptive
|
|
1056
|
+
* long table handling. Use when heuristics produce incorrect results
|
|
1057
|
+
*/
|
|
1058
|
+
disable_heuristics?: boolean | null;
|
|
1059
|
+
/**
|
|
1060
|
+
* Options for ignoring specific text types (diagonal, hidden, text in images)
|
|
1061
|
+
*/
|
|
1062
|
+
ignore?: ProcessingOptions.Ignore;
|
|
1063
|
+
/**
|
|
1064
|
+
* OCR configuration including language detection settings
|
|
1065
|
+
*/
|
|
1066
|
+
ocr_parameters?: ProcessingOptions.OcrParameters;
|
|
1067
|
+
/**
|
|
1068
|
+
* Enable AI-powered chart analysis. Modes: 'efficient' (fast, lower cost),
|
|
1069
|
+
* 'agentic' (balanced), 'agentic_plus' (highest accuracy). Automatically enables
|
|
1070
|
+
* extract_layout and precise_bounding_box when set
|
|
1071
|
+
*/
|
|
1072
|
+
specialized_chart_parsing?: 'agentic_plus' | 'agentic' | 'efficient' | null;
|
|
1073
|
+
}
|
|
1074
|
+
namespace ProcessingOptions {
|
|
1075
|
+
/**
|
|
1076
|
+
* A single auto mode rule with trigger conditions and parsing configuration.
|
|
1077
|
+
*
|
|
1078
|
+
* Auto mode allows conditional parsing where different configurations are applied
|
|
1079
|
+
* based on page content, structure, or filename. When triggers match, the
|
|
1080
|
+
* parsing_conf overrides default settings for that page.
|
|
1081
|
+
*/
|
|
1082
|
+
interface AutoModeConfiguration {
|
|
1083
|
+
/**
|
|
1084
|
+
* Parsing configuration to apply when trigger conditions are met
|
|
1085
|
+
*/
|
|
1086
|
+
parsing_conf: AutoModeConfiguration.ParsingConf;
|
|
1087
|
+
/**
|
|
1088
|
+
* Single glob pattern to match against filename
|
|
1089
|
+
*/
|
|
1090
|
+
filename_match_glob?: string | null;
|
|
1091
|
+
/**
|
|
1092
|
+
* List of glob patterns to match against filename
|
|
1093
|
+
*/
|
|
1094
|
+
filename_match_glob_list?: Array<string> | null;
|
|
1095
|
+
/**
|
|
1096
|
+
* Regex pattern to match against filename
|
|
1097
|
+
*/
|
|
1098
|
+
filename_regexp?: string | null;
|
|
1099
|
+
/**
|
|
1100
|
+
* Regex mode flags (e.g., 'i' for case-insensitive)
|
|
1101
|
+
*/
|
|
1102
|
+
filename_regexp_mode?: string | null;
|
|
1103
|
+
/**
|
|
1104
|
+
* Trigger if page contains a full-page image (scanned page detection)
|
|
1105
|
+
*/
|
|
1106
|
+
full_page_image_in_page?: boolean | null;
|
|
1107
|
+
/**
|
|
1108
|
+
* Threshold for full page image detection (0.0-1.0, default 0.8)
|
|
1109
|
+
*/
|
|
1110
|
+
full_page_image_in_page_threshold?: number | string | null;
|
|
1111
|
+
/**
|
|
1112
|
+
* Trigger if page contains non-screenshot images
|
|
1113
|
+
*/
|
|
1114
|
+
image_in_page?: boolean | null;
|
|
1115
|
+
/**
|
|
1116
|
+
* Trigger if page contains this layout element type
|
|
1117
|
+
*/
|
|
1118
|
+
layout_element_in_page?: string | null;
|
|
1119
|
+
/**
|
|
1120
|
+
* Confidence threshold for layout element detection
|
|
1121
|
+
*/
|
|
1122
|
+
layout_element_in_page_confidence_threshold?: number | string | null;
|
|
1123
|
+
/**
|
|
1124
|
+
* Trigger if page has more than N charts
|
|
1125
|
+
*/
|
|
1126
|
+
page_contains_at_least_n_charts?: number | string | null;
|
|
1127
|
+
/**
|
|
1128
|
+
* Trigger if page has more than N images
|
|
1129
|
+
*/
|
|
1130
|
+
page_contains_at_least_n_images?: number | string | null;
|
|
1131
|
+
/**
|
|
1132
|
+
* Trigger if page has more than N layout elements
|
|
1133
|
+
*/
|
|
1134
|
+
page_contains_at_least_n_layout_elements?: number | string | null;
|
|
1135
|
+
/**
|
|
1136
|
+
* Trigger if page has more than N lines
|
|
1137
|
+
*/
|
|
1138
|
+
page_contains_at_least_n_lines?: number | string | null;
|
|
1139
|
+
/**
|
|
1140
|
+
* Trigger if page has more than N links
|
|
1141
|
+
*/
|
|
1142
|
+
page_contains_at_least_n_links?: number | string | null;
|
|
1143
|
+
/**
|
|
1144
|
+
* Trigger if page has more than N numeric words
|
|
1145
|
+
*/
|
|
1146
|
+
page_contains_at_least_n_numbers?: number | string | null;
|
|
1147
|
+
/**
|
|
1148
|
+
* Trigger if page has more than N% numeric words
|
|
1149
|
+
*/
|
|
1150
|
+
page_contains_at_least_n_percent_numbers?: number | string | null;
|
|
1151
|
+
/**
|
|
1152
|
+
* Trigger if page has more than N tables
|
|
1153
|
+
*/
|
|
1154
|
+
page_contains_at_least_n_tables?: number | string | null;
|
|
1155
|
+
/**
|
|
1156
|
+
* Trigger if page has more than N words
|
|
1157
|
+
*/
|
|
1158
|
+
page_contains_at_least_n_words?: number | string | null;
|
|
1159
|
+
/**
|
|
1160
|
+
* Trigger if page has fewer than N charts
|
|
1161
|
+
*/
|
|
1162
|
+
page_contains_at_most_n_charts?: number | string | null;
|
|
1163
|
+
/**
|
|
1164
|
+
* Trigger if page has fewer than N images
|
|
1165
|
+
*/
|
|
1166
|
+
page_contains_at_most_n_images?: number | string | null;
|
|
1167
|
+
/**
|
|
1168
|
+
* Trigger if page has fewer than N layout elements
|
|
1169
|
+
*/
|
|
1170
|
+
page_contains_at_most_n_layout_elements?: number | string | null;
|
|
1171
|
+
/**
|
|
1172
|
+
* Trigger if page has fewer than N lines
|
|
1173
|
+
*/
|
|
1174
|
+
page_contains_at_most_n_lines?: number | string | null;
|
|
1175
|
+
/**
|
|
1176
|
+
* Trigger if page has fewer than N links
|
|
1177
|
+
*/
|
|
1178
|
+
page_contains_at_most_n_links?: number | string | null;
|
|
1179
|
+
/**
|
|
1180
|
+
* Trigger if page has fewer than N numeric words
|
|
1181
|
+
*/
|
|
1182
|
+
page_contains_at_most_n_numbers?: number | string | null;
|
|
1183
|
+
/**
|
|
1184
|
+
* Trigger if page has fewer than N% numeric words
|
|
1185
|
+
*/
|
|
1186
|
+
page_contains_at_most_n_percent_numbers?: number | string | null;
|
|
1187
|
+
/**
|
|
1188
|
+
* Trigger if page has fewer than N tables
|
|
1189
|
+
*/
|
|
1190
|
+
page_contains_at_most_n_tables?: number | string | null;
|
|
1191
|
+
/**
|
|
1192
|
+
* Trigger if page has fewer than N words
|
|
1193
|
+
*/
|
|
1194
|
+
page_contains_at_most_n_words?: number | string | null;
|
|
1195
|
+
/**
|
|
1196
|
+
* Trigger if page has more than N characters
|
|
1197
|
+
*/
|
|
1198
|
+
page_longer_than_n_chars?: number | string | null;
|
|
1199
|
+
/**
|
|
1200
|
+
* Trigger on pages with markdown extraction errors
|
|
1201
|
+
*/
|
|
1202
|
+
page_md_error?: boolean | null;
|
|
1203
|
+
/**
|
|
1204
|
+
* Trigger if page has fewer than N characters
|
|
1205
|
+
*/
|
|
1206
|
+
page_shorter_than_n_chars?: number | string | null;
|
|
1207
|
+
/**
|
|
1208
|
+
* Regex pattern to match in page content
|
|
1209
|
+
*/
|
|
1210
|
+
regexp_in_page?: string | null;
|
|
1211
|
+
/**
|
|
1212
|
+
* Regex mode flags for regexp_in_page
|
|
1213
|
+
*/
|
|
1214
|
+
regexp_in_page_mode?: string | null;
|
|
1215
|
+
/**
|
|
1216
|
+
* Trigger if page contains a table
|
|
1217
|
+
*/
|
|
1218
|
+
table_in_page?: boolean | null;
|
|
1219
|
+
/**
|
|
1220
|
+
* Trigger if page text/markdown contains this string
|
|
1221
|
+
*/
|
|
1222
|
+
text_in_page?: string | null;
|
|
1223
|
+
/**
|
|
1224
|
+
* How to combine multiple trigger conditions: 'and' (all conditions must match,
|
|
1225
|
+
* this is the default) or 'or' (any single condition can trigger)
|
|
1226
|
+
*/
|
|
1227
|
+
trigger_mode?: string | null;
|
|
1228
|
+
}
|
|
1229
|
+
namespace AutoModeConfiguration {
|
|
1230
|
+
/**
|
|
1231
|
+
* Parsing configuration to apply when trigger conditions are met
|
|
1232
|
+
*/
|
|
1233
|
+
interface ParsingConf {
|
|
1234
|
+
/**
|
|
1235
|
+
* Whether to use adaptive long table handling
|
|
1236
|
+
*/
|
|
1237
|
+
adaptive_long_table?: boolean | null;
|
|
1238
|
+
/**
|
|
1239
|
+
* Whether to use aggressive table extraction
|
|
1240
|
+
*/
|
|
1241
|
+
aggressive_table_extraction?: boolean | null;
|
|
1242
|
+
/**
|
|
1243
|
+
* Crop box options for auto mode parsing configuration.
|
|
1244
|
+
*/
|
|
1245
|
+
crop_box?: ParsingConf.CropBox | null;
|
|
1246
|
+
/**
|
|
1247
|
+
* Custom AI instructions for matched pages. Overrides the base custom_prompt
|
|
1248
|
+
*/
|
|
1249
|
+
custom_prompt?: string | null;
|
|
1250
|
+
/**
|
|
1251
|
+
* Whether to extract layout information
|
|
1252
|
+
*/
|
|
1253
|
+
extract_layout?: boolean | null;
|
|
1254
|
+
/**
|
|
1255
|
+
* Whether to use high resolution OCR
|
|
1256
|
+
*/
|
|
1257
|
+
high_res_ocr?: boolean | null;
|
|
1258
|
+
/**
|
|
1259
|
+
* Ignore options for auto mode parsing configuration.
|
|
1260
|
+
*/
|
|
1261
|
+
ignore?: ParsingConf.Ignore | null;
|
|
1262
|
+
/**
|
|
1263
|
+
* Primary language of the document
|
|
1264
|
+
*/
|
|
1265
|
+
language?: string | null;
|
|
1266
|
+
/**
|
|
1267
|
+
* Whether to use outlined table extraction
|
|
1268
|
+
*/
|
|
1269
|
+
outlined_table_extraction?: boolean | null;
|
|
1270
|
+
/**
|
|
1271
|
+
* Presentation-specific options for auto mode parsing configuration.
|
|
1272
|
+
*/
|
|
1273
|
+
presentation?: ParsingConf.Presentation | null;
|
|
1274
|
+
/**
|
|
1275
|
+
* Spatial text options for auto mode parsing configuration.
|
|
1276
|
+
*/
|
|
1277
|
+
spatial_text?: ParsingConf.SpatialText | null;
|
|
1278
|
+
/**
|
|
1279
|
+
* Enable specialized chart parsing with the specified mode
|
|
1280
|
+
*/
|
|
1281
|
+
specialized_chart_parsing?: 'agentic_plus' | 'agentic' | 'efficient' | null;
|
|
1282
|
+
/**
|
|
1283
|
+
* Override the parsing tier for matched pages. Must be paired with version
|
|
1284
|
+
*/
|
|
1285
|
+
tier?: 'fast' | 'cost_effective' | 'agentic' | 'agentic_plus' | null;
|
|
1286
|
+
/**
|
|
1287
|
+
* Tier version when overriding tier. Required when tier is specified
|
|
1288
|
+
*/
|
|
1289
|
+
version?: '2025-12-11' | '2025-12-18' | '2025-12-31' | '2026-01-08' | '2026-01-09' | '2026-01-16' | '2026-01-21' | '2026-01-22' | '2026-01-24' | '2026-01-29' | '2026-01-30' | '2026-02-03' | '2026-02-18' | '2026-02-20' | '2026-02-24' | '2026-02-26' | '2026-03-02' | '2026-03-03' | '2026-03-04' | '2026-03-05' | '2026-03-09' | '2026-03-10' | '2026-03-11' | '2026-03-12' | '2026-03-17' | '2026-03-19' | '2026-03-20' | '2026-03-22' | '2026-03-23' | '2026-03-24' | '2026-03-25' | '2026-03-26' | '2026-03-27' | 'latest' | (string & {}) | null;
|
|
1290
|
+
}
|
|
1291
|
+
namespace ParsingConf {
|
|
1292
|
+
/**
|
|
1293
|
+
* Crop box options for auto mode parsing configuration.
|
|
1294
|
+
*/
|
|
1295
|
+
interface CropBox {
|
|
1296
|
+
/**
|
|
1297
|
+
* Bottom boundary of crop box as ratio (0-1)
|
|
1298
|
+
*/
|
|
1299
|
+
bottom?: number | null;
|
|
1300
|
+
/**
|
|
1301
|
+
* Left boundary of crop box as ratio (0-1)
|
|
1302
|
+
*/
|
|
1303
|
+
left?: number | null;
|
|
1304
|
+
/**
|
|
1305
|
+
* Right boundary of crop box as ratio (0-1)
|
|
1306
|
+
*/
|
|
1307
|
+
right?: number | null;
|
|
1308
|
+
/**
|
|
1309
|
+
* Top boundary of crop box as ratio (0-1)
|
|
1310
|
+
*/
|
|
1311
|
+
top?: number | null;
|
|
1312
|
+
}
|
|
1313
|
+
/**
|
|
1314
|
+
* Ignore options for auto mode parsing configuration.
|
|
1315
|
+
*/
|
|
1316
|
+
interface Ignore {
|
|
1317
|
+
/**
|
|
1318
|
+
* Whether to ignore diagonal text in the document
|
|
1319
|
+
*/
|
|
1320
|
+
ignore_diagonal_text?: boolean | null;
|
|
1321
|
+
/**
|
|
1322
|
+
* Whether to ignore hidden text in the document
|
|
1323
|
+
*/
|
|
1324
|
+
ignore_hidden_text?: boolean | null;
|
|
1325
|
+
}
|
|
1326
|
+
/**
|
|
1327
|
+
* Presentation-specific options for auto mode parsing configuration.
|
|
1328
|
+
*/
|
|
1329
|
+
interface Presentation {
|
|
1330
|
+
/**
|
|
1331
|
+
* Extract out of bounds content in presentation slides
|
|
1332
|
+
*/
|
|
1333
|
+
out_of_bounds_content?: boolean | null;
|
|
1334
|
+
/**
|
|
1335
|
+
* Skip extraction of embedded data for charts in presentation slides
|
|
1336
|
+
*/
|
|
1337
|
+
skip_embedded_data?: boolean | null;
|
|
1338
|
+
}
|
|
1339
|
+
/**
|
|
1340
|
+
* Spatial text options for auto mode parsing configuration.
|
|
1341
|
+
*/
|
|
1342
|
+
interface SpatialText {
|
|
1343
|
+
/**
|
|
1344
|
+
* Keep column structure intact without unrolling
|
|
1345
|
+
*/
|
|
1346
|
+
do_not_unroll_columns?: boolean | null;
|
|
1347
|
+
/**
|
|
1348
|
+
* Preserve text alignment across page boundaries
|
|
1349
|
+
*/
|
|
1350
|
+
preserve_layout_alignment_across_pages?: boolean | null;
|
|
1351
|
+
/**
|
|
1352
|
+
* Include very small text in spatial output
|
|
1353
|
+
*/
|
|
1354
|
+
preserve_very_small_text?: boolean | null;
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
/**
|
|
1359
|
+
* Cost optimizer configuration for reducing parsing costs on simpler pages.
|
|
1360
|
+
*
|
|
1361
|
+
* When enabled, the parser analyzes each page and routes simpler pages to faster,
|
|
1362
|
+
* cheaper processing while preserving quality for complex pages. Only works with
|
|
1363
|
+
* 'agentic' or 'agentic_plus' tiers.
|
|
1364
|
+
*/
|
|
1365
|
+
interface CostOptimizer {
|
|
1366
|
+
/**
|
|
1367
|
+
* Enable cost-optimized parsing. Routes simpler pages to faster processing while
|
|
1368
|
+
* complex pages use full AI analysis. May reduce speed on some documents.
|
|
1369
|
+
* IMPORTANT: Only available with 'agentic' or 'agentic_plus' tiers
|
|
1370
|
+
*/
|
|
1371
|
+
enable?: boolean | null;
|
|
1372
|
+
}
|
|
1373
|
+
/**
|
|
1374
|
+
* Options for ignoring specific text types (diagonal, hidden, text in images)
|
|
1375
|
+
*/
|
|
1376
|
+
interface Ignore {
|
|
1377
|
+
/**
|
|
1378
|
+
* Skip text rotated at an angle (not horizontal/vertical). Useful for ignoring
|
|
1379
|
+
* watermarks or decorative angled text
|
|
1380
|
+
*/
|
|
1381
|
+
ignore_diagonal_text?: boolean | null;
|
|
1382
|
+
/**
|
|
1383
|
+
* Skip text marked as hidden in the document structure. Some PDFs contain
|
|
1384
|
+
* invisible text layers used for accessibility or search indexing
|
|
1385
|
+
*/
|
|
1386
|
+
ignore_hidden_text?: boolean | null;
|
|
1387
|
+
/**
|
|
1388
|
+
* Skip OCR text extraction from embedded images. Use when images contain
|
|
1389
|
+
* irrelevant text (watermarks, logos) that shouldn't be in the output
|
|
1390
|
+
*/
|
|
1391
|
+
ignore_text_in_image?: boolean | null;
|
|
1392
|
+
}
|
|
1393
|
+
/**
|
|
1394
|
+
* OCR configuration including language detection settings
|
|
1395
|
+
*/
|
|
1396
|
+
interface OcrParameters {
|
|
1397
|
+
/**
|
|
1398
|
+
* Languages to use for OCR text recognition. Specify multiple languages if
|
|
1399
|
+
* document contains mixed-language content. Order matters - put primary language
|
|
1400
|
+
* first. Example: ['en', 'es'] for English with Spanish
|
|
1401
|
+
*/
|
|
1402
|
+
languages?: Array<ParsingAPI.ParsingLanguages> | null;
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
/**
|
|
1406
|
+
* Webhook configuration for receiving parsing job notifications.
|
|
1407
|
+
*
|
|
1408
|
+
* Webhooks are called when specified events occur during job processing. Configure
|
|
1409
|
+
* multiple webhook configurations to send to different endpoints.
|
|
1410
|
+
*/
|
|
1411
|
+
interface WebhookConfiguration {
|
|
1412
|
+
/**
|
|
1413
|
+
* Events that trigger this webhook. Options: 'parse.success' (job completed),
|
|
1414
|
+
* 'parse.failure' (job failed), 'parse.partial' (some pages failed). If not
|
|
1415
|
+
* specified, webhook fires for all events
|
|
1416
|
+
*/
|
|
1417
|
+
webhook_events?: Array<string> | null;
|
|
1418
|
+
/**
|
|
1419
|
+
* Custom HTTP headers to include in webhook requests. Use for authentication
|
|
1420
|
+
* tokens or custom routing. Example: {'Authorization': 'Bearer xyz'}
|
|
1421
|
+
*/
|
|
1422
|
+
webhook_headers?: {
|
|
1423
|
+
[key: string]: unknown;
|
|
1424
|
+
} | null;
|
|
1425
|
+
/**
|
|
1426
|
+
* HTTPS URL to receive webhook POST requests. Must be publicly accessible
|
|
1427
|
+
*/
|
|
1428
|
+
webhook_url?: string | null;
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
/**
|
|
1432
|
+
* Catch-all for configurations without a dedicated typed schema.
|
|
1433
|
+
*
|
|
1434
|
+
* Accepts arbitrary JSON fields alongside `product_type`.
|
|
1435
|
+
*/
|
|
1436
|
+
interface UntypedParameters {
|
|
1437
|
+
/**
|
|
1438
|
+
* Product type.
|
|
1439
|
+
*/
|
|
1440
|
+
product_type: 'unknown';
|
|
1441
|
+
[k: string]: unknown;
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
export interface ExtractCreateParams {
|
|
1445
|
+
/**
|
|
1446
|
+
* Body param: File ID or Parse Job ID to extract from
|
|
1447
|
+
*/
|
|
1448
|
+
document_input_value: string;
|
|
1449
|
+
/**
|
|
1450
|
+
* Query param
|
|
1451
|
+
*/
|
|
1452
|
+
organization_id?: string | null;
|
|
1453
|
+
/**
|
|
1454
|
+
* Query param
|
|
1455
|
+
*/
|
|
1456
|
+
project_id?: string | null;
|
|
1457
|
+
/**
|
|
1458
|
+
* Body param: Extract configuration combining parse and extract settings.
|
|
1459
|
+
*/
|
|
1460
|
+
configuration?: ExtractConfiguration | null;
|
|
1461
|
+
/**
|
|
1462
|
+
* Body param: Saved extract configuration ID (mutually exclusive with
|
|
1463
|
+
* configuration)
|
|
1464
|
+
*/
|
|
1465
|
+
configuration_id?: string | null;
|
|
1466
|
+
/**
|
|
1467
|
+
* Body param: Outbound webhook endpoints to notify on job status changes
|
|
1468
|
+
*/
|
|
1469
|
+
webhook_configurations?: Array<ExtractCreateParams.WebhookConfiguration> | null;
|
|
1470
|
+
}
|
|
1471
|
+
export declare namespace ExtractCreateParams {
|
|
1472
|
+
/**
|
|
1473
|
+
* Configuration for a single outbound webhook endpoint.
|
|
1474
|
+
*/
|
|
1475
|
+
interface WebhookConfiguration {
|
|
1476
|
+
/**
|
|
1477
|
+
* Events to subscribe to (e.g. 'parse.success', 'extract.error'). If null, all
|
|
1478
|
+
* events are delivered.
|
|
1479
|
+
*/
|
|
1480
|
+
webhook_events?: Array<'extract.pending' | 'extract.success' | 'extract.error' | 'extract.partial_success' | 'extract.cancelled' | 'parse.pending' | 'parse.running' | 'parse.success' | 'parse.error' | 'parse.partial_success' | 'parse.cancelled' | 'classify.pending' | 'classify.success' | 'classify.error' | 'classify.partial_success' | 'classify.cancelled' | 'unmapped_event'> | null;
|
|
1481
|
+
/**
|
|
1482
|
+
* Custom HTTP headers sent with each webhook request (e.g. auth tokens)
|
|
1483
|
+
*/
|
|
1484
|
+
webhook_headers?: {
|
|
1485
|
+
[key: string]: string;
|
|
1486
|
+
} | null;
|
|
1487
|
+
/**
|
|
1488
|
+
* Response format sent to the webhook: 'string' (default) or 'json'
|
|
1489
|
+
*/
|
|
1490
|
+
webhook_output_format?: string | null;
|
|
1491
|
+
/**
|
|
1492
|
+
* URL to receive webhook POST notifications
|
|
1493
|
+
*/
|
|
1494
|
+
webhook_url?: string | null;
|
|
1495
|
+
}
|
|
1496
|
+
}
|
|
1497
|
+
export interface ExtractListParams extends PaginatedCursorParams {
|
|
1498
|
+
/**
|
|
1499
|
+
* Filter by configuration ID
|
|
1500
|
+
*/
|
|
1501
|
+
configuration_id?: string | null;
|
|
1502
|
+
/**
|
|
1503
|
+
* Include jobs created at or after this timestamp (inclusive)
|
|
1504
|
+
*/
|
|
1505
|
+
created_at_on_or_after?: string | null;
|
|
1506
|
+
/**
|
|
1507
|
+
* Include jobs created at or before this timestamp (inclusive)
|
|
1508
|
+
*/
|
|
1509
|
+
created_at_on_or_before?: string | null;
|
|
1510
|
+
/**
|
|
1511
|
+
* Filter by document input type (file_id or parse_job_id)
|
|
1512
|
+
*/
|
|
1513
|
+
document_input_type?: string | null;
|
|
1514
|
+
/**
|
|
1515
|
+
* Filter by document input value
|
|
1516
|
+
*/
|
|
1517
|
+
document_input_value?: string | null;
|
|
1518
|
+
/**
|
|
1519
|
+
* Additional fields to include: configuration, extract_metadata
|
|
1520
|
+
*/
|
|
1521
|
+
expand?: Array<string>;
|
|
1522
|
+
/**
|
|
1523
|
+
* Filter by specific job IDs
|
|
1524
|
+
*/
|
|
1525
|
+
job_ids?: Array<string> | null;
|
|
1526
|
+
organization_id?: string | null;
|
|
1527
|
+
project_id?: string | null;
|
|
1528
|
+
/**
|
|
1529
|
+
* Filter by status
|
|
1530
|
+
*/
|
|
1531
|
+
status?: 'PENDING' | 'THROTTLED' | 'RUNNING' | 'COMPLETED' | 'FAILED' | 'CANCELLED' | null;
|
|
1532
|
+
}
|
|
1533
|
+
export interface ExtractDeleteParams {
|
|
1534
|
+
organization_id?: string | null;
|
|
1535
|
+
project_id?: string | null;
|
|
1536
|
+
}
|
|
1537
|
+
export interface ExtractGenerateSchemaParams {
|
|
1538
|
+
/**
|
|
1539
|
+
* Query param
|
|
1540
|
+
*/
|
|
1541
|
+
organization_id?: string | null;
|
|
1542
|
+
/**
|
|
1543
|
+
* Query param
|
|
1544
|
+
*/
|
|
1545
|
+
project_id?: string | null;
|
|
1546
|
+
/**
|
|
1547
|
+
* Body param: Optional schema to validate, refine, or extend
|
|
1548
|
+
*/
|
|
1549
|
+
data_schema?: {
|
|
1550
|
+
[key: string]: {
|
|
1551
|
+
[key: string]: unknown;
|
|
1552
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
1553
|
+
} | null;
|
|
1554
|
+
/**
|
|
1555
|
+
* Body param: Optional file ID to analyze for schema generation
|
|
1556
|
+
*/
|
|
1557
|
+
file_id?: string | null;
|
|
1558
|
+
/**
|
|
1559
|
+
* Body param: Name for the generated configuration (auto-generated if omitted)
|
|
1560
|
+
*/
|
|
1561
|
+
name?: string | null;
|
|
1562
|
+
/**
|
|
1563
|
+
* Body param: Natural language description of the data structure to extract
|
|
1564
|
+
*/
|
|
1565
|
+
prompt?: string | null;
|
|
1566
|
+
}
|
|
1567
|
+
export interface ExtractGetParams {
|
|
1568
|
+
/**
|
|
1569
|
+
* Additional fields to include: configuration, extract_metadata
|
|
1570
|
+
*/
|
|
1571
|
+
expand?: Array<string>;
|
|
1572
|
+
organization_id?: string | null;
|
|
1573
|
+
project_id?: string | null;
|
|
1574
|
+
}
|
|
1575
|
+
export interface ExtractValidateSchemaParams {
|
|
1576
|
+
/**
|
|
1577
|
+
* JSON Schema to validate for use with extract jobs
|
|
1578
|
+
*/
|
|
1579
|
+
data_schema: {
|
|
1580
|
+
[key: string]: {
|
|
1581
|
+
[key: string]: unknown;
|
|
1582
|
+
} | Array<unknown> | string | number | boolean | null;
|
|
1583
|
+
};
|
|
1584
|
+
}
|
|
1585
|
+
export declare namespace Extract {
|
|
1586
|
+
export { type ExtractConfiguration as ExtractConfiguration, type ExtractJobMetadata as ExtractJobMetadata, type ExtractJobUsage as ExtractJobUsage, type ExtractV2Job as ExtractV2Job, type ExtractV2JobCreate as ExtractV2JobCreate, type ExtractV2JobQueryResponse as ExtractV2JobQueryResponse, type ExtractV2SchemaGenerateRequest as ExtractV2SchemaGenerateRequest, type ExtractV2SchemaValidateRequest as ExtractV2SchemaValidateRequest, type ExtractV2SchemaValidateResponse as ExtractV2SchemaValidateResponse, type ExtractedFieldMetadata as ExtractedFieldMetadata, type ExtractDeleteResponse as ExtractDeleteResponse, type ExtractGenerateSchemaResponse as ExtractGenerateSchemaResponse, type ExtractV2JobsPaginatedCursor as ExtractV2JobsPaginatedCursor, type ExtractCreateParams as ExtractCreateParams, type ExtractListParams as ExtractListParams, type ExtractDeleteParams as ExtractDeleteParams, type ExtractGenerateSchemaParams as ExtractGenerateSchemaParams, type ExtractGetParams as ExtractGetParams, type ExtractValidateSchemaParams as ExtractValidateSchemaParams, };
|
|
1587
|
+
}
|
|
1588
|
+
//# sourceMappingURL=extract.d.ts.map
|