@llamaindex/llama-cloud 1.8.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +10 -8
  3. package/client.d.mts +4 -6
  4. package/client.d.mts.map +1 -1
  5. package/client.d.ts +4 -6
  6. package/client.d.ts.map +1 -1
  7. package/client.js +7 -6
  8. package/client.js.map +1 -1
  9. package/client.mjs +7 -6
  10. package/client.mjs.map +1 -1
  11. package/core/pagination.d.mts +0 -23
  12. package/core/pagination.d.mts.map +1 -1
  13. package/core/pagination.d.ts +0 -23
  14. package/core/pagination.d.ts.map +1 -1
  15. package/core/pagination.js +1 -32
  16. package/core/pagination.js.map +1 -1
  17. package/core/pagination.mjs +0 -30
  18. package/core/pagination.mjs.map +1 -1
  19. package/package.json +12 -1
  20. package/resources/beta/batch/batch.d.mts +55 -30
  21. package/resources/beta/batch/batch.d.mts.map +1 -1
  22. package/resources/beta/batch/batch.d.ts +55 -30
  23. package/resources/beta/batch/batch.d.ts.map +1 -1
  24. package/resources/beta/batch/batch.js +14 -11
  25. package/resources/beta/batch/batch.js.map +1 -1
  26. package/resources/beta/batch/batch.mjs +14 -11
  27. package/resources/beta/batch/batch.mjs.map +1 -1
  28. package/resources/beta/batch/job-items.d.mts +36 -13
  29. package/resources/beta/batch/job-items.d.mts.map +1 -1
  30. package/resources/beta/batch/job-items.d.ts +36 -13
  31. package/resources/beta/batch/job-items.d.ts.map +1 -1
  32. package/resources/beta/batch/job-items.js +6 -8
  33. package/resources/beta/batch/job-items.js.map +1 -1
  34. package/resources/beta/batch/job-items.mjs +6 -8
  35. package/resources/beta/batch/job-items.mjs.map +1 -1
  36. package/resources/beta/parse-configurations.d.mts +7 -9
  37. package/resources/beta/parse-configurations.d.mts.map +1 -1
  38. package/resources/beta/parse-configurations.d.ts +7 -9
  39. package/resources/beta/parse-configurations.d.ts.map +1 -1
  40. package/resources/beta/parse-configurations.js +7 -9
  41. package/resources/beta/parse-configurations.js.map +1 -1
  42. package/resources/beta/parse-configurations.mjs +7 -9
  43. package/resources/beta/parse-configurations.mjs.map +1 -1
  44. package/resources/beta/sheets.d.mts +16 -0
  45. package/resources/beta/sheets.d.mts.map +1 -1
  46. package/resources/beta/sheets.d.ts +16 -0
  47. package/resources/beta/sheets.d.ts.map +1 -1
  48. package/resources/beta/split.d.mts +60 -16
  49. package/resources/beta/split.d.mts.map +1 -1
  50. package/resources/beta/split.d.ts +60 -16
  51. package/resources/beta/split.d.ts.map +1 -1
  52. package/resources/beta/split.js.map +1 -1
  53. package/resources/beta/split.mjs.map +1 -1
  54. package/resources/classifier/jobs.d.mts +12 -3
  55. package/resources/classifier/jobs.d.mts.map +1 -1
  56. package/resources/classifier/jobs.d.ts +12 -3
  57. package/resources/classifier/jobs.d.ts.map +1 -1
  58. package/resources/classify.d.mts +93 -38
  59. package/resources/classify.d.mts.map +1 -1
  60. package/resources/classify.d.ts +93 -38
  61. package/resources/classify.d.ts.map +1 -1
  62. package/resources/classify.js +19 -2
  63. package/resources/classify.js.map +1 -1
  64. package/resources/classify.mjs +19 -2
  65. package/resources/classify.mjs.map +1 -1
  66. package/resources/extract.d.mts +1593 -0
  67. package/resources/extract.d.mts.map +1 -0
  68. package/resources/extract.d.ts +1593 -0
  69. package/resources/extract.d.ts.map +1 -0
  70. package/resources/extract.js +215 -0
  71. package/resources/extract.js.map +1 -0
  72. package/resources/extract.mjs +211 -0
  73. package/resources/extract.mjs.map +1 -0
  74. package/resources/files.d.mts +53 -39
  75. package/resources/files.d.mts.map +1 -1
  76. package/resources/files.d.ts +53 -39
  77. package/resources/files.d.ts.map +1 -1
  78. package/resources/files.js +11 -10
  79. package/resources/files.js.map +1 -1
  80. package/resources/files.mjs +11 -10
  81. package/resources/files.mjs.map +1 -1
  82. package/resources/index.d.mts +1 -1
  83. package/resources/index.d.mts.map +1 -1
  84. package/resources/index.d.ts +1 -1
  85. package/resources/index.d.ts.map +1 -1
  86. package/resources/index.js +3 -3
  87. package/resources/index.js.map +1 -1
  88. package/resources/index.mjs +1 -1
  89. package/resources/index.mjs.map +1 -1
  90. package/resources/parsing.d.mts +292 -138
  91. package/resources/parsing.d.mts.map +1 -1
  92. package/resources/parsing.d.ts +292 -138
  93. package/resources/parsing.d.ts.map +1 -1
  94. package/resources/parsing.js +30 -4
  95. package/resources/parsing.js.map +1 -1
  96. package/resources/parsing.mjs +30 -4
  97. package/resources/parsing.mjs.map +1 -1
  98. package/resources/pipelines/pipelines.d.mts +59 -13
  99. package/resources/pipelines/pipelines.d.mts.map +1 -1
  100. package/resources/pipelines/pipelines.d.ts +59 -13
  101. package/resources/pipelines/pipelines.d.ts.map +1 -1
  102. package/resources/pipelines/pipelines.js +24 -9
  103. package/resources/pipelines/pipelines.js.map +1 -1
  104. package/resources/pipelines/pipelines.mjs +24 -9
  105. package/resources/pipelines/pipelines.mjs.map +1 -1
  106. package/resources/pipelines/sync.d.mts +5 -3
  107. package/resources/pipelines/sync.d.mts.map +1 -1
  108. package/resources/pipelines/sync.d.ts +5 -3
  109. package/resources/pipelines/sync.d.ts.map +1 -1
  110. package/resources/pipelines/sync.js +5 -3
  111. package/resources/pipelines/sync.js.map +1 -1
  112. package/resources/pipelines/sync.mjs +5 -3
  113. package/resources/pipelines/sync.mjs.map +1 -1
  114. package/src/client.ts +50 -15
  115. package/src/core/pagination.ts +0 -71
  116. package/src/resources/beta/batch/batch.ts +75 -30
  117. package/src/resources/beta/batch/job-items.ts +56 -13
  118. package/src/resources/beta/parse-configurations.ts +7 -9
  119. package/src/resources/beta/sheets.ts +20 -0
  120. package/src/resources/beta/split.ts +70 -17
  121. package/src/resources/classifier/jobs.ts +12 -3
  122. package/src/resources/classify.ts +101 -38
  123. package/src/resources/extract.ts +2055 -0
  124. package/src/resources/files.ts +53 -39
  125. package/src/resources/index.ts +22 -1
  126. package/src/resources/parsing.ts +327 -136
  127. package/src/resources/pipelines/pipelines.ts +80 -14
  128. package/src/resources/pipelines/sync.ts +5 -3
  129. package/src/version.ts +1 -1
  130. package/version.d.mts +1 -1
  131. package/version.d.ts +1 -1
  132. package/version.js +1 -1
  133. package/version.mjs +1 -1
  134. package/resources/extraction/extraction-agents/extraction-agents.d.mts +0 -126
  135. package/resources/extraction/extraction-agents/extraction-agents.d.mts.map +0 -1
  136. package/resources/extraction/extraction-agents/extraction-agents.d.ts +0 -126
  137. package/resources/extraction/extraction-agents/extraction-agents.d.ts.map +0 -1
  138. package/resources/extraction/extraction-agents/extraction-agents.js +0 -56
  139. package/resources/extraction/extraction-agents/extraction-agents.js.map +0 -1
  140. package/resources/extraction/extraction-agents/extraction-agents.mjs +0 -51
  141. package/resources/extraction/extraction-agents/extraction-agents.mjs.map +0 -1
  142. package/resources/extraction/extraction-agents/index.d.mts +0 -3
  143. package/resources/extraction/extraction-agents/index.d.mts.map +0 -1
  144. package/resources/extraction/extraction-agents/index.d.ts +0 -3
  145. package/resources/extraction/extraction-agents/index.d.ts.map +0 -1
  146. package/resources/extraction/extraction-agents/index.js +0 -9
  147. package/resources/extraction/extraction-agents/index.js.map +0 -1
  148. package/resources/extraction/extraction-agents/index.mjs +0 -4
  149. package/resources/extraction/extraction-agents/index.mjs.map +0 -1
  150. package/resources/extraction/extraction-agents/schema.d.mts +0 -75
  151. package/resources/extraction/extraction-agents/schema.d.mts.map +0 -1
  152. package/resources/extraction/extraction-agents/schema.d.ts +0 -75
  153. package/resources/extraction/extraction-agents/schema.d.ts.map +0 -1
  154. package/resources/extraction/extraction-agents/schema.js +0 -28
  155. package/resources/extraction/extraction-agents/schema.js.map +0 -1
  156. package/resources/extraction/extraction-agents/schema.mjs +0 -24
  157. package/resources/extraction/extraction-agents/schema.mjs.map +0 -1
  158. package/resources/extraction/extraction-agents.d.mts +0 -2
  159. package/resources/extraction/extraction-agents.d.mts.map +0 -1
  160. package/resources/extraction/extraction-agents.d.ts +0 -2
  161. package/resources/extraction/extraction-agents.d.ts.map +0 -1
  162. package/resources/extraction/extraction-agents.js +0 -6
  163. package/resources/extraction/extraction-agents.js.map +0 -1
  164. package/resources/extraction/extraction-agents.mjs +0 -3
  165. package/resources/extraction/extraction-agents.mjs.map +0 -1
  166. package/resources/extraction/extraction.d.mts +0 -118
  167. package/resources/extraction/extraction.d.mts.map +0 -1
  168. package/resources/extraction/extraction.d.ts +0 -118
  169. package/resources/extraction/extraction.d.ts.map +0 -1
  170. package/resources/extraction/extraction.js +0 -91
  171. package/resources/extraction/extraction.js.map +0 -1
  172. package/resources/extraction/extraction.mjs +0 -86
  173. package/resources/extraction/extraction.mjs.map +0 -1
  174. package/resources/extraction/index.d.mts +0 -5
  175. package/resources/extraction/index.d.mts.map +0 -1
  176. package/resources/extraction/index.d.ts +0 -5
  177. package/resources/extraction/index.d.ts.map +0 -1
  178. package/resources/extraction/index.js +0 -13
  179. package/resources/extraction/index.js.map +0 -1
  180. package/resources/extraction/index.mjs +0 -6
  181. package/resources/extraction/index.mjs.map +0 -1
  182. package/resources/extraction/jobs.d.mts +0 -280
  183. package/resources/extraction/jobs.d.mts.map +0 -1
  184. package/resources/extraction/jobs.d.ts +0 -280
  185. package/resources/extraction/jobs.d.ts.map +0 -1
  186. package/resources/extraction/jobs.js +0 -179
  187. package/resources/extraction/jobs.js.map +0 -1
  188. package/resources/extraction/jobs.mjs +0 -175
  189. package/resources/extraction/jobs.mjs.map +0 -1
  190. package/resources/extraction/runs.d.mts +0 -198
  191. package/resources/extraction/runs.d.mts.map +0 -1
  192. package/resources/extraction/runs.d.ts +0 -198
  193. package/resources/extraction/runs.d.ts.map +0 -1
  194. package/resources/extraction/runs.js +0 -42
  195. package/resources/extraction/runs.js.map +0 -1
  196. package/resources/extraction/runs.mjs +0 -38
  197. package/resources/extraction/runs.mjs.map +0 -1
  198. package/resources/extraction.d.mts +0 -2
  199. package/resources/extraction.d.mts.map +0 -1
  200. package/resources/extraction.d.ts +0 -2
  201. package/resources/extraction.d.ts.map +0 -1
  202. package/resources/extraction.js +0 -6
  203. package/resources/extraction.js.map +0 -1
  204. package/resources/extraction.mjs +0 -3
  205. package/resources/extraction.mjs.map +0 -1
  206. package/src/resources/extraction/extraction-agents/extraction-agents.ts +0 -196
  207. package/src/resources/extraction/extraction-agents/index.ts +0 -18
  208. package/src/resources/extraction/extraction-agents/schema.ts +0 -100
  209. package/src/resources/extraction/extraction-agents.ts +0 -3
  210. package/src/resources/extraction/extraction.ts +0 -224
  211. package/src/resources/extraction/index.ts +0 -34
  212. package/src/resources/extraction/jobs.ts +0 -414
  213. package/src/resources/extraction/runs.ts +0 -315
  214. package/src/resources/extraction.ts +0 -3
@@ -0,0 +1,2055 @@
1
+ // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ import { APIResource } from '../core/resource';
4
+ import * as ExtractAPI from './extract';
5
+ import * as ParsingAPI from './parsing';
6
+ import * as SplitAPI from './beta/split';
7
+ import { APIPromise } from '../core/api-promise';
8
+ import { PagePromise, PaginatedCursor, type PaginatedCursorParams } from '../core/pagination';
9
+ import { RequestOptions } from '../internal/request-options';
10
+ import { path } from '../internal/utils/path';
11
+ import { pollUntilComplete, PollingOptions, DEFAULT_TIMEOUT } from '../core/polling';
12
+
13
+ export class Extract extends APIResource {
14
+ /**
15
+ * Create an extraction job.
16
+ *
17
+ * Extracts structured data from a document using either a saved configuration or
18
+ * an inline JSON Schema.
19
+ *
20
+ * ## Input
21
+ *
22
+ * Provide exactly one of:
23
+ *
24
+ * - `configuration_id` — reference a saved extraction config
25
+ * - `configuration` — inline configuration with a `data_schema`
26
+ *
27
+ * ## Document input
28
+ *
29
+ * Set `file_input` to a file ID (`dfl-...`) or a completed parse job ID
30
+ * (`pjb-...`).
31
+ *
32
+ * The job runs asynchronously. Poll `GET /extract/{job_id}` or register a webhook
33
+ * to monitor completion.
34
+ *
35
+ * @example
36
+ * ```ts
37
+ * const extractV2Job = await client.extract.create({
38
+ * file_input: 'dfl-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
39
+ * });
40
+ * ```
41
+ */
42
+ create(params: ExtractCreateParams, options?: RequestOptions): APIPromise<ExtractV2Job> {
43
+ const { organization_id, project_id, ...body } = params;
44
+ return this._client.post('/api/v2/extract', { query: { organization_id, project_id }, body, ...options });
45
+ }
46
+
47
+ /**
48
+ * List extraction jobs with optional filtering and pagination.
49
+ *
50
+ * Filter by `configuration_id`, `status`, `file_input`, or creation date range.
51
+ * Results are returned newest-first. Use `expand=configuration` to include the
52
+ * full configuration used, and `expand=extract_metadata` for per-field metadata.
53
+ *
54
+ * @example
55
+ * ```ts
56
+ * // Automatically fetches more pages as needed.
57
+ * for await (const extractV2Job of client.extract.list()) {
58
+ * // ...
59
+ * }
60
+ * ```
61
+ */
62
+ list(
63
+ query: ExtractListParams | null | undefined = {},
64
+ options?: RequestOptions,
65
+ ): PagePromise<ExtractV2JobsPaginatedCursor, ExtractV2Job> {
66
+ return this._client.getAPIList('/api/v2/extract', PaginatedCursor<ExtractV2Job>, { query, ...options });
67
+ }
68
+
69
+ /**
70
+ * Delete an extraction job and its results.
71
+ *
72
+ * @example
73
+ * ```ts
74
+ * const extract = await client.extract.delete('job_id');
75
+ * ```
76
+ */
77
+ delete(
78
+ jobID: string,
79
+ params: ExtractDeleteParams | null | undefined = {},
80
+ options?: RequestOptions,
81
+ ): APIPromise<unknown> {
82
+ const { organization_id, project_id } = params ?? {};
83
+ return this._client.delete(path`/api/v2/extract/${jobID}`, {
84
+ query: { organization_id, project_id },
85
+ ...options,
86
+ });
87
+ }
88
+
89
+ /**
90
+ * Generate a JSON schema and return a product configuration request.
91
+ *
92
+ * @example
93
+ * ```ts
94
+ * const response = await client.extract.generateSchema();
95
+ * ```
96
+ */
97
+ generateSchema(
98
+ params: ExtractGenerateSchemaParams,
99
+ options?: RequestOptions,
100
+ ): APIPromise<ExtractGenerateSchemaResponse> {
101
+ const { organization_id, project_id, ...body } = params;
102
+ return this._client.post('/api/v2/extract/schema/generate', {
103
+ query: { organization_id, project_id },
104
+ body,
105
+ ...options,
106
+ });
107
+ }
108
+
109
+ /**
110
+ * Get a single extraction job by ID.
111
+ *
112
+ * Returns the job status and results when complete. Use `expand=configuration` to
113
+ * include the full configuration used, and `expand=extract_metadata` for per-field
114
+ * metadata.
115
+ *
116
+ * @example
117
+ * ```ts
118
+ * const extractV2Job = await client.extract.get('job_id');
119
+ * ```
120
+ */
121
+ get(
122
+ jobID: string,
123
+ query: ExtractGetParams | null | undefined = {},
124
+ options?: RequestOptions,
125
+ ): APIPromise<ExtractV2Job> {
126
+ return this._client.get(path`/api/v2/extract/${jobID}`, { query, ...options });
127
+ }
128
+
129
+ /**
130
+ * Validate a JSON schema for extraction.
131
+ *
132
+ * @example
133
+ * ```ts
134
+ * const extractV2SchemaValidateResponse =
135
+ * await client.extract.validateSchema({
136
+ * data_schema: { foo: { foo: 'bar' } },
137
+ * });
138
+ * ```
139
+ */
140
+ validateSchema(
141
+ body: ExtractValidateSchemaParams,
142
+ options?: RequestOptions,
143
+ ): APIPromise<ExtractV2SchemaValidateResponse> {
144
+ return this._client.post('/api/v2/extract/schema/validation', { body, ...options });
145
+ }
146
+
147
+ /**
148
+ * Wait for an extraction job to complete by polling until it reaches a terminal state.
149
+ *
150
+ * @param jobID - The ID of the extraction job to wait for
151
+ * @param query - Optional query parameters (organization_id, project_id)
152
+ * @param options - Polling configuration and request options
153
+ * @returns The completed extraction job
154
+ * @throws {PollingTimeoutError} If the job doesn't complete within the timeout period
155
+ * @throws {PollingError} If the job fails or is cancelled
156
+ *
157
+ * @example
158
+ * ```typescript
159
+ * const job = await client.extract.create({
160
+ * document_input_value: 'dfl-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
161
+ * });
162
+ *
163
+ * const completed = await client.extract.waitForCompletion(job.id, undefined, { verbose: true });
164
+ * console.log(completed.extract_result);
165
+ * ```
166
+ */
167
+ async waitForCompletion(
168
+ jobID: string,
169
+ query?: ExtractGetParams,
170
+ options?: PollingOptions & RequestOptions,
171
+ ): Promise<ExtractV2Job> {
172
+ const { pollingInterval, maxInterval, timeout, backoff, verbose, ...requestOptions } = options || {};
173
+
174
+ const getStatus = async (): Promise<ExtractV2Job> => {
175
+ return await this.get(jobID, query, requestOptions);
176
+ };
177
+
178
+ const isComplete = (job: ExtractV2Job): boolean => {
179
+ return job.status === 'COMPLETED';
180
+ };
181
+
182
+ const isError = (job: ExtractV2Job): boolean => {
183
+ return job.status === 'FAILED' || job.status === 'CANCELLED';
184
+ };
185
+
186
+ const getErrorMessage = (job: ExtractV2Job): string => {
187
+ const errorParts = [`Job ${jobID} failed with status: ${job.status}`];
188
+ if (job.error_message) {
189
+ errorParts.push(`Error: ${job.error_message}`);
190
+ }
191
+ return errorParts.join(' | ');
192
+ };
193
+
194
+ return await pollUntilComplete(getStatus, isComplete, isError, getErrorMessage, {
195
+ pollingInterval,
196
+ maxInterval,
197
+ timeout: timeout || DEFAULT_TIMEOUT,
198
+ backoff,
199
+ verbose,
200
+ });
201
+ }
202
+
203
+ /**
204
+ * Create an extraction job, wait for it to complete, and return the result.
205
+ *
206
+ * This is a convenience method that combines create() and waitForCompletion()
207
+ * into a single call for the most common end-to-end workflow.
208
+ *
209
+ * @param params - Extract job creation parameters
210
+ * @param options - Polling configuration and request options
211
+ * @returns The completed extraction job with extract_result populated
212
+ * @throws {PollingTimeoutError} If the job doesn't complete within the timeout period
213
+ * @throws {PollingError} If the job fails or is cancelled
214
+ *
215
+ * @example
216
+ * ```typescript
217
+ * import { LlamaCloud } from 'llama-cloud';
218
+ *
219
+ * const client = new LlamaCloud({ apiKey: '...' });
220
+ *
221
+ * const result = await client.extract.run({
222
+ * document_input_value: 'dfl-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
223
+ * configuration: {
224
+ * data_schema: { name: { type: 'string' }, age: { type: 'number' } },
225
+ * },
226
+ * }, { verbose: true });
227
+ *
228
+ * console.log(result.extract_result);
229
+ * ```
230
+ */
231
+ async run(params: ExtractCreateParams, options?: PollingOptions & RequestOptions): Promise<ExtractV2Job> {
232
+ const { pollingInterval, maxInterval, timeout, backoff, verbose, ...requestOptions } = options || {};
233
+
234
+ const job = await this.create(params, requestOptions);
235
+
236
+ const getQuery: ExtractGetParams = {};
237
+ if (params.organization_id !== undefined) {
238
+ getQuery.organization_id = params.organization_id;
239
+ }
240
+ if (params.project_id !== undefined) {
241
+ getQuery.project_id = params.project_id;
242
+ }
243
+
244
+ return await this.waitForCompletion(job.id, getQuery, {
245
+ pollingInterval,
246
+ maxInterval,
247
+ timeout: timeout || DEFAULT_TIMEOUT,
248
+ backoff,
249
+ verbose,
250
+ ...requestOptions,
251
+ });
252
+ }
253
+ }
254
+
255
+ export type ExtractV2JobsPaginatedCursor = PaginatedCursor<ExtractV2Job>;
256
+
257
+ /**
258
+ * Extract configuration combining parse and extract settings.
259
+ */
260
+ export interface ExtractConfiguration {
261
+ /**
262
+ * JSON Schema defining the fields to extract. Validate with the /schema/validate
263
+ * endpoint first.
264
+ */
265
+ data_schema: {
266
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
267
+ };
268
+
269
+ /**
270
+ * Include citations in results
271
+ */
272
+ cite_sources?: boolean;
273
+
274
+ /**
275
+ * Include confidence scores in results
276
+ */
277
+ confidence_scores?: boolean;
278
+
279
+ /**
280
+ * Extract algorithm version. Use 'latest' or a date string.
281
+ */
282
+ extract_version?: string;
283
+
284
+ /**
285
+ * Granularity of extraction: per_doc returns one object per document, per_page
286
+ * returns one object per page, per_table_row returns one object per table row
287
+ */
288
+ extraction_target?: 'per_doc' | 'per_page' | 'per_table_row';
289
+
290
+ /**
291
+ * ISO 639-1 language code for the document
292
+ */
293
+ lang?: string;
294
+
295
+ /**
296
+ * Maximum number of pages to process. Omit for no limit.
297
+ */
298
+ max_pages?: number | null;
299
+
300
+ /**
301
+ * Saved parse configuration ID to control how the document is parsed before
302
+ * extraction
303
+ */
304
+ parse_config_id?: string | null;
305
+
306
+ /**
307
+ * Parse tier to use before extraction (fast, cost_effective, or agentic)
308
+ */
309
+ parse_tier?: string | null;
310
+
311
+ /**
312
+ * Custom system prompt to guide extraction behavior
313
+ */
314
+ system_prompt?: string | null;
315
+
316
+ /**
317
+ * Comma-separated page numbers or ranges to process (1-based). Omit to process all
318
+ * pages.
319
+ */
320
+ target_pages?: string | null;
321
+
322
+ /**
323
+ * Extract tier: cost_effective (5 credits/page) or agentic (15 credits/page)
324
+ */
325
+ tier?: 'cost_effective' | 'agentic';
326
+ }
327
+
328
+ /**
329
+ * Extraction metadata.
330
+ */
331
+ export interface ExtractJobMetadata {
332
+ /**
333
+ * Metadata for extracted fields including document, page, and row level info.
334
+ */
335
+ field_metadata?: ExtractedFieldMetadata | null;
336
+
337
+ /**
338
+ * Reference to the ParseJob ID used for parsing
339
+ */
340
+ parse_job_id?: string | null;
341
+
342
+ /**
343
+ * Parse tier used for parsing the document
344
+ */
345
+ parse_tier?: string | null;
346
+ }
347
+
348
+ /**
349
+ * Extraction usage metrics.
350
+ */
351
+ export interface ExtractJobUsage {
352
+ /**
353
+ * Number of document tokens
354
+ */
355
+ num_document_tokens?: number | null;
356
+
357
+ /**
358
+ * Number of output tokens
359
+ */
360
+ num_output_tokens?: number | null;
361
+
362
+ /**
363
+ * Number of pages extracted
364
+ */
365
+ num_pages_extracted?: number | null;
366
+ }
367
+
368
+ /**
369
+ * An extraction job.
370
+ */
371
+ export interface ExtractV2Job {
372
+ /**
373
+ * Unique job identifier (job_id)
374
+ */
375
+ id: string;
376
+
377
+ /**
378
+ * Creation timestamp
379
+ */
380
+ created_at: string;
381
+
382
+ /**
383
+ * File ID or parse job ID that was extracted
384
+ */
385
+ file_input: string;
386
+
387
+ /**
388
+ * Project this job belongs to
389
+ */
390
+ project_id: string;
391
+
392
+ /**
393
+ * Current job status.
394
+ *
395
+ * - `PENDING` — queued, not yet started
396
+ * - `RUNNING` — actively processing
397
+ * - `COMPLETED` — finished successfully
398
+ * - `FAILED` — terminated with an error
399
+ * - `CANCELLED` — cancelled by user
400
+ */
401
+ status: string;
402
+
403
+ /**
404
+ * Last update timestamp
405
+ */
406
+ updated_at: string;
407
+
408
+ /**
409
+ * Extract configuration combining parse and extract settings.
410
+ */
411
+ configuration?: ExtractConfiguration | null;
412
+
413
+ /**
414
+ * Saved extract configuration ID used for this job, if any
415
+ */
416
+ configuration_id?: string | null;
417
+
418
+ /**
419
+ * Error details when status is FAILED
420
+ */
421
+ error_message?: string | null;
422
+
423
+ /**
424
+ * Extraction metadata.
425
+ */
426
+ extract_metadata?: ExtractJobMetadata | null;
427
+
428
+ /**
429
+ * Extracted data conforming to the data_schema. Returns a single object for
430
+ * per_doc, or an array for per_page / per_table_row.
431
+ */
432
+ extract_result?:
433
+ | { [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null }
434
+ | Array<{ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null }>
435
+ | null;
436
+
437
+ /**
438
+ * Job-level metadata.
439
+ */
440
+ metadata?: ExtractV2Job.Metadata | null;
441
+ }
442
+
443
+ export namespace ExtractV2Job {
444
+ /**
445
+ * Job-level metadata.
446
+ */
447
+ export interface Metadata {
448
+ /**
449
+ * Extraction usage metrics.
450
+ */
451
+ usage?: ExtractAPI.ExtractJobUsage | null;
452
+
453
+ [k: string]: unknown;
454
+ }
455
+ }
456
+
457
+ /**
458
+ * Request to create an extraction job. Provide configuration_id or inline
459
+ * configuration.
460
+ */
461
+ export interface ExtractV2JobCreate {
462
+ /**
463
+ * File ID or parse job ID to extract from
464
+ */
465
+ file_input: string;
466
+
467
+ /**
468
+ * Extract configuration combining parse and extract settings.
469
+ */
470
+ configuration?: ExtractConfiguration | null;
471
+
472
+ /**
473
+ * Saved configuration ID
474
+ */
475
+ configuration_id?: string | null;
476
+
477
+ /**
478
+ * Outbound webhook endpoints to notify on job status changes
479
+ */
480
+ webhook_configurations?: Array<ExtractV2JobCreate.WebhookConfiguration> | null;
481
+ }
482
+
483
+ export namespace ExtractV2JobCreate {
484
+ /**
485
+ * Configuration for a single outbound webhook endpoint.
486
+ */
487
+ export interface WebhookConfiguration {
488
+ /**
489
+ * Events to subscribe to (e.g. 'parse.success', 'extract.error'). If null, all
490
+ * events are delivered.
491
+ */
492
+ webhook_events?: Array<
493
+ | 'extract.pending'
494
+ | 'extract.success'
495
+ | 'extract.error'
496
+ | 'extract.partial_success'
497
+ | 'extract.cancelled'
498
+ | 'parse.pending'
499
+ | 'parse.running'
500
+ | 'parse.success'
501
+ | 'parse.error'
502
+ | 'parse.partial_success'
503
+ | 'parse.cancelled'
504
+ | 'classify.pending'
505
+ | 'classify.success'
506
+ | 'classify.error'
507
+ | 'classify.partial_success'
508
+ | 'classify.cancelled'
509
+ | 'unmapped_event'
510
+ > | null;
511
+
512
+ /**
513
+ * Custom HTTP headers sent with each webhook request (e.g. auth tokens)
514
+ */
515
+ webhook_headers?: { [key: string]: string } | null;
516
+
517
+ /**
518
+ * Response format sent to the webhook: 'string' (default) or 'json'
519
+ */
520
+ webhook_output_format?: string | null;
521
+
522
+ /**
523
+ * URL to receive webhook POST notifications
524
+ */
525
+ webhook_url?: string | null;
526
+ }
527
+ }
528
+
529
+ /**
530
+ * Paginated list of extraction jobs.
531
+ */
532
+ export interface ExtractV2JobQueryResponse {
533
+ /**
534
+ * The list of items.
535
+ */
536
+ items: Array<ExtractV2Job>;
537
+
538
+ /**
539
+ * A token, which can be sent as page_token to retrieve the next page. If this
540
+ * field is omitted, there are no subsequent pages.
541
+ */
542
+ next_page_token?: string | null;
543
+
544
+ /**
545
+ * The total number of items available. This is only populated when specifically
546
+ * requested. The value may be an estimate and can be used for display purposes
547
+ * only.
548
+ */
549
+ total_size?: number | null;
550
+ }
551
+
552
+ /**
553
+ * Request schema for generating an extraction schema.
554
+ */
555
+ export interface ExtractV2SchemaGenerateRequest {
556
+ /**
557
+ * Optional schema to validate, refine, or extend
558
+ */
559
+ data_schema?: {
560
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
561
+ } | null;
562
+
563
+ /**
564
+ * Optional file ID to analyze for schema generation
565
+ */
566
+ file_id?: string | null;
567
+
568
+ /**
569
+ * Name for the generated configuration (auto-generated if omitted)
570
+ */
571
+ name?: string | null;
572
+
573
+ /**
574
+ * Natural language description of the data structure to extract
575
+ */
576
+ prompt?: string | null;
577
+ }
578
+
579
+ /**
580
+ * Request schema for validating an extraction schema.
581
+ */
582
+ export interface ExtractV2SchemaValidateRequest {
583
+ /**
584
+ * JSON Schema to validate for use with extract jobs
585
+ */
586
+ data_schema: {
587
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
588
+ };
589
+ }
590
+
591
+ /**
592
+ * Response schema for schema validation.
593
+ */
594
+ export interface ExtractV2SchemaValidateResponse {
595
+ /**
596
+ * Validated JSON Schema, ready for use in extract jobs
597
+ */
598
+ data_schema: {
599
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
600
+ };
601
+ }
602
+
603
+ /**
604
+ * Metadata for extracted fields including document, page, and row level info.
605
+ */
606
+ export interface ExtractedFieldMetadata {
607
+ /**
608
+ * Per-field metadata keyed by field name from your schema. Scalar fields (e.g.
609
+ * `vendor`) map to a FieldMetadataEntry with citation and confidence. Array fields
610
+ * (e.g. `items`) map to a list where each element contains per-sub-field
611
+ * FieldMetadataEntry objects, indexed by array position. Nested objects contain
612
+ * sub-field entries recursively.
613
+ */
614
+ document_metadata?: {
615
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
616
+ } | null;
617
+
618
+ /**
619
+ * Per-page metadata when extraction_target is per_page
620
+ */
621
+ page_metadata?: Array<{
622
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
623
+ }> | null;
624
+
625
+ /**
626
+ * Per-row metadata when extraction_target is per_table_row
627
+ */
628
+ row_metadata?: Array<{
629
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
630
+ }> | null;
631
+ }
632
+
633
+ export type ExtractDeleteResponse = unknown;
634
+
635
+ /**
636
+ * Request body for creating a product configuration.
637
+ */
638
+ export interface ExtractGenerateSchemaResponse {
639
+ /**
640
+ * Human-readable name for this configuration.
641
+ */
642
+ name: string;
643
+
644
+ /**
645
+ * Product-specific configuration parameters.
646
+ */
647
+ parameters:
648
+ | ExtractGenerateSchemaResponse.SplitV1Parameters
649
+ | ExtractGenerateSchemaResponse.ExtractV2Parameters
650
+ | ExtractGenerateSchemaResponse.ClassifyV2Parameters
651
+ | ExtractGenerateSchemaResponse.ParseV2Parameters
652
+ | ExtractGenerateSchemaResponse.UntypedParameters;
653
+ }
654
+
655
+ export namespace ExtractGenerateSchemaResponse {
656
+ /**
657
+ * Typed parameters for a _split v1_ product configuration.
658
+ */
659
+ export interface SplitV1Parameters {
660
+ /**
661
+ * Categories to split documents into.
662
+ */
663
+ categories: Array<SplitAPI.SplitCategory>;
664
+
665
+ /**
666
+ * Product type.
667
+ */
668
+ product_type: 'split_v1';
669
+
670
+ /**
671
+ * Strategy for splitting documents.
672
+ */
673
+ splitting_strategy?: SplitV1Parameters.SplittingStrategy;
674
+ }
675
+
676
+ export namespace SplitV1Parameters {
677
+ /**
678
+ * Strategy for splitting documents.
679
+ */
680
+ export interface SplittingStrategy {
681
+ /**
682
+ * Controls handling of pages that don't match any category. 'include': pages can
683
+ * be grouped as 'uncategorized' and included in results. 'forbid': all pages must
684
+ * be assigned to a defined category. 'omit': pages can be classified as
685
+ * 'uncategorized' but are excluded from results.
686
+ */
687
+ allow_uncategorized?: 'include' | 'forbid' | 'omit';
688
+ }
689
+ }
690
+
691
+ /**
692
+ * Typed parameters for an _extract v2_ product configuration.
693
+ */
694
+ export interface ExtractV2Parameters {
695
+ /**
696
+ * JSON Schema defining the fields to extract. Validate with the /schema/validate
697
+ * endpoint first.
698
+ */
699
+ data_schema: {
700
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
701
+ };
702
+
703
+ /**
704
+ * Product type.
705
+ */
706
+ product_type: 'extract_v2';
707
+
708
+ /**
709
+ * Include citations in results
710
+ */
711
+ cite_sources?: boolean;
712
+
713
+ /**
714
+ * Include confidence scores in results
715
+ */
716
+ confidence_scores?: boolean;
717
+
718
+ /**
719
+ * Extract algorithm version. Use 'latest' or a date string.
720
+ */
721
+ extract_version?: string;
722
+
723
+ /**
724
+ * Granularity of extraction: per_doc returns one object per document, per_page
725
+ * returns one object per page, per_table_row returns one object per table row
726
+ */
727
+ extraction_target?: 'per_doc' | 'per_page' | 'per_table_row';
728
+
729
+ /**
730
+ * ISO 639-1 language code for the document
731
+ */
732
+ lang?: string;
733
+
734
+ /**
735
+ * Maximum number of pages to process. Omit for no limit.
736
+ */
737
+ max_pages?: number | null;
738
+
739
+ /**
740
+ * Saved parse configuration ID to control how the document is parsed before
741
+ * extraction
742
+ */
743
+ parse_config_id?: string | null;
744
+
745
+ /**
746
+ * Parse tier to use before extraction (fast, cost_effective, or agentic)
747
+ */
748
+ parse_tier?: string | null;
749
+
750
+ /**
751
+ * Custom system prompt to guide extraction behavior
752
+ */
753
+ system_prompt?: string | null;
754
+
755
+ /**
756
+ * Comma-separated page numbers or ranges to process (1-based). Omit to process all
757
+ * pages.
758
+ */
759
+ target_pages?: string | null;
760
+
761
+ /**
762
+ * Extract tier: cost_effective (5 credits/page) or agentic (15 credits/page)
763
+ */
764
+ tier?: 'cost_effective' | 'agentic';
765
+ }
766
+
767
+ /**
768
+ * Typed parameters for a _classify v2_ product configuration.
769
+ */
770
+ export interface ClassifyV2Parameters {
771
+ /**
772
+ * Product type.
773
+ */
774
+ product_type: 'classify_v2';
775
+
776
+ /**
777
+ * Classify rules to evaluate against the document (at least one required)
778
+ */
779
+ rules: Array<ClassifyV2Parameters.Rule>;
780
+
781
+ /**
782
+ * Classify execution mode
783
+ */
784
+ mode?: 'FAST';
785
+
786
+ /**
787
+ * Parsing configuration for classify jobs.
788
+ */
789
+ parsing_configuration?: ClassifyV2Parameters.ParsingConfiguration | null;
790
+ }
791
+
792
+ export namespace ClassifyV2Parameters {
793
+ /**
794
+ * A rule for classifying documents.
795
+ */
796
+ export interface Rule {
797
+ /**
798
+ * Natural language criteria for matching this rule
799
+ */
800
+ description: string;
801
+
802
+ /**
803
+ * Document type to assign when rule matches
804
+ */
805
+ type: string;
806
+ }
807
+
808
+ /**
809
+ * Parsing configuration for classify jobs.
810
+ */
811
+ export interface ParsingConfiguration {
812
+ /**
813
+ * ISO 639-1 language code for the document
814
+ */
815
+ lang?: string;
816
+
817
+ /**
818
+ * Maximum number of pages to process. Omit for no limit.
819
+ */
820
+ max_pages?: number | null;
821
+
822
+ /**
823
+ * Comma-separated page numbers or ranges to process (1-based). Omit to process all
824
+ * pages.
825
+ */
826
+ target_pages?: string | null;
827
+ }
828
+ }
829
+
830
+ /**
831
+ * Configuration for LlamaParse v2 document parsing.
832
+ *
833
+ * Includes tier selection, processing options, output formatting, page targeting,
834
+ * and webhook delivery. Refer to the LlamaParse documentation for details on each
835
+ * field.
836
+ */
837
+ export interface ParseV2Parameters {
838
+ /**
839
+ * Product type.
840
+ */
841
+ product_type: 'parse_v2';
842
+
843
+ /**
844
+ * Parsing tier: 'fast' (rule-based, cheapest), 'cost_effective' (balanced),
845
+ * 'agentic' (AI-powered with custom prompts), or 'agentic_plus' (premium AI with
846
+ * highest accuracy)
847
+ */
848
+ tier: 'fast' | 'cost_effective' | 'agentic' | 'agentic_plus';
849
+
850
+ /**
851
+ * Tier version. Use 'latest' for the current stable version, or specify a specific
852
+ * version (e.g., '1.0', '2.0') for reproducible results
853
+ */
854
+ version:
855
+ | '2025-12-11'
856
+ | '2025-12-18'
857
+ | '2025-12-31'
858
+ | '2026-01-08'
859
+ | '2026-01-09'
860
+ | '2026-01-16'
861
+ | '2026-01-21'
862
+ | '2026-01-22'
863
+ | '2026-01-24'
864
+ | '2026-01-29'
865
+ | '2026-01-30'
866
+ | '2026-02-03'
867
+ | '2026-02-18'
868
+ | '2026-02-20'
869
+ | '2026-02-24'
870
+ | '2026-02-26'
871
+ | '2026-03-02'
872
+ | '2026-03-03'
873
+ | '2026-03-04'
874
+ | '2026-03-05'
875
+ | '2026-03-09'
876
+ | '2026-03-10'
877
+ | '2026-03-11'
878
+ | '2026-03-12'
879
+ | '2026-03-17'
880
+ | '2026-03-19'
881
+ | '2026-03-20'
882
+ | '2026-03-22'
883
+ | '2026-03-23'
884
+ | '2026-03-24'
885
+ | '2026-03-25'
886
+ | '2026-03-26'
887
+ | '2026-03-27'
888
+ | '2026-03-30'
889
+ | '2026-03-31'
890
+ | 'latest'
891
+ | (string & {});
892
+
893
+ /**
894
+ * Options for AI-powered parsing tiers (cost_effective, agentic, agentic_plus).
895
+ *
896
+ * These options customize how the AI processes and interprets document content.
897
+ * Only applicable when using non-fast tiers.
898
+ */
899
+ agentic_options?: ParseV2Parameters.AgenticOptions | null;
900
+
901
+ /**
902
+ * Identifier for the client/application making the request. Used for analytics and
903
+ * debugging. Example: 'my-app-v2'
904
+ */
905
+ client_name?: string | null;
906
+
907
+ /**
908
+ * Crop boundaries to process only a portion of each page. Values are ratios 0-1
909
+ * from page edges
910
+ */
911
+ crop_box?: ParseV2Parameters.CropBox;
912
+
913
+ /**
914
+ * Bypass result caching and force re-parsing. Use when document content may have
915
+ * changed or you need fresh results
916
+ */
917
+ disable_cache?: boolean | null;
918
+
919
+ /**
920
+ * Options for fast tier parsing (rule-based, no AI).
921
+ *
922
+ * Fast tier uses deterministic algorithms for text extraction without AI
923
+ * enhancement. It's the fastest and most cost-effective option, best suited for
924
+ * simple documents with standard layouts. Currently has no configurable options
925
+ * but reserved for future expansion.
926
+ */
927
+ fast_options?: unknown | null;
928
+
929
+ /**
930
+ * Format-specific options (HTML, PDF, spreadsheet, presentation). Applied based on
931
+ * detected input file type
932
+ */
933
+ input_options?: ParseV2Parameters.InputOptions;
934
+
935
+ /**
936
+ * Output formatting options for markdown, text, and extracted images
937
+ */
938
+ output_options?: ParseV2Parameters.OutputOptions;
939
+
940
+ /**
941
+ * Page selection: limit total pages or specify exact pages to process
942
+ */
943
+ page_ranges?: ParseV2Parameters.PageRanges;
944
+
945
+ /**
946
+ * Job execution controls including timeouts and failure thresholds
947
+ */
948
+ processing_control?: ParseV2Parameters.ProcessingControl;
949
+
950
+ /**
951
+ * Document processing options including OCR, table extraction, and chart parsing
952
+ */
953
+ processing_options?: ParseV2Parameters.ProcessingOptions;
954
+
955
+ /**
956
+ * Webhook endpoints for job status notifications. Multiple webhooks can be
957
+ * configured for different events or services
958
+ */
959
+ webhook_configurations?: Array<ParseV2Parameters.WebhookConfiguration>;
960
+ }
961
+
962
+ export namespace ParseV2Parameters {
963
+ /**
964
+ * Options for AI-powered parsing tiers (cost_effective, agentic, agentic_plus).
965
+ *
966
+ * These options customize how the AI processes and interprets document content.
967
+ * Only applicable when using non-fast tiers.
968
+ */
969
+ export interface AgenticOptions {
970
+ /**
971
+ * Custom instructions for the AI parser. Use to guide extraction behavior, specify
972
+ * output formatting, or provide domain-specific context. Example: 'Extract
973
+ * financial tables with currency symbols. Format dates as YYYY-MM-DD.'
974
+ */
975
+ custom_prompt?: string | null;
976
+ }
977
+
978
+ /**
979
+ * Crop boundaries to process only a portion of each page. Values are ratios 0-1
980
+ * from page edges
981
+ */
982
+ export interface CropBox {
983
+ /**
984
+ * Bottom boundary as ratio (0-1). 0=top edge, 1=bottom edge. Content below this
985
+ * line is excluded
986
+ */
987
+ bottom?: number | null;
988
+
989
+ /**
990
+ * Left boundary as ratio (0-1). 0=left edge, 1=right edge. Content left of this
991
+ * line is excluded
992
+ */
993
+ left?: number | null;
994
+
995
+ /**
996
+ * Right boundary as ratio (0-1). 0=left edge, 1=right edge. Content right of this
997
+ * line is excluded
998
+ */
999
+ right?: number | null;
1000
+
1001
+ /**
1002
+ * Top boundary as ratio (0-1). 0=top edge, 1=bottom edge. Content above this line
1003
+ * is excluded
1004
+ */
1005
+ top?: number | null;
1006
+ }
1007
+
1008
+ /**
1009
+ * Format-specific options (HTML, PDF, spreadsheet, presentation). Applied based on
1010
+ * detected input file type
1011
+ */
1012
+ export interface InputOptions {
1013
+ /**
1014
+ * HTML/web page parsing options (applies to .html, .htm files)
1015
+ */
1016
+ html?: InputOptions.HTML;
1017
+
1018
+ /**
1019
+ * PDF-specific parsing options (applies to .pdf files)
1020
+ */
1021
+ pdf?: unknown;
1022
+
1023
+ /**
1024
+ * Presentation parsing options (applies to .pptx, .ppt, .odp, .key files)
1025
+ */
1026
+ presentation?: InputOptions.Presentation;
1027
+
1028
+ /**
1029
+ * Spreadsheet parsing options (applies to .xlsx, .xls, .csv, .ods files)
1030
+ */
1031
+ spreadsheet?: InputOptions.Spreadsheet;
1032
+ }
1033
+
1034
+ export namespace InputOptions {
1035
+ /**
1036
+ * HTML/web page parsing options (applies to .html, .htm files)
1037
+ */
1038
+ export interface HTML {
1039
+ /**
1040
+ * Force all HTML elements to be visible by overriding CSS display/visibility
1041
+ * properties. Useful for parsing pages with hidden content or collapsed sections
1042
+ */
1043
+ make_all_elements_visible?: boolean | null;
1044
+
1045
+ /**
1046
+ * Remove fixed-position elements (headers, footers, floating buttons) that appear
1047
+ * on every page render
1048
+ */
1049
+ remove_fixed_elements?: boolean | null;
1050
+
1051
+ /**
1052
+ * Remove navigation elements (nav bars, sidebars, menus) to focus on main content
1053
+ */
1054
+ remove_navigation_elements?: boolean | null;
1055
+ }
1056
+
1057
+ /**
1058
+ * Presentation parsing options (applies to .pptx, .ppt, .odp, .key files)
1059
+ */
1060
+ export interface Presentation {
1061
+ /**
1062
+ * Extract content positioned outside the visible slide area. Some presentations
1063
+ * have hidden notes or content that extends beyond slide boundaries
1064
+ */
1065
+ out_of_bounds_content?: boolean | null;
1066
+
1067
+ /**
1068
+ * Skip extraction of embedded chart data tables. When true, only the visual
1069
+ * representation of charts is captured, not the underlying data
1070
+ */
1071
+ skip_embedded_data?: boolean | null;
1072
+ }
1073
+
1074
+ /**
1075
+ * Spreadsheet parsing options (applies to .xlsx, .xls, .csv, .ods files)
1076
+ */
1077
+ export interface Spreadsheet {
1078
+ /**
1079
+ * Detect and extract multiple tables within a single sheet. Useful when
1080
+ * spreadsheets contain several data regions separated by blank rows/columns
1081
+ */
1082
+ detect_sub_tables_in_sheets?: boolean | null;
1083
+
1084
+ /**
1085
+ * Compute formula results instead of extracting formula text. Use when you need
1086
+ * calculated values rather than formula definitions
1087
+ */
1088
+ force_formula_computation_in_sheets?: boolean | null;
1089
+
1090
+ /**
1091
+ * Parse hidden sheets in addition to visible ones. By default, hidden sheets are
1092
+ * skipped
1093
+ */
1094
+ include_hidden_sheets?: boolean | null;
1095
+ }
1096
+ }
1097
+
1098
+ /**
1099
+ * Output formatting options for markdown, text, and extracted images
1100
+ */
1101
+ export interface OutputOptions {
1102
+ /**
1103
+ * Extract the printed page number as it appears in the document (e.g., 'Page 5 of
1104
+ * 10', 'v', 'A-3'). Useful for referencing original page numbers
1105
+ */
1106
+ extract_printed_page_number?: boolean | null;
1107
+
1108
+ /**
1109
+ * Image categories to extract and save. Options: 'screenshot' (full page renders
1110
+ * useful for visual QA), 'embedded' (images found within the document), 'layout'
1111
+ * (cropped regions from layout detection like figures and diagrams). Empty list
1112
+ * saves no images
1113
+ */
1114
+ images_to_save?: Array<'screenshot' | 'embedded' | 'layout'>;
1115
+
1116
+ /**
1117
+ * Markdown formatting options including table styles and link annotations
1118
+ */
1119
+ markdown?: OutputOptions.Markdown;
1120
+
1121
+ /**
1122
+ * Spatial text output options for preserving document layout structure
1123
+ */
1124
+ spatial_text?: OutputOptions.SpatialText;
1125
+
1126
+ /**
1127
+ * Options for exporting tables as XLSX spreadsheets
1128
+ */
1129
+ tables_as_spreadsheet?: OutputOptions.TablesAsSpreadsheet;
1130
+ }
1131
+
1132
+ export namespace OutputOptions {
1133
+ /**
1134
+ * Markdown formatting options including table styles and link annotations
1135
+ */
1136
+ export interface Markdown {
1137
+ /**
1138
+ * Add link annotations to markdown output in the format [text](url). When false,
1139
+ * only the link text is included
1140
+ */
1141
+ annotate_links?: boolean | null;
1142
+
1143
+ /**
1144
+ * Embed images directly in markdown as base64 data URIs instead of extracting them
1145
+ * as separate files. Useful for self-contained markdown output
1146
+ */
1147
+ inline_images?: boolean | null;
1148
+
1149
+ /**
1150
+ * Table formatting options including markdown vs HTML format and merging behavior
1151
+ */
1152
+ tables?: Markdown.Tables;
1153
+ }
1154
+
1155
+ export namespace Markdown {
1156
+ /**
1157
+ * Table formatting options including markdown vs HTML format and merging behavior
1158
+ */
1159
+ export interface Tables {
1160
+ /**
1161
+ * Remove extra whitespace padding in markdown table cells for more compact output
1162
+ */
1163
+ compact_markdown_tables?: boolean | null;
1164
+
1165
+ /**
1166
+ * Separator string for multiline cell content in markdown tables. Example:
1167
+ * '&lt;br&gt;' to preserve line breaks, ' ' to join with spaces
1168
+ */
1169
+ markdown_table_multiline_separator?: string | null;
1170
+
1171
+ /**
1172
+ * Automatically merge tables that span multiple pages into a single table. The
1173
+ * merged table appears on the first page with merged_from_pages metadata
1174
+ */
1175
+ merge_continued_tables?: boolean | null;
1176
+
1177
+ /**
1178
+ * Output tables as markdown pipe tables instead of HTML &lt;table&gt; tags.
1179
+ * Markdown tables are simpler but cannot represent complex structures like merged
1180
+ * cells
1181
+ */
1182
+ output_tables_as_markdown?: boolean | null;
1183
+ }
1184
+ }
1185
+
1186
+ /**
1187
+ * Spatial text output options for preserving document layout structure
1188
+ */
1189
+ export interface SpatialText {
1190
+ /**
1191
+ * Keep multi-column layouts intact instead of linearizing columns into sequential
1192
+ * text. Automatically enabled for non-fast tiers
1193
+ */
1194
+ do_not_unroll_columns?: boolean | null;
1195
+
1196
+ /**
1197
+ * Maintain consistent text column alignment across page boundaries. Automatically
1198
+ * enabled for document-level parsing modes
1199
+ */
1200
+ preserve_layout_alignment_across_pages?: boolean | null;
1201
+
1202
+ /**
1203
+ * Include text below the normal size threshold. Useful for footnotes, watermarks,
1204
+ * or fine print that might otherwise be filtered out
1205
+ */
1206
+ preserve_very_small_text?: boolean | null;
1207
+ }
1208
+
1209
+ /**
1210
+ * Options for exporting tables as XLSX spreadsheets
1211
+ */
1212
+ export interface TablesAsSpreadsheet {
1213
+ /**
1214
+ * Whether this option is enabled
1215
+ */
1216
+ enable?: boolean | null;
1217
+
1218
+ /**
1219
+ * Automatically generate descriptive sheet names from table context (headers,
1220
+ * surrounding text) instead of using generic names like 'Table_1'
1221
+ */
1222
+ guess_sheet_name?: boolean;
1223
+ }
1224
+ }
1225
+
1226
+ /**
1227
+ * Page selection: limit total pages or specify exact pages to process
1228
+ */
1229
+ export interface PageRanges {
1230
+ /**
1231
+ * Maximum number of pages to process. Pages are processed in order starting from
1232
+ * page 1. If both max_pages and target_pages are set, target_pages takes
1233
+ * precedence
1234
+ */
1235
+ max_pages?: number | null;
1236
+
1237
+ /**
1238
+ * Comma-separated list of specific pages to process using 1-based indexing.
1239
+ * Supports individual pages and ranges. Examples: '1,3,5' (pages 1, 3, 5), '1-5'
1240
+ * (pages 1 through 5 inclusive), '1,3,5-8,10' (pages 1, 3, 5-8, and 10). Pages are
1241
+ * sorted and deduplicated automatically. Duplicate pages cause an error
1242
+ */
1243
+ target_pages?: string | null;
1244
+ }
1245
+
1246
+ /**
1247
+ * Job execution controls including timeouts and failure thresholds
1248
+ */
1249
+ export interface ProcessingControl {
1250
+ /**
1251
+ * Quality thresholds that determine when a job should fail vs complete with
1252
+ * partial results
1253
+ */
1254
+ job_failure_conditions?: ProcessingControl.JobFailureConditions;
1255
+
1256
+ /**
1257
+ * Timeout settings for job execution. Increase for large or complex documents
1258
+ */
1259
+ timeouts?: ProcessingControl.Timeouts;
1260
+ }
1261
+
1262
+ export namespace ProcessingControl {
1263
+ /**
1264
+ * Quality thresholds that determine when a job should fail vs complete with
1265
+ * partial results
1266
+ */
1267
+ export interface JobFailureConditions {
1268
+ /**
1269
+ * Maximum ratio of pages allowed to fail before the job fails (0-1). Example: 0.1
1270
+ * means job fails if more than 10% of pages fail. Default is 0.05 (5%)
1271
+ */
1272
+ allowed_page_failure_ratio?: number | null;
1273
+
1274
+ /**
1275
+ * Fail the job if a problematic font is detected that may cause incorrect text
1276
+ * extraction. Buggy fonts can produce garbled or missing characters
1277
+ */
1278
+ fail_on_buggy_font?: boolean | null;
1279
+
1280
+ /**
1281
+ * Fail the entire job if any embedded image cannot be extracted. By default, image
1282
+ * extraction errors are logged but don't fail the job
1283
+ */
1284
+ fail_on_image_extraction_error?: boolean | null;
1285
+
1286
+ /**
1287
+ * Fail the entire job if OCR fails on any image. By default, OCR errors result in
1288
+ * empty text for that image
1289
+ */
1290
+ fail_on_image_ocr_error?: boolean | null;
1291
+
1292
+ /**
1293
+ * Fail the entire job if markdown cannot be reconstructed for any page. By
1294
+ * default, failed pages use fallback text extraction
1295
+ */
1296
+ fail_on_markdown_reconstruction_error?: boolean | null;
1297
+ }
1298
+
1299
+ /**
1300
+ * Timeout settings for job execution. Increase for large or complex documents
1301
+ */
1302
+ export interface Timeouts {
1303
+ /**
1304
+ * Base timeout for the job in seconds (max 1800 = 30 minutes). This is the minimum
1305
+ * time allowed regardless of document size
1306
+ */
1307
+ base_in_seconds?: number | null;
1308
+
1309
+ /**
1310
+ * Additional timeout per page in seconds (max 300 = 5 minutes). Total timeout =
1311
+ * base + (this value × page count)
1312
+ */
1313
+ extra_time_per_page_in_seconds?: number | null;
1314
+ }
1315
+ }
1316
+
1317
+ /**
1318
+ * Document processing options including OCR, table extraction, and chart parsing
1319
+ */
1320
+ export interface ProcessingOptions {
1321
+ /**
1322
+ * Use aggressive heuristics to detect table boundaries, even without visible
1323
+ * borders. Useful for documents with borderless or complex tables
1324
+ */
1325
+ aggressive_table_extraction?: boolean | null;
1326
+
1327
+ /**
1328
+ * Conditional processing rules that apply different parsing options based on page
1329
+ * content, document structure, or filename patterns. Each entry defines trigger
1330
+ * conditions and the parsing configuration to apply when triggered
1331
+ */
1332
+ auto_mode_configuration?: Array<ProcessingOptions.AutoModeConfiguration> | null;
1333
+
1334
+ /**
1335
+ * Cost optimizer configuration for reducing parsing costs on simpler pages.
1336
+ *
1337
+ * When enabled, the parser analyzes each page and routes simpler pages to faster,
1338
+ * cheaper processing while preserving quality for complex pages. Only works with
1339
+ * 'agentic' or 'agentic_plus' tiers.
1340
+ */
1341
+ cost_optimizer?: ProcessingOptions.CostOptimizer | null;
1342
+
1343
+ /**
1344
+ * Disable automatic heuristics including outlined table extraction and adaptive
1345
+ * long table handling. Use when heuristics produce incorrect results
1346
+ */
1347
+ disable_heuristics?: boolean | null;
1348
+
1349
+ /**
1350
+ * Options for ignoring specific text types (diagonal, hidden, text in images)
1351
+ */
1352
+ ignore?: ProcessingOptions.Ignore;
1353
+
1354
+ /**
1355
+ * OCR configuration including language detection settings
1356
+ */
1357
+ ocr_parameters?: ProcessingOptions.OcrParameters;
1358
+
1359
+ /**
1360
+ * Enable AI-powered chart analysis. Modes: 'efficient' (fast, lower cost),
1361
+ * 'agentic' (balanced), 'agentic_plus' (highest accuracy). Automatically enables
1362
+ * extract_layout and precise_bounding_box when set
1363
+ */
1364
+ specialized_chart_parsing?: 'agentic_plus' | 'agentic' | 'efficient' | null;
1365
+ }
1366
+
1367
+ export namespace ProcessingOptions {
1368
+ /**
1369
+ * A single auto mode rule with trigger conditions and parsing configuration.
1370
+ *
1371
+ * Auto mode allows conditional parsing where different configurations are applied
1372
+ * based on page content, structure, or filename. When triggers match, the
1373
+ * parsing_conf overrides default settings for that page.
1374
+ */
1375
+ export interface AutoModeConfiguration {
1376
+ /**
1377
+ * Parsing configuration to apply when trigger conditions are met
1378
+ */
1379
+ parsing_conf: AutoModeConfiguration.ParsingConf;
1380
+
1381
+ /**
1382
+ * Single glob pattern to match against filename
1383
+ */
1384
+ filename_match_glob?: string | null;
1385
+
1386
+ /**
1387
+ * List of glob patterns to match against filename
1388
+ */
1389
+ filename_match_glob_list?: Array<string> | null;
1390
+
1391
+ /**
1392
+ * Regex pattern to match against filename
1393
+ */
1394
+ filename_regexp?: string | null;
1395
+
1396
+ /**
1397
+ * Regex mode flags (e.g., 'i' for case-insensitive)
1398
+ */
1399
+ filename_regexp_mode?: string | null;
1400
+
1401
+ /**
1402
+ * Trigger if page contains a full-page image (scanned page detection)
1403
+ */
1404
+ full_page_image_in_page?: boolean | null;
1405
+
1406
+ /**
1407
+ * Threshold for full page image detection (0.0-1.0, default 0.8)
1408
+ */
1409
+ full_page_image_in_page_threshold?: number | string | null;
1410
+
1411
+ /**
1412
+ * Trigger if page contains non-screenshot images
1413
+ */
1414
+ image_in_page?: boolean | null;
1415
+
1416
+ /**
1417
+ * Trigger if page contains this layout element type
1418
+ */
1419
+ layout_element_in_page?: string | null;
1420
+
1421
+ /**
1422
+ * Confidence threshold for layout element detection
1423
+ */
1424
+ layout_element_in_page_confidence_threshold?: number | string | null;
1425
+
1426
+ /**
1427
+ * Trigger if page has more than N charts
1428
+ */
1429
+ page_contains_at_least_n_charts?: number | string | null;
1430
+
1431
+ /**
1432
+ * Trigger if page has more than N images
1433
+ */
1434
+ page_contains_at_least_n_images?: number | string | null;
1435
+
1436
+ /**
1437
+ * Trigger if page has more than N layout elements
1438
+ */
1439
+ page_contains_at_least_n_layout_elements?: number | string | null;
1440
+
1441
+ /**
1442
+ * Trigger if page has more than N lines
1443
+ */
1444
+ page_contains_at_least_n_lines?: number | string | null;
1445
+
1446
+ /**
1447
+ * Trigger if page has more than N links
1448
+ */
1449
+ page_contains_at_least_n_links?: number | string | null;
1450
+
1451
+ /**
1452
+ * Trigger if page has more than N numeric words
1453
+ */
1454
+ page_contains_at_least_n_numbers?: number | string | null;
1455
+
1456
+ /**
1457
+ * Trigger if page has more than N% numeric words
1458
+ */
1459
+ page_contains_at_least_n_percent_numbers?: number | string | null;
1460
+
1461
+ /**
1462
+ * Trigger if page has more than N tables
1463
+ */
1464
+ page_contains_at_least_n_tables?: number | string | null;
1465
+
1466
+ /**
1467
+ * Trigger if page has more than N words
1468
+ */
1469
+ page_contains_at_least_n_words?: number | string | null;
1470
+
1471
+ /**
1472
+ * Trigger if page has fewer than N charts
1473
+ */
1474
+ page_contains_at_most_n_charts?: number | string | null;
1475
+
1476
+ /**
1477
+ * Trigger if page has fewer than N images
1478
+ */
1479
+ page_contains_at_most_n_images?: number | string | null;
1480
+
1481
+ /**
1482
+ * Trigger if page has fewer than N layout elements
1483
+ */
1484
+ page_contains_at_most_n_layout_elements?: number | string | null;
1485
+
1486
+ /**
1487
+ * Trigger if page has fewer than N lines
1488
+ */
1489
+ page_contains_at_most_n_lines?: number | string | null;
1490
+
1491
+ /**
1492
+ * Trigger if page has fewer than N links
1493
+ */
1494
+ page_contains_at_most_n_links?: number | string | null;
1495
+
1496
+ /**
1497
+ * Trigger if page has fewer than N numeric words
1498
+ */
1499
+ page_contains_at_most_n_numbers?: number | string | null;
1500
+
1501
+ /**
1502
+ * Trigger if page has fewer than N% numeric words
1503
+ */
1504
+ page_contains_at_most_n_percent_numbers?: number | string | null;
1505
+
1506
+ /**
1507
+ * Trigger if page has fewer than N tables
1508
+ */
1509
+ page_contains_at_most_n_tables?: number | string | null;
1510
+
1511
+ /**
1512
+ * Trigger if page has fewer than N words
1513
+ */
1514
+ page_contains_at_most_n_words?: number | string | null;
1515
+
1516
+ /**
1517
+ * Trigger if page has more than N characters
1518
+ */
1519
+ page_longer_than_n_chars?: number | string | null;
1520
+
1521
+ /**
1522
+ * Trigger on pages with markdown extraction errors
1523
+ */
1524
+ page_md_error?: boolean | null;
1525
+
1526
+ /**
1527
+ * Trigger if page has fewer than N characters
1528
+ */
1529
+ page_shorter_than_n_chars?: number | string | null;
1530
+
1531
+ /**
1532
+ * Regex pattern to match in page content
1533
+ */
1534
+ regexp_in_page?: string | null;
1535
+
1536
+ /**
1537
+ * Regex mode flags for regexp_in_page
1538
+ */
1539
+ regexp_in_page_mode?: string | null;
1540
+
1541
+ /**
1542
+ * Trigger if page contains a table
1543
+ */
1544
+ table_in_page?: boolean | null;
1545
+
1546
+ /**
1547
+ * Trigger if page text/markdown contains this string
1548
+ */
1549
+ text_in_page?: string | null;
1550
+
1551
+ /**
1552
+ * How to combine multiple trigger conditions: 'and' (all conditions must match,
1553
+ * this is the default) or 'or' (any single condition can trigger)
1554
+ */
1555
+ trigger_mode?: string | null;
1556
+ }
1557
+
1558
+ export namespace AutoModeConfiguration {
1559
+ /**
1560
+ * Parsing configuration to apply when trigger conditions are met
1561
+ */
1562
+ export interface ParsingConf {
1563
+ /**
1564
+ * Whether to use adaptive long table handling
1565
+ */
1566
+ adaptive_long_table?: boolean | null;
1567
+
1568
+ /**
1569
+ * Whether to use aggressive table extraction
1570
+ */
1571
+ aggressive_table_extraction?: boolean | null;
1572
+
1573
+ /**
1574
+ * Crop box options for auto mode parsing configuration.
1575
+ */
1576
+ crop_box?: ParsingConf.CropBox | null;
1577
+
1578
+ /**
1579
+ * Custom AI instructions for matched pages. Overrides the base custom_prompt
1580
+ */
1581
+ custom_prompt?: string | null;
1582
+
1583
+ /**
1584
+ * Whether to extract layout information
1585
+ */
1586
+ extract_layout?: boolean | null;
1587
+
1588
+ /**
1589
+ * Whether to use high resolution OCR
1590
+ */
1591
+ high_res_ocr?: boolean | null;
1592
+
1593
+ /**
1594
+ * Ignore options for auto mode parsing configuration.
1595
+ */
1596
+ ignore?: ParsingConf.Ignore | null;
1597
+
1598
+ /**
1599
+ * Primary language of the document
1600
+ */
1601
+ language?: string | null;
1602
+
1603
+ /**
1604
+ * Whether to use outlined table extraction
1605
+ */
1606
+ outlined_table_extraction?: boolean | null;
1607
+
1608
+ /**
1609
+ * Presentation-specific options for auto mode parsing configuration.
1610
+ */
1611
+ presentation?: ParsingConf.Presentation | null;
1612
+
1613
+ /**
1614
+ * Spatial text options for auto mode parsing configuration.
1615
+ */
1616
+ spatial_text?: ParsingConf.SpatialText | null;
1617
+
1618
+ /**
1619
+ * Enable specialized chart parsing with the specified mode
1620
+ */
1621
+ specialized_chart_parsing?: 'agentic_plus' | 'agentic' | 'efficient' | null;
1622
+
1623
+ /**
1624
+ * Override the parsing tier for matched pages. Must be paired with version
1625
+ */
1626
+ tier?: 'fast' | 'cost_effective' | 'agentic' | 'agentic_plus' | null;
1627
+
1628
+ /**
1629
+ * Tier version when overriding tier. Required when tier is specified
1630
+ */
1631
+ version?:
1632
+ | '2025-12-11'
1633
+ | '2025-12-18'
1634
+ | '2025-12-31'
1635
+ | '2026-01-08'
1636
+ | '2026-01-09'
1637
+ | '2026-01-16'
1638
+ | '2026-01-21'
1639
+ | '2026-01-22'
1640
+ | '2026-01-24'
1641
+ | '2026-01-29'
1642
+ | '2026-01-30'
1643
+ | '2026-02-03'
1644
+ | '2026-02-18'
1645
+ | '2026-02-20'
1646
+ | '2026-02-24'
1647
+ | '2026-02-26'
1648
+ | '2026-03-02'
1649
+ | '2026-03-03'
1650
+ | '2026-03-04'
1651
+ | '2026-03-05'
1652
+ | '2026-03-09'
1653
+ | '2026-03-10'
1654
+ | '2026-03-11'
1655
+ | '2026-03-12'
1656
+ | '2026-03-17'
1657
+ | '2026-03-19'
1658
+ | '2026-03-20'
1659
+ | '2026-03-22'
1660
+ | '2026-03-23'
1661
+ | '2026-03-24'
1662
+ | '2026-03-25'
1663
+ | '2026-03-26'
1664
+ | '2026-03-27'
1665
+ | '2026-03-30'
1666
+ | '2026-03-31'
1667
+ | 'latest'
1668
+ | (string & {})
1669
+ | null;
1670
+ }
1671
+
1672
+ export namespace ParsingConf {
1673
+ /**
1674
+ * Crop box options for auto mode parsing configuration.
1675
+ */
1676
+ export interface CropBox {
1677
+ /**
1678
+ * Bottom boundary of crop box as ratio (0-1)
1679
+ */
1680
+ bottom?: number | null;
1681
+
1682
+ /**
1683
+ * Left boundary of crop box as ratio (0-1)
1684
+ */
1685
+ left?: number | null;
1686
+
1687
+ /**
1688
+ * Right boundary of crop box as ratio (0-1)
1689
+ */
1690
+ right?: number | null;
1691
+
1692
+ /**
1693
+ * Top boundary of crop box as ratio (0-1)
1694
+ */
1695
+ top?: number | null;
1696
+ }
1697
+
1698
+ /**
1699
+ * Ignore options for auto mode parsing configuration.
1700
+ */
1701
+ export interface Ignore {
1702
+ /**
1703
+ * Whether to ignore diagonal text in the document
1704
+ */
1705
+ ignore_diagonal_text?: boolean | null;
1706
+
1707
+ /**
1708
+ * Whether to ignore hidden text in the document
1709
+ */
1710
+ ignore_hidden_text?: boolean | null;
1711
+ }
1712
+
1713
+ /**
1714
+ * Presentation-specific options for auto mode parsing configuration.
1715
+ */
1716
+ export interface Presentation {
1717
+ /**
1718
+ * Extract out of bounds content in presentation slides
1719
+ */
1720
+ out_of_bounds_content?: boolean | null;
1721
+
1722
+ /**
1723
+ * Skip extraction of embedded data for charts in presentation slides
1724
+ */
1725
+ skip_embedded_data?: boolean | null;
1726
+ }
1727
+
1728
+ /**
1729
+ * Spatial text options for auto mode parsing configuration.
1730
+ */
1731
+ export interface SpatialText {
1732
+ /**
1733
+ * Keep column structure intact without unrolling
1734
+ */
1735
+ do_not_unroll_columns?: boolean | null;
1736
+
1737
+ /**
1738
+ * Preserve text alignment across page boundaries
1739
+ */
1740
+ preserve_layout_alignment_across_pages?: boolean | null;
1741
+
1742
+ /**
1743
+ * Include very small text in spatial output
1744
+ */
1745
+ preserve_very_small_text?: boolean | null;
1746
+ }
1747
+ }
1748
+ }
1749
+
1750
+ /**
1751
+ * Cost optimizer configuration for reducing parsing costs on simpler pages.
1752
+ *
1753
+ * When enabled, the parser analyzes each page and routes simpler pages to faster,
1754
+ * cheaper processing while preserving quality for complex pages. Only works with
1755
+ * 'agentic' or 'agentic_plus' tiers.
1756
+ */
1757
+ export interface CostOptimizer {
1758
+ /**
1759
+ * Enable cost-optimized parsing. Routes simpler pages to faster processing while
1760
+ * complex pages use full AI analysis. May reduce speed on some documents.
1761
+ * IMPORTANT: Only available with 'agentic' or 'agentic_plus' tiers
1762
+ */
1763
+ enable?: boolean | null;
1764
+ }
1765
+
1766
+ /**
1767
+ * Options for ignoring specific text types (diagonal, hidden, text in images)
1768
+ */
1769
+ export interface Ignore {
1770
+ /**
1771
+ * Skip text rotated at an angle (not horizontal/vertical). Useful for ignoring
1772
+ * watermarks or decorative angled text
1773
+ */
1774
+ ignore_diagonal_text?: boolean | null;
1775
+
1776
+ /**
1777
+ * Skip text marked as hidden in the document structure. Some PDFs contain
1778
+ * invisible text layers used for accessibility or search indexing
1779
+ */
1780
+ ignore_hidden_text?: boolean | null;
1781
+
1782
+ /**
1783
+ * Skip OCR text extraction from embedded images. Use when images contain
1784
+ * irrelevant text (watermarks, logos) that shouldn't be in the output
1785
+ */
1786
+ ignore_text_in_image?: boolean | null;
1787
+ }
1788
+
1789
+ /**
1790
+ * OCR configuration including language detection settings
1791
+ */
1792
+ export interface OcrParameters {
1793
+ /**
1794
+ * Languages to use for OCR text recognition. Specify multiple languages if
1795
+ * document contains mixed-language content. Order matters - put primary language
1796
+ * first. Example: ['en', 'es'] for English with Spanish
1797
+ */
1798
+ languages?: Array<ParsingAPI.ParsingLanguages> | null;
1799
+ }
1800
+ }
1801
+
1802
+ /**
1803
+ * Webhook configuration for receiving parsing job notifications.
1804
+ *
1805
+ * Webhooks are called when specified events occur during job processing. Configure
1806
+ * multiple webhook configurations to send to different endpoints.
1807
+ */
1808
+ export interface WebhookConfiguration {
1809
+ /**
1810
+ * Events that trigger this webhook. Options: 'parse.success' (job completed),
1811
+ * 'parse.failure' (job failed), 'parse.partial' (some pages failed). If not
1812
+ * specified, webhook fires for all events
1813
+ */
1814
+ webhook_events?: Array<string> | null;
1815
+
1816
+ /**
1817
+ * Custom HTTP headers to include in webhook requests. Use for authentication
1818
+ * tokens or custom routing. Example: {'Authorization': 'Bearer xyz'}
1819
+ */
1820
+ webhook_headers?: { [key: string]: unknown } | null;
1821
+
1822
+ /**
1823
+ * HTTPS URL to receive webhook POST requests. Must be publicly accessible
1824
+ */
1825
+ webhook_url?: string | null;
1826
+ }
1827
+ }
1828
+
1829
+ /**
1830
+ * Catch-all for configurations without a dedicated typed schema.
1831
+ *
1832
+ * Accepts arbitrary JSON fields alongside `product_type`.
1833
+ */
1834
+ export interface UntypedParameters {
1835
+ /**
1836
+ * Product type.
1837
+ */
1838
+ product_type: 'unknown';
1839
+
1840
+ [k: string]: unknown;
1841
+ }
1842
+ }
1843
+
1844
+ export interface ExtractCreateParams {
1845
+ /**
1846
+ * Body param: File ID or parse job ID to extract from
1847
+ */
1848
+ file_input: string;
1849
+
1850
+ /**
1851
+ * Query param
1852
+ */
1853
+ organization_id?: string | null;
1854
+
1855
+ /**
1856
+ * Query param
1857
+ */
1858
+ project_id?: string | null;
1859
+
1860
+ /**
1861
+ * Body param: Extract configuration combining parse and extract settings.
1862
+ */
1863
+ configuration?: ExtractConfiguration | null;
1864
+
1865
+ /**
1866
+ * Body param: Saved configuration ID
1867
+ */
1868
+ configuration_id?: string | null;
1869
+
1870
+ /**
1871
+ * Body param: Outbound webhook endpoints to notify on job status changes
1872
+ */
1873
+ webhook_configurations?: Array<ExtractCreateParams.WebhookConfiguration> | null;
1874
+ }
1875
+
1876
+ export namespace ExtractCreateParams {
1877
+ /**
1878
+ * Configuration for a single outbound webhook endpoint.
1879
+ */
1880
+ export interface WebhookConfiguration {
1881
+ /**
1882
+ * Events to subscribe to (e.g. 'parse.success', 'extract.error'). If null, all
1883
+ * events are delivered.
1884
+ */
1885
+ webhook_events?: Array<
1886
+ | 'extract.pending'
1887
+ | 'extract.success'
1888
+ | 'extract.error'
1889
+ | 'extract.partial_success'
1890
+ | 'extract.cancelled'
1891
+ | 'parse.pending'
1892
+ | 'parse.running'
1893
+ | 'parse.success'
1894
+ | 'parse.error'
1895
+ | 'parse.partial_success'
1896
+ | 'parse.cancelled'
1897
+ | 'classify.pending'
1898
+ | 'classify.success'
1899
+ | 'classify.error'
1900
+ | 'classify.partial_success'
1901
+ | 'classify.cancelled'
1902
+ | 'unmapped_event'
1903
+ > | null;
1904
+
1905
+ /**
1906
+ * Custom HTTP headers sent with each webhook request (e.g. auth tokens)
1907
+ */
1908
+ webhook_headers?: { [key: string]: string } | null;
1909
+
1910
+ /**
1911
+ * Response format sent to the webhook: 'string' (default) or 'json'
1912
+ */
1913
+ webhook_output_format?: string | null;
1914
+
1915
+ /**
1916
+ * URL to receive webhook POST notifications
1917
+ */
1918
+ webhook_url?: string | null;
1919
+ }
1920
+ }
1921
+
1922
+ export interface ExtractListParams extends PaginatedCursorParams {
1923
+ /**
1924
+ * Filter by configuration ID
1925
+ */
1926
+ configuration_id?: string | null;
1927
+
1928
+ /**
1929
+ * Include items created at or after this timestamp (inclusive)
1930
+ */
1931
+ created_at_on_or_after?: string | null;
1932
+
1933
+ /**
1934
+ * Include items created at or before this timestamp (inclusive)
1935
+ */
1936
+ created_at_on_or_before?: string | null;
1937
+
1938
+ /**
1939
+ * Filter by document input type (file_id or parse_job_id)
1940
+ */
1941
+ document_input_type?: string | null;
1942
+
1943
+ /**
1944
+ * @deprecated Deprecated: use file_input instead
1945
+ */
1946
+ document_input_value?: string | null;
1947
+
1948
+ /**
1949
+ * Additional fields to include: configuration, extract_metadata
1950
+ */
1951
+ expand?: Array<string>;
1952
+
1953
+ /**
1954
+ * Filter by file input value
1955
+ */
1956
+ file_input?: string | null;
1957
+
1958
+ /**
1959
+ * Filter by specific job IDs
1960
+ */
1961
+ job_ids?: Array<string> | null;
1962
+
1963
+ organization_id?: string | null;
1964
+
1965
+ project_id?: string | null;
1966
+
1967
+ /**
1968
+ * Filter by status
1969
+ */
1970
+ status?: 'PENDING' | 'THROTTLED' | 'RUNNING' | 'COMPLETED' | 'FAILED' | 'CANCELLED' | null;
1971
+ }
1972
+
1973
+ export interface ExtractDeleteParams {
1974
+ organization_id?: string | null;
1975
+
1976
+ project_id?: string | null;
1977
+ }
1978
+
1979
+ export interface ExtractGenerateSchemaParams {
1980
+ /**
1981
+ * Query param
1982
+ */
1983
+ organization_id?: string | null;
1984
+
1985
+ /**
1986
+ * Query param
1987
+ */
1988
+ project_id?: string | null;
1989
+
1990
+ /**
1991
+ * Body param: Optional schema to validate, refine, or extend
1992
+ */
1993
+ data_schema?: {
1994
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
1995
+ } | null;
1996
+
1997
+ /**
1998
+ * Body param: Optional file ID to analyze for schema generation
1999
+ */
2000
+ file_id?: string | null;
2001
+
2002
+ /**
2003
+ * Body param: Name for the generated configuration (auto-generated if omitted)
2004
+ */
2005
+ name?: string | null;
2006
+
2007
+ /**
2008
+ * Body param: Natural language description of the data structure to extract
2009
+ */
2010
+ prompt?: string | null;
2011
+ }
2012
+
2013
+ export interface ExtractGetParams {
2014
+ /**
2015
+ * Additional fields to include: configuration, extract_metadata
2016
+ */
2017
+ expand?: Array<string>;
2018
+
2019
+ organization_id?: string | null;
2020
+
2021
+ project_id?: string | null;
2022
+ }
2023
+
2024
+ export interface ExtractValidateSchemaParams {
2025
+ /**
2026
+ * JSON Schema to validate for use with extract jobs
2027
+ */
2028
+ data_schema: {
2029
+ [key: string]: { [key: string]: unknown } | Array<unknown> | string | number | boolean | null;
2030
+ };
2031
+ }
2032
+
2033
+ export declare namespace Extract {
2034
+ export {
2035
+ type ExtractConfiguration as ExtractConfiguration,
2036
+ type ExtractJobMetadata as ExtractJobMetadata,
2037
+ type ExtractJobUsage as ExtractJobUsage,
2038
+ type ExtractV2Job as ExtractV2Job,
2039
+ type ExtractV2JobCreate as ExtractV2JobCreate,
2040
+ type ExtractV2JobQueryResponse as ExtractV2JobQueryResponse,
2041
+ type ExtractV2SchemaGenerateRequest as ExtractV2SchemaGenerateRequest,
2042
+ type ExtractV2SchemaValidateRequest as ExtractV2SchemaValidateRequest,
2043
+ type ExtractV2SchemaValidateResponse as ExtractV2SchemaValidateResponse,
2044
+ type ExtractedFieldMetadata as ExtractedFieldMetadata,
2045
+ type ExtractDeleteResponse as ExtractDeleteResponse,
2046
+ type ExtractGenerateSchemaResponse as ExtractGenerateSchemaResponse,
2047
+ type ExtractV2JobsPaginatedCursor as ExtractV2JobsPaginatedCursor,
2048
+ type ExtractCreateParams as ExtractCreateParams,
2049
+ type ExtractListParams as ExtractListParams,
2050
+ type ExtractDeleteParams as ExtractDeleteParams,
2051
+ type ExtractGenerateSchemaParams as ExtractGenerateSchemaParams,
2052
+ type ExtractGetParams as ExtractGetParams,
2053
+ type ExtractValidateSchemaParams as ExtractValidateSchemaParams,
2054
+ };
2055
+ }