chub-dev 0.1.0 → 0.1.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/README.md +55 -0
  2. package/bin/chub-mcp +2 -0
  3. package/dist/airtable/docs/database/javascript/DOC.md +1437 -0
  4. package/dist/airtable/docs/database/python/DOC.md +1735 -0
  5. package/dist/amplitude/docs/analytics/javascript/DOC.md +1282 -0
  6. package/dist/amplitude/docs/analytics/python/DOC.md +1199 -0
  7. package/dist/anthropic/docs/claude-api/javascript/DOC.md +503 -0
  8. package/dist/anthropic/docs/claude-api/python/DOC.md +389 -0
  9. package/dist/asana/docs/tasks/DOC.md +1396 -0
  10. package/dist/assemblyai/docs/transcription/DOC.md +1043 -0
  11. package/dist/atlassian/docs/confluence/javascript/DOC.md +1347 -0
  12. package/dist/atlassian/docs/confluence/python/DOC.md +1604 -0
  13. package/dist/auth0/docs/identity/javascript/DOC.md +968 -0
  14. package/dist/auth0/docs/identity/python/DOC.md +1199 -0
  15. package/dist/aws/docs/s3/javascript/DOC.md +1773 -0
  16. package/dist/aws/docs/s3/python/DOC.md +1807 -0
  17. package/dist/binance/docs/trading/javascript/DOC.md +1315 -0
  18. package/dist/binance/docs/trading/python/DOC.md +1454 -0
  19. package/dist/braintree/docs/gateway/javascript/DOC.md +1278 -0
  20. package/dist/braintree/docs/gateway/python/DOC.md +1179 -0
  21. package/dist/chromadb/docs/embeddings-db/javascript/DOC.md +1263 -0
  22. package/dist/chromadb/docs/embeddings-db/python/DOC.md +1707 -0
  23. package/dist/clerk/docs/auth/javascript/DOC.md +1220 -0
  24. package/dist/clerk/docs/auth/python/DOC.md +274 -0
  25. package/dist/cloudflare/docs/workers/javascript/DOC.md +918 -0
  26. package/dist/cloudflare/docs/workers/python/DOC.md +994 -0
  27. package/dist/cockroachdb/docs/distributed-db/DOC.md +1500 -0
  28. package/dist/cohere/docs/llm/DOC.md +1335 -0
  29. package/dist/datadog/docs/monitoring/javascript/DOC.md +1740 -0
  30. package/dist/datadog/docs/monitoring/python/DOC.md +1815 -0
  31. package/dist/deepgram/docs/speech/javascript/DOC.md +885 -0
  32. package/dist/deepgram/docs/speech/python/DOC.md +685 -0
  33. package/dist/deepl/docs/translation/javascript/DOC.md +887 -0
  34. package/dist/deepl/docs/translation/python/DOC.md +944 -0
  35. package/dist/deepseek/docs/llm/DOC.md +1220 -0
  36. package/dist/directus/docs/headless-cms/javascript/DOC.md +1128 -0
  37. package/dist/directus/docs/headless-cms/python/DOC.md +1276 -0
  38. package/dist/discord/docs/bot/javascript/DOC.md +1090 -0
  39. package/dist/discord/docs/bot/python/DOC.md +1130 -0
  40. package/dist/elasticsearch/docs/search/DOC.md +1634 -0
  41. package/dist/elevenlabs/docs/text-to-speech/javascript/DOC.md +336 -0
  42. package/dist/elevenlabs/docs/text-to-speech/python/DOC.md +552 -0
  43. package/dist/firebase/docs/auth/DOC.md +1015 -0
  44. package/dist/gemini/docs/genai/javascript/DOC.md +691 -0
  45. package/dist/gemini/docs/genai/python/DOC.md +555 -0
  46. package/dist/github/docs/octokit/DOC.md +1560 -0
  47. package/dist/google/docs/bigquery/javascript/DOC.md +1688 -0
  48. package/dist/google/docs/bigquery/python/DOC.md +1503 -0
  49. package/dist/hubspot/docs/crm/javascript/DOC.md +1805 -0
  50. package/dist/hubspot/docs/crm/python/DOC.md +2033 -0
  51. package/dist/huggingface/docs/transformers/DOC.md +948 -0
  52. package/dist/intercom/docs/messaging/javascript/DOC.md +1844 -0
  53. package/dist/intercom/docs/messaging/python/DOC.md +1797 -0
  54. package/dist/jira/docs/issues/javascript/DOC.md +1420 -0
  55. package/dist/jira/docs/issues/python/DOC.md +1492 -0
  56. package/dist/kafka/docs/streaming/javascript/DOC.md +1671 -0
  57. package/dist/kafka/docs/streaming/python/DOC.md +1464 -0
  58. package/dist/landingai-ade/docs/api/DOC.md +620 -0
  59. package/dist/landingai-ade/docs/sdk/python/DOC.md +489 -0
  60. package/dist/landingai-ade/docs/sdk/typescript/DOC.md +542 -0
  61. package/dist/landingai-ade/skills/SKILL.md +489 -0
  62. package/dist/launchdarkly/docs/feature-flags/javascript/DOC.md +1191 -0
  63. package/dist/launchdarkly/docs/feature-flags/python/DOC.md +1671 -0
  64. package/dist/linear/docs/tracker/DOC.md +1554 -0
  65. package/dist/livekit/docs/realtime/javascript/DOC.md +303 -0
  66. package/dist/livekit/docs/realtime/python/DOC.md +163 -0
  67. package/dist/mailchimp/docs/marketing/DOC.md +1420 -0
  68. package/dist/meilisearch/docs/search/DOC.md +1241 -0
  69. package/dist/microsoft/docs/onedrive/javascript/DOC.md +1421 -0
  70. package/dist/microsoft/docs/onedrive/python/DOC.md +1549 -0
  71. package/dist/mongodb/docs/atlas/DOC.md +2041 -0
  72. package/dist/notion/docs/workspace-api/javascript/DOC.md +1435 -0
  73. package/dist/notion/docs/workspace-api/python/DOC.md +1400 -0
  74. package/dist/okta/docs/identity/javascript/DOC.md +1171 -0
  75. package/dist/okta/docs/identity/python/DOC.md +1401 -0
  76. package/dist/openai/docs/chat/javascript/DOC.md +407 -0
  77. package/dist/openai/docs/chat/python/DOC.md +568 -0
  78. package/dist/paypal/docs/checkout/DOC.md +278 -0
  79. package/dist/pinecone/docs/sdk/javascript/DOC.md +984 -0
  80. package/dist/pinecone/docs/sdk/python/DOC.md +1395 -0
  81. package/dist/plaid/docs/banking/javascript/DOC.md +1163 -0
  82. package/dist/plaid/docs/banking/python/DOC.md +1203 -0
  83. package/dist/playwright-community/skills/login-flows/SKILL.md +108 -0
  84. package/dist/postmark/docs/transactional-email/DOC.md +1168 -0
  85. package/dist/prisma/docs/orm/javascript/DOC.md +1419 -0
  86. package/dist/prisma/docs/orm/python/DOC.md +1317 -0
  87. package/dist/qdrant/docs/vector-search/javascript/DOC.md +1221 -0
  88. package/dist/qdrant/docs/vector-search/python/DOC.md +1653 -0
  89. package/dist/rabbitmq/docs/message-queue/javascript/DOC.md +1193 -0
  90. package/dist/rabbitmq/docs/message-queue/python/DOC.md +1243 -0
  91. package/dist/razorpay/docs/payments/javascript/DOC.md +1219 -0
  92. package/dist/razorpay/docs/payments/python/DOC.md +1330 -0
  93. package/dist/redis/docs/key-value/javascript/DOC.md +1851 -0
  94. package/dist/redis/docs/key-value/python/DOC.md +2054 -0
  95. package/dist/registry.json +2817 -0
  96. package/dist/replicate/docs/model-hosting/DOC.md +1318 -0
  97. package/dist/resend/docs/email/DOC.md +1271 -0
  98. package/dist/salesforce/docs/crm/javascript/DOC.md +1241 -0
  99. package/dist/salesforce/docs/crm/python/DOC.md +1183 -0
  100. package/dist/search-index.json +1 -0
  101. package/dist/sendgrid/docs/email-api/javascript/DOC.md +371 -0
  102. package/dist/sendgrid/docs/email-api/python/DOC.md +656 -0
  103. package/dist/sentry/docs/error-tracking/javascript/DOC.md +1073 -0
  104. package/dist/sentry/docs/error-tracking/python/DOC.md +1309 -0
  105. package/dist/shopify/docs/storefront/DOC.md +457 -0
  106. package/dist/slack/docs/workspace/javascript/DOC.md +933 -0
  107. package/dist/slack/docs/workspace/python/DOC.md +271 -0
  108. package/dist/square/docs/payments/javascript/DOC.md +1855 -0
  109. package/dist/square/docs/payments/python/DOC.md +1728 -0
  110. package/dist/stripe/docs/api/DOC.md +1727 -0
  111. package/dist/stripe/docs/payments/DOC.md +1726 -0
  112. package/dist/stytch/docs/auth/javascript/DOC.md +1813 -0
  113. package/dist/stytch/docs/auth/python/DOC.md +1962 -0
  114. package/dist/supabase/docs/client/DOC.md +1606 -0
  115. package/dist/twilio/docs/messaging/python/DOC.md +469 -0
  116. package/dist/twilio/docs/messaging/typescript/DOC.md +946 -0
  117. package/dist/vercel/docs/platform/DOC.md +1940 -0
  118. package/dist/weaviate/docs/vector-db/javascript/DOC.md +1268 -0
  119. package/dist/weaviate/docs/vector-db/python/DOC.md +1388 -0
  120. package/dist/zendesk/docs/support/javascript/DOC.md +2150 -0
  121. package/dist/zendesk/docs/support/python/DOC.md +2297 -0
  122. package/package.json +22 -6
  123. package/skills/get-api-docs/SKILL.md +84 -0
  124. package/src/commands/annotate.js +83 -0
  125. package/src/commands/build.js +12 -1
  126. package/src/commands/feedback.js +150 -0
  127. package/src/commands/get.js +83 -42
  128. package/src/commands/search.js +7 -0
  129. package/src/index.js +43 -17
  130. package/src/lib/analytics.js +90 -0
  131. package/src/lib/annotations.js +57 -0
  132. package/src/lib/bm25.js +170 -0
  133. package/src/lib/cache.js +69 -6
  134. package/src/lib/config.js +8 -3
  135. package/src/lib/identity.js +99 -0
  136. package/src/lib/registry.js +103 -20
  137. package/src/lib/telemetry.js +86 -0
  138. package/src/mcp/server.js +177 -0
  139. package/src/mcp/tools.js +251 -0
@@ -0,0 +1,620 @@
1
+ ---
2
+ name: api
3
+ description: "REST API specification for LandingAI's Agentic Document Extraction (ADE). Covers all endpoints (Parse, Extract, Split, Parse Jobs), request parameters, response structures, data types, error codes, model versions, and curl examples."
4
+ metadata:
5
+ languages: "http"
6
+ versions: "v1"
7
+ updated-on: "2026-03-04"
8
+ source: maintainer
9
+ tags: "landingai,ade,api,document-extraction,parse,extract,split,parse-jobs,curl,rest"
10
+ ---
11
+
12
+ # LandingAI ADE API Specification
13
+
14
+ Complete API specification for LandingAI's Agentic Document Extraction (ADE).
15
+
16
+ ## Overview
17
+
18
+ ADE provides a REST API for document parsing, splitting, data extraction, and large file parse jobs. All SDKs and tools (Python, TypeScript) use this same underlying API.
19
+
20
+ **Core workflow**: Parse first → then Split and/or Extract from the parsed markdown. Extract and Split accept **markdown, not raw files**.
21
+
22
+ ## Base Configuration
23
+
24
+ | Region | Base URL |
25
+ |--------|----------|
26
+ | US (default) | `https://api.va.landing.ai` |
27
+ | EU | `https://api.va.eu-west-1.landing.ai` |
28
+
29
+ All endpoint paths below are relative to the base URL (e.g., `POST {base}/v1/ade/parse`).
30
+
31
+ **Authentication**: All requests require `Authorization: Bearer $VISION_AGENT_API_KEY`
32
+
33
+ **Content type**: Always use `-F` (multipart form data), never `-d` (JSON body).
34
+
35
+ ## SDK Quick Start
36
+
37
+ ```bash
38
+ # Python
39
+ pip install landingai-ade
40
+
41
+ # TypeScript / JavaScript
42
+ npm install landingai-ade
43
+ ```
44
+
45
+ ## Common Mistakes
46
+
47
+ | Mistake | Fix |
48
+ |---------|-----|
49
+ | Sending a PDF/image to `/extract` or `/split` | **Parse first** to get markdown, then extract/split from that |
50
+ | `Authorization: Basic` | Must be `Authorization: Bearer` |
51
+ | `-F "pdf=@..."` or `-F "file=@..."` | Field name is `document` (parse) or `markdown` (extract/split) |
52
+ | Missing `@` before file path in curl | `-F "document=@/path/to/file"` needs the `@` |
53
+ | Using `-d` (JSON body) instead of `-F` | Always use `-F` for multipart form data |
54
+ | Missing `schema` on extract | Required — define a JSON schema for the fields you want |
55
+ | Not using `jq -r` when extracting markdown | Plain `jq` wraps output in quotes with escapes; `jq -r` gives raw text |
56
+ | Sync parse on huge documents | Use `/v1/ade/parse/jobs` for files >50MB or >50 pages |
57
+
58
+ ---
59
+
60
+ ## API Endpoints
61
+
62
+ ### 1. Parse API
63
+
64
+ **Endpoint**: `POST /v1/ade/parse`
65
+
66
+ Converts documents to structured markdown with visual grounding.
67
+
68
+ #### Request Parameters
69
+
70
+ | Parameter | Type | Required | Description |
71
+ |-----------|------|----------|-------------|
72
+ | `document` | file | One required | Local file — PDF, images (JPG/PNG/TIFF/WEBP/GIF/BMP/PSD + more), Word (DOC/DOCX/ODT), PowerPoint (PPT/PPTX/ODP), spreadsheets (XLSX/CSV) |
73
+ | `document_url` | string | One required | Remote document URL |
74
+ | `model` | string | No | Model version (default: `dpt-2-latest`) |
75
+ | `split` | string | No | Split mode: `"page"` to split by pages |
76
+
77
+ #### Response Structure
78
+
79
+ ```
80
+ .markdown → string: full document as markdown
81
+ .chunks[] → {id, type, markdown, grounding: {page, box: {left, top, right, bottom}}}
82
+ .grounding → {id → {type, page, box, position?}} — bounding boxes + tableCell positions
83
+ .splits[] → {chunks[], class, identifier, markdown, pages[]} (only if split="page")
84
+ .metadata → {filename, org_id, page_count, duration_ms, credit_usage, version, job_id, failed_pages}
85
+ ```
86
+
87
+ <details>
88
+ <summary>Full JSON example</summary>
89
+
90
+ ```json
91
+ {
92
+ "markdown": "string",
93
+ "chunks": [
94
+ {
95
+ "id": "uuid",
96
+ "type": "text|table|marginalia|figure|scan_code|logo|card|attestation",
97
+ "markdown": "string",
98
+ "grounding": {
99
+ "page": 0,
100
+ "box": { "left": 0.1, "top": 0.2, "right": 0.9, "bottom": 0.3 }
101
+ }
102
+ }
103
+ ],
104
+ "grounding": {
105
+ "chunk-id": {
106
+ "type": "chunkText|chunkTable|chunkFigure|chunkLogo|chunkCard|chunkAttestation|chunkScanCode|chunkForm|chunkMarginalia|chunkTitle|chunkPageHeader|chunkPageFooter|chunkPageNumber|chunkKeyValue|table|tableCell",
107
+ "page": 0,
108
+ "box": { "left": 0.1, "top": 0.2, "right": 0.9, "bottom": 0.3 }
109
+ },
110
+ "0-1": { "type": "table", "page": 0, "box": {} },
111
+ "0-2": {
112
+ "type": "tableCell", "page": 0, "box": {},
113
+ "position": { "row": 0, "col": 0, "rowspan": 1, "colspan": 1, "chunk_id": "uuid" }
114
+ }
115
+ },
116
+ "splits": [
117
+ { "chunks": ["chunk-id-1"], "class": "page", "identifier": "0", "markdown": "string", "pages": [0] }
118
+ ],
119
+ "metadata": {
120
+ "filename": "document.pdf", "org_id": "org_abc123", "page_count": 5,
121
+ "duration_ms": 1234, "credit_usage": 3, "version": "dpt-2-latest",
122
+ "job_id": "job_abc123", "failed_pages": []
123
+ }
124
+ }
125
+ ```
126
+
127
+ </details>
128
+
129
+ ### 2. Extract API
130
+
131
+ **Endpoint**: `POST /v1/ade/extract`
132
+
133
+ Extracts structured data from markdown using JSON schemas. **Accepts markdown, not raw documents** — parse first if needed.
134
+
135
+ #### Request Parameters
136
+
137
+ | Parameter | Type | Required | Description |
138
+ |-----------|------|----------|-------------|
139
+ | `schema` | JSON string | Yes | JSON Schema defining extraction structure |
140
+ | `markdown` | string/file | One required | Markdown content or markdown file to extract from |
141
+ | `markdown_url` | string | One required | URL to markdown content |
142
+ | `model` | string | No | Model version (default: `extract-latest`) |
143
+
144
+ #### Response Structure
145
+
146
+ ```
147
+ .extraction → object: extracted key-value pairs matching schema
148
+ .extraction_metadata → {field → {references: [chunk_ids]}} for grounding
149
+ .metadata → {credit_usage, duration_ms, filename, job_id, org_id, version, fallback_model_version, schema_violation_error}
150
+ ```
151
+
152
+ ### 3. Split API
153
+
154
+ **Endpoint**: `POST /v1/ade/split`
155
+
156
+ Classifies and splits mixed documents by type. **Accepts markdown, not raw documents** — parse first if needed.
157
+
158
+ #### Request Parameters
159
+
160
+ | Parameter | Type | Required | Description |
161
+ |-----------|------|----------|-------------|
162
+ | `split_class` | JSON array | Yes | Classification configuration (see below) |
163
+ | `markdown` | string | One required | Markdown content to split |
164
+ | `markdownUrl` | string | One required | URL to markdown content |
165
+ | `model` | string | No | Model version (default: `split-latest`) |
166
+
167
+ #### Split Class Structure
168
+
169
+ ```json
170
+ {
171
+ "name": "Invoice", // Required: Classification name
172
+ "description": "Sales invoice", // Optional: Description for better classification
173
+ "identifier": "Invoice Number" // Optional: Field to group documents by
174
+ }
175
+ ```
176
+
177
+ #### Response Structure
178
+
179
+ ```
180
+ .splits[] → {chunks[], class, classification, identifier, markdowns[], pages[]}
181
+ .metadata → {credit_usage, duration_ms, filename, page_count, job_id, org_id, version}
182
+ ```
183
+
184
+ ### 4. Parse Jobs API (Async)
185
+
186
+ For large files (>50MB), use asynchronous processing.
187
+
188
+ #### Create Job
189
+
190
+ **Endpoint**: `POST /v1/ade/parse/jobs`
191
+
192
+ **Parameters**: Same as Parse API plus:
193
+
194
+ | Parameter | Type | Required | Description |
195
+ |-----------|------|----------|-------------|
196
+ | `output_save_url` | string | If ZDR | URL for zero data retention output |
197
+
198
+ **Response**: `{ "job_id": "cml1kaihb08dxcn01b3mlfy5b" }`
199
+
200
+ #### Get Job Status
201
+
202
+ **Endpoint**: `GET /v1/ade/parse/jobs/{job_id}`
203
+
204
+ ```
205
+ .job_id → string
206
+ .status → string: pending|processing|completed|failed|cancelled
207
+ .progress → number: 0.0 to 1.0
208
+ .failure_reason → string | null: error message if failed
209
+ .received_at → number: Unix timestamp
210
+ .data → ParseResponse | null: full result when completed (if output_save_url not used)
211
+ .output_url → string | null: presigned URL when result >1MB or output_save_url was set (expires 1hr)
212
+ .org_id → string
213
+ .version → string
214
+ .metadata → ParseMetadata | null
215
+ ```
216
+
217
+ #### List Jobs
218
+
219
+ **Endpoint**: `GET /v1/ade/parse/jobs`
220
+
221
+ **Query Parameters**: `status` (filter), `page` (0-indexed), `pageSize` (items per page)
222
+
223
+ ```
224
+ .jobs[] → {job_id, status, progress, failure_reason, received_at}
225
+ .has_more → boolean
226
+ .org_id → string
227
+ ```
228
+
229
+ ---
230
+
231
+ ## Data Types
232
+
233
+ ### Chunk Types
234
+ - `text` — Characters, paragraphs, headings, lists, form fields, checkboxes, code blocks
235
+ - `table` — Grid of rows and columns; includes spreadsheets and receipts
236
+ - `figure` — Visual/graphical non-text content — images, graphs, flowcharts, diagrams
237
+ - `marginalia` — Content in document margins — headers, footers, page numbers, handwritten notes
238
+ - `logo` — Logos (DPT-2 only)
239
+ - `card` — ID cards and driver's licenses (DPT-2 only)
240
+ - `attestation` — Signatures, stamps, and seals (DPT-2 only)
241
+ - `scan_code` — QR codes and barcodes (DPT-2 only)
242
+
243
+ ### Grounding Types
244
+
245
+ #### For Chunks (with "chunk" prefix)
246
+ - `chunkText`, `chunkTable`, `chunkFigure`, `chunkMarginalia`, `chunkLogo`, `chunkCard`, `chunkAttestation`, `chunkScanCode`
247
+
248
+ #### For Structure Elements (no prefix)
249
+ - `table` — Actual table structure
250
+ - `tableCell` — Individual table cell with position
251
+
252
+ ### Bounding Box
253
+
254
+ All coordinates normalized 0–1: `{ left, top, right, bottom }`.
255
+
256
+ ### Table Cell Position
257
+
258
+ `{ row, col, rowspan, colspan, chunk_id }` — all zero-indexed.
259
+
260
+ ### Table Chunk Formats
261
+
262
+ Table chunks render as HTML. The ID format and grounding availability differ by source document type.
263
+
264
+ #### PDF / Image / Document Tables
265
+
266
+ Element IDs use the format `{page_number}-{base62_sequential_number}` (page starts at 0, numbers increment per element within the page). Cells may include `rowspan`/`colspan` attributes. The `grounding` object contains bounding boxes and `tableCell` position entries for every cell.
267
+
268
+ ```html
269
+ <a id='chunk-uuid'></a>
270
+
271
+ <table id="0-1">
272
+ <tr><td id="0-2" colspan="2">Product Summary</td></tr>
273
+ <tr><td id="0-3">Product</td><td id="0-4">Revenue</td></tr>
274
+ <tr><td id="0-5">Hardware</td><td id="0-6">15,230</td></tr>
275
+ </table>
276
+ ```
277
+
278
+ #### Spreadsheet Tables (XLSX / CSV)
279
+
280
+ Element IDs use the format `{tab_name}-{cell_reference}` (e.g., `Sheet 1-A1`). The table element itself uses `{tab_name}-{start_cell}:{end_cell}` (e.g., `Sheet 1-A1:B4`). Embedded images and charts become `figure` chunks.
281
+
282
+ **`grounding` is `null`** for spreadsheet table chunks — cell positions are encoded in the IDs themselves.
283
+
284
+ ```html
285
+ <a id='Sheet 1-A1:B4-chunk'></a>
286
+
287
+ <table id='Sheet 1-A1:B4'>
288
+ <tr>
289
+ <td id='Sheet 1-A1'>Program</td>
290
+ <td id='Sheet 1-B1'>Interest Rate</td>
291
+ </tr>
292
+ <tr>
293
+ <td id='Sheet 1-A2'>15 Year Fixed-Rate Mortgage</td>
294
+ <td id='Sheet 1-B2'>0.05125</td>
295
+ </tr>
296
+ </table>
297
+ ```
298
+
299
+ ---
300
+
301
+ ## Error Responses
302
+
303
+ All errors follow this format:
304
+
305
+ ```json
306
+ {
307
+ "error": {
308
+ "message": "Human-readable error message",
309
+ "type": "error_type",
310
+ "details": { "field": "problem_field", "reason": "Specific reason" }
311
+ }
312
+ }
313
+ ```
314
+
315
+ ### HTTP Status Codes
316
+
317
+ | Status | Error Type | Description | Solution |
318
+ |--------|------------|-------------|----------|
319
+ | 400 | `validation_error` | Invalid parameters | Check request format |
320
+ | 401 | `authentication_error` | Invalid API key | Check VISION_AGENT_API_KEY |
321
+ | 413 | `payload_too_large` | File too large | Use Parse Jobs API |
322
+ | 422 | `unprocessable_entity` | Invalid file type or malformed schema | Validate file format and schema JSON |
323
+ | 429 | `rate_limit_error` | Too many requests | Implement backoff |
324
+ | 500 | `internal_error` | Server error | Retry with backoff |
325
+ | 504 | `timeout_error` | Request timeout | Use Parse Jobs API |
326
+
327
+ ## Model Versions
328
+
329
+ | Operation | Current Version | Description |
330
+ |-----------|----------------|-------------|
331
+ | Parse | `dpt-2-latest` | Document parsing and OCR |
332
+ | Extract | `extract-latest` | Schema-based extraction |
333
+ | Split | `split-latest` | Document classification |
334
+
335
+ ## Supported File Types
336
+
337
+ | Category | Formats | Notes |
338
+ |----------|---------|-------|
339
+ | **PDF** | PDF | Up to 100 pages; no password-protected files |
340
+ | **Images** | JPEG, JPG, PNG, APNG, BMP, DCX, DDS, DIB, GD, GIF, ICNS, JP2, PCX, PPM, PSD, TGA, TIF, TIFF, WEBP | |
341
+ | **Text Documents** | DOC, DOCX, ODT | Converted to PDF before parsing |
342
+ | **Presentations** | ODP, PPT, PPTX | Converted to PDF before parsing |
343
+ | **Spreadsheets** | CSV, XLSX | Up to 10 MB in Playground; no sheet/column/row limits |
344
+
345
+ > **Note:** Word, PowerPoint, and OpenDocument files are converted to PDF server-side before parsing.
346
+
347
+ ## Best Practices
348
+
349
+ ### File Size Handling
350
+ - < 50MB: Use synchronous Parse API
351
+ - \> 50MB: Use Parse Jobs API
352
+ - \> 100MB: Consider splitting document first
353
+
354
+ ### Rate Limiting
355
+ - Implement exponential backoff — start with 1s, double on each retry, max 5 retries
356
+
357
+ ### Cost Optimization
358
+ - Parse once, extract/split multiple times
359
+ - Use specific schemas (avoid extracting everything)
360
+ - Cache parsed results when possible
361
+
362
+ ---
363
+
364
+ # API (curl) Reference
365
+
366
+ Direct HTTP API implementation using curl and shell scripts.
367
+
368
+ ## Authentication
369
+
370
+ ```bash
371
+ export VISION_AGENT_API_KEY="v2_..."
372
+ BASE_URL="https://api.va.landing.ai" # or https://api.va.eu-west-1.landing.ai for EU
373
+ ```
374
+
375
+ ## Parse Examples
376
+
377
+ ### Basic Parse
378
+ ```bash
379
+ curl -s -X POST "$BASE_URL/v1/ade/parse" \
380
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
381
+ -F "document=@document.pdf" \
382
+ -F "model=dpt-2-latest"
383
+ ```
384
+
385
+ ### Parse with Page Splitting
386
+ ```bash
387
+ curl -s -X POST "$BASE_URL/v1/ade/parse" \
388
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
389
+ -F "document=@multi_page.pdf" \
390
+ -F "split=page"
391
+ ```
392
+
393
+ ### Parse from URL
394
+ ```bash
395
+ curl -s -X POST "$BASE_URL/v1/ade/parse" \
396
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
397
+ -F "document_url=https://example.com/document.pdf"
398
+ ```
399
+
400
+ ## Extract Examples
401
+
402
+ ```bash
403
+ SCHEMA='{
404
+ "type": "object",
405
+ "properties": {
406
+ "invoice_number": {"type": "string", "description": "Invoice number"},
407
+ "total_amount": {"type": "number", "description": "Total amount"},
408
+ "vendor_name": {"type": "string", "description": "Vendor name"}
409
+ }
410
+ }'
411
+
412
+ # Extract from a markdown file (parse first if you have a PDF)
413
+ curl -s -X POST "$BASE_URL/v1/ade/extract" \
414
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
415
+ -F "markdown=@parsed_invoice.md" \
416
+ -F "schema=$SCHEMA" \
417
+ -F "model=extract-latest"
418
+ ```
419
+
420
+ ### Parse Once, Extract Many
421
+ ```bash
422
+ # Parse once, save markdown
423
+ MARKDOWN=$(curl -s -X POST "$BASE_URL/v1/ade/parse" \
424
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
425
+ -F "document=@invoice.pdf" \
426
+ | jq -r '.markdown')
427
+
428
+ # Extract with different schemas
429
+ curl -s -X POST "$BASE_URL/v1/ade/extract" \
430
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
431
+ -F "markdown=$MARKDOWN" \
432
+ -F "schema=$SCHEMA"
433
+ ```
434
+
435
+ ## Split Examples
436
+
437
+ ```bash
438
+ SPLIT_CLASSES='[
439
+ {"name": "Invoice", "identifier": "Invoice Number"},
440
+ {"name": "Receipt", "identifier": "Receipt Number"},
441
+ {"name": "Purchase Order", "identifier": "PO Number"}
442
+ ]'
443
+
444
+ # Parse first, then split
445
+ MARKDOWN=$(curl -s -X POST "$BASE_URL/v1/ade/parse" \
446
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
447
+ -F "document=@mixed_documents.pdf" \
448
+ | jq -r '.markdown')
449
+
450
+ curl -s -X POST "$BASE_URL/v1/ade/split" \
451
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
452
+ -F "markdown=$MARKDOWN" \
453
+ -F "split_class=$SPLIT_CLASSES" \
454
+ -F "model=split-latest"
455
+ ```
456
+
457
+ ## Parse Jobs (Async, Large Files)
458
+
459
+ ```bash
460
+ #!/bin/bash
461
+
462
+ # Create job
463
+ JOB_ID=$(curl -s -X POST "$BASE_URL/v1/ade/parse/jobs" \
464
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
465
+ -F "document=@large_document.pdf" \
466
+ -F "model=dpt-2-latest" \
467
+ | jq -r '.job_id')
468
+
469
+ echo "Created job: $JOB_ID"
470
+
471
+ # Poll for completion
472
+ while true; do
473
+ STATUS=$(curl -s -X GET "$BASE_URL/v1/ade/parse/jobs/$JOB_ID" \
474
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY")
475
+
476
+ STATE=$(echo "$STATUS" | jq -r '.status')
477
+ PROGRESS=$(echo "$STATUS" | jq -r '.progress')
478
+
479
+ echo "Status: $STATE, Progress: $(echo "$PROGRESS * 100" | bc)%"
480
+
481
+ if [ "$STATE" = "completed" ]; then
482
+ echo "$STATUS" | jq '.data' > "parse_result.json"
483
+ break
484
+ elif [ "$STATE" = "failed" ]; then
485
+ echo "Job failed: $(echo "$STATUS" | jq -r '.failure_reason')" >&2
486
+ exit 1
487
+ fi
488
+
489
+ sleep 5
490
+ done
491
+ ```
492
+
493
+ ## Complete Workflow: Parse → Split → Extract
494
+
495
+ ```bash
496
+ #!/bin/bash
497
+
498
+ # 1. Parse
499
+ MARKDOWN=$(curl -s -X POST "$BASE_URL/v1/ade/parse" \
500
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
501
+ -F "document=@mixed_invoices.pdf" \
502
+ | jq -r '.markdown')
503
+
504
+ # 2. Split
505
+ SPLIT_CLASSES='[
506
+ {"name": "Invoice", "identifier": "Invoice Number"},
507
+ {"name": "Credit Note", "identifier": "Credit Note Number"}
508
+ ]'
509
+
510
+ SPLITS=$(curl -s -X POST "$BASE_URL/v1/ade/split" \
511
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
512
+ -F "markdown=$MARKDOWN" \
513
+ -F "split_class=$SPLIT_CLASSES")
514
+
515
+ # 3. Extract from each split
516
+ SCHEMA='{"type": "object", "properties": {
517
+ "document_number": {"type": "string"},
518
+ "total": {"type": "number"},
519
+ "date": {"type": "string"}
520
+ }}'
521
+
522
+ echo "$SPLITS" | jq -c '.splits[]' | while read -r split; do
523
+ TYPE=$(echo "$split" | jq -r '.classification')
524
+ ID=$(echo "$split" | jq -r '.identifier')
525
+ MD=$(echo "$split" | jq -r '.markdowns[0]')
526
+
527
+ echo "Processing $TYPE: $ID"
528
+
529
+ curl -s -X POST "$BASE_URL/v1/ade/extract" \
530
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
531
+ -F "markdown=$MD" \
532
+ -F "schema=$SCHEMA" \
533
+ | jq '.extraction'
534
+ done
535
+ ```
536
+
537
+ ## Error Handling with Retry
538
+
539
+ ```bash
540
+ #!/bin/bash
541
+
542
+ MAX_RETRIES=3
543
+ RETRY_COUNT=0
544
+
545
+ while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
546
+ RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$BASE_URL/v1/ade/parse" \
547
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
548
+ -F "document=@document.pdf")
549
+
550
+ HTTP_CODE=$(echo "$RESPONSE" | tail -n 1)
551
+ BODY=$(echo "$RESPONSE" | sed '$d')
552
+
553
+ if [ "$HTTP_CODE" -eq 200 ]; then
554
+ echo "$BODY"
555
+ break
556
+ elif [ "$HTTP_CODE" -eq 429 ]; then
557
+ WAIT_TIME=$((2 ** RETRY_COUNT * 10))
558
+ echo "Rate limited. Waiting ${WAIT_TIME}s..." >&2
559
+ sleep $WAIT_TIME
560
+ RETRY_COUNT=$((RETRY_COUNT + 1))
561
+ elif [ "$HTTP_CODE" -eq 413 ] || [ "$HTTP_CODE" -eq 504 ]; then
562
+ echo "File too large or timeout — use parse jobs API" >&2
563
+ exit 1
564
+ else
565
+ echo "Error: HTTP $HTTP_CODE" >&2
566
+ echo "$BODY" | jq '.error' >&2
567
+ exit 1
568
+ fi
569
+ done
570
+ ```
571
+
572
+ ## jq Recipes
573
+
574
+ ```bash
575
+ # Extract just markdown
576
+ curl -s ... | jq -r '.markdown'
577
+
578
+ # Get all tables
579
+ curl -s ... | jq '.chunks[] | select(.type == "table")'
580
+
581
+ # Extract table cells with positions
582
+ curl -s ... | jq '.grounding | to_entries[] | select(.value.type == "tableCell")'
583
+
584
+ # Get chunks from specific page
585
+ curl -s ... | jq '.chunks[] | select(.grounding.page == 0)'
586
+
587
+ # Group chunks by type with counts
588
+ curl -s ... | jq '.chunks | group_by(.type) | map({type: .[0].type, count: length})'
589
+
590
+ # Get specific extracted field
591
+ curl -s ... | jq '.extraction.invoice_number'
592
+
593
+ # Process extracted line items
594
+ curl -s ... | jq '.extraction.line_items[] | {sku: .sku, total: (.quantity * .unit_price)}'
595
+ ```
596
+
597
+ ## Shell Functions for Reuse
598
+
599
+ ```bash
600
+ ade_parse() {
601
+ curl -s -X POST "$BASE_URL/v1/ade/parse" \
602
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
603
+ -F "document=@$1"
604
+ }
605
+
606
+ ade_extract() {
607
+ curl -s -X POST "$BASE_URL/v1/ade/extract" \
608
+ -H "Authorization: Bearer $VISION_AGENT_API_KEY" \
609
+ -F "markdown=$1" \
610
+ -F "schema=$2"
611
+ }
612
+ ```
613
+
614
+ ---
615
+
616
+ ## External Links
617
+
618
+ - [API Reference](https://docs.landing.ai/api-reference)
619
+ - [ADE Documentation](https://docs.landing.ai/ade)
620
+ - [Supported File Types](https://docs.landing.ai/ade/ade-file-types)