chub-dev 0.1.0 → 0.1.2-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -0
- package/bin/chub-mcp +2 -0
- package/dist/airtable/docs/database/javascript/DOC.md +1437 -0
- package/dist/airtable/docs/database/python/DOC.md +1735 -0
- package/dist/amplitude/docs/analytics/javascript/DOC.md +1282 -0
- package/dist/amplitude/docs/analytics/python/DOC.md +1199 -0
- package/dist/anthropic/docs/claude-api/javascript/DOC.md +503 -0
- package/dist/anthropic/docs/claude-api/python/DOC.md +389 -0
- package/dist/asana/docs/tasks/DOC.md +1396 -0
- package/dist/assemblyai/docs/transcription/DOC.md +1043 -0
- package/dist/atlassian/docs/confluence/javascript/DOC.md +1347 -0
- package/dist/atlassian/docs/confluence/python/DOC.md +1604 -0
- package/dist/auth0/docs/identity/javascript/DOC.md +968 -0
- package/dist/auth0/docs/identity/python/DOC.md +1199 -0
- package/dist/aws/docs/s3/javascript/DOC.md +1773 -0
- package/dist/aws/docs/s3/python/DOC.md +1807 -0
- package/dist/binance/docs/trading/javascript/DOC.md +1315 -0
- package/dist/binance/docs/trading/python/DOC.md +1454 -0
- package/dist/braintree/docs/gateway/javascript/DOC.md +1278 -0
- package/dist/braintree/docs/gateway/python/DOC.md +1179 -0
- package/dist/chromadb/docs/embeddings-db/javascript/DOC.md +1263 -0
- package/dist/chromadb/docs/embeddings-db/python/DOC.md +1707 -0
- package/dist/clerk/docs/auth/javascript/DOC.md +1220 -0
- package/dist/clerk/docs/auth/python/DOC.md +274 -0
- package/dist/cloudflare/docs/workers/javascript/DOC.md +918 -0
- package/dist/cloudflare/docs/workers/python/DOC.md +994 -0
- package/dist/cockroachdb/docs/distributed-db/DOC.md +1500 -0
- package/dist/cohere/docs/llm/DOC.md +1335 -0
- package/dist/datadog/docs/monitoring/javascript/DOC.md +1740 -0
- package/dist/datadog/docs/monitoring/python/DOC.md +1815 -0
- package/dist/deepgram/docs/speech/javascript/DOC.md +885 -0
- package/dist/deepgram/docs/speech/python/DOC.md +685 -0
- package/dist/deepl/docs/translation/javascript/DOC.md +887 -0
- package/dist/deepl/docs/translation/python/DOC.md +944 -0
- package/dist/deepseek/docs/llm/DOC.md +1220 -0
- package/dist/directus/docs/headless-cms/javascript/DOC.md +1128 -0
- package/dist/directus/docs/headless-cms/python/DOC.md +1276 -0
- package/dist/discord/docs/bot/javascript/DOC.md +1090 -0
- package/dist/discord/docs/bot/python/DOC.md +1130 -0
- package/dist/elasticsearch/docs/search/DOC.md +1634 -0
- package/dist/elevenlabs/docs/text-to-speech/javascript/DOC.md +336 -0
- package/dist/elevenlabs/docs/text-to-speech/python/DOC.md +552 -0
- package/dist/firebase/docs/auth/DOC.md +1015 -0
- package/dist/gemini/docs/genai/javascript/DOC.md +691 -0
- package/dist/gemini/docs/genai/python/DOC.md +555 -0
- package/dist/github/docs/octokit/DOC.md +1560 -0
- package/dist/google/docs/bigquery/javascript/DOC.md +1688 -0
- package/dist/google/docs/bigquery/python/DOC.md +1503 -0
- package/dist/hubspot/docs/crm/javascript/DOC.md +1805 -0
- package/dist/hubspot/docs/crm/python/DOC.md +2033 -0
- package/dist/huggingface/docs/transformers/DOC.md +948 -0
- package/dist/intercom/docs/messaging/javascript/DOC.md +1844 -0
- package/dist/intercom/docs/messaging/python/DOC.md +1797 -0
- package/dist/jira/docs/issues/javascript/DOC.md +1420 -0
- package/dist/jira/docs/issues/python/DOC.md +1492 -0
- package/dist/kafka/docs/streaming/javascript/DOC.md +1671 -0
- package/dist/kafka/docs/streaming/python/DOC.md +1464 -0
- package/dist/landingai-ade/docs/api/DOC.md +620 -0
- package/dist/landingai-ade/docs/sdk/python/DOC.md +489 -0
- package/dist/landingai-ade/docs/sdk/typescript/DOC.md +542 -0
- package/dist/landingai-ade/skills/SKILL.md +489 -0
- package/dist/launchdarkly/docs/feature-flags/javascript/DOC.md +1191 -0
- package/dist/launchdarkly/docs/feature-flags/python/DOC.md +1671 -0
- package/dist/linear/docs/tracker/DOC.md +1554 -0
- package/dist/livekit/docs/realtime/javascript/DOC.md +303 -0
- package/dist/livekit/docs/realtime/python/DOC.md +163 -0
- package/dist/mailchimp/docs/marketing/DOC.md +1420 -0
- package/dist/meilisearch/docs/search/DOC.md +1241 -0
- package/dist/microsoft/docs/onedrive/javascript/DOC.md +1421 -0
- package/dist/microsoft/docs/onedrive/python/DOC.md +1549 -0
- package/dist/mongodb/docs/atlas/DOC.md +2041 -0
- package/dist/notion/docs/workspace-api/javascript/DOC.md +1435 -0
- package/dist/notion/docs/workspace-api/python/DOC.md +1400 -0
- package/dist/okta/docs/identity/javascript/DOC.md +1171 -0
- package/dist/okta/docs/identity/python/DOC.md +1401 -0
- package/dist/openai/docs/chat/javascript/DOC.md +407 -0
- package/dist/openai/docs/chat/python/DOC.md +568 -0
- package/dist/paypal/docs/checkout/DOC.md +278 -0
- package/dist/pinecone/docs/sdk/javascript/DOC.md +984 -0
- package/dist/pinecone/docs/sdk/python/DOC.md +1395 -0
- package/dist/plaid/docs/banking/javascript/DOC.md +1163 -0
- package/dist/plaid/docs/banking/python/DOC.md +1203 -0
- package/dist/playwright-community/skills/login-flows/SKILL.md +108 -0
- package/dist/postmark/docs/transactional-email/DOC.md +1168 -0
- package/dist/prisma/docs/orm/javascript/DOC.md +1419 -0
- package/dist/prisma/docs/orm/python/DOC.md +1317 -0
- package/dist/qdrant/docs/vector-search/javascript/DOC.md +1221 -0
- package/dist/qdrant/docs/vector-search/python/DOC.md +1653 -0
- package/dist/rabbitmq/docs/message-queue/javascript/DOC.md +1193 -0
- package/dist/rabbitmq/docs/message-queue/python/DOC.md +1243 -0
- package/dist/razorpay/docs/payments/javascript/DOC.md +1219 -0
- package/dist/razorpay/docs/payments/python/DOC.md +1330 -0
- package/dist/redis/docs/key-value/javascript/DOC.md +1851 -0
- package/dist/redis/docs/key-value/python/DOC.md +2054 -0
- package/dist/registry.json +2817 -0
- package/dist/replicate/docs/model-hosting/DOC.md +1318 -0
- package/dist/resend/docs/email/DOC.md +1271 -0
- package/dist/salesforce/docs/crm/javascript/DOC.md +1241 -0
- package/dist/salesforce/docs/crm/python/DOC.md +1183 -0
- package/dist/search-index.json +1 -0
- package/dist/sendgrid/docs/email-api/javascript/DOC.md +371 -0
- package/dist/sendgrid/docs/email-api/python/DOC.md +656 -0
- package/dist/sentry/docs/error-tracking/javascript/DOC.md +1073 -0
- package/dist/sentry/docs/error-tracking/python/DOC.md +1309 -0
- package/dist/shopify/docs/storefront/DOC.md +457 -0
- package/dist/slack/docs/workspace/javascript/DOC.md +933 -0
- package/dist/slack/docs/workspace/python/DOC.md +271 -0
- package/dist/square/docs/payments/javascript/DOC.md +1855 -0
- package/dist/square/docs/payments/python/DOC.md +1728 -0
- package/dist/stripe/docs/api/DOC.md +1727 -0
- package/dist/stripe/docs/payments/DOC.md +1726 -0
- package/dist/stytch/docs/auth/javascript/DOC.md +1813 -0
- package/dist/stytch/docs/auth/python/DOC.md +1962 -0
- package/dist/supabase/docs/client/DOC.md +1606 -0
- package/dist/twilio/docs/messaging/python/DOC.md +469 -0
- package/dist/twilio/docs/messaging/typescript/DOC.md +946 -0
- package/dist/vercel/docs/platform/DOC.md +1940 -0
- package/dist/weaviate/docs/vector-db/javascript/DOC.md +1268 -0
- package/dist/weaviate/docs/vector-db/python/DOC.md +1388 -0
- package/dist/zendesk/docs/support/javascript/DOC.md +2150 -0
- package/dist/zendesk/docs/support/python/DOC.md +2297 -0
- package/package.json +22 -6
- package/skills/get-api-docs/SKILL.md +84 -0
- package/src/commands/annotate.js +83 -0
- package/src/commands/build.js +12 -1
- package/src/commands/feedback.js +150 -0
- package/src/commands/get.js +83 -42
- package/src/commands/search.js +7 -0
- package/src/index.js +43 -17
- package/src/lib/analytics.js +90 -0
- package/src/lib/annotations.js +57 -0
- package/src/lib/bm25.js +170 -0
- package/src/lib/cache.js +69 -6
- package/src/lib/config.js +8 -3
- package/src/lib/identity.js +99 -0
- package/src/lib/registry.js +103 -20
- package/src/lib/telemetry.js +86 -0
- package/src/mcp/server.js +177 -0
- package/src/mcp/tools.js +251 -0
|
@@ -0,0 +1,620 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: api
|
|
3
|
+
description: "REST API specification for LandingAI's Agentic Document Extraction (ADE). Covers all endpoints (Parse, Extract, Split, Parse Jobs), request parameters, response structures, data types, error codes, model versions, and curl examples."
|
|
4
|
+
metadata:
|
|
5
|
+
languages: "http"
|
|
6
|
+
versions: "v1"
|
|
7
|
+
updated-on: "2026-03-04"
|
|
8
|
+
source: maintainer
|
|
9
|
+
tags: "landingai,ade,api,document-extraction,parse,extract,split,parse-jobs,curl,rest"
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# LandingAI ADE API Specification
|
|
13
|
+
|
|
14
|
+
Complete API specification for LandingAI's Agentic Document Extraction (ADE).
|
|
15
|
+
|
|
16
|
+
## Overview
|
|
17
|
+
|
|
18
|
+
ADE provides a REST API for document parsing, splitting, data extraction, and large file parse jobs. All SDKs and tools (Python, TypeScript) use this same underlying API.
|
|
19
|
+
|
|
20
|
+
**Core workflow**: Parse first → then Split and/or Extract from the parsed markdown. Extract and Split accept **markdown, not raw files**.
|
|
21
|
+
|
|
22
|
+
## Base Configuration
|
|
23
|
+
|
|
24
|
+
| Region | Base URL |
|
|
25
|
+
|--------|----------|
|
|
26
|
+
| US (default) | `https://api.va.landing.ai` |
|
|
27
|
+
| EU | `https://api.va.eu-west-1.landing.ai` |
|
|
28
|
+
|
|
29
|
+
All endpoint paths below are relative to the base URL (e.g., `POST {base}/v1/ade/parse`).
|
|
30
|
+
|
|
31
|
+
**Authentication**: All requests require `Authorization: Bearer $VISION_AGENT_API_KEY`
|
|
32
|
+
|
|
33
|
+
**Content type**: Always use `-F` (multipart form data), never `-d` (JSON body).
|
|
34
|
+
|
|
35
|
+
## SDK Quick Start
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Python
|
|
39
|
+
pip install landingai-ade
|
|
40
|
+
|
|
41
|
+
# TypeScript / JavaScript
|
|
42
|
+
npm install landingai-ade
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Common Mistakes
|
|
46
|
+
|
|
47
|
+
| Mistake | Fix |
|
|
48
|
+
|---------|-----|
|
|
49
|
+
| Sending a PDF/image to `/extract` or `/split` | **Parse first** to get markdown, then extract/split from that |
|
|
50
|
+
| `Authorization: Basic` | Must be `Authorization: Bearer` |
|
|
51
|
+
| `-F "pdf=@..."` or `-F "file=@..."` | Field name is `document` (parse) or `markdown` (extract/split) |
|
|
52
|
+
| Missing `@` before file path in curl | `-F "document=@/path/to/file"` needs the `@` |
|
|
53
|
+
| Using `-d` (JSON body) instead of `-F` | Always use `-F` for multipart form data |
|
|
54
|
+
| Missing `schema` on extract | Required — define a JSON schema for the fields you want |
|
|
55
|
+
| Not using `jq -r` when extracting markdown | Plain `jq` wraps output in quotes with escapes; `jq -r` gives raw text |
|
|
56
|
+
| Sync parse on huge documents | Use `/v1/ade/parse/jobs` for files >50MB or >50 pages |
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## API Endpoints
|
|
61
|
+
|
|
62
|
+
### 1. Parse API
|
|
63
|
+
|
|
64
|
+
**Endpoint**: `POST /v1/ade/parse`
|
|
65
|
+
|
|
66
|
+
Converts documents to structured markdown with visual grounding.
|
|
67
|
+
|
|
68
|
+
#### Request Parameters
|
|
69
|
+
|
|
70
|
+
| Parameter | Type | Required | Description |
|
|
71
|
+
|-----------|------|----------|-------------|
|
|
72
|
+
| `document` | file | One required | Local file — PDF, images (JPG/PNG/TIFF/WEBP/GIF/BMP/PSD + more), Word (DOC/DOCX/ODT), PowerPoint (PPT/PPTX/ODP), spreadsheets (XLSX/CSV) |
|
|
73
|
+
| `document_url` | string | One required | Remote document URL |
|
|
74
|
+
| `model` | string | No | Model version (default: `dpt-2-latest`) |
|
|
75
|
+
| `split` | string | No | Split mode: `"page"` to split by pages |
|
|
76
|
+
|
|
77
|
+
#### Response Structure
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
.markdown → string: full document as markdown
|
|
81
|
+
.chunks[] → {id, type, markdown, grounding: {page, box: {left, top, right, bottom}}}
|
|
82
|
+
.grounding → {id → {type, page, box, position?}} — bounding boxes + tableCell positions
|
|
83
|
+
.splits[] → {chunks[], class, identifier, markdown, pages[]} (only if split="page")
|
|
84
|
+
.metadata → {filename, org_id, page_count, duration_ms, credit_usage, version, job_id, failed_pages}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
<details>
|
|
88
|
+
<summary>Full JSON example</summary>
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"markdown": "string",
|
|
93
|
+
"chunks": [
|
|
94
|
+
{
|
|
95
|
+
"id": "uuid",
|
|
96
|
+
"type": "text|table|marginalia|figure|scan_code|logo|card|attestation",
|
|
97
|
+
"markdown": "string",
|
|
98
|
+
"grounding": {
|
|
99
|
+
"page": 0,
|
|
100
|
+
"box": { "left": 0.1, "top": 0.2, "right": 0.9, "bottom": 0.3 }
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
],
|
|
104
|
+
"grounding": {
|
|
105
|
+
"chunk-id": {
|
|
106
|
+
"type": "chunkText|chunkTable|chunkFigure|chunkLogo|chunkCard|chunkAttestation|chunkScanCode|chunkForm|chunkMarginalia|chunkTitle|chunkPageHeader|chunkPageFooter|chunkPageNumber|chunkKeyValue|table|tableCell",
|
|
107
|
+
"page": 0,
|
|
108
|
+
"box": { "left": 0.1, "top": 0.2, "right": 0.9, "bottom": 0.3 }
|
|
109
|
+
},
|
|
110
|
+
"0-1": { "type": "table", "page": 0, "box": {} },
|
|
111
|
+
"0-2": {
|
|
112
|
+
"type": "tableCell", "page": 0, "box": {},
|
|
113
|
+
"position": { "row": 0, "col": 0, "rowspan": 1, "colspan": 1, "chunk_id": "uuid" }
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
"splits": [
|
|
117
|
+
{ "chunks": ["chunk-id-1"], "class": "page", "identifier": "0", "markdown": "string", "pages": [0] }
|
|
118
|
+
],
|
|
119
|
+
"metadata": {
|
|
120
|
+
"filename": "document.pdf", "org_id": "org_abc123", "page_count": 5,
|
|
121
|
+
"duration_ms": 1234, "credit_usage": 3, "version": "dpt-2-latest",
|
|
122
|
+
"job_id": "job_abc123", "failed_pages": []
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
</details>
|
|
128
|
+
|
|
129
|
+
### 2. Extract API
|
|
130
|
+
|
|
131
|
+
**Endpoint**: `POST /v1/ade/extract`
|
|
132
|
+
|
|
133
|
+
Extracts structured data from markdown using JSON schemas. **Accepts markdown, not raw documents** — parse first if needed.
|
|
134
|
+
|
|
135
|
+
#### Request Parameters
|
|
136
|
+
|
|
137
|
+
| Parameter | Type | Required | Description |
|
|
138
|
+
|-----------|------|----------|-------------|
|
|
139
|
+
| `schema` | JSON string | Yes | JSON Schema defining extraction structure |
|
|
140
|
+
| `markdown` | string/file | One required | Markdown content or markdown file to extract from |
|
|
141
|
+
| `markdown_url` | string | One required | URL to markdown content |
|
|
142
|
+
| `model` | string | No | Model version (default: `extract-latest`) |
|
|
143
|
+
|
|
144
|
+
#### Response Structure
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
.extraction → object: extracted key-value pairs matching schema
|
|
148
|
+
.extraction_metadata → {field → {references: [chunk_ids]}} for grounding
|
|
149
|
+
.metadata → {credit_usage, duration_ms, filename, job_id, org_id, version, fallback_model_version, schema_violation_error}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### 3. Split API
|
|
153
|
+
|
|
154
|
+
**Endpoint**: `POST /v1/ade/split`
|
|
155
|
+
|
|
156
|
+
Classifies and splits mixed documents by type. **Accepts markdown, not raw documents** — parse first if needed.
|
|
157
|
+
|
|
158
|
+
#### Request Parameters
|
|
159
|
+
|
|
160
|
+
| Parameter | Type | Required | Description |
|
|
161
|
+
|-----------|------|----------|-------------|
|
|
162
|
+
| `split_class` | JSON array | Yes | Classification configuration (see below) |
|
|
163
|
+
| `markdown` | string | One required | Markdown content to split |
|
|
164
|
+
| `markdownUrl` | string | One required | URL to markdown content |
|
|
165
|
+
| `model` | string | No | Model version (default: `split-latest`) |
|
|
166
|
+
|
|
167
|
+
#### Split Class Structure
|
|
168
|
+
|
|
169
|
+
```json
|
|
170
|
+
{
|
|
171
|
+
"name": "Invoice", // Required: Classification name
|
|
172
|
+
"description": "Sales invoice", // Optional: Description for better classification
|
|
173
|
+
"identifier": "Invoice Number" // Optional: Field to group documents by
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
#### Response Structure
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
.splits[] → {chunks[], class, classification, identifier, markdowns[], pages[]}
|
|
181
|
+
.metadata → {credit_usage, duration_ms, filename, page_count, job_id, org_id, version}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### 4. Parse Jobs API (Async)
|
|
185
|
+
|
|
186
|
+
For large files (>50MB), use asynchronous processing.
|
|
187
|
+
|
|
188
|
+
#### Create Job
|
|
189
|
+
|
|
190
|
+
**Endpoint**: `POST /v1/ade/parse/jobs`
|
|
191
|
+
|
|
192
|
+
**Parameters**: Same as Parse API plus:
|
|
193
|
+
|
|
194
|
+
| Parameter | Type | Required | Description |
|
|
195
|
+
|-----------|------|----------|-------------|
|
|
196
|
+
| `output_save_url` | string | If ZDR | URL for zero data retention output |
|
|
197
|
+
|
|
198
|
+
**Response**: `{ "job_id": "cml1kaihb08dxcn01b3mlfy5b" }`
|
|
199
|
+
|
|
200
|
+
#### Get Job Status
|
|
201
|
+
|
|
202
|
+
**Endpoint**: `GET /v1/ade/parse/jobs/{job_id}`
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
.job_id → string
|
|
206
|
+
.status → string: pending|processing|completed|failed|cancelled
|
|
207
|
+
.progress → number: 0.0 to 1.0
|
|
208
|
+
.failure_reason → string | null: error message if failed
|
|
209
|
+
.received_at → number: Unix timestamp
|
|
210
|
+
.data → ParseResponse | null: full result when completed (if output_save_url not used)
|
|
211
|
+
.output_url → string | null: presigned URL when result >1MB or output_save_url was set (expires 1hr)
|
|
212
|
+
.org_id → string
|
|
213
|
+
.version → string
|
|
214
|
+
.metadata → ParseMetadata | null
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
#### List Jobs
|
|
218
|
+
|
|
219
|
+
**Endpoint**: `GET /v1/ade/parse/jobs`
|
|
220
|
+
|
|
221
|
+
**Query Parameters**: `status` (filter), `page` (0-indexed), `pageSize` (items per page)
|
|
222
|
+
|
|
223
|
+
```
|
|
224
|
+
.jobs[] → {job_id, status, progress, failure_reason, received_at}
|
|
225
|
+
.has_more → boolean
|
|
226
|
+
.org_id → string
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Data Types
|
|
232
|
+
|
|
233
|
+
### Chunk Types
|
|
234
|
+
- `text` — Characters, paragraphs, headings, lists, form fields, checkboxes, code blocks
|
|
235
|
+
- `table` — Grid of rows and columns; includes spreadsheets and receipts
|
|
236
|
+
- `figure` — Visual/graphical non-text content — images, graphs, flowcharts, diagrams
|
|
237
|
+
- `marginalia` — Content in document margins — headers, footers, page numbers, handwritten notes
|
|
238
|
+
- `logo` — Logos (DPT-2 only)
|
|
239
|
+
- `card` — ID cards and driver's licenses (DPT-2 only)
|
|
240
|
+
- `attestation` — Signatures, stamps, and seals (DPT-2 only)
|
|
241
|
+
- `scan_code` — QR codes and barcodes (DPT-2 only)
|
|
242
|
+
|
|
243
|
+
### Grounding Types
|
|
244
|
+
|
|
245
|
+
#### For Chunks (with "chunk" prefix)
|
|
246
|
+
- `chunkText`, `chunkTable`, `chunkFigure`, `chunkMarginalia`, `chunkLogo`, `chunkCard`, `chunkAttestation`, `chunkScanCode`
|
|
247
|
+
|
|
248
|
+
#### For Structure Elements (no prefix)
|
|
249
|
+
- `table` — Actual table structure
|
|
250
|
+
- `tableCell` — Individual table cell with position
|
|
251
|
+
|
|
252
|
+
### Bounding Box
|
|
253
|
+
|
|
254
|
+
All coordinates normalized 0–1: `{ left, top, right, bottom }`.
|
|
255
|
+
|
|
256
|
+
### Table Cell Position
|
|
257
|
+
|
|
258
|
+
`{ row, col, rowspan, colspan, chunk_id }` — all zero-indexed.
|
|
259
|
+
|
|
260
|
+
### Table Chunk Formats
|
|
261
|
+
|
|
262
|
+
Table chunks render as HTML. The ID format and grounding availability differ by source document type.
|
|
263
|
+
|
|
264
|
+
#### PDF / Image / Document Tables
|
|
265
|
+
|
|
266
|
+
Element IDs use the format `{page_number}-{base62_sequential_number}` (page starts at 0, numbers increment per element within the page). Cells may include `rowspan`/`colspan` attributes. The `grounding` object contains bounding boxes and `tableCell` position entries for every cell.
|
|
267
|
+
|
|
268
|
+
```html
|
|
269
|
+
<a id='chunk-uuid'></a>
|
|
270
|
+
|
|
271
|
+
<table id="0-1">
|
|
272
|
+
<tr><td id="0-2" colspan="2">Product Summary</td></tr>
|
|
273
|
+
<tr><td id="0-3">Product</td><td id="0-4">Revenue</td></tr>
|
|
274
|
+
<tr><td id="0-5">Hardware</td><td id="0-6">15,230</td></tr>
|
|
275
|
+
</table>
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
#### Spreadsheet Tables (XLSX / CSV)
|
|
279
|
+
|
|
280
|
+
Element IDs use the format `{tab_name}-{cell_reference}` (e.g., `Sheet 1-A1`). The table element itself uses `{tab_name}-{start_cell}:{end_cell}` (e.g., `Sheet 1-A1:B4`). Embedded images and charts become `figure` chunks.
|
|
281
|
+
|
|
282
|
+
**`grounding` is `null`** for spreadsheet table chunks — cell positions are encoded in the IDs themselves.
|
|
283
|
+
|
|
284
|
+
```html
|
|
285
|
+
<a id='Sheet 1-A1:B4-chunk'></a>
|
|
286
|
+
|
|
287
|
+
<table id='Sheet 1-A1:B4'>
|
|
288
|
+
<tr>
|
|
289
|
+
<td id='Sheet 1-A1'>Program</td>
|
|
290
|
+
<td id='Sheet 1-B1'>Interest Rate</td>
|
|
291
|
+
</tr>
|
|
292
|
+
<tr>
|
|
293
|
+
<td id='Sheet 1-A2'>15 Year Fixed-Rate Mortgage</td>
|
|
294
|
+
<td id='Sheet 1-B2'>0.05125</td>
|
|
295
|
+
</tr>
|
|
296
|
+
</table>
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## Error Responses
|
|
302
|
+
|
|
303
|
+
All errors follow this format:
|
|
304
|
+
|
|
305
|
+
```json
|
|
306
|
+
{
|
|
307
|
+
"error": {
|
|
308
|
+
"message": "Human-readable error message",
|
|
309
|
+
"type": "error_type",
|
|
310
|
+
"details": { "field": "problem_field", "reason": "Specific reason" }
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### HTTP Status Codes
|
|
316
|
+
|
|
317
|
+
| Status | Error Type | Description | Solution |
|
|
318
|
+
|--------|------------|-------------|----------|
|
|
319
|
+
| 400 | `validation_error` | Invalid parameters | Check request format |
|
|
320
|
+
| 401 | `authentication_error` | Invalid API key | Check VISION_AGENT_API_KEY |
|
|
321
|
+
| 413 | `payload_too_large` | File too large | Use Parse Jobs API |
|
|
322
|
+
| 422 | `unprocessable_entity` | Invalid file type or malformed schema | Validate file format and schema JSON |
|
|
323
|
+
| 429 | `rate_limit_error` | Too many requests | Implement backoff |
|
|
324
|
+
| 500 | `internal_error` | Server error | Retry with backoff |
|
|
325
|
+
| 504 | `timeout_error` | Request timeout | Use Parse Jobs API |
|
|
326
|
+
|
|
327
|
+
## Model Versions
|
|
328
|
+
|
|
329
|
+
| Operation | Current Version | Description |
|
|
330
|
+
|-----------|----------------|-------------|
|
|
331
|
+
| Parse | `dpt-2-latest` | Document parsing and OCR |
|
|
332
|
+
| Extract | `extract-latest` | Schema-based extraction |
|
|
333
|
+
| Split | `split-latest` | Document classification |
|
|
334
|
+
|
|
335
|
+
## Supported File Types
|
|
336
|
+
|
|
337
|
+
| Category | Formats | Notes |
|
|
338
|
+
|----------|---------|-------|
|
|
339
|
+
| **PDF** | PDF | Up to 100 pages; no password-protected files |
|
|
340
|
+
| **Images** | JPEG, JPG, PNG, APNG, BMP, DCX, DDS, DIB, GD, GIF, ICNS, JP2, PCX, PPM, PSD, TGA, TIF, TIFF, WEBP | |
|
|
341
|
+
| **Text Documents** | DOC, DOCX, ODT | Converted to PDF before parsing |
|
|
342
|
+
| **Presentations** | ODP, PPT, PPTX | Converted to PDF before parsing |
|
|
343
|
+
| **Spreadsheets** | CSV, XLSX | Up to 10 MB in Playground; no sheet/column/row limits |
|
|
344
|
+
|
|
345
|
+
> **Note:** Word, PowerPoint, and OpenDocument files are converted to PDF server-side before parsing.
|
|
346
|
+
|
|
347
|
+
## Best Practices
|
|
348
|
+
|
|
349
|
+
### File Size Handling
|
|
350
|
+
- < 50MB: Use synchronous Parse API
|
|
351
|
+
- \> 50MB: Use Parse Jobs API
|
|
352
|
+
- \> 100MB: Consider splitting document first
|
|
353
|
+
|
|
354
|
+
### Rate Limiting
|
|
355
|
+
- Implement exponential backoff — start with 1s, double on each retry, max 5 retries
|
|
356
|
+
|
|
357
|
+
### Cost Optimization
|
|
358
|
+
- Parse once, extract/split multiple times
|
|
359
|
+
- Use specific schemas (avoid extracting everything)
|
|
360
|
+
- Cache parsed results when possible
|
|
361
|
+
|
|
362
|
+
---
|
|
363
|
+
|
|
364
|
+
# API (curl) Reference
|
|
365
|
+
|
|
366
|
+
Direct HTTP API implementation using curl and shell scripts.
|
|
367
|
+
|
|
368
|
+
## Authentication
|
|
369
|
+
|
|
370
|
+
```bash
|
|
371
|
+
export VISION_AGENT_API_KEY="v2_..."
|
|
372
|
+
BASE_URL="https://api.va.landing.ai" # or https://api.va.eu-west-1.landing.ai for EU
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
## Parse Examples
|
|
376
|
+
|
|
377
|
+
### Basic Parse
|
|
378
|
+
```bash
|
|
379
|
+
curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
380
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
381
|
+
-F "document=@document.pdf" \
|
|
382
|
+
-F "model=dpt-2-latest"
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
### Parse with Page Splitting
|
|
386
|
+
```bash
|
|
387
|
+
curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
388
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
389
|
+
-F "document=@multi_page.pdf" \
|
|
390
|
+
-F "split=page"
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
### Parse from URL
|
|
394
|
+
```bash
|
|
395
|
+
curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
396
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
397
|
+
-F "document_url=https://example.com/document.pdf"
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
## Extract Examples
|
|
401
|
+
|
|
402
|
+
```bash
|
|
403
|
+
SCHEMA='{
|
|
404
|
+
"type": "object",
|
|
405
|
+
"properties": {
|
|
406
|
+
"invoice_number": {"type": "string", "description": "Invoice number"},
|
|
407
|
+
"total_amount": {"type": "number", "description": "Total amount"},
|
|
408
|
+
"vendor_name": {"type": "string", "description": "Vendor name"}
|
|
409
|
+
}
|
|
410
|
+
}'
|
|
411
|
+
|
|
412
|
+
# Extract from a markdown file (parse first if you have a PDF)
|
|
413
|
+
curl -s -X POST "$BASE_URL/v1/ade/extract" \
|
|
414
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
415
|
+
-F "markdown=@parsed_invoice.md" \
|
|
416
|
+
-F "schema=$SCHEMA" \
|
|
417
|
+
-F "model=extract-latest"
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
### Parse Once, Extract Many
|
|
421
|
+
```bash
|
|
422
|
+
# Parse once, save markdown
|
|
423
|
+
MARKDOWN=$(curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
424
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
425
|
+
-F "document=@invoice.pdf" \
|
|
426
|
+
| jq -r '.markdown')
|
|
427
|
+
|
|
428
|
+
# Extract with different schemas
|
|
429
|
+
curl -s -X POST "$BASE_URL/v1/ade/extract" \
|
|
430
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
431
|
+
-F "markdown=$MARKDOWN" \
|
|
432
|
+
-F "schema=$SCHEMA"
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
## Split Examples
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
SPLIT_CLASSES='[
|
|
439
|
+
{"name": "Invoice", "identifier": "Invoice Number"},
|
|
440
|
+
{"name": "Receipt", "identifier": "Receipt Number"},
|
|
441
|
+
{"name": "Purchase Order", "identifier": "PO Number"}
|
|
442
|
+
]'
|
|
443
|
+
|
|
444
|
+
# Parse first, then split
|
|
445
|
+
MARKDOWN=$(curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
446
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
447
|
+
-F "document=@mixed_documents.pdf" \
|
|
448
|
+
| jq -r '.markdown')
|
|
449
|
+
|
|
450
|
+
curl -s -X POST "$BASE_URL/v1/ade/split" \
|
|
451
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
452
|
+
-F "markdown=$MARKDOWN" \
|
|
453
|
+
-F "split_class=$SPLIT_CLASSES" \
|
|
454
|
+
-F "model=split-latest"
|
|
455
|
+
```
|
|
456
|
+
|
|
457
|
+
## Parse Jobs (Async, Large Files)
|
|
458
|
+
|
|
459
|
+
```bash
|
|
460
|
+
#!/bin/bash
|
|
461
|
+
|
|
462
|
+
# Create job
|
|
463
|
+
JOB_ID=$(curl -s -X POST "$BASE_URL/v1/ade/parse/jobs" \
|
|
464
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
465
|
+
-F "document=@large_document.pdf" \
|
|
466
|
+
-F "model=dpt-2-latest" \
|
|
467
|
+
| jq -r '.job_id')
|
|
468
|
+
|
|
469
|
+
echo "Created job: $JOB_ID"
|
|
470
|
+
|
|
471
|
+
# Poll for completion
|
|
472
|
+
while true; do
|
|
473
|
+
STATUS=$(curl -s -X GET "$BASE_URL/v1/ade/parse/jobs/$JOB_ID" \
|
|
474
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY")
|
|
475
|
+
|
|
476
|
+
STATE=$(echo "$STATUS" | jq -r '.status')
|
|
477
|
+
PROGRESS=$(echo "$STATUS" | jq -r '.progress')
|
|
478
|
+
|
|
479
|
+
echo "Status: $STATE, Progress: $(echo "$PROGRESS * 100" | bc)%"
|
|
480
|
+
|
|
481
|
+
if [ "$STATE" = "completed" ]; then
|
|
482
|
+
echo "$STATUS" | jq '.data' > "parse_result.json"
|
|
483
|
+
break
|
|
484
|
+
elif [ "$STATE" = "failed" ]; then
|
|
485
|
+
echo "Job failed: $(echo "$STATUS" | jq -r '.failure_reason')" >&2
|
|
486
|
+
exit 1
|
|
487
|
+
fi
|
|
488
|
+
|
|
489
|
+
sleep 5
|
|
490
|
+
done
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
## Complete Workflow: Parse → Split → Extract
|
|
494
|
+
|
|
495
|
+
```bash
|
|
496
|
+
#!/bin/bash
|
|
497
|
+
|
|
498
|
+
# 1. Parse
|
|
499
|
+
MARKDOWN=$(curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
500
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
501
|
+
-F "document=@mixed_invoices.pdf" \
|
|
502
|
+
| jq -r '.markdown')
|
|
503
|
+
|
|
504
|
+
# 2. Split
|
|
505
|
+
SPLIT_CLASSES='[
|
|
506
|
+
{"name": "Invoice", "identifier": "Invoice Number"},
|
|
507
|
+
{"name": "Credit Note", "identifier": "Credit Note Number"}
|
|
508
|
+
]'
|
|
509
|
+
|
|
510
|
+
SPLITS=$(curl -s -X POST "$BASE_URL/v1/ade/split" \
|
|
511
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
512
|
+
-F "markdown=$MARKDOWN" \
|
|
513
|
+
-F "split_class=$SPLIT_CLASSES")
|
|
514
|
+
|
|
515
|
+
# 3. Extract from each split
|
|
516
|
+
SCHEMA='{"type": "object", "properties": {
|
|
517
|
+
"document_number": {"type": "string"},
|
|
518
|
+
"total": {"type": "number"},
|
|
519
|
+
"date": {"type": "string"}
|
|
520
|
+
}}'
|
|
521
|
+
|
|
522
|
+
echo "$SPLITS" | jq -c '.splits[]' | while read -r split; do
|
|
523
|
+
TYPE=$(echo "$split" | jq -r '.classification')
|
|
524
|
+
ID=$(echo "$split" | jq -r '.identifier')
|
|
525
|
+
MD=$(echo "$split" | jq -r '.markdowns[0]')
|
|
526
|
+
|
|
527
|
+
echo "Processing $TYPE: $ID"
|
|
528
|
+
|
|
529
|
+
curl -s -X POST "$BASE_URL/v1/ade/extract" \
|
|
530
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
531
|
+
-F "markdown=$MD" \
|
|
532
|
+
-F "schema=$SCHEMA" \
|
|
533
|
+
| jq '.extraction'
|
|
534
|
+
done
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
## Error Handling with Retry
|
|
538
|
+
|
|
539
|
+
```bash
|
|
540
|
+
#!/bin/bash
|
|
541
|
+
|
|
542
|
+
MAX_RETRIES=3
|
|
543
|
+
RETRY_COUNT=0
|
|
544
|
+
|
|
545
|
+
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
|
|
546
|
+
RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$BASE_URL/v1/ade/parse" \
|
|
547
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
548
|
+
-F "document=@document.pdf")
|
|
549
|
+
|
|
550
|
+
HTTP_CODE=$(echo "$RESPONSE" | tail -n 1)
|
|
551
|
+
BODY=$(echo "$RESPONSE" | sed '$d')
|
|
552
|
+
|
|
553
|
+
if [ "$HTTP_CODE" -eq 200 ]; then
|
|
554
|
+
echo "$BODY"
|
|
555
|
+
break
|
|
556
|
+
elif [ "$HTTP_CODE" -eq 429 ]; then
|
|
557
|
+
WAIT_TIME=$((2 ** RETRY_COUNT * 10))
|
|
558
|
+
echo "Rate limited. Waiting ${WAIT_TIME}s..." >&2
|
|
559
|
+
sleep $WAIT_TIME
|
|
560
|
+
RETRY_COUNT=$((RETRY_COUNT + 1))
|
|
561
|
+
elif [ "$HTTP_CODE" -eq 413 ] || [ "$HTTP_CODE" -eq 504 ]; then
|
|
562
|
+
echo "File too large or timeout — use parse jobs API" >&2
|
|
563
|
+
exit 1
|
|
564
|
+
else
|
|
565
|
+
echo "Error: HTTP $HTTP_CODE" >&2
|
|
566
|
+
echo "$BODY" | jq '.error' >&2
|
|
567
|
+
exit 1
|
|
568
|
+
fi
|
|
569
|
+
done
|
|
570
|
+
```
|
|
571
|
+
|
|
572
|
+
## jq Recipes
|
|
573
|
+
|
|
574
|
+
```bash
|
|
575
|
+
# Extract just markdown
|
|
576
|
+
curl -s ... | jq -r '.markdown'
|
|
577
|
+
|
|
578
|
+
# Get all tables
|
|
579
|
+
curl -s ... | jq '.chunks[] | select(.type == "table")'
|
|
580
|
+
|
|
581
|
+
# Extract table cells with positions
|
|
582
|
+
curl -s ... | jq '.grounding | to_entries[] | select(.value.type == "tableCell")'
|
|
583
|
+
|
|
584
|
+
# Get chunks from specific page
|
|
585
|
+
curl -s ... | jq '.chunks[] | select(.grounding.page == 0)'
|
|
586
|
+
|
|
587
|
+
# Group chunks by type with counts
|
|
588
|
+
curl -s ... | jq '.chunks | group_by(.type) | map({type: .[0].type, count: length})'
|
|
589
|
+
|
|
590
|
+
# Get specific extracted field
|
|
591
|
+
curl -s ... | jq '.extraction.invoice_number'
|
|
592
|
+
|
|
593
|
+
# Process extracted line items
|
|
594
|
+
curl -s ... | jq '.extraction.line_items[] | {sku: .sku, total: (.quantity * .unit_price)}'
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
## Shell Functions for Reuse
|
|
598
|
+
|
|
599
|
+
```bash
|
|
600
|
+
ade_parse() {
|
|
601
|
+
curl -s -X POST "$BASE_URL/v1/ade/parse" \
|
|
602
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
603
|
+
-F "document=@$1"
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
ade_extract() {
|
|
607
|
+
curl -s -X POST "$BASE_URL/v1/ade/extract" \
|
|
608
|
+
-H "Authorization: Bearer $VISION_AGENT_API_KEY" \
|
|
609
|
+
-F "markdown=$1" \
|
|
610
|
+
-F "schema=$2"
|
|
611
|
+
}
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
---
|
|
615
|
+
|
|
616
|
+
## External Links
|
|
617
|
+
|
|
618
|
+
- [API Reference](https://docs.landing.ai/api-reference)
|
|
619
|
+
- [ADE Documentation](https://docs.landing.ai/ade)
|
|
620
|
+
- [Supported File Types](https://docs.landing.ai/ade/ade-file-types)
|