docslight 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docslight-0.1.5/PKG-INFO +474 -0
- docslight-0.1.5/README.md +440 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/__init__.py +1 -1
- {docslight-0.1.4 → docslight-0.1.5}/docslight/cli.py +74 -74
- {docslight-0.1.4 → docslight-0.1.5}/docslight/cloud/client.py +362 -362
- {docslight-0.1.4 → docslight-0.1.5}/docslight/config.py +1 -1
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/llm_extractor.py +43 -43
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/loaders.py +1 -1
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/paddle_parser.py +26 -26
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/pipeline.py +141 -141
- {docslight-0.1.4 → docslight-0.1.5}/docslight/preview.py +2 -2
- {docslight-0.1.4 → docslight-0.1.5}/docslight/providers/openai_compatible.py +3 -3
- {docslight-0.1.4 → docslight-0.1.5}/docslight/schemas/fields.py +2 -2
- {docslight-0.1.4 → docslight-0.1.5}/docslight/web_app.py +58 -58
- docslight-0.1.5/docslight.egg-info/PKG-INFO +474 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight.egg-info/SOURCES.txt +13 -1
- {docslight-0.1.4 → docslight-0.1.5}/docslight.egg-info/requires.txt +11 -26
- {docslight-0.1.4 → docslight-0.1.5}/pyproject.toml +29 -44
- docslight-0.1.5/tests/test_cli.py +450 -0
- docslight-0.1.5/tests/test_cli_entrypoint.py +15 -0
- docslight-0.1.5/tests/test_client.py +255 -0
- docslight-0.1.5/tests/test_cloud_client.py +771 -0
- docslight-0.1.5/tests/test_config_result.py +231 -0
- docslight-0.1.5/tests/test_examples.py +20 -0
- docslight-0.1.5/tests/test_local_llm.py +401 -0
- docslight-0.1.5/tests/test_local_loader_parser.py +300 -0
- docslight-0.1.5/tests/test_local_office_loader.py +108 -0
- docslight-0.1.5/tests/test_local_pipeline.py +825 -0
- docslight-0.1.5/tests/test_schema_helpers.py +117 -0
- docslight-0.1.5/tests/test_web_app.py +442 -0
- docslight-0.1.4/PKG-INFO +0 -277
- docslight-0.1.4/README.md +0 -230
- docslight-0.1.4/docslight.egg-info/PKG-INFO +0 -277
- {docslight-0.1.4 → docslight-0.1.5}/LICENSE +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/client.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/cloud/__init__.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/exceptions.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/__init__.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/layout_blocks.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/markdown.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/local/office_loader.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/providers/__init__.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/providers/ollama.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/result.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/schemas/__init__.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight/standard_json.py +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight.egg-info/dependency_links.txt +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight.egg-info/entry_points.txt +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/docslight.egg-info/top_level.txt +0 -0
- {docslight-0.1.4 → docslight-0.1.5}/setup.cfg +0 -0
docslight-0.1.5/PKG-INFO
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docslight
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
|
+
Author-email: ComPDF AI <support@compdf.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: <=3.13,>=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: requests>=2.31.0
|
|
11
|
+
Requires-Dist: pydantic>=2.5.0
|
|
12
|
+
Requires-Dist: typing-extensions>=4.8.0; python_version < "3.11"
|
|
13
|
+
Requires-Dist: tomli>=2.0.1; python_version < "3.11"
|
|
14
|
+
Requires-Dist: flask>=3.1.3
|
|
15
|
+
Requires-Dist: werkzeug>=3.1.8
|
|
16
|
+
Requires-Dist: Pillow>=10.0.0
|
|
17
|
+
Requires-Dist: PyMuPDF>=1.23.0
|
|
18
|
+
Requires-Dist: numpy>=1.24.0
|
|
19
|
+
Requires-Dist: paddlepaddle>=3.3.0
|
|
20
|
+
Requires-Dist: paddleocr[doc-parser]>=3.3.0
|
|
21
|
+
Requires-Dist: python-docx>=1.1.0
|
|
22
|
+
Requires-Dist: python-pptx>=0.6.23
|
|
23
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
24
|
+
Requires-Dist: openai>=1.0.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
29
|
+
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
30
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: playwright>=1.40.0; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<h1 align="center">DocSlight</h1>
|
|
37
|
+
<p align="center">Lightweight Python SDK & CLI for document parsing and structured extraction</p>
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/v/docslight" alt="PyPI"></a>
|
|
40
|
+
<a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/pyversions/docslight" alt="Python versions"></a>
|
|
41
|
+
<a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight" alt="License"></a>
|
|
42
|
+
</p>
|
|
43
|
+
</p>
|
|
44
|
+
|
|
45
|
+
## What is DocSlight?
|
|
46
|
+
|
|
47
|
+
A lightweight Python library that turns PDFs, images, and Office documents into clean Markdown or structured JSON — with one line of code. Works with ComPDF Cloud (recommended) or fully offline with local parsers.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from docslight import DocSlight
|
|
51
|
+
|
|
52
|
+
client = DocSlight(api_key="<COMPDF_API_KEY>")
|
|
53
|
+
result = client.parse("<DOCUMENT_PATH>")
|
|
54
|
+
print(result.to_markdown())
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install docslight
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Cloud parse with the CLI:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
docslight parse "<DOCUMENT_PATH>" --mode cloud --api-key "<COMPDF_API_KEY>" --output "<OUTPUT.md>"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Cloud extract with the CLI:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
docslight extract "<DOCUMENT_PATH>" --mode cloud --api-key "<COMPDF_API_KEY>" --fields "invoice_number,total_amount" --output "<OUTPUT.json>"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Local parse and extract:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
docslight parse "<DOCUMENT_PATH>" --mode local --output "<OUTPUT.md>"
|
|
79
|
+
docslight extract "<DOCUMENT_PATH>" --mode local --fields "invoice_number,total_amount" --local-llm-provider ollama --local-llm-model "<OLLAMA_MODEL>" --output "<OUTPUT.json>"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Launch the API server:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
docslight web
|
|
86
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Placeholders used above:
|
|
90
|
+
|
|
91
|
+
| Placeholder | Description |
|
|
92
|
+
|-------------|-------------|
|
|
93
|
+
| `<DOCUMENT_PATH>` | Input file path. Use an absolute or relative path, and quote paths that contain spaces. |
|
|
94
|
+
| `<COMPDF_API_KEY>` | ComPDF Cloud API key. You can also set `DOCSLIGHT_API_KEY` instead of passing `--api-key`. |
|
|
95
|
+
| `<OUTPUT.md>` | Markdown output path for parse results. |
|
|
96
|
+
| `<OUTPUT.json>` | JSON output path for extract results. |
|
|
97
|
+
| `<OUTPUT.zip>` | ZIP output path for raw parse archives. |
|
|
98
|
+
| `<OLLAMA_MODEL>` | Model name available in your Ollama service, for example `llama3.1` or `qwen2.5:7b`. |
|
|
99
|
+
|
|
100
|
+
## Features
|
|
101
|
+
|
|
102
|
+
- **Dual mode** — ComPDF Cloud for production-grade results, or local CPU parsing for offline evaluation
|
|
103
|
+
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
104
|
+
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
105
|
+
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
106
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
107
|
+
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
108
|
+
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
109
|
+
- **Document types** — Classify and route documents by type for cloud extraction
|
|
110
|
+
- **Error-safe** — Typed result objects, structured error hierarchy, no credential leaks
|
|
111
|
+
|
|
112
|
+
## Install
|
|
113
|
+
|
|
114
|
+
| Scenario | Command |
|
|
115
|
+
|----------|---------|
|
|
116
|
+
| Runtime package (cloud, local, web, CLI, SDK) | `pip install docslight` |
|
|
117
|
+
| Development tools (tests, lint, build, upload) | `pip install -e ".[dev]"` |
|
|
118
|
+
|
|
119
|
+
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
120
|
+
|
|
121
|
+
## SDK Usage
|
|
122
|
+
|
|
123
|
+
### Cloud — Parse
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from docslight import DocSlight
|
|
127
|
+
|
|
128
|
+
client = DocSlight(
|
|
129
|
+
mode="cloud",
|
|
130
|
+
api_key="<COMPDF_API_KEY>",
|
|
131
|
+
base_url="https://api-server.compdf.com",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
result = client.parse("<DOCUMENT_PATH>")
|
|
135
|
+
print(result.to_markdown()) # Clean markdown
|
|
136
|
+
print(result.to_json()) # Full result with pages + metadata
|
|
137
|
+
|
|
138
|
+
if result.raw_archive:
|
|
139
|
+
with open("<OUTPUT.zip>", "wb") as file:
|
|
140
|
+
file.write(result.raw_archive)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Parameters:
|
|
144
|
+
|
|
145
|
+
| Parameter | Required | Description |
|
|
146
|
+
|-----------|----------|-------------|
|
|
147
|
+
| `mode="cloud"` | Yes | Use ComPDF Cloud. This is the default if `mode` is omitted. |
|
|
148
|
+
| `api_key` | Yes for cloud | ComPDF Cloud API key. Can also come from `DOCSLIGHT_API_KEY`. |
|
|
149
|
+
| `base_url` | No | Cloud API base URL. Defaults to `https://api-server.compdf.com`. |
|
|
150
|
+
| `<DOCUMENT_PATH>` | Yes | PDF, image, DOCX, PPTX, or XLSX input file. |
|
|
151
|
+
| `<OUTPUT.zip>` | No | Optional path if you want to save the raw parse ZIP archive returned by cloud parse. |
|
|
152
|
+
|
|
153
|
+
### Cloud — Extract
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
result = client.extract(
|
|
157
|
+
"<DOCUMENT_PATH>",
|
|
158
|
+
fields=["invoice_number", "invoice_date", "total_amount"],
|
|
159
|
+
)
|
|
160
|
+
print(result.to_json())
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
With a JSON Schema:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
schema = {
|
|
167
|
+
"type": "object",
|
|
168
|
+
"properties": {
|
|
169
|
+
"invoice_number": {"type": "string"},
|
|
170
|
+
"total_amount": {"type": "number"},
|
|
171
|
+
},
|
|
172
|
+
"required": ["invoice_number"],
|
|
173
|
+
}
|
|
174
|
+
result = client.extract("<DOCUMENT_PATH>", schema=schema)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
With document type classification:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
result = client.extract(
|
|
181
|
+
"<DOCUMENT_PATH>",
|
|
182
|
+
fields=["invoice_number"],
|
|
183
|
+
document_types=["invoice"],
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Cloud extract parameters:
|
|
188
|
+
|
|
189
|
+
| Parameter | Required | Description |
|
|
190
|
+
|-----------|----------|-------------|
|
|
191
|
+
| `fields` | Optional | Field list to extract. Accepts `list[str]`, comma-separated string, or structured object with `keys` and `tableHeaders`. |
|
|
192
|
+
| `schema` | Optional | JSON Schema describing the desired output. If both `fields` and `schema` are provided, schema is used as the structured output contract. |
|
|
193
|
+
| `document_types` | Optional | List of document type labels for cloud-side classification/routing. |
|
|
194
|
+
|
|
195
|
+
### Local — Parse (Offline)
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
client = DocSlight(mode="local")
|
|
199
|
+
result = client.parse("<DOCUMENT_PATH>")
|
|
200
|
+
print(result.to_markdown())
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Local parse uses the runtime dependencies installed with:
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
pip install docslight
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Local — Extract with Ollama
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
client = DocSlight(
|
|
213
|
+
mode="local",
|
|
214
|
+
local_llm={
|
|
215
|
+
"provider": "ollama",
|
|
216
|
+
"model": "<OLLAMA_MODEL>",
|
|
217
|
+
"base_url": "http://localhost:11434",
|
|
218
|
+
},
|
|
219
|
+
)
|
|
220
|
+
result = client.extract(
|
|
221
|
+
"<DOCUMENT_PATH>",
|
|
222
|
+
fields=["invoice_number", "invoice_date"],
|
|
223
|
+
)
|
|
224
|
+
print(result.to_json())
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Local — Extract with OpenAI-Compatible API
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
client = DocSlight(
|
|
231
|
+
mode="local",
|
|
232
|
+
local_llm={
|
|
233
|
+
"provider": "openai-compatible",
|
|
234
|
+
"base_url": "<OPENAI_COMPATIBLE_BASE_URL>",
|
|
235
|
+
"model": "<MODEL_NAME>",
|
|
236
|
+
"api_key": "<LOCAL_LLM_API_KEY>",
|
|
237
|
+
"extra_body": {"enable_thinking": False}, # e.g., DashScope qwen3
|
|
238
|
+
},
|
|
239
|
+
)
|
|
240
|
+
result = client.extract("<DOCUMENT_PATH>", fields=["invoice_number"])
|
|
241
|
+
print(result.to_json())
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Local LLM parameters:
|
|
245
|
+
|
|
246
|
+
| Parameter | Required | Description |
|
|
247
|
+
|-----------|----------|-------------|
|
|
248
|
+
| `provider` | No | `ollama`, `openai`, or `openai-compatible`. Defaults to `ollama` when other local LLM fields are set. |
|
|
249
|
+
| `model` | Yes for local extract | Local or OpenAI-compatible model name. |
|
|
250
|
+
| `base_url` | Required for `openai-compatible` | LLM endpoint base URL. Ollama defaults to `http://localhost:11434`. |
|
|
251
|
+
| `api_key` | Usually required for `openai-compatible` | API key for the LLM endpoint. Ollama can use the default placeholder key. |
|
|
252
|
+
| `extra_body` | No | Extra provider-specific request body, such as `{"enable_thinking": False}`. |
|
|
253
|
+
|
|
254
|
+
### Batch Processing
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
results = client.parse_batch(["<DOCUMENT_1>", "<DOCUMENT_2>", "<DOCUMENT_3>"])
|
|
258
|
+
for r in results:
|
|
259
|
+
print(r.to_markdown()[:200])
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## CLI Usage
|
|
263
|
+
|
|
264
|
+
### Cloud CLI
|
|
265
|
+
|
|
266
|
+
Parse to Markdown:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
docslight parse "<DOCUMENT_PATH>" --mode cloud --api-key "<COMPDF_API_KEY>" --output "<OUTPUT.md>"
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Save the raw parse ZIP archive:
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
docslight parse "<DOCUMENT_PATH>" --mode cloud --api-key "<COMPDF_API_KEY>" --format zip --output "<OUTPUT.zip>"
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
Extract fields to JSON:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
docslight extract "<DOCUMENT_PATH>" --mode cloud --api-key "<COMPDF_API_KEY>" --fields "invoice_number,total_amount" --output "<OUTPUT.json>"
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Extract with a JSON Schema file:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
docslight extract "<DOCUMENT_PATH>" --mode cloud --api-key "<COMPDF_API_KEY>" --schema "<SCHEMA_PATH.json>" --output "<OUTPUT.json>"
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Local CLI
|
|
291
|
+
|
|
292
|
+
Parse locally:
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
docslight parse "<DOCUMENT_PATH>" --mode local --output "<OUTPUT.md>"
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
Extract locally with Ollama:
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
docslight extract "<DOCUMENT_PATH>" --mode local --fields "invoice_number,total_amount" --local-llm-provider ollama --local-llm-model "<OLLAMA_MODEL>" --local-llm-base-url "http://localhost:11434" --output "<OUTPUT.json>"
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
Extract locally with an OpenAI-compatible endpoint:
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
docslight extract "<DOCUMENT_PATH>" --mode local --fields "invoice_number,total_amount" --local-llm-provider openai-compatible --local-llm-model "<MODEL_NAME>" --local-llm-base-url "<OPENAI_COMPATIBLE_BASE_URL>" --local-llm-api-key "<LOCAL_LLM_API_KEY>" --output "<OUTPUT.json>"
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
CLI parameters:
|
|
311
|
+
|
|
312
|
+
| Option | Required | Description |
|
|
313
|
+
|--------|----------|-------------|
|
|
314
|
+
| `<DOCUMENT_PATH>` | Yes | Input file path. Quote paths that contain spaces. |
|
|
315
|
+
| `--mode` | No | `cloud` or `local`. Defaults to config/env/default mode, which is `cloud`. |
|
|
316
|
+
| `--api-key` | Yes for cloud unless env is set | ComPDF Cloud API key. Can be replaced by `DOCSLIGHT_API_KEY`. |
|
|
317
|
+
| `--base-url` | No | Cloud API base URL. Defaults to `https://api-server.compdf.com`. |
|
|
318
|
+
| `--output`, `-o` | No | Output path. If omitted, text/JSON is written to stdout. Binary ZIP output should use `--output`. |
|
|
319
|
+
| `--format` | No | Parse output format: `markdown`, `json`, `standard-json`, or `zip`. If output ends in `.zip`, `zip` is inferred. |
|
|
320
|
+
| `--fields` | Optional for extract | Comma-separated field names, for example `"invoice_number,total_amount"`, or a JSON object string. |
|
|
321
|
+
| `--schema` | Optional for extract | Path to a JSON Schema file. |
|
|
322
|
+
| `--document-types` | Optional for cloud extract | Path to a JSON file containing a list, for example `["invoice"]`. |
|
|
323
|
+
| `--local-llm-provider` | Required for explicit local extract setup | `ollama`, `openai`, or `openai-compatible`. Defaults to `ollama` if other local LLM args are supplied. |
|
|
324
|
+
| `--local-llm-model` | Yes for local extract | Local LLM model name. |
|
|
325
|
+
| `--local-llm-base-url` | Required for OpenAI-compatible local extract | LLM service base URL. Ollama defaults to `http://localhost:11434`. |
|
|
326
|
+
| `--local-llm-api-key` | Usually required for OpenAI-compatible local extract | API key for the local or private LLM service. |
|
|
327
|
+
| `--host` | No, `docslight web` only | API server host. Defaults to `127.0.0.1`. |
|
|
328
|
+
| `--port` | No, `docslight web` only | API server port. Defaults to `8000`. |
|
|
329
|
+
| `--debug` | No, `docslight web` only | Enable Flask debug logging. |
|
|
330
|
+
|
|
331
|
+
## API Server
|
|
332
|
+
|
|
333
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
334
|
+
|
|
335
|
+
Install and start:
|
|
336
|
+
|
|
337
|
+
```bash
|
|
338
|
+
pip install docslight
|
|
339
|
+
docslight web --host 127.0.0.1 --port 8000
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
You can also run the same server module directly:
|
|
343
|
+
|
|
344
|
+
```bash
|
|
345
|
+
python -m docslight.web_app --host 127.0.0.1 --port 8000 --debug
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### API Health
|
|
349
|
+
|
|
350
|
+
```powershell
|
|
351
|
+
curl.exe "http://127.0.0.1:8000/api/health"
|
|
352
|
+
curl.exe "http://127.0.0.1:8000/api/system-info"
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### Cloud Parse API
|
|
356
|
+
|
|
357
|
+
`POST /api/parse` returns a ZIP file when the cloud parser returns a parse archive.
|
|
358
|
+
|
|
359
|
+
```powershell
|
|
360
|
+
curl.exe -X POST "http://127.0.0.1:8000/api/parse" `
|
|
361
|
+
-F "file=@<DOCUMENT_PATH>" `
|
|
362
|
+
-F "mode=cloud" `
|
|
363
|
+
-F "api_key=<COMPDF_API_KEY>" `
|
|
364
|
+
-F "base_url=https://api-server.compdf.com" `
|
|
365
|
+
--output "<OUTPUT.zip>"
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
### Cloud Extract API
|
|
369
|
+
|
|
370
|
+
```powershell
|
|
371
|
+
curl.exe -X POST "http://127.0.0.1:8000/api/extract" `
|
|
372
|
+
-F "file=@<DOCUMENT_PATH>" `
|
|
373
|
+
-F "mode=cloud" `
|
|
374
|
+
-F "api_key=<COMPDF_API_KEY>" `
|
|
375
|
+
-F "base_url=https://api-server.compdf.com" `
|
|
376
|
+
-F 'fields={"name":"Invoice","keys":{"invoice_number":{"prompt":null,"mapping":null},"total_amount":{"prompt":null,"mapping":null}}}' `
|
|
377
|
+
-F "cloud_extract_mode=vlm" `
|
|
378
|
+
--output "<OUTPUT.json>"
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
### Local Parse API
|
|
382
|
+
|
|
383
|
+
```powershell
|
|
384
|
+
curl.exe -X POST "http://127.0.0.1:8000/api/parse" `
|
|
385
|
+
-F "file=@<DOCUMENT_PATH>" `
|
|
386
|
+
-F "mode=local" `
|
|
387
|
+
--output "<OUTPUT.zip>"
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
### Local Extract API with Ollama
|
|
391
|
+
|
|
392
|
+
```powershell
|
|
393
|
+
curl.exe -X POST "http://127.0.0.1:8000/api/extract" `
|
|
394
|
+
-F "file=@<DOCUMENT_PATH>" `
|
|
395
|
+
-F "mode=local" `
|
|
396
|
+
-F 'fields={"name":"Invoice","keys":{"invoice_number":{"prompt":null,"mapping":null},"total_amount":{"prompt":null,"mapping":null}}}' `
|
|
397
|
+
-F "local_llm_provider=ollama" `
|
|
398
|
+
-F "local_llm_model=<OLLAMA_MODEL>" `
|
|
399
|
+
-F "local_llm_base_url=http://localhost:11434" `
|
|
400
|
+
--output "<OUTPUT.json>"
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
### Local Extract API with OpenAI-Compatible LLM
|
|
404
|
+
|
|
405
|
+
```powershell
|
|
406
|
+
curl.exe -X POST "http://127.0.0.1:8000/api/extract" `
|
|
407
|
+
-F "file=@<DOCUMENT_PATH>" `
|
|
408
|
+
-F "mode=local" `
|
|
409
|
+
-F "fields=invoice_number,total_amount" `
|
|
410
|
+
-F "local_llm_provider=openai-compatible" `
|
|
411
|
+
-F "local_llm_model=<MODEL_NAME>" `
|
|
412
|
+
-F "local_llm_base_url=<OPENAI_COMPATIBLE_BASE_URL>" `
|
|
413
|
+
-F "local_llm_api_key=<LOCAL_LLM_API_KEY>" `
|
|
414
|
+
--output "<OUTPUT.json>"
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
API form fields:
|
|
418
|
+
|
|
419
|
+
| Field | Required | Description |
|
|
420
|
+
|-------|----------|-------------|
|
|
421
|
+
| `file` | Yes | Uploaded document. In curl, use `file=@<DOCUMENT_PATH>`. |
|
|
422
|
+
| `mode` | No | `cloud` or `local`. Defaults to `cloud`. |
|
|
423
|
+
| `api_key` | Yes for cloud unless env is set | ComPDF Cloud API key. |
|
|
424
|
+
| `base_url` | No | Cloud API base URL. Defaults to `https://api-server.compdf.com`. |
|
|
425
|
+
| `fields` | Optional for extract | Comma-separated field names or a structured JSON object with `name`, `keys`, and optional `tableHeaders`. |
|
|
426
|
+
| `schema` | Optional for extract | JSON Schema string. |
|
|
427
|
+
| `document_types` | Optional for cloud extract | JSON list string, for example `["invoice"]`. |
|
|
428
|
+
| `cloud_extract_mode` | No, cloud extract only | Cloud extraction mode. Defaults to `vlm`. |
|
|
429
|
+
| `enable_grounding` | No, cloud extract only | Boolean value used when `cloud_extract_mode=integrate`. Accepted values include `true`, `false`, `1`, and `0`. |
|
|
430
|
+
| `local_llm_provider` | Required for local extract | `ollama`, `openai`, or `openai-compatible`. |
|
|
431
|
+
| `local_llm_model` | Yes for local extract | Local LLM model name. |
|
|
432
|
+
| `local_llm_base_url` | Required for OpenAI-compatible local extract | LLM endpoint base URL. |
|
|
433
|
+
| `local_llm_api_key` | Usually required for OpenAI-compatible local extract | LLM endpoint API key. |
|
|
434
|
+
|
|
435
|
+
Response behavior:
|
|
436
|
+
|
|
437
|
+
| Endpoint | Cloud | Local |
|
|
438
|
+
|----------|-------|-------|
|
|
439
|
+
| `POST /api/parse` | Returns the parse ZIP archive when available. | Returns the parse ZIP archive when available. |
|
|
440
|
+
| `POST /api/extract` | Returns JSON with top-level `success`, `results`, and `metadata`. | Returns JSON with top-level `success`, `results`, and `metadata`. |
|
|
441
|
+
| `POST /api/preview` | Returns preview JSON for PDF/images; Office preview is not supported. | Same behavior. |
|
|
442
|
+
|
|
443
|
+
## Environment Variables
|
|
444
|
+
|
|
445
|
+
| Variable | Description |
|
|
446
|
+
|----------|-------------|
|
|
447
|
+
| `DOCSLIGHT_API_KEY` | API key for cloud mode |
|
|
448
|
+
| `DOCSLIGHT_MODE` | Processing mode: `cloud` or `local` (default: `cloud`) |
|
|
449
|
+
| `DOCSLIGHT_BASE_URL` | Cloud API base URL (default: `https://api-server.compdf.com`) |
|
|
450
|
+
| `DOCSLIGHT_TIMEOUT` | Request timeout in seconds |
|
|
451
|
+
| `DOCSLIGHT_LOCAL_PARSER` | Local parser selector, usually left unset |
|
|
452
|
+
|
|
453
|
+
## Supported Inputs
|
|
454
|
+
|
|
455
|
+
| Mode | Formats |
|
|
456
|
+
|------|---------|
|
|
457
|
+
| Cloud | PDF, images (PNG/JPG/TIFF/BMP/WebP), DOCX, PPTX, XLSX, and more via ComPDF Cloud API |
|
|
458
|
+
| Local | PDF, images (PNG/JPG/TIFF/BMP/WebP), DOCX, PPTX, XLSX |
|
|
459
|
+
|
|
460
|
+
> Legacy Office formats (`.doc`, `.ppt`, `.xls`) must be converted to DOCX/PPTX/XLSX for local processing.
|
|
461
|
+
|
|
462
|
+
## Development
|
|
463
|
+
|
|
464
|
+
```bash
|
|
465
|
+
pip install -e ".[dev]"
|
|
466
|
+
ruff check .
|
|
467
|
+
mypy docslight
|
|
468
|
+
pytest
|
|
469
|
+
python -m build
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
## License
|
|
473
|
+
|
|
474
|
+
MIT License. See `LICENSE`.
|