docslight-lite 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docslight_lite-0.1.0/LICENSE +21 -0
- docslight_lite-0.1.0/PKG-INFO +277 -0
- docslight_lite-0.1.0/README.md +229 -0
- docslight_lite-0.1.0/docslight/__init__.py +41 -0
- docslight_lite-0.1.0/docslight/cli.py +215 -0
- docslight_lite-0.1.0/docslight/client.py +92 -0
- docslight_lite-0.1.0/docslight/cloud/__init__.py +5 -0
- docslight_lite-0.1.0/docslight/cloud/client.py +622 -0
- docslight_lite-0.1.0/docslight/config.py +117 -0
- docslight_lite-0.1.0/docslight/exceptions.py +65 -0
- docslight_lite-0.1.0/docslight/local/__init__.py +31 -0
- docslight_lite-0.1.0/docslight/local/layout_blocks.py +80 -0
- docslight_lite-0.1.0/docslight/local/llm_extractor.py +252 -0
- docslight_lite-0.1.0/docslight/local/loaders.py +95 -0
- docslight_lite-0.1.0/docslight/local/markdown.py +18 -0
- docslight_lite-0.1.0/docslight/local/office_loader.py +128 -0
- docslight_lite-0.1.0/docslight/local/paddle_parser.py +173 -0
- docslight_lite-0.1.0/docslight/local/pipeline.py +213 -0
- docslight_lite-0.1.0/docslight/preview.py +46 -0
- docslight_lite-0.1.0/docslight/providers/__init__.py +6 -0
- docslight_lite-0.1.0/docslight/providers/ollama.py +30 -0
- docslight_lite-0.1.0/docslight/providers/openai_compatible.py +64 -0
- docslight_lite-0.1.0/docslight/result.py +89 -0
- docslight_lite-0.1.0/docslight/schemas/__init__.py +5 -0
- docslight_lite-0.1.0/docslight/schemas/fields.py +190 -0
- docslight_lite-0.1.0/docslight/standard_json.py +367 -0
- docslight_lite-0.1.0/docslight/static/app/common.js +668 -0
- docslight_lite-0.1.0/docslight/static/app/docslight-extract.json +307 -0
- docslight_lite-0.1.0/docslight/static/app/extract.js +394 -0
- docslight_lite-0.1.0/docslight/static/app/i18n.js +405 -0
- docslight_lite-0.1.0/docslight/static/app/parse.js +161 -0
- docslight_lite-0.1.0/docslight/static/styles.css +878 -0
- docslight_lite-0.1.0/docslight/templates/base.html +36 -0
- docslight_lite-0.1.0/docslight/templates/extract.html +123 -0
- docslight_lite-0.1.0/docslight/templates/parse.html +81 -0
- docslight_lite-0.1.0/docslight/web_app.py +372 -0
- docslight_lite-0.1.0/docslight_lite.egg-info/PKG-INFO +277 -0
- docslight_lite-0.1.0/docslight_lite.egg-info/SOURCES.txt +42 -0
- docslight_lite-0.1.0/docslight_lite.egg-info/dependency_links.txt +1 -0
- docslight_lite-0.1.0/docslight_lite.egg-info/entry_points.txt +2 -0
- docslight_lite-0.1.0/docslight_lite.egg-info/requires.txt +44 -0
- docslight_lite-0.1.0/docslight_lite.egg-info/top_level.txt +1 -0
- docslight_lite-0.1.0/pyproject.toml +87 -0
- docslight_lite-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ComPDF AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docslight-lite
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
|
+
Author-email: ComPDF AI <support@compdf.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: requests>=2.31.0
|
|
11
|
+
Requires-Dist: pydantic>=2.5.0
|
|
12
|
+
Requires-Dist: typing-extensions>=4.8.0; python_version < "3.11"
|
|
13
|
+
Requires-Dist: tomli>=2.0.1; python_version < "3.11"
|
|
14
|
+
Requires-Dist: flask>=3.1.3
|
|
15
|
+
Requires-Dist: werkzeug>=3.1.8
|
|
16
|
+
Provides-Extra: local
|
|
17
|
+
Requires-Dist: Pillow>=10.0.0; extra == "local"
|
|
18
|
+
Requires-Dist: PyMuPDF>=1.23.0; extra == "local"
|
|
19
|
+
Requires-Dist: numpy>=1.24.0; extra == "local"
|
|
20
|
+
Requires-Dist: paddlepaddle>=3.3.0; extra == "local"
|
|
21
|
+
Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "local"
|
|
22
|
+
Requires-Dist: python-docx>=1.1.0; extra == "local"
|
|
23
|
+
Requires-Dist: python-pptx>=0.6.23; extra == "local"
|
|
24
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "local"
|
|
25
|
+
Provides-Extra: local-llm
|
|
26
|
+
Requires-Dist: openai>=1.0.0; extra == "local-llm"
|
|
27
|
+
Provides-Extra: web
|
|
28
|
+
Requires-Dist: Flask>=3.0.0; extra == "web"
|
|
29
|
+
Requires-Dist: Werkzeug>=3.0.0; extra == "web"
|
|
30
|
+
Provides-Extra: web-test
|
|
31
|
+
Requires-Dist: playwright>=1.40.0; extra == "web-test"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
35
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
37
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
39
|
+
Requires-Dist: Pillow>=10.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: PyMuPDF>=1.23.0; extra == "dev"
|
|
41
|
+
Requires-Dist: numpy>=1.24.0; extra == "dev"
|
|
42
|
+
Requires-Dist: python-docx>=1.1.0; extra == "dev"
|
|
43
|
+
Requires-Dist: python-pptx>=0.6.23; extra == "dev"
|
|
44
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "dev"
|
|
45
|
+
Requires-Dist: Flask>=3.0.0; extra == "dev"
|
|
46
|
+
Requires-Dist: Werkzeug>=3.0.0; extra == "dev"
|
|
47
|
+
Dynamic: license-file
|
|
48
|
+
|
|
49
|
+
<p align="center">
|
|
50
|
+
<h1 align="center">DocSlight Lite</h1>
|
|
51
|
+
<p align="center">Lightweight Python SDK & CLI for document parsing and structured extraction</p>
|
|
52
|
+
<p align="center">
|
|
53
|
+
<a href="https://pypi.org/project/docslight-lite/"><img src="https://img.shields.io/pypi/v/docslight-lite" alt="PyPI"></a>
|
|
54
|
+
<a href="https://pypi.org/project/docslight-lite/"><img src="https://img.shields.io/pypi/pyversions/docslight-lite" alt="Python versions"></a>
|
|
55
|
+
<a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight-lite" alt="License"></a>
|
|
56
|
+
</p>
|
|
57
|
+
</p>
|
|
58
|
+
|
|
59
|
+
## What is DocSlight Lite?
|
|
60
|
+
|
|
61
|
+
A lightweight Python library that turns PDFs, images, and Office documents into clean Markdown or structured JSON — with one line of code. Works with ComPDF Cloud (recommended) or fully offline with local parsers.
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from docslight import DocSlight
|
|
65
|
+
|
|
66
|
+
client = DocSlight(api_key="your-api-key")
|
|
67
|
+
result = client.parse("invoice.pdf")
|
|
68
|
+
print(result.to_markdown())
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Quick Start
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install docslight-lite
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Parse any document:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
docslight parse invoice.pdf --output invoice.md
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Extract specific fields:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Launch the local Web UI workbench:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install "docslight-lite[web]"
|
|
93
|
+
docslight web
|
|
94
|
+
# Open http://127.0.0.1:8000
|
|
95
|
+
|
|
96
|
+
# Or run the same Web UI directly as a module
|
|
97
|
+
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Features
|
|
101
|
+
|
|
102
|
+
- **Dual mode** — ComPDF Cloud for production-grade results, or local CPU parsing for offline evaluation
|
|
103
|
+
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
104
|
+
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
105
|
+
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
106
|
+
- **Web UI** — Local Flask workbench with drag-and-drop, live preview with bbox highlights, and a Fields Builder UI
|
|
107
|
+
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
108
|
+
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
109
|
+
- **Document types** — Classify and route documents by type for cloud extraction
|
|
110
|
+
- **Error-safe** — Typed result objects, structured error hierarchy, no credential leaks
|
|
111
|
+
|
|
112
|
+
## Install
|
|
113
|
+
|
|
114
|
+
| Scenario | Command |
|
|
115
|
+
|----------|---------|
|
|
116
|
+
| Core SDK & CLI | `pip install docslight-lite` |
|
|
117
|
+
| + Local parsing (OCR, Office) | `pip install "docslight-lite[local]"` |
|
|
118
|
+
| + Local LLM extraction | `pip install "docslight-lite[local,local-llm]"` |
|
|
119
|
+
| + Web UI workbench | `pip install "docslight-lite[web]"` |
|
|
120
|
+
|
|
121
|
+
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
122
|
+
|
|
123
|
+
## SDK Usage
|
|
124
|
+
|
|
125
|
+
### Cloud — Parse
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from docslight import DocSlight
|
|
129
|
+
|
|
130
|
+
client = DocSlight(mode="cloud", api_key="your-api-key")
|
|
131
|
+
|
|
132
|
+
result = client.parse("invoice.pdf")
|
|
133
|
+
print(result.to_markdown()) # Clean markdown
|
|
134
|
+
print(result.to_json()) # Full result with pages + metadata
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Cloud — Extract
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
result = client.extract(
|
|
141
|
+
"invoice.pdf",
|
|
142
|
+
fields=["invoice_number", "invoice_date", "total_amount"],
|
|
143
|
+
)
|
|
144
|
+
print(result.to_json())
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
With a JSON Schema:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
schema = {
|
|
151
|
+
"type": "object",
|
|
152
|
+
"properties": {
|
|
153
|
+
"invoice_number": {"type": "string"},
|
|
154
|
+
"total_amount": {"type": "number"},
|
|
155
|
+
},
|
|
156
|
+
"required": ["invoice_number"],
|
|
157
|
+
}
|
|
158
|
+
result = client.extract("invoice.pdf", schema=schema)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
With document type classification:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
result = client.extract(
|
|
165
|
+
"invoice.pdf",
|
|
166
|
+
fields=["invoice_number"],
|
|
167
|
+
document_types=["invoice"],
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Local — Parse (Offline)
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
client = DocSlight(mode="local")
|
|
175
|
+
result = client.parse("invoice.pdf")
|
|
176
|
+
print(result.to_markdown())
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Local — Extract with Ollama
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
client = DocSlight(
|
|
183
|
+
mode="local",
|
|
184
|
+
local_llm={"provider": "ollama", "model": "llama3.1"},
|
|
185
|
+
)
|
|
186
|
+
result = client.extract(
|
|
187
|
+
"invoice.pdf",
|
|
188
|
+
fields=["invoice_number", "invoice_date"],
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Local — Extract with OpenAI-Compatible API
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
client = DocSlight(
|
|
196
|
+
mode="local",
|
|
197
|
+
local_llm={
|
|
198
|
+
"provider": "openai-compatible",
|
|
199
|
+
"base_url": "https://your-endpoint/v1",
|
|
200
|
+
"model": "your-model",
|
|
201
|
+
"api_key": "your-api-key",
|
|
202
|
+
"extra_body": {"enable_thinking": False}, # e.g., DashScope qwen3
|
|
203
|
+
},
|
|
204
|
+
)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Batch Processing
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
211
|
+
for r in results:
|
|
212
|
+
print(r.to_markdown()[:200])
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## CLI Usage
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Parse
|
|
219
|
+
docslight parse invoice.pdf --mode cloud -o invoice.md
|
|
220
|
+
docslight parse invoice.pdf --mode local -o invoice.md
|
|
221
|
+
|
|
222
|
+
# Extract
|
|
223
|
+
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
224
|
+
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
225
|
+
|
|
226
|
+
# Extract with schema
|
|
227
|
+
docslight extract invoice.pdf --schema schema.json
|
|
228
|
+
|
|
229
|
+
# Web UI
|
|
230
|
+
docslight web --host 127.0.0.1 --port 8000
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Web UI Workbench
|
|
234
|
+
|
|
235
|
+
DocSlight Workbench is a local Flask app for visual document processing.
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
pip install "docslight-lite[web]"
|
|
239
|
+
docslight web
|
|
240
|
+
python -m docslight.web_app
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
- **Parse & Extract tabs** — Switch between parsing and extraction workflows
|
|
244
|
+
- **Drag-and-drop upload** — PDF, images, DOCX, PPTX, XLSX
|
|
245
|
+
- **Live preview** — PDF page rendering with bbox highlight overlays
|
|
246
|
+
- **Fields Builder** — Structured UI for building key-value and table extraction templates
|
|
247
|
+
- **Download results** — One-click download of Markdown or JSON output
|
|
248
|
+
|
|
249
|
+
## Environment Variables
|
|
250
|
+
|
|
251
|
+
| Variable | Description |
|
|
252
|
+
|----------|-------------|
|
|
253
|
+
| `DOCSLIGHT_API_KEY` | API key for cloud mode |
|
|
254
|
+
| `DOCSLIGHT_MODE` | Processing mode: `cloud` or `local` (default: `cloud`) |
|
|
255
|
+
|
|
256
|
+
## Supported Inputs
|
|
257
|
+
|
|
258
|
+
| Mode | Formats |
|
|
259
|
+
|------|---------|
|
|
260
|
+
| Cloud | PDF, images (PNG/JPG/TIFF/BMP/WebP), DOCX, PPTX, XLSX, and more via ComPDF Cloud API |
|
|
261
|
+
| Local | PDF, images (PNG/JPG/TIFF/BMP/WebP), DOCX, PPTX, XLSX |
|
|
262
|
+
|
|
263
|
+
> Legacy Office formats (`.doc`, `.ppt`, `.xls`) must be converted to DOCX/PPTX/XLSX for local processing.
|
|
264
|
+
|
|
265
|
+
## Development
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
pip install -e ".[dev]"
|
|
269
|
+
ruff check .
|
|
270
|
+
mypy docslight
|
|
271
|
+
pytest
|
|
272
|
+
python -m build
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## License
|
|
276
|
+
|
|
277
|
+
MIT License. See `LICENSE`.
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<h1 align="center">DocSlight Lite</h1>
|
|
3
|
+
<p align="center">Lightweight Python SDK & CLI for document parsing and structured extraction</p>
|
|
4
|
+
<p align="center">
|
|
5
|
+
<a href="https://pypi.org/project/docslight-lite/"><img src="https://img.shields.io/pypi/v/docslight-lite" alt="PyPI"></a>
|
|
6
|
+
<a href="https://pypi.org/project/docslight-lite/"><img src="https://img.shields.io/pypi/pyversions/docslight-lite" alt="Python versions"></a>
|
|
7
|
+
<a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight-lite" alt="License"></a>
|
|
8
|
+
</p>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
## What is DocSlight Lite?
|
|
12
|
+
|
|
13
|
+
A lightweight Python library that turns PDFs, images, and Office documents into clean Markdown or structured JSON — with one line of code. Works with ComPDF Cloud (recommended) or fully offline with local parsers.
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from docslight import DocSlight
|
|
17
|
+
|
|
18
|
+
client = DocSlight(api_key="your-api-key")
|
|
19
|
+
result = client.parse("invoice.pdf")
|
|
20
|
+
print(result.to_markdown())
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install docslight-lite
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Parse any document:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
docslight parse invoice.pdf --output invoice.md
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Extract specific fields:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Launch the local Web UI workbench:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install "docslight-lite[web]"
|
|
45
|
+
docslight web
|
|
46
|
+
# Open http://127.0.0.1:8000
|
|
47
|
+
|
|
48
|
+
# Or run the same Web UI directly as a module
|
|
49
|
+
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **Dual mode** — ComPDF Cloud for production-grade results, or local CPU parsing for offline evaluation
|
|
55
|
+
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
56
|
+
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
57
|
+
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
58
|
+
- **Web UI** — Local Flask workbench with drag-and-drop, live preview with bbox highlights, and a Fields Builder UI
|
|
59
|
+
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
60
|
+
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
61
|
+
- **Document types** — Classify and route documents by type for cloud extraction
|
|
62
|
+
- **Error-safe** — Typed result objects, structured error hierarchy, no credential leaks
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
| Scenario | Command |
|
|
67
|
+
|----------|---------|
|
|
68
|
+
| Core SDK & CLI | `pip install docslight-lite` |
|
|
69
|
+
| + Local parsing (OCR, Office) | `pip install "docslight-lite[local]"` |
|
|
70
|
+
| + Local LLM extraction | `pip install "docslight-lite[local,local-llm]"` |
|
|
71
|
+
| + Web UI workbench | `pip install "docslight-lite[web]"` |
|
|
72
|
+
|
|
73
|
+
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
74
|
+
|
|
75
|
+
## SDK Usage
|
|
76
|
+
|
|
77
|
+
### Cloud — Parse
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from docslight import DocSlight
|
|
81
|
+
|
|
82
|
+
client = DocSlight(mode="cloud", api_key="your-api-key")
|
|
83
|
+
|
|
84
|
+
result = client.parse("invoice.pdf")
|
|
85
|
+
print(result.to_markdown()) # Clean markdown
|
|
86
|
+
print(result.to_json()) # Full result with pages + metadata
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Cloud — Extract
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
result = client.extract(
|
|
93
|
+
"invoice.pdf",
|
|
94
|
+
fields=["invoice_number", "invoice_date", "total_amount"],
|
|
95
|
+
)
|
|
96
|
+
print(result.to_json())
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
With a JSON Schema:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
schema = {
|
|
103
|
+
"type": "object",
|
|
104
|
+
"properties": {
|
|
105
|
+
"invoice_number": {"type": "string"},
|
|
106
|
+
"total_amount": {"type": "number"},
|
|
107
|
+
},
|
|
108
|
+
"required": ["invoice_number"],
|
|
109
|
+
}
|
|
110
|
+
result = client.extract("invoice.pdf", schema=schema)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
With document type classification:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
result = client.extract(
|
|
117
|
+
"invoice.pdf",
|
|
118
|
+
fields=["invoice_number"],
|
|
119
|
+
document_types=["invoice"],
|
|
120
|
+
)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Local — Parse (Offline)
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
client = DocSlight(mode="local")
|
|
127
|
+
result = client.parse("invoice.pdf")
|
|
128
|
+
print(result.to_markdown())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Local — Extract with Ollama
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
client = DocSlight(
|
|
135
|
+
mode="local",
|
|
136
|
+
local_llm={"provider": "ollama", "model": "llama3.1"},
|
|
137
|
+
)
|
|
138
|
+
result = client.extract(
|
|
139
|
+
"invoice.pdf",
|
|
140
|
+
fields=["invoice_number", "invoice_date"],
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Local — Extract with OpenAI-Compatible API
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
client = DocSlight(
|
|
148
|
+
mode="local",
|
|
149
|
+
local_llm={
|
|
150
|
+
"provider": "openai-compatible",
|
|
151
|
+
"base_url": "https://your-endpoint/v1",
|
|
152
|
+
"model": "your-model",
|
|
153
|
+
"api_key": "your-api-key",
|
|
154
|
+
"extra_body": {"enable_thinking": False}, # e.g., DashScope qwen3
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Batch Processing
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
163
|
+
for r in results:
|
|
164
|
+
print(r.to_markdown()[:200])
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## CLI Usage
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
# Parse
|
|
171
|
+
docslight parse invoice.pdf --mode cloud -o invoice.md
|
|
172
|
+
docslight parse invoice.pdf --mode local -o invoice.md
|
|
173
|
+
|
|
174
|
+
# Extract
|
|
175
|
+
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
176
|
+
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
177
|
+
|
|
178
|
+
# Extract with schema
|
|
179
|
+
docslight extract invoice.pdf --schema schema.json
|
|
180
|
+
|
|
181
|
+
# Web UI
|
|
182
|
+
docslight web --host 127.0.0.1 --port 8000
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Web UI Workbench
|
|
186
|
+
|
|
187
|
+
DocSlight Workbench is a local Flask app for visual document processing.
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
pip install "docslight-lite[web]"
|
|
191
|
+
docslight web
|
|
192
|
+
python -m docslight.web_app
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
- **Parse & Extract tabs** — Switch between parsing and extraction workflows
|
|
196
|
+
- **Drag-and-drop upload** — PDF, images, DOCX, PPTX, XLSX
|
|
197
|
+
- **Live preview** — PDF page rendering with bbox highlight overlays
|
|
198
|
+
- **Fields Builder** — Structured UI for building key-value and table extraction templates
|
|
199
|
+
- **Download results** — One-click download of Markdown or JSON output
|
|
200
|
+
|
|
201
|
+
## Environment Variables
|
|
202
|
+
|
|
203
|
+
| Variable | Description |
|
|
204
|
+
|----------|-------------|
|
|
205
|
+
| `DOCSLIGHT_API_KEY` | API key for cloud mode |
|
|
206
|
+
| `DOCSLIGHT_MODE` | Processing mode: `cloud` or `local` (default: `cloud`) |
|
|
207
|
+
|
|
208
|
+
## Supported Inputs
|
|
209
|
+
|
|
210
|
+
| Mode | Formats |
|
|
211
|
+
|------|---------|
|
|
212
|
+
| Cloud | PDF, images (PNG/JPG/TIFF/BMP/WebP), DOCX, PPTX, XLSX, and more via ComPDF Cloud API |
|
|
213
|
+
| Local | PDF, images (PNG/JPG/TIFF/BMP/WebP), DOCX, PPTX, XLSX |
|
|
214
|
+
|
|
215
|
+
> Legacy Office formats (`.doc`, `.ppt`, `.xls`) must be converted to DOCX/PPTX/XLSX for local processing.
|
|
216
|
+
|
|
217
|
+
## Development
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
pip install -e ".[dev]"
|
|
221
|
+
ruff check .
|
|
222
|
+
mypy docslight
|
|
223
|
+
pytest
|
|
224
|
+
python -m build
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## License
|
|
228
|
+
|
|
229
|
+
MIT License. See `LICENSE`.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Lightweight ComPDF document parsing and extraction SDK."""
|
|
2
|
+
|
|
3
|
+
from docslight.client import DocSlight
|
|
4
|
+
from docslight.config import (
|
|
5
|
+
DEFAULT_BASE_URL,
|
|
6
|
+
DEFAULT_CONFIG_PATH,
|
|
7
|
+
VALID_MODES,
|
|
8
|
+
DocSlightConfig,
|
|
9
|
+
)
|
|
10
|
+
from docslight.exceptions import (
|
|
11
|
+
AuthenticationError,
|
|
12
|
+
CloudAPIError,
|
|
13
|
+
ConfigurationError,
|
|
14
|
+
DependencyMissingError,
|
|
15
|
+
DocSlightError,
|
|
16
|
+
LocalProcessingError,
|
|
17
|
+
RateLimitError,
|
|
18
|
+
UnsupportedFormatError,
|
|
19
|
+
)
|
|
20
|
+
from docslight.result import ExtractResult, ParseResult
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"AuthenticationError",
|
|
26
|
+
"CloudAPIError",
|
|
27
|
+
"ConfigurationError",
|
|
28
|
+
"DEFAULT_BASE_URL",
|
|
29
|
+
"DEFAULT_CONFIG_PATH",
|
|
30
|
+
"DependencyMissingError",
|
|
31
|
+
"DocSlight",
|
|
32
|
+
"DocSlightConfig",
|
|
33
|
+
"DocSlightError",
|
|
34
|
+
"ExtractResult",
|
|
35
|
+
"LocalProcessingError",
|
|
36
|
+
"ParseResult",
|
|
37
|
+
"RateLimitError",
|
|
38
|
+
"UnsupportedFormatError",
|
|
39
|
+
"VALID_MODES",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|