docslight 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docslight-0.1.2 → docslight-0.1.4}/PKG-INFO +27 -30
- {docslight-0.1.2 → docslight-0.1.4}/README.md +38 -40
- {docslight-0.1.2 → docslight-0.1.4}/docslight/__init__.py +1 -1
- {docslight-0.1.2 → docslight-0.1.4}/docslight/cli.py +10 -10
- {docslight-0.1.2 → docslight-0.1.4}/docslight/cloud/client.py +1 -1
- {docslight-0.1.2 → docslight-0.1.4}/docslight/preview.py +1 -1
- {docslight-0.1.2 → docslight-0.1.4}/docslight/providers/openai_compatible.py +3 -3
- {docslight-0.1.2 → docslight-0.1.4}/docslight/schemas/fields.py +2 -2
- {docslight-0.1.2 → docslight-0.1.4}/docslight/web_app.py +32 -31
- {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/PKG-INFO +27 -30
- {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/SOURCES.txt +1 -10
- {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/requires.txt +8 -11
- {docslight-0.1.2 → docslight-0.1.4}/pyproject.toml +40 -46
- docslight-0.1.2/docslight/static/app/common.js +0 -668
- docslight-0.1.2/docslight/static/app/docslight-extract.json +0 -307
- docslight-0.1.2/docslight/static/app/extract.js +0 -394
- docslight-0.1.2/docslight/static/app/i18n.js +0 -405
- docslight-0.1.2/docslight/static/app/parse.js +0 -161
- docslight-0.1.2/docslight/static/styles.css +0 -878
- docslight-0.1.2/docslight/templates/base.html +0 -36
- docslight-0.1.2/docslight/templates/extract.html +0 -123
- docslight-0.1.2/docslight/templates/parse.html +0 -81
- {docslight-0.1.2 → docslight-0.1.4}/LICENSE +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/client.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/cloud/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/config.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/exceptions.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/layout_blocks.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/llm_extractor.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/loaders.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/markdown.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/office_loader.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/paddle_parser.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/local/pipeline.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/providers/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/providers/ollama.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/result.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/schemas/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight/standard_json.py +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/dependency_links.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/entry_points.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/top_level.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.4}/setup.cfg +0 -0
|
@@ -1,18 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docslight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
5
|
Author-email: ComPDF AI <support@compdf.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Requires-Python:
|
|
7
|
+
Requires-Python: <=3.13,>=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: requests>=2.31.0
|
|
11
11
|
Requires-Dist: pydantic>=2.5.0
|
|
12
12
|
Requires-Dist: typing-extensions>=4.8.0; python_version < "3.11"
|
|
13
13
|
Requires-Dist: tomli>=2.0.1; python_version < "3.11"
|
|
14
|
-
Requires-Dist: flask>=3.1.3
|
|
15
|
-
Requires-Dist: werkzeug>=3.1.8
|
|
16
14
|
Provides-Extra: local
|
|
17
15
|
Requires-Dist: Pillow>=10.0.0; extra == "local"
|
|
18
16
|
Requires-Dist: PyMuPDF>=1.23.0; extra == "local"
|
|
@@ -22,13 +20,10 @@ Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "local"
|
|
|
22
20
|
Requires-Dist: python-docx>=1.1.0; extra == "local"
|
|
23
21
|
Requires-Dist: python-pptx>=0.6.23; extra == "local"
|
|
24
22
|
Requires-Dist: openpyxl>=3.1.0; extra == "local"
|
|
25
|
-
|
|
26
|
-
Requires-Dist: openai>=1.0.0; extra == "local-llm"
|
|
23
|
+
Requires-Dist: openai>=1.0.0; extra == "local"
|
|
27
24
|
Provides-Extra: web
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Provides-Extra: web-test
|
|
31
|
-
Requires-Dist: playwright>=1.40.0; extra == "web-test"
|
|
25
|
+
Requires-Dist: flask>=3.1.3; extra == "web"
|
|
26
|
+
Requires-Dist: werkzeug>=3.1.8; extra == "web"
|
|
32
27
|
Provides-Extra: dev
|
|
33
28
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
34
29
|
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
@@ -36,14 +31,18 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
|
36
31
|
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
37
32
|
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
38
33
|
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: playwright>=1.40.0; extra == "dev"
|
|
39
35
|
Requires-Dist: Pillow>=10.0.0; extra == "dev"
|
|
40
36
|
Requires-Dist: PyMuPDF>=1.23.0; extra == "dev"
|
|
41
37
|
Requires-Dist: numpy>=1.24.0; extra == "dev"
|
|
38
|
+
Requires-Dist: paddlepaddle>=3.3.0; extra == "dev"
|
|
39
|
+
Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "dev"
|
|
42
40
|
Requires-Dist: python-docx>=1.1.0; extra == "dev"
|
|
43
41
|
Requires-Dist: python-pptx>=0.6.23; extra == "dev"
|
|
44
42
|
Requires-Dist: openpyxl>=3.1.0; extra == "dev"
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist:
|
|
43
|
+
Requires-Dist: openai>=1.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: flask>=3.1.3; extra == "dev"
|
|
45
|
+
Requires-Dist: werkzeug>=3.1.8; extra == "dev"
|
|
47
46
|
Dynamic: license-file
|
|
48
47
|
|
|
49
48
|
<p align="center">
|
|
@@ -87,14 +86,13 @@ Extract specific fields:
|
|
|
87
86
|
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
88
87
|
```
|
|
89
88
|
|
|
90
|
-
Launch the local
|
|
89
|
+
Launch the local API server:
|
|
91
90
|
|
|
92
91
|
```bash
|
|
93
|
-
pip install "docslight[web]"
|
|
94
92
|
docslight web
|
|
95
|
-
#
|
|
93
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
96
94
|
|
|
97
|
-
# Or run the same
|
|
95
|
+
# Or run the same API server directly as a module
|
|
98
96
|
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
99
97
|
```
|
|
100
98
|
|
|
@@ -104,7 +102,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
104
102
|
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
105
103
|
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
106
104
|
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
107
|
-
- **
|
|
105
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
108
106
|
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
109
107
|
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
110
108
|
- **Document types** — Classify and route documents by type for cloud extraction
|
|
@@ -116,8 +114,8 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
116
114
|
|----------|---------|
|
|
117
115
|
| Core SDK & CLI | `pip install docslight` |
|
|
118
116
|
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
119
|
-
| +
|
|
120
|
-
| +
|
|
117
|
+
| + API server | `pip install "docslight[web]"` |
|
|
118
|
+
| + Local parsing and API server | `pip install "docslight[local,web]"` |
|
|
121
119
|
|
|
122
120
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
123
121
|
|
|
@@ -209,7 +207,7 @@ client = DocSlight(
|
|
|
209
207
|
|
|
210
208
|
```python
|
|
211
209
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
212
|
-
for r in results:
|
|
210
|
+
for r in results:
|
|
213
211
|
print(r.to_markdown()[:200])
|
|
214
212
|
```
|
|
215
213
|
|
|
@@ -224,30 +222,29 @@ docslight parse invoice.pdf --mode local -o invoice.zip
|
|
|
224
222
|
# Extract
|
|
225
223
|
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
226
224
|
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
227
|
-
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
|
|
225
|
+
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
228
226
|
|
|
229
227
|
# Extract with schema
|
|
230
228
|
docslight extract invoice.pdf --schema schema.json
|
|
231
229
|
|
|
232
|
-
#
|
|
230
|
+
# API server
|
|
233
231
|
docslight web --host 127.0.0.1 --port 8000
|
|
234
232
|
```
|
|
235
233
|
|
|
236
|
-
##
|
|
234
|
+
## API Server
|
|
237
235
|
|
|
238
|
-
DocSlight
|
|
236
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
239
237
|
|
|
240
238
|
```bash
|
|
241
|
-
pip install "docslight[web]"
|
|
242
239
|
docslight web
|
|
243
240
|
python -m docslight.web_app
|
|
244
241
|
```
|
|
245
242
|
|
|
246
|
-
-
|
|
247
|
-
-
|
|
248
|
-
-
|
|
249
|
-
-
|
|
250
|
-
-
|
|
243
|
+
- `GET /api/health`
|
|
244
|
+
- `GET /api/system-info`
|
|
245
|
+
- `POST /api/parse`
|
|
246
|
+
- `POST /api/extract`
|
|
247
|
+
- `POST /api/preview`
|
|
251
248
|
|
|
252
249
|
## Environment Variables
|
|
253
250
|
|
|
@@ -39,16 +39,15 @@ Extract specific fields:
|
|
|
39
39
|
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
-
Launch the local
|
|
43
|
-
|
|
44
|
-
```bash
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
```
|
|
42
|
+
Launch the local API server:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
docslight web
|
|
46
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
47
|
+
|
|
48
|
+
# Or run the same API server directly as a module
|
|
49
|
+
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
50
|
+
```
|
|
52
51
|
|
|
53
52
|
## Features
|
|
54
53
|
|
|
@@ -56,7 +55,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
56
55
|
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
57
56
|
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
58
57
|
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
59
|
-
- **
|
|
58
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
60
59
|
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
61
60
|
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
62
61
|
- **Document types** — Classify and route documents by type for cloud extraction
|
|
@@ -66,10 +65,10 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
66
65
|
|
|
67
66
|
| Scenario | Command |
|
|
68
67
|
|----------|---------|
|
|
69
|
-
| Core SDK & CLI | `pip install docslight` |
|
|
70
|
-
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
71
|
-
| +
|
|
72
|
-
| +
|
|
68
|
+
| Core SDK & CLI | `pip install docslight` |
|
|
69
|
+
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
70
|
+
| + API server | `pip install "docslight[web]"` |
|
|
71
|
+
| + Local parsing and API server | `pip install "docslight[local,web]"` |
|
|
73
72
|
|
|
74
73
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
75
74
|
|
|
@@ -160,9 +159,9 @@ client = DocSlight(
|
|
|
160
159
|
### Batch Processing
|
|
161
160
|
|
|
162
161
|
```python
|
|
163
|
-
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
164
|
-
for r in results:
|
|
165
|
-
print(r.to_markdown()[:200])
|
|
162
|
+
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
163
|
+
for r in results:
|
|
164
|
+
print(r.to_markdown()[:200])
|
|
166
165
|
```
|
|
167
166
|
|
|
168
167
|
## CLI Usage
|
|
@@ -174,32 +173,31 @@ docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
|
|
|
174
173
|
docslight parse invoice.pdf --mode local -o invoice.zip
|
|
175
174
|
|
|
176
175
|
# Extract
|
|
177
|
-
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
178
|
-
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
179
|
-
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
|
|
176
|
+
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
177
|
+
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
178
|
+
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
180
179
|
|
|
181
180
|
# Extract with schema
|
|
182
181
|
docslight extract invoice.pdf --schema schema.json
|
|
183
182
|
|
|
184
|
-
#
|
|
185
|
-
docslight web --host 127.0.0.1 --port 8000
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
##
|
|
189
|
-
|
|
190
|
-
DocSlight
|
|
191
|
-
|
|
192
|
-
```bash
|
|
193
|
-
|
|
194
|
-
docslight
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
-
|
|
199
|
-
-
|
|
200
|
-
-
|
|
201
|
-
-
|
|
202
|
-
- **Download results** — One-click download of Markdown or JSON output
|
|
183
|
+
# API server
|
|
184
|
+
docslight web --host 127.0.0.1 --port 8000
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## API Server
|
|
188
|
+
|
|
189
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
docslight web
|
|
193
|
+
python -m docslight.web_app
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
- `GET /api/health`
|
|
197
|
+
- `GET /api/system-info`
|
|
198
|
+
- `POST /api/parse`
|
|
199
|
+
- `POST /api/extract`
|
|
200
|
+
- `POST /api/preview`
|
|
203
201
|
|
|
204
202
|
## Environment Variables
|
|
205
203
|
|
|
@@ -108,7 +108,7 @@ def _to_pretty_json(data: Any) -> str:
|
|
|
108
108
|
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
109
109
|
|
|
110
110
|
|
|
111
|
-
def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
111
|
+
def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
112
112
|
"""Run the optional Flask web application."""
|
|
113
113
|
if importlib.util.find_spec("docslight.web_app") is None:
|
|
114
114
|
raise CLIUsageError(WEB_EXTRA_ERROR)
|
|
@@ -119,8 +119,8 @@ def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
|
119
119
|
if exc.name in {"flask", "werkzeug"}:
|
|
120
120
|
raise CLIUsageError(WEB_EXTRA_ERROR) from exc
|
|
121
121
|
raise
|
|
122
|
-
_run_web_app = web_app.run_web_app
|
|
123
|
-
_run_web_app(host, port, debug)
|
|
122
|
+
_run_web_app = web_app.run_web_app
|
|
123
|
+
_run_web_app(host, port, debug)
|
|
124
124
|
|
|
125
125
|
|
|
126
126
|
def _print_cli_error(error: Exception) -> int:
|
|
@@ -165,10 +165,10 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
165
165
|
extract_parser.set_defaults(func=_run_extract)
|
|
166
166
|
|
|
167
167
|
web_parser = subparsers.add_parser("web", help="Run the web application")
|
|
168
|
-
web_parser.add_argument("--host", default="127.0.0.1")
|
|
169
|
-
web_parser.add_argument("--port", type=int, default=8000)
|
|
170
|
-
web_parser.add_argument("--debug", action="store_true")
|
|
171
|
-
web_parser.set_defaults(func=_run_web)
|
|
168
|
+
web_parser.add_argument("--host", default="127.0.0.1")
|
|
169
|
+
web_parser.add_argument("--port", type=int, default=8000)
|
|
170
|
+
web_parser.add_argument("--debug", action="store_true")
|
|
171
|
+
web_parser.set_defaults(func=_run_web)
|
|
172
172
|
|
|
173
173
|
return parser
|
|
174
174
|
|
|
@@ -227,9 +227,9 @@ def _run_extract(args: argparse.Namespace) -> int:
|
|
|
227
227
|
return 0
|
|
228
228
|
|
|
229
229
|
|
|
230
|
-
def _run_web(args: argparse.Namespace) -> int:
|
|
231
|
-
run_web_app(args.host, args.port, args.debug)
|
|
232
|
-
return 0
|
|
230
|
+
def _run_web(args: argparse.Namespace) -> int:
|
|
231
|
+
run_web_app(args.host, args.port, args.debug)
|
|
232
|
+
return 0
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
@@ -219,7 +219,7 @@ class CloudClient:
|
|
|
219
219
|
return compacted
|
|
220
220
|
|
|
221
221
|
def _headers(self) -> dict[str, str]:
|
|
222
|
-
headers = {"User-Agent": "docslight/0.1.
|
|
222
|
+
headers = {"User-Agent": "docslight/0.1.4"}
|
|
223
223
|
if self.api_key:
|
|
224
224
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
225
225
|
headers["x-api-key"] = self.api_key
|
|
@@ -6,9 +6,9 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
from docslight.exceptions import DependencyMissingError, LocalProcessingError
|
|
8
8
|
|
|
9
|
-
INSTALL_LOCAL_LLM_MESSAGE = (
|
|
10
|
-
"Install local
|
|
11
|
-
)
|
|
9
|
+
INSTALL_LOCAL_LLM_MESSAGE = (
|
|
10
|
+
"Install local dependencies with: pip install 'docslight[local]'"
|
|
11
|
+
)
|
|
12
12
|
NO_TEXT_CONTENT_MESSAGE = "OpenAI-compatible provider returned no text content"
|
|
13
13
|
REQUEST_FAILED_MESSAGE = "OpenAI-compatible provider request failed"
|
|
14
14
|
|
|
@@ -11,8 +11,8 @@ NormalizedFields = list[str] | StructuredFields | None
|
|
|
11
11
|
ExtractSchema = dict[str, Any]
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
|
|
15
|
-
"""Normalize extraction fields from SDK, CLI, or
|
|
14
|
+
def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
|
|
15
|
+
"""Normalize extraction fields from SDK, CLI, or API inputs."""
|
|
16
16
|
if fields is None:
|
|
17
17
|
return None
|
|
18
18
|
if isinstance(fields, str):
|
|
@@ -2,19 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import argparse
|
|
6
|
-
import base64
|
|
7
|
-
import json
|
|
8
|
-
import logging
|
|
9
|
-
import sys
|
|
10
|
-
import tempfile
|
|
5
|
+
import argparse
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
11
|
from collections.abc import Callable
|
|
12
12
|
from io import BytesIO
|
|
13
13
|
from json import JSONDecodeError
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
from typing import Any, cast
|
|
16
16
|
|
|
17
|
-
from flask import Flask, Response, jsonify,
|
|
17
|
+
from flask import Flask, Response, jsonify, request, send_file
|
|
18
18
|
from werkzeug.datastructures import FileStorage
|
|
19
19
|
from werkzeug.utils import secure_filename
|
|
20
20
|
|
|
@@ -58,21 +58,18 @@ OFFICE_PREVIEW_UNSUPPORTED_MESSAGE = (
|
|
|
58
58
|
LOG_FORMAT = "%(levelname)s:%(name)s:%(message)s"
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
62
|
-
"""Create the local DocSlight Flask application."""
|
|
63
|
-
app = Flask(__name__)
|
|
64
|
-
|
|
65
|
-
@app.get("/")
|
|
66
|
-
def index() -> Any:
|
|
67
|
-
return
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@app.get("/extract")
|
|
74
|
-
def extract_page() -> str:
|
|
75
|
-
return render_template("extract.html", active_page="extract")
|
|
61
|
+
def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
62
|
+
"""Create the local DocSlight Flask application."""
|
|
63
|
+
app = Flask(__name__)
|
|
64
|
+
|
|
65
|
+
@app.get("/")
|
|
66
|
+
def index() -> Any:
|
|
67
|
+
return jsonify(
|
|
68
|
+
{
|
|
69
|
+
"status": "healthy",
|
|
70
|
+
"service": "docslight-web",
|
|
71
|
+
}
|
|
72
|
+
)
|
|
76
73
|
|
|
77
74
|
@app.get("/api/health")
|
|
78
75
|
def health() -> Any:
|
|
@@ -160,7 +157,11 @@ def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
|
160
157
|
return app
|
|
161
158
|
|
|
162
159
|
|
|
163
|
-
def run_web_app(
|
|
160
|
+
def run_web_app(
|
|
161
|
+
host: str = "127.0.0.1",
|
|
162
|
+
port: int = 8000,
|
|
163
|
+
debug: bool = False,
|
|
164
|
+
) -> None:
|
|
164
165
|
"""Run the local DocSlight web application."""
|
|
165
166
|
_configure_web_logging(debug)
|
|
166
167
|
create_app().run(host=host, port=port, debug=debug)
|
|
@@ -183,17 +184,17 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
183
184
|
prog="python -m docslight.web_app",
|
|
184
185
|
description="Run the DocSlight web application.",
|
|
185
186
|
)
|
|
186
|
-
parser.add_argument("--host", default="127.0.0.1")
|
|
187
|
-
parser.add_argument("--port", type=int, default=8000)
|
|
188
|
-
parser.add_argument("--debug", action="store_true")
|
|
189
|
-
return parser
|
|
187
|
+
parser.add_argument("--host", default="127.0.0.1")
|
|
188
|
+
parser.add_argument("--port", type=int, default=8000)
|
|
189
|
+
parser.add_argument("--debug", action="store_true")
|
|
190
|
+
return parser
|
|
190
191
|
|
|
191
192
|
|
|
192
193
|
def main(argv: list[str] | None = None) -> int:
|
|
193
|
-
"""Run the standalone DocSlight web application entrypoint."""
|
|
194
|
-
args = build_parser().parse_args(argv)
|
|
195
|
-
run_web_app(args.host, args.port, args.debug)
|
|
196
|
-
return 0
|
|
194
|
+
"""Run the standalone DocSlight web application entrypoint."""
|
|
195
|
+
args = build_parser().parse_args(argv)
|
|
196
|
+
run_web_app(args.host, args.port, args.debug)
|
|
197
|
+
return 0
|
|
197
198
|
|
|
198
199
|
|
|
199
200
|
def local_llm_from_form(form: Any) -> dict[str, str] | None:
|
|
@@ -1,18 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docslight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
5
|
Author-email: ComPDF AI <support@compdf.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Requires-Python:
|
|
7
|
+
Requires-Python: <=3.13,>=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: requests>=2.31.0
|
|
11
11
|
Requires-Dist: pydantic>=2.5.0
|
|
12
12
|
Requires-Dist: typing-extensions>=4.8.0; python_version < "3.11"
|
|
13
13
|
Requires-Dist: tomli>=2.0.1; python_version < "3.11"
|
|
14
|
-
Requires-Dist: flask>=3.1.3
|
|
15
|
-
Requires-Dist: werkzeug>=3.1.8
|
|
16
14
|
Provides-Extra: local
|
|
17
15
|
Requires-Dist: Pillow>=10.0.0; extra == "local"
|
|
18
16
|
Requires-Dist: PyMuPDF>=1.23.0; extra == "local"
|
|
@@ -22,13 +20,10 @@ Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "local"
|
|
|
22
20
|
Requires-Dist: python-docx>=1.1.0; extra == "local"
|
|
23
21
|
Requires-Dist: python-pptx>=0.6.23; extra == "local"
|
|
24
22
|
Requires-Dist: openpyxl>=3.1.0; extra == "local"
|
|
25
|
-
|
|
26
|
-
Requires-Dist: openai>=1.0.0; extra == "local-llm"
|
|
23
|
+
Requires-Dist: openai>=1.0.0; extra == "local"
|
|
27
24
|
Provides-Extra: web
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Provides-Extra: web-test
|
|
31
|
-
Requires-Dist: playwright>=1.40.0; extra == "web-test"
|
|
25
|
+
Requires-Dist: flask>=3.1.3; extra == "web"
|
|
26
|
+
Requires-Dist: werkzeug>=3.1.8; extra == "web"
|
|
32
27
|
Provides-Extra: dev
|
|
33
28
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
34
29
|
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
@@ -36,14 +31,18 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
|
36
31
|
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
37
32
|
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
38
33
|
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: playwright>=1.40.0; extra == "dev"
|
|
39
35
|
Requires-Dist: Pillow>=10.0.0; extra == "dev"
|
|
40
36
|
Requires-Dist: PyMuPDF>=1.23.0; extra == "dev"
|
|
41
37
|
Requires-Dist: numpy>=1.24.0; extra == "dev"
|
|
38
|
+
Requires-Dist: paddlepaddle>=3.3.0; extra == "dev"
|
|
39
|
+
Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "dev"
|
|
42
40
|
Requires-Dist: python-docx>=1.1.0; extra == "dev"
|
|
43
41
|
Requires-Dist: python-pptx>=0.6.23; extra == "dev"
|
|
44
42
|
Requires-Dist: openpyxl>=3.1.0; extra == "dev"
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist:
|
|
43
|
+
Requires-Dist: openai>=1.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: flask>=3.1.3; extra == "dev"
|
|
45
|
+
Requires-Dist: werkzeug>=3.1.8; extra == "dev"
|
|
47
46
|
Dynamic: license-file
|
|
48
47
|
|
|
49
48
|
<p align="center">
|
|
@@ -87,14 +86,13 @@ Extract specific fields:
|
|
|
87
86
|
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
88
87
|
```
|
|
89
88
|
|
|
90
|
-
Launch the local
|
|
89
|
+
Launch the local API server:
|
|
91
90
|
|
|
92
91
|
```bash
|
|
93
|
-
pip install "docslight[web]"
|
|
94
92
|
docslight web
|
|
95
|
-
#
|
|
93
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
96
94
|
|
|
97
|
-
# Or run the same
|
|
95
|
+
# Or run the same API server directly as a module
|
|
98
96
|
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
99
97
|
```
|
|
100
98
|
|
|
@@ -104,7 +102,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
104
102
|
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
105
103
|
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
106
104
|
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
107
|
-
- **
|
|
105
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
108
106
|
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
109
107
|
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
110
108
|
- **Document types** — Classify and route documents by type for cloud extraction
|
|
@@ -116,8 +114,8 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
116
114
|
|----------|---------|
|
|
117
115
|
| Core SDK & CLI | `pip install docslight` |
|
|
118
116
|
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
119
|
-
| +
|
|
120
|
-
| +
|
|
117
|
+
| + API server | `pip install "docslight[web]"` |
|
|
118
|
+
| + Local parsing and API server | `pip install "docslight[local,web]"` |
|
|
121
119
|
|
|
122
120
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
123
121
|
|
|
@@ -209,7 +207,7 @@ client = DocSlight(
|
|
|
209
207
|
|
|
210
208
|
```python
|
|
211
209
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
212
|
-
for r in results:
|
|
210
|
+
for r in results:
|
|
213
211
|
print(r.to_markdown()[:200])
|
|
214
212
|
```
|
|
215
213
|
|
|
@@ -224,30 +222,29 @@ docslight parse invoice.pdf --mode local -o invoice.zip
|
|
|
224
222
|
# Extract
|
|
225
223
|
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
226
224
|
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
227
|
-
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
|
|
225
|
+
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
228
226
|
|
|
229
227
|
# Extract with schema
|
|
230
228
|
docslight extract invoice.pdf --schema schema.json
|
|
231
229
|
|
|
232
|
-
#
|
|
230
|
+
# API server
|
|
233
231
|
docslight web --host 127.0.0.1 --port 8000
|
|
234
232
|
```
|
|
235
233
|
|
|
236
|
-
##
|
|
234
|
+
## API Server
|
|
237
235
|
|
|
238
|
-
DocSlight
|
|
236
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
239
237
|
|
|
240
238
|
```bash
|
|
241
|
-
pip install "docslight[web]"
|
|
242
239
|
docslight web
|
|
243
240
|
python -m docslight.web_app
|
|
244
241
|
```
|
|
245
242
|
|
|
246
|
-
-
|
|
247
|
-
-
|
|
248
|
-
-
|
|
249
|
-
-
|
|
250
|
-
-
|
|
243
|
+
- `GET /api/health`
|
|
244
|
+
- `GET /api/system-info`
|
|
245
|
+
- `POST /api/parse`
|
|
246
|
+
- `POST /api/extract`
|
|
247
|
+
- `POST /api/preview`
|
|
251
248
|
|
|
252
249
|
## Environment Variables
|
|
253
250
|
|
|
@@ -30,13 +30,4 @@ docslight/providers/__init__.py
|
|
|
30
30
|
docslight/providers/ollama.py
|
|
31
31
|
docslight/providers/openai_compatible.py
|
|
32
32
|
docslight/schemas/__init__.py
|
|
33
|
-
docslight/schemas/fields.py
|
|
34
|
-
docslight/static/styles.css
|
|
35
|
-
docslight/static/app/common.js
|
|
36
|
-
docslight/static/app/docslight-extract.json
|
|
37
|
-
docslight/static/app/extract.js
|
|
38
|
-
docslight/static/app/i18n.js
|
|
39
|
-
docslight/static/app/parse.js
|
|
40
|
-
docslight/templates/base.html
|
|
41
|
-
docslight/templates/extract.html
|
|
42
|
-
docslight/templates/parse.html
|
|
33
|
+
docslight/schemas/fields.py
|