docslight 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docslight-0.1.2 → docslight-0.1.3}/PKG-INFO +15 -17
- {docslight-0.1.2 → docslight-0.1.3}/README.md +32 -34
- {docslight-0.1.2 → docslight-0.1.3}/docslight/cli.py +10 -10
- {docslight-0.1.2 → docslight-0.1.3}/docslight/preview.py +1 -1
- {docslight-0.1.2 → docslight-0.1.3}/docslight/schemas/fields.py +2 -2
- {docslight-0.1.2 → docslight-0.1.3}/docslight/web_app.py +32 -31
- {docslight-0.1.2 → docslight-0.1.3}/docslight.egg-info/PKG-INFO +15 -17
- {docslight-0.1.2 → docslight-0.1.3}/docslight.egg-info/SOURCES.txt +12 -9
- {docslight-0.1.2 → docslight-0.1.3}/pyproject.toml +4 -7
- docslight-0.1.3/tests/test_cli.py +450 -0
- docslight-0.1.3/tests/test_cli_entrypoint.py +15 -0
- docslight-0.1.3/tests/test_client.py +255 -0
- docslight-0.1.3/tests/test_cloud_client.py +771 -0
- docslight-0.1.3/tests/test_config_result.py +231 -0
- docslight-0.1.3/tests/test_examples.py +20 -0
- docslight-0.1.3/tests/test_local_llm.py +401 -0
- docslight-0.1.3/tests/test_local_loader_parser.py +300 -0
- docslight-0.1.3/tests/test_local_office_loader.py +108 -0
- docslight-0.1.3/tests/test_local_pipeline.py +825 -0
- docslight-0.1.3/tests/test_schema_helpers.py +117 -0
- docslight-0.1.3/tests/test_web_app.py +442 -0
- docslight-0.1.2/docslight/static/app/common.js +0 -668
- docslight-0.1.2/docslight/static/app/docslight-extract.json +0 -307
- docslight-0.1.2/docslight/static/app/extract.js +0 -394
- docslight-0.1.2/docslight/static/app/i18n.js +0 -405
- docslight-0.1.2/docslight/static/app/parse.js +0 -161
- docslight-0.1.2/docslight/static/styles.css +0 -878
- docslight-0.1.2/docslight/templates/base.html +0 -36
- docslight-0.1.2/docslight/templates/extract.html +0 -123
- docslight-0.1.2/docslight/templates/parse.html +0 -81
- {docslight-0.1.2 → docslight-0.1.3}/LICENSE +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/client.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/cloud/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/cloud/client.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/config.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/exceptions.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/layout_blocks.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/llm_extractor.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/loaders.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/markdown.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/office_loader.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/paddle_parser.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/local/pipeline.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/providers/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/providers/ollama.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/providers/openai_compatible.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/result.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/schemas/__init__.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight/standard_json.py +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight.egg-info/dependency_links.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight.egg-info/entry_points.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight.egg-info/requires.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/docslight.egg-info/top_level.txt +0 -0
- {docslight-0.1.2 → docslight-0.1.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docslight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
5
|
Author-email: ComPDF AI <support@compdf.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -87,14 +87,13 @@ Extract specific fields:
|
|
|
87
87
|
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
88
88
|
```
|
|
89
89
|
|
|
90
|
-
Launch the local
|
|
90
|
+
Launch the local API server:
|
|
91
91
|
|
|
92
92
|
```bash
|
|
93
|
-
pip install "docslight[web]"
|
|
94
93
|
docslight web
|
|
95
|
-
#
|
|
94
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
96
95
|
|
|
97
|
-
# Or run the same
|
|
96
|
+
# Or run the same API server directly as a module
|
|
98
97
|
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
99
98
|
```
|
|
100
99
|
|
|
@@ -104,7 +103,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
104
103
|
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
105
104
|
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
106
105
|
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
107
|
-
- **
|
|
106
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
108
107
|
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
109
108
|
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
110
109
|
- **Document types** — Classify and route documents by type for cloud extraction
|
|
@@ -117,7 +116,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
117
116
|
| Core SDK & CLI | `pip install docslight` |
|
|
118
117
|
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
119
118
|
| + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
|
|
120
|
-
| +
|
|
119
|
+
| + API server | `pip install "docslight[web]"` |
|
|
121
120
|
|
|
122
121
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
123
122
|
|
|
@@ -209,7 +208,7 @@ client = DocSlight(
|
|
|
209
208
|
|
|
210
209
|
```python
|
|
211
210
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
212
|
-
for r in results:
|
|
211
|
+
for r in results:
|
|
213
212
|
print(r.to_markdown()[:200])
|
|
214
213
|
```
|
|
215
214
|
|
|
@@ -229,25 +228,24 @@ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --
|
|
|
229
228
|
# Extract with schema
|
|
230
229
|
docslight extract invoice.pdf --schema schema.json
|
|
231
230
|
|
|
232
|
-
#
|
|
231
|
+
# API server
|
|
233
232
|
docslight web --host 127.0.0.1 --port 8000
|
|
234
233
|
```
|
|
235
234
|
|
|
236
|
-
##
|
|
235
|
+
## API Server
|
|
237
236
|
|
|
238
|
-
DocSlight
|
|
237
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
239
238
|
|
|
240
239
|
```bash
|
|
241
|
-
pip install "docslight[web]"
|
|
242
240
|
docslight web
|
|
243
241
|
python -m docslight.web_app
|
|
244
242
|
```
|
|
245
243
|
|
|
246
|
-
-
|
|
247
|
-
-
|
|
248
|
-
-
|
|
249
|
-
-
|
|
250
|
-
-
|
|
244
|
+
- `GET /api/health`
|
|
245
|
+
- `GET /api/system-info`
|
|
246
|
+
- `POST /api/parse`
|
|
247
|
+
- `POST /api/extract`
|
|
248
|
+
- `POST /api/preview`
|
|
251
249
|
|
|
252
250
|
## Environment Variables
|
|
253
251
|
|
|
@@ -39,16 +39,15 @@ Extract specific fields:
|
|
|
39
39
|
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
-
Launch the local
|
|
43
|
-
|
|
44
|
-
```bash
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
```
|
|
42
|
+
Launch the local API server:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
docslight web
|
|
46
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
47
|
+
|
|
48
|
+
# Or run the same API server directly as a module
|
|
49
|
+
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
50
|
+
```
|
|
52
51
|
|
|
53
52
|
## Features
|
|
54
53
|
|
|
@@ -56,7 +55,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
56
55
|
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
57
56
|
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
58
57
|
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
59
|
-
- **
|
|
58
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
60
59
|
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
61
60
|
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
62
61
|
- **Document types** — Classify and route documents by type for cloud extraction
|
|
@@ -69,7 +68,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
69
68
|
| Core SDK & CLI | `pip install docslight` |
|
|
70
69
|
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
71
70
|
| + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
|
|
72
|
-
| +
|
|
71
|
+
| + API server | `pip install "docslight[web]"` |
|
|
73
72
|
|
|
74
73
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
75
74
|
|
|
@@ -160,9 +159,9 @@ client = DocSlight(
|
|
|
160
159
|
### Batch Processing
|
|
161
160
|
|
|
162
161
|
```python
|
|
163
|
-
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
164
|
-
for r in results:
|
|
165
|
-
print(r.to_markdown()[:200])
|
|
162
|
+
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
163
|
+
for r in results:
|
|
164
|
+
print(r.to_markdown()[:200])
|
|
166
165
|
```
|
|
167
166
|
|
|
168
167
|
## CLI Usage
|
|
@@ -181,25 +180,24 @@ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --
|
|
|
181
180
|
# Extract with schema
|
|
182
181
|
docslight extract invoice.pdf --schema schema.json
|
|
183
182
|
|
|
184
|
-
#
|
|
185
|
-
docslight web --host 127.0.0.1 --port 8000
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
##
|
|
189
|
-
|
|
190
|
-
DocSlight
|
|
191
|
-
|
|
192
|
-
```bash
|
|
193
|
-
|
|
194
|
-
docslight
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
-
|
|
199
|
-
-
|
|
200
|
-
-
|
|
201
|
-
-
|
|
202
|
-
- **Download results** — One-click download of Markdown or JSON output
|
|
183
|
+
# API server
|
|
184
|
+
docslight web --host 127.0.0.1 --port 8000
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## API Server
|
|
188
|
+
|
|
189
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
docslight web
|
|
193
|
+
python -m docslight.web_app
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
- `GET /api/health`
|
|
197
|
+
- `GET /api/system-info`
|
|
198
|
+
- `POST /api/parse`
|
|
199
|
+
- `POST /api/extract`
|
|
200
|
+
- `POST /api/preview`
|
|
203
201
|
|
|
204
202
|
## Environment Variables
|
|
205
203
|
|
|
@@ -108,7 +108,7 @@ def _to_pretty_json(data: Any) -> str:
|
|
|
108
108
|
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
109
109
|
|
|
110
110
|
|
|
111
|
-
def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
111
|
+
def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
112
112
|
"""Run the optional Flask web application."""
|
|
113
113
|
if importlib.util.find_spec("docslight.web_app") is None:
|
|
114
114
|
raise CLIUsageError(WEB_EXTRA_ERROR)
|
|
@@ -119,8 +119,8 @@ def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
|
119
119
|
if exc.name in {"flask", "werkzeug"}:
|
|
120
120
|
raise CLIUsageError(WEB_EXTRA_ERROR) from exc
|
|
121
121
|
raise
|
|
122
|
-
_run_web_app = web_app.run_web_app
|
|
123
|
-
_run_web_app(host, port, debug)
|
|
122
|
+
_run_web_app = web_app.run_web_app
|
|
123
|
+
_run_web_app(host, port, debug)
|
|
124
124
|
|
|
125
125
|
|
|
126
126
|
def _print_cli_error(error: Exception) -> int:
|
|
@@ -165,10 +165,10 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
165
165
|
extract_parser.set_defaults(func=_run_extract)
|
|
166
166
|
|
|
167
167
|
web_parser = subparsers.add_parser("web", help="Run the web application")
|
|
168
|
-
web_parser.add_argument("--host", default="127.0.0.1")
|
|
169
|
-
web_parser.add_argument("--port", type=int, default=8000)
|
|
170
|
-
web_parser.add_argument("--debug", action="store_true")
|
|
171
|
-
web_parser.set_defaults(func=_run_web)
|
|
168
|
+
web_parser.add_argument("--host", default="127.0.0.1")
|
|
169
|
+
web_parser.add_argument("--port", type=int, default=8000)
|
|
170
|
+
web_parser.add_argument("--debug", action="store_true")
|
|
171
|
+
web_parser.set_defaults(func=_run_web)
|
|
172
172
|
|
|
173
173
|
return parser
|
|
174
174
|
|
|
@@ -227,9 +227,9 @@ def _run_extract(args: argparse.Namespace) -> int:
|
|
|
227
227
|
return 0
|
|
228
228
|
|
|
229
229
|
|
|
230
|
-
def _run_web(args: argparse.Namespace) -> int:
|
|
231
|
-
run_web_app(args.host, args.port, args.debug)
|
|
232
|
-
return 0
|
|
230
|
+
def _run_web(args: argparse.Namespace) -> int:
|
|
231
|
+
run_web_app(args.host, args.port, args.debug)
|
|
232
|
+
return 0
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
@@ -11,8 +11,8 @@ NormalizedFields = list[str] | StructuredFields | None
|
|
|
11
11
|
ExtractSchema = dict[str, Any]
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
|
|
15
|
-
"""Normalize extraction fields from SDK, CLI, or
|
|
14
|
+
def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
|
|
15
|
+
"""Normalize extraction fields from SDK, CLI, or API inputs."""
|
|
16
16
|
if fields is None:
|
|
17
17
|
return None
|
|
18
18
|
if isinstance(fields, str):
|
|
@@ -2,19 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import argparse
|
|
6
|
-
import base64
|
|
7
|
-
import json
|
|
8
|
-
import logging
|
|
9
|
-
import sys
|
|
10
|
-
import tempfile
|
|
5
|
+
import argparse
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
11
|
from collections.abc import Callable
|
|
12
12
|
from io import BytesIO
|
|
13
13
|
from json import JSONDecodeError
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
from typing import Any, cast
|
|
16
16
|
|
|
17
|
-
from flask import Flask, Response, jsonify,
|
|
17
|
+
from flask import Flask, Response, jsonify, request, send_file
|
|
18
18
|
from werkzeug.datastructures import FileStorage
|
|
19
19
|
from werkzeug.utils import secure_filename
|
|
20
20
|
|
|
@@ -58,21 +58,18 @@ OFFICE_PREVIEW_UNSUPPORTED_MESSAGE = (
|
|
|
58
58
|
LOG_FORMAT = "%(levelname)s:%(name)s:%(message)s"
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
62
|
-
"""Create the local DocSlight Flask application."""
|
|
63
|
-
app = Flask(__name__)
|
|
64
|
-
|
|
65
|
-
@app.get("/")
|
|
66
|
-
def index() -> Any:
|
|
67
|
-
return
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@app.get("/extract")
|
|
74
|
-
def extract_page() -> str:
|
|
75
|
-
return render_template("extract.html", active_page="extract")
|
|
61
|
+
def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
62
|
+
"""Create the local DocSlight Flask application."""
|
|
63
|
+
app = Flask(__name__)
|
|
64
|
+
|
|
65
|
+
@app.get("/")
|
|
66
|
+
def index() -> Any:
|
|
67
|
+
return jsonify(
|
|
68
|
+
{
|
|
69
|
+
"status": "healthy",
|
|
70
|
+
"service": "docslight-web",
|
|
71
|
+
}
|
|
72
|
+
)
|
|
76
73
|
|
|
77
74
|
@app.get("/api/health")
|
|
78
75
|
def health() -> Any:
|
|
@@ -160,7 +157,11 @@ def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
|
160
157
|
return app
|
|
161
158
|
|
|
162
159
|
|
|
163
|
-
def run_web_app(
|
|
160
|
+
def run_web_app(
|
|
161
|
+
host: str = "127.0.0.1",
|
|
162
|
+
port: int = 8000,
|
|
163
|
+
debug: bool = False,
|
|
164
|
+
) -> None:
|
|
164
165
|
"""Run the local DocSlight web application."""
|
|
165
166
|
_configure_web_logging(debug)
|
|
166
167
|
create_app().run(host=host, port=port, debug=debug)
|
|
@@ -183,17 +184,17 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
183
184
|
prog="python -m docslight.web_app",
|
|
184
185
|
description="Run the DocSlight web application.",
|
|
185
186
|
)
|
|
186
|
-
parser.add_argument("--host", default="127.0.0.1")
|
|
187
|
-
parser.add_argument("--port", type=int, default=8000)
|
|
188
|
-
parser.add_argument("--debug", action="store_true")
|
|
189
|
-
return parser
|
|
187
|
+
parser.add_argument("--host", default="127.0.0.1")
|
|
188
|
+
parser.add_argument("--port", type=int, default=8000)
|
|
189
|
+
parser.add_argument("--debug", action="store_true")
|
|
190
|
+
return parser
|
|
190
191
|
|
|
191
192
|
|
|
192
193
|
def main(argv: list[str] | None = None) -> int:
|
|
193
|
-
"""Run the standalone DocSlight web application entrypoint."""
|
|
194
|
-
args = build_parser().parse_args(argv)
|
|
195
|
-
run_web_app(args.host, args.port, args.debug)
|
|
196
|
-
return 0
|
|
194
|
+
"""Run the standalone DocSlight web application entrypoint."""
|
|
195
|
+
args = build_parser().parse_args(argv)
|
|
196
|
+
run_web_app(args.host, args.port, args.debug)
|
|
197
|
+
return 0
|
|
197
198
|
|
|
198
199
|
|
|
199
200
|
def local_llm_from_form(form: Any) -> dict[str, str] | None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docslight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
5
|
Author-email: ComPDF AI <support@compdf.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -87,14 +87,13 @@ Extract specific fields:
|
|
|
87
87
|
docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
88
88
|
```
|
|
89
89
|
|
|
90
|
-
Launch the local
|
|
90
|
+
Launch the local API server:
|
|
91
91
|
|
|
92
92
|
```bash
|
|
93
|
-
pip install "docslight[web]"
|
|
94
93
|
docslight web
|
|
95
|
-
#
|
|
94
|
+
# Health: http://127.0.0.1:8000/api/health
|
|
96
95
|
|
|
97
|
-
# Or run the same
|
|
96
|
+
# Or run the same API server directly as a module
|
|
98
97
|
python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
99
98
|
```
|
|
100
99
|
|
|
@@ -104,7 +103,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
104
103
|
- **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
|
|
105
104
|
- **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
|
|
106
105
|
- **CLI first** — Full-featured command-line interface, script-friendly
|
|
107
|
-
- **
|
|
106
|
+
- **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
|
|
108
107
|
- **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
|
|
109
108
|
- **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
|
|
110
109
|
- **Document types** — Classify and route documents by type for cloud extraction
|
|
@@ -117,7 +116,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
117
116
|
| Core SDK & CLI | `pip install docslight` |
|
|
118
117
|
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
119
118
|
| + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
|
|
120
|
-
| +
|
|
119
|
+
| + API server | `pip install "docslight[web]"` |
|
|
121
120
|
|
|
122
121
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
123
122
|
|
|
@@ -209,7 +208,7 @@ client = DocSlight(
|
|
|
209
208
|
|
|
210
209
|
```python
|
|
211
210
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
212
|
-
for r in results:
|
|
211
|
+
for r in results:
|
|
213
212
|
print(r.to_markdown()[:200])
|
|
214
213
|
```
|
|
215
214
|
|
|
@@ -229,25 +228,24 @@ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --
|
|
|
229
228
|
# Extract with schema
|
|
230
229
|
docslight extract invoice.pdf --schema schema.json
|
|
231
230
|
|
|
232
|
-
#
|
|
231
|
+
# API server
|
|
233
232
|
docslight web --host 127.0.0.1 --port 8000
|
|
234
233
|
```
|
|
235
234
|
|
|
236
|
-
##
|
|
235
|
+
## API Server
|
|
237
236
|
|
|
238
|
-
DocSlight
|
|
237
|
+
DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
|
|
239
238
|
|
|
240
239
|
```bash
|
|
241
|
-
pip install "docslight[web]"
|
|
242
240
|
docslight web
|
|
243
241
|
python -m docslight.web_app
|
|
244
242
|
```
|
|
245
243
|
|
|
246
|
-
-
|
|
247
|
-
-
|
|
248
|
-
-
|
|
249
|
-
-
|
|
250
|
-
-
|
|
244
|
+
- `GET /api/health`
|
|
245
|
+
- `GET /api/system-info`
|
|
246
|
+
- `POST /api/parse`
|
|
247
|
+
- `POST /api/extract`
|
|
248
|
+
- `POST /api/preview`
|
|
251
249
|
|
|
252
250
|
## Environment Variables
|
|
253
251
|
|
|
@@ -31,12 +31,15 @@ docslight/providers/ollama.py
|
|
|
31
31
|
docslight/providers/openai_compatible.py
|
|
32
32
|
docslight/schemas/__init__.py
|
|
33
33
|
docslight/schemas/fields.py
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
34
|
+
tests/test_cli.py
|
|
35
|
+
tests/test_cli_entrypoint.py
|
|
36
|
+
tests/test_client.py
|
|
37
|
+
tests/test_cloud_client.py
|
|
38
|
+
tests/test_config_result.py
|
|
39
|
+
tests/test_examples.py
|
|
40
|
+
tests/test_local_llm.py
|
|
41
|
+
tests/test_local_loader_parser.py
|
|
42
|
+
tests/test_local_office_loader.py
|
|
43
|
+
tests/test_local_pipeline.py
|
|
44
|
+
tests/test_schema_helpers.py
|
|
45
|
+
tests/test_web_app.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docslight"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.3"
|
|
8
8
|
description = "Lightweight ComPDF document parsing and extraction SDK"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -65,12 +65,9 @@ docslight = "docslight.cli:main"
|
|
|
65
65
|
[tool.setuptools.packages.find]
|
|
66
66
|
include = ["docslight*"]
|
|
67
67
|
|
|
68
|
-
[tool.
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
[tool.ruff]
|
|
72
|
-
line-length = 88
|
|
73
|
-
target-version = "py310"
|
|
68
|
+
[tool.ruff]
|
|
69
|
+
line-length = 88
|
|
70
|
+
target-version = "py310"
|
|
74
71
|
|
|
75
72
|
[tool.ruff.lint]
|
|
76
73
|
select = ["E", "F", "I", "UP", "B"]
|