docslight 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docslight-0.1.1 → docslight-0.1.2}/PKG-INFO +7 -4
- {docslight-0.1.1 → docslight-0.1.2}/README.md +18 -15
- {docslight-0.1.1 → docslight-0.1.2}/docslight/__init__.py +1 -1
- {docslight-0.1.1 → docslight-0.1.2}/docslight/cli.py +53 -23
- {docslight-0.1.1 → docslight-0.1.2}/docslight/cloud/client.py +6 -6
- {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/PKG-INFO +7 -4
- {docslight-0.1.1 → docslight-0.1.2}/pyproject.toml +1 -1
- {docslight-0.1.1 → docslight-0.1.2}/LICENSE +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/client.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/cloud/__init__.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/config.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/exceptions.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/__init__.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/layout_blocks.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/llm_extractor.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/loaders.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/markdown.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/office_loader.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/paddle_parser.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/local/pipeline.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/preview.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/providers/__init__.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/providers/ollama.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/providers/openai_compatible.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/result.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/schemas/__init__.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/schemas/fields.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/standard_json.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/common.js +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/docslight-extract.json +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/extract.js +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/i18n.js +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/parse.js +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/static/styles.css +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/templates/base.html +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/templates/extract.html +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/templates/parse.html +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight/web_app.py +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/SOURCES.txt +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/dependency_links.txt +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/entry_points.txt +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/requires.txt +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/top_level.txt +0 -0
- {docslight-0.1.1 → docslight-0.1.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docslight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
5
|
Author-email: ComPDF AI <support@compdf.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -78,6 +78,7 @@ Parse any document:
|
|
|
78
78
|
|
|
79
79
|
```bash
|
|
80
80
|
docslight parse invoice.pdf --output invoice.md
|
|
81
|
+
docslight parse invoice.pdf --format zip --output invoice.zip
|
|
81
82
|
```
|
|
82
83
|
|
|
83
84
|
Extract specific fields:
|
|
@@ -208,7 +209,7 @@ client = DocSlight(
|
|
|
208
209
|
|
|
209
210
|
```python
|
|
210
211
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
211
|
-
for r in results:
|
|
212
|
+
for r in results:[release.ps1](scripts/release.ps1)
|
|
212
213
|
print(r.to_markdown()[:200])
|
|
213
214
|
```
|
|
214
215
|
|
|
@@ -216,12 +217,14 @@ for r in results:
|
|
|
216
217
|
|
|
217
218
|
```bash
|
|
218
219
|
# Parse
|
|
219
|
-
docslight parse invoice.pdf --mode cloud -o invoice.
|
|
220
|
-
docslight parse invoice.pdf --mode
|
|
220
|
+
docslight parse invoice.pdf --mode cloud -o invoice.zip
|
|
221
|
+
docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
|
|
222
|
+
docslight parse invoice.pdf --mode local -o invoice.zip
|
|
221
223
|
|
|
222
224
|
# Extract
|
|
223
225
|
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
224
226
|
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
227
|
+
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
|
|
225
228
|
|
|
226
229
|
# Extract with schema
|
|
227
230
|
docslight extract invoice.pdf --schema schema.json
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<h1 align="center">DocSlight</h1>
|
|
2
|
+
<h1 align="center">DocSlight</h1>
|
|
3
3
|
<p align="center">Lightweight Python SDK & CLI for document parsing and structured extraction</p>
|
|
4
4
|
<p align="center">
|
|
5
|
-
<a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/v/docslight" alt="PyPI"></a>
|
|
6
|
-
<a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/pyversions/docslight" alt="Python versions"></a>
|
|
7
|
-
<a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight" alt="License"></a>
|
|
5
|
+
<a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/v/docslight" alt="PyPI"></a>
|
|
6
|
+
<a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/pyversions/docslight" alt="Python versions"></a>
|
|
7
|
+
<a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight" alt="License"></a>
|
|
8
8
|
</p>
|
|
9
9
|
</p>
|
|
10
10
|
|
|
11
|
-
## What is DocSlight?
|
|
11
|
+
## What is DocSlight?
|
|
12
12
|
|
|
13
13
|
A lightweight Python library that turns PDFs, images, and Office documents into clean Markdown or structured JSON — with one line of code. Works with ComPDF Cloud (recommended) or fully offline with local parsers.
|
|
14
14
|
|
|
@@ -23,13 +23,14 @@ print(result.to_markdown())
|
|
|
23
23
|
## Quick Start
|
|
24
24
|
|
|
25
25
|
```bash
|
|
26
|
-
pip install docslight
|
|
26
|
+
pip install docslight
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
Parse any document:
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
32
|
docslight parse invoice.pdf --output invoice.md
|
|
33
|
+
docslight parse invoice.pdf --format zip --output invoice.zip
|
|
33
34
|
```
|
|
34
35
|
|
|
35
36
|
Extract specific fields:
|
|
@@ -41,7 +42,7 @@ docslight extract invoice.pdf --fields invoice_number,total_amount
|
|
|
41
42
|
Launch the local Web UI workbench:
|
|
42
43
|
|
|
43
44
|
```bash
|
|
44
|
-
pip install "docslight[web]"
|
|
45
|
+
pip install "docslight[web]"
|
|
45
46
|
docslight web
|
|
46
47
|
# Open http://127.0.0.1:8000
|
|
47
48
|
|
|
@@ -65,10 +66,10 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
|
|
|
65
66
|
|
|
66
67
|
| Scenario | Command |
|
|
67
68
|
|----------|---------|
|
|
68
|
-
| Core SDK & CLI | `pip install docslight` |
|
|
69
|
-
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
70
|
-
| + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
|
|
71
|
-
| + Web UI workbench | `pip install "docslight[web]"` |
|
|
69
|
+
| Core SDK & CLI | `pip install docslight` |
|
|
70
|
+
| + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
|
|
71
|
+
| + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
|
|
72
|
+
| + Web UI workbench | `pip install "docslight[web]"` |
|
|
72
73
|
|
|
73
74
|
> Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
|
|
74
75
|
|
|
@@ -160,7 +161,7 @@ client = DocSlight(
|
|
|
160
161
|
|
|
161
162
|
```python
|
|
162
163
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
163
|
-
for r in results:
|
|
164
|
+
for r in results:[release.ps1](scripts/release.ps1)
|
|
164
165
|
print(r.to_markdown()[:200])
|
|
165
166
|
```
|
|
166
167
|
|
|
@@ -168,12 +169,14 @@ for r in results:
|
|
|
168
169
|
|
|
169
170
|
```bash
|
|
170
171
|
# Parse
|
|
171
|
-
docslight parse invoice.pdf --mode cloud -o invoice.
|
|
172
|
-
docslight parse invoice.pdf --mode
|
|
172
|
+
docslight parse invoice.pdf --mode cloud -o invoice.zip
|
|
173
|
+
docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
|
|
174
|
+
docslight parse invoice.pdf --mode local -o invoice.zip
|
|
173
175
|
|
|
174
176
|
# Extract
|
|
175
177
|
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
176
178
|
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
179
|
+
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
|
|
177
180
|
|
|
178
181
|
# Extract with schema
|
|
179
182
|
docslight extract invoice.pdf --schema schema.json
|
|
@@ -187,7 +190,7 @@ docslight web --host 127.0.0.1 --port 8000
|
|
|
187
190
|
DocSlight Workbench is a local Flask app for visual document processing.
|
|
188
191
|
|
|
189
192
|
```bash
|
|
190
|
-
pip install "docslight[web]"
|
|
193
|
+
pip install "docslight[web]"
|
|
191
194
|
docslight web
|
|
192
195
|
python -m docslight.web_app
|
|
193
196
|
```
|
|
@@ -81,13 +81,27 @@ def _client_from_args(args: argparse.Namespace) -> DocSlight:
|
|
|
81
81
|
)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
def _write_output(content: str, output_path: str | None) -> None:
|
|
85
|
-
if output_path is None:
|
|
86
|
-
sys.stdout.write(content)
|
|
87
|
-
if not content.endswith("\n"):
|
|
88
|
-
sys.stdout.write("\n")
|
|
89
|
-
return
|
|
90
|
-
Path(output_path)
|
|
84
|
+
def _write_output(content: str, output_path: str | None) -> None:
|
|
85
|
+
if output_path is None:
|
|
86
|
+
sys.stdout.write(content)
|
|
87
|
+
if not content.endswith("\n"):
|
|
88
|
+
sys.stdout.write("\n")
|
|
89
|
+
return
|
|
90
|
+
path = Path(output_path)
|
|
91
|
+
path.write_text(content, encoding="utf-8")
|
|
92
|
+
sys.stderr.write(f"Wrote {path.resolve()}\n")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _write_binary_output(content: bytes, output_path: str | None) -> None:
|
|
96
|
+
if output_path is None:
|
|
97
|
+
output_buffer = getattr(sys.stdout, "buffer", None)
|
|
98
|
+
if output_buffer is None:
|
|
99
|
+
raise CLIUsageError("binary output requires --output")
|
|
100
|
+
output_buffer.write(content)
|
|
101
|
+
return
|
|
102
|
+
path = Path(output_path)
|
|
103
|
+
path.write_bytes(content)
|
|
104
|
+
sys.stderr.write(f"Wrote {path.resolve()}\n")
|
|
91
105
|
|
|
92
106
|
|
|
93
107
|
def _to_pretty_json(data: Any) -> str:
|
|
@@ -125,11 +139,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
125
139
|
parse_parser = subparsers.add_parser("parse", help="Parse a document")
|
|
126
140
|
parse_parser.add_argument("input")
|
|
127
141
|
parse_parser.add_argument("--output", "-o")
|
|
128
|
-
parse_parser.add_argument(
|
|
129
|
-
"--format",
|
|
130
|
-
choices=("markdown", "json", "standard-json"),
|
|
131
|
-
default=
|
|
132
|
-
)
|
|
142
|
+
parse_parser.add_argument(
|
|
143
|
+
"--format",
|
|
144
|
+
choices=("markdown", "json", "standard-json", "zip"),
|
|
145
|
+
default=None,
|
|
146
|
+
)
|
|
133
147
|
_add_common_options(parse_parser)
|
|
134
148
|
parse_parser.set_defaults(func=_run_parse)
|
|
135
149
|
|
|
@@ -159,17 +173,33 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
159
173
|
return parser
|
|
160
174
|
|
|
161
175
|
|
|
162
|
-
def _run_parse(args: argparse.Namespace) -> int:
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
176
|
+
def _run_parse(args: argparse.Namespace) -> int:
|
|
177
|
+
parse_format = _resolve_parse_format(args.format, args.output)
|
|
178
|
+
parse_output = "json" if parse_format == "standard-json" else "markdown"
|
|
179
|
+
result = _client_from_args(args).parse(args.input, output=parse_output)
|
|
180
|
+
if parse_format == "zip":
|
|
181
|
+
raw_archive = getattr(result, "raw_archive", None)
|
|
182
|
+
if not isinstance(raw_archive, bytes):
|
|
183
|
+
raise CLIUsageError("parse result did not include a ZIP archive")
|
|
184
|
+
_write_binary_output(raw_archive, args.output)
|
|
185
|
+
elif parse_format == "json":
|
|
186
|
+
content = _to_pretty_json(result.to_json())
|
|
187
|
+
_write_output(content, args.output)
|
|
188
|
+
elif parse_format == "standard-json":
|
|
189
|
+
content = _to_pretty_json(result.to_standard_json())
|
|
190
|
+
_write_output(content, args.output)
|
|
191
|
+
else:
|
|
192
|
+
content = result.to_markdown()
|
|
193
|
+
_write_output(content, args.output)
|
|
194
|
+
return 0
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _resolve_parse_format(parse_format: str | None, output_path: str | None) -> str:
|
|
198
|
+
if parse_format is not None:
|
|
199
|
+
return parse_format
|
|
200
|
+
if output_path is not None and Path(output_path).suffix.lower() == ".zip":
|
|
201
|
+
return "zip"
|
|
202
|
+
return "markdown"
|
|
173
203
|
|
|
174
204
|
|
|
175
205
|
def _run_convert_parse_json(args: argparse.Namespace) -> int:
|
|
@@ -171,11 +171,11 @@ class CloudClient:
|
|
|
171
171
|
return _read_downloaded_result_payload(content)
|
|
172
172
|
return self._response_json(response), None
|
|
173
173
|
|
|
174
|
-
def _prepare_options(self, operation: str, options: dict[str, Any]) -> dict[str, Any]:
|
|
175
|
-
if operation != "extract"
|
|
176
|
-
return options
|
|
177
|
-
|
|
178
|
-
prepared = dict(options)
|
|
174
|
+
def _prepare_options(self, operation: str, options: dict[str, Any]) -> dict[str, Any]:
|
|
175
|
+
if operation != "extract":
|
|
176
|
+
return options
|
|
177
|
+
|
|
178
|
+
prepared = dict(options)
|
|
179
179
|
if "extract_fields" in prepared:
|
|
180
180
|
return prepared
|
|
181
181
|
|
|
@@ -219,7 +219,7 @@ class CloudClient:
|
|
|
219
219
|
return compacted
|
|
220
220
|
|
|
221
221
|
def _headers(self) -> dict[str, str]:
|
|
222
|
-
headers = {"User-Agent": "docslight/0.1.
|
|
222
|
+
headers = {"User-Agent": "docslight/0.1.2"}
|
|
223
223
|
if self.api_key:
|
|
224
224
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
225
225
|
headers["x-api-key"] = self.api_key
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docslight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Lightweight ComPDF document parsing and extraction SDK
|
|
5
5
|
Author-email: ComPDF AI <support@compdf.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -78,6 +78,7 @@ Parse any document:
|
|
|
78
78
|
|
|
79
79
|
```bash
|
|
80
80
|
docslight parse invoice.pdf --output invoice.md
|
|
81
|
+
docslight parse invoice.pdf --format zip --output invoice.zip
|
|
81
82
|
```
|
|
82
83
|
|
|
83
84
|
Extract specific fields:
|
|
@@ -208,7 +209,7 @@ client = DocSlight(
|
|
|
208
209
|
|
|
209
210
|
```python
|
|
210
211
|
results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
|
211
|
-
for r in results:
|
|
212
|
+
for r in results:[release.ps1](scripts/release.ps1)
|
|
212
213
|
print(r.to_markdown()[:200])
|
|
213
214
|
```
|
|
214
215
|
|
|
@@ -216,12 +217,14 @@ for r in results:
|
|
|
216
217
|
|
|
217
218
|
```bash
|
|
218
219
|
# Parse
|
|
219
|
-
docslight parse invoice.pdf --mode cloud -o invoice.
|
|
220
|
-
docslight parse invoice.pdf --mode
|
|
220
|
+
docslight parse invoice.pdf --mode cloud -o invoice.zip
|
|
221
|
+
docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
|
|
222
|
+
docslight parse invoice.pdf --mode local -o invoice.zip
|
|
221
223
|
|
|
222
224
|
# Extract
|
|
223
225
|
docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
|
|
224
226
|
docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
|
|
227
|
+
docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
|
|
225
228
|
|
|
226
229
|
# Extract with schema
|
|
227
230
|
docslight extract invoice.pdf --schema schema.json
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|