docslight 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docslight-0.1.1 → docslight-0.1.2}/PKG-INFO +7 -4
  2. {docslight-0.1.1 → docslight-0.1.2}/README.md +18 -15
  3. {docslight-0.1.1 → docslight-0.1.2}/docslight/__init__.py +1 -1
  4. {docslight-0.1.1 → docslight-0.1.2}/docslight/cli.py +53 -23
  5. {docslight-0.1.1 → docslight-0.1.2}/docslight/cloud/client.py +6 -6
  6. {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/PKG-INFO +7 -4
  7. {docslight-0.1.1 → docslight-0.1.2}/pyproject.toml +1 -1
  8. {docslight-0.1.1 → docslight-0.1.2}/LICENSE +0 -0
  9. {docslight-0.1.1 → docslight-0.1.2}/docslight/client.py +0 -0
  10. {docslight-0.1.1 → docslight-0.1.2}/docslight/cloud/__init__.py +0 -0
  11. {docslight-0.1.1 → docslight-0.1.2}/docslight/config.py +0 -0
  12. {docslight-0.1.1 → docslight-0.1.2}/docslight/exceptions.py +0 -0
  13. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/__init__.py +0 -0
  14. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/layout_blocks.py +0 -0
  15. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/llm_extractor.py +0 -0
  16. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/loaders.py +0 -0
  17. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/markdown.py +0 -0
  18. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/office_loader.py +0 -0
  19. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/paddle_parser.py +0 -0
  20. {docslight-0.1.1 → docslight-0.1.2}/docslight/local/pipeline.py +0 -0
  21. {docslight-0.1.1 → docslight-0.1.2}/docslight/preview.py +0 -0
  22. {docslight-0.1.1 → docslight-0.1.2}/docslight/providers/__init__.py +0 -0
  23. {docslight-0.1.1 → docslight-0.1.2}/docslight/providers/ollama.py +0 -0
  24. {docslight-0.1.1 → docslight-0.1.2}/docslight/providers/openai_compatible.py +0 -0
  25. {docslight-0.1.1 → docslight-0.1.2}/docslight/result.py +0 -0
  26. {docslight-0.1.1 → docslight-0.1.2}/docslight/schemas/__init__.py +0 -0
  27. {docslight-0.1.1 → docslight-0.1.2}/docslight/schemas/fields.py +0 -0
  28. {docslight-0.1.1 → docslight-0.1.2}/docslight/standard_json.py +0 -0
  29. {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/common.js +0 -0
  30. {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/docslight-extract.json +0 -0
  31. {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/extract.js +0 -0
  32. {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/i18n.js +0 -0
  33. {docslight-0.1.1 → docslight-0.1.2}/docslight/static/app/parse.js +0 -0
  34. {docslight-0.1.1 → docslight-0.1.2}/docslight/static/styles.css +0 -0
  35. {docslight-0.1.1 → docslight-0.1.2}/docslight/templates/base.html +0 -0
  36. {docslight-0.1.1 → docslight-0.1.2}/docslight/templates/extract.html +0 -0
  37. {docslight-0.1.1 → docslight-0.1.2}/docslight/templates/parse.html +0 -0
  38. {docslight-0.1.1 → docslight-0.1.2}/docslight/web_app.py +0 -0
  39. {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/SOURCES.txt +0 -0
  40. {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/dependency_links.txt +0 -0
  41. {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/entry_points.txt +0 -0
  42. {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/requires.txt +0 -0
  43. {docslight-0.1.1 → docslight-0.1.2}/docslight.egg-info/top_level.txt +0 -0
  44. {docslight-0.1.1 → docslight-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docslight
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Lightweight ComPDF document parsing and extraction SDK
5
5
  Author-email: ComPDF AI <support@compdf.com>
6
6
  License-Expression: MIT
@@ -78,6 +78,7 @@ Parse any document:
78
78
 
79
79
  ```bash
80
80
  docslight parse invoice.pdf --output invoice.md
81
+ docslight parse invoice.pdf --format zip --output invoice.zip
81
82
  ```
82
83
 
83
84
  Extract specific fields:
@@ -208,7 +209,7 @@ client = DocSlight(
208
209
 
209
210
  ```python
210
211
  results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
211
- for r in results:
212
+ for r in results:[release.ps1](scripts/release.ps1)
212
213
  print(r.to_markdown()[:200])
213
214
  ```
214
215
 
@@ -216,12 +217,14 @@ for r in results:
216
217
 
217
218
  ```bash
218
219
  # Parse
219
- docslight parse invoice.pdf --mode cloud -o invoice.md
220
- docslight parse invoice.pdf --mode local -o invoice.md
220
+ docslight parse invoice.pdf --mode cloud -o invoice.zip
221
+ docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
222
+ docslight parse invoice.pdf --mode local -o invoice.zip
221
223
 
222
224
  # Extract
223
225
  docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
224
226
  docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
227
+ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
225
228
 
226
229
  # Extract with schema
227
230
  docslight extract invoice.pdf --schema schema.json
@@ -1,14 +1,14 @@
1
1
  <p align="center">
2
- <h1 align="center">DocSlight</h1>
2
+ <h1 align="center">DocSlight</h1>
3
3
  <p align="center">Lightweight Python SDK & CLI for document parsing and structured extraction</p>
4
4
  <p align="center">
5
- <a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/v/docslight" alt="PyPI"></a>
6
- <a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/pyversions/docslight" alt="Python versions"></a>
7
- <a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight" alt="License"></a>
5
+ <a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/v/docslight" alt="PyPI"></a>
6
+ <a href="https://pypi.org/project/docslight/"><img src="https://img.shields.io/pypi/pyversions/docslight" alt="Python versions"></a>
7
+ <a href="LICENSE"><img src="https://img.shields.io/github/license/kdanmobile/docslight" alt="License"></a>
8
8
  </p>
9
9
  </p>
10
10
 
11
- ## What is DocSlight?
11
+ ## What is DocSlight?
12
12
 
13
13
  A lightweight Python library that turns PDFs, images, and Office documents into clean Markdown or structured JSON — with one line of code. Works with ComPDF Cloud (recommended) or fully offline with local parsers.
14
14
 
@@ -23,13 +23,14 @@ print(result.to_markdown())
23
23
  ## Quick Start
24
24
 
25
25
  ```bash
26
- pip install docslight
26
+ pip install docslight
27
27
  ```
28
28
 
29
29
  Parse any document:
30
30
 
31
31
  ```bash
32
32
  docslight parse invoice.pdf --output invoice.md
33
+ docslight parse invoice.pdf --format zip --output invoice.zip
33
34
  ```
34
35
 
35
36
  Extract specific fields:
@@ -41,7 +42,7 @@ docslight extract invoice.pdf --fields invoice_number,total_amount
41
42
  Launch the local Web UI workbench:
42
43
 
43
44
  ```bash
44
- pip install "docslight[web]"
45
+ pip install "docslight[web]"
45
46
  docslight web
46
47
  # Open http://127.0.0.1:8000
47
48
 
@@ -65,10 +66,10 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
65
66
 
66
67
  | Scenario | Command |
67
68
  |----------|---------|
68
- | Core SDK & CLI | `pip install docslight` |
69
- | + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
70
- | + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
71
- | + Web UI workbench | `pip install "docslight[web]"` |
69
+ | Core SDK & CLI | `pip install docslight` |
70
+ | + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
71
+ | + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
72
+ | + Web UI workbench | `pip install "docslight[web]"` |
72
73
 
73
74
  > Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
74
75
 
@@ -160,7 +161,7 @@ client = DocSlight(
160
161
 
161
162
  ```python
162
163
  results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
163
- for r in results:
164
+ for r in results:[release.ps1](scripts/release.ps1)
164
165
  print(r.to_markdown()[:200])
165
166
  ```
166
167
 
@@ -168,12 +169,14 @@ for r in results:
168
169
 
169
170
  ```bash
170
171
  # Parse
171
- docslight parse invoice.pdf --mode cloud -o invoice.md
172
- docslight parse invoice.pdf --mode local -o invoice.md
172
+ docslight parse invoice.pdf --mode cloud -o invoice.zip
173
+ docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
174
+ docslight parse invoice.pdf --mode local -o invoice.zip
173
175
 
174
176
  # Extract
175
177
  docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
176
178
  docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
179
+ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
177
180
 
178
181
  # Extract with schema
179
182
  docslight extract invoice.pdf --schema schema.json
@@ -187,7 +190,7 @@ docslight web --host 127.0.0.1 --port 8000
187
190
  DocSlight Workbench is a local Flask app for visual document processing.
188
191
 
189
192
  ```bash
190
- pip install "docslight[web]"
193
+ pip install "docslight[web]"
191
194
  docslight web
192
195
  python -m docslight.web_app
193
196
  ```
@@ -19,7 +19,7 @@ from docslight.exceptions import (
19
19
  )
20
20
  from docslight.result import ExtractResult, ParseResult
21
21
 
22
- __version__ = "0.1.1"
22
+ __version__ = "0.1.2"
23
23
 
24
24
  __all__ = [
25
25
  "AuthenticationError",
@@ -81,13 +81,27 @@ def _client_from_args(args: argparse.Namespace) -> DocSlight:
81
81
  )
82
82
 
83
83
 
84
- def _write_output(content: str, output_path: str | None) -> None:
85
- if output_path is None:
86
- sys.stdout.write(content)
87
- if not content.endswith("\n"):
88
- sys.stdout.write("\n")
89
- return
90
- Path(output_path).write_text(content, encoding="utf-8")
84
+ def _write_output(content: str, output_path: str | None) -> None:
85
+ if output_path is None:
86
+ sys.stdout.write(content)
87
+ if not content.endswith("\n"):
88
+ sys.stdout.write("\n")
89
+ return
90
+ path = Path(output_path)
91
+ path.write_text(content, encoding="utf-8")
92
+ sys.stderr.write(f"Wrote {path.resolve()}\n")
93
+
94
+
95
+ def _write_binary_output(content: bytes, output_path: str | None) -> None:
96
+ if output_path is None:
97
+ output_buffer = getattr(sys.stdout, "buffer", None)
98
+ if output_buffer is None:
99
+ raise CLIUsageError("binary output requires --output")
100
+ output_buffer.write(content)
101
+ return
102
+ path = Path(output_path)
103
+ path.write_bytes(content)
104
+ sys.stderr.write(f"Wrote {path.resolve()}\n")
91
105
 
92
106
 
93
107
  def _to_pretty_json(data: Any) -> str:
@@ -125,11 +139,11 @@ def build_parser() -> argparse.ArgumentParser:
125
139
  parse_parser = subparsers.add_parser("parse", help="Parse a document")
126
140
  parse_parser.add_argument("input")
127
141
  parse_parser.add_argument("--output", "-o")
128
- parse_parser.add_argument(
129
- "--format",
130
- choices=("markdown", "json", "standard-json"),
131
- default="markdown",
132
- )
142
+ parse_parser.add_argument(
143
+ "--format",
144
+ choices=("markdown", "json", "standard-json", "zip"),
145
+ default=None,
146
+ )
133
147
  _add_common_options(parse_parser)
134
148
  parse_parser.set_defaults(func=_run_parse)
135
149
 
@@ -159,17 +173,33 @@ def build_parser() -> argparse.ArgumentParser:
159
173
  return parser
160
174
 
161
175
 
162
- def _run_parse(args: argparse.Namespace) -> int:
163
- parse_output = "json" if args.format == "standard-json" else args.format
164
- result = _client_from_args(args).parse(args.input, output=parse_output)
165
- if args.format == "json":
166
- content = _to_pretty_json(result.to_json())
167
- elif args.format == "standard-json":
168
- content = _to_pretty_json(result.to_standard_json())
169
- else:
170
- content = result.to_markdown()
171
- _write_output(content, args.output)
172
- return 0
176
+ def _run_parse(args: argparse.Namespace) -> int:
177
+ parse_format = _resolve_parse_format(args.format, args.output)
178
+ parse_output = "json" if parse_format == "standard-json" else "markdown"
179
+ result = _client_from_args(args).parse(args.input, output=parse_output)
180
+ if parse_format == "zip":
181
+ raw_archive = getattr(result, "raw_archive", None)
182
+ if not isinstance(raw_archive, bytes):
183
+ raise CLIUsageError("parse result did not include a ZIP archive")
184
+ _write_binary_output(raw_archive, args.output)
185
+ elif parse_format == "json":
186
+ content = _to_pretty_json(result.to_json())
187
+ _write_output(content, args.output)
188
+ elif parse_format == "standard-json":
189
+ content = _to_pretty_json(result.to_standard_json())
190
+ _write_output(content, args.output)
191
+ else:
192
+ content = result.to_markdown()
193
+ _write_output(content, args.output)
194
+ return 0
195
+
196
+
197
+ def _resolve_parse_format(parse_format: str | None, output_path: str | None) -> str:
198
+ if parse_format is not None:
199
+ return parse_format
200
+ if output_path is not None and Path(output_path).suffix.lower() == ".zip":
201
+ return "zip"
202
+ return "markdown"
173
203
 
174
204
 
175
205
  def _run_convert_parse_json(args: argparse.Namespace) -> int:
@@ -171,11 +171,11 @@ class CloudClient:
171
171
  return _read_downloaded_result_payload(content)
172
172
  return self._response_json(response), None
173
173
 
174
- def _prepare_options(self, operation: str, options: dict[str, Any]) -> dict[str, Any]:
175
- if operation != "extract" or not self._uses_custom_operation_urls():
176
- return options
177
-
178
- prepared = dict(options)
174
+ def _prepare_options(self, operation: str, options: dict[str, Any]) -> dict[str, Any]:
175
+ if operation != "extract":
176
+ return options
177
+
178
+ prepared = dict(options)
179
179
  if "extract_fields" in prepared:
180
180
  return prepared
181
181
 
@@ -219,7 +219,7 @@ class CloudClient:
219
219
  return compacted
220
220
 
221
221
  def _headers(self) -> dict[str, str]:
222
- headers = {"User-Agent": "docslight/0.1.1"}
222
+ headers = {"User-Agent": "docslight/0.1.2"}
223
223
  if self.api_key:
224
224
  headers["Authorization"] = f"Bearer {self.api_key}"
225
225
  headers["x-api-key"] = self.api_key
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docslight
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Lightweight ComPDF document parsing and extraction SDK
5
5
  Author-email: ComPDF AI <support@compdf.com>
6
6
  License-Expression: MIT
@@ -78,6 +78,7 @@ Parse any document:
78
78
 
79
79
  ```bash
80
80
  docslight parse invoice.pdf --output invoice.md
81
+ docslight parse invoice.pdf --format zip --output invoice.zip
81
82
  ```
82
83
 
83
84
  Extract specific fields:
@@ -208,7 +209,7 @@ client = DocSlight(
208
209
 
209
210
  ```python
210
211
  results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
211
- for r in results:
212
+ for r in results:[release.ps1](scripts/release.ps1)
212
213
  print(r.to_markdown()[:200])
213
214
  ```
214
215
 
@@ -216,12 +217,14 @@ for r in results:
216
217
 
217
218
  ```bash
218
219
  # Parse
219
- docslight parse invoice.pdf --mode cloud -o invoice.md
220
- docslight parse invoice.pdf --mode local -o invoice.md
220
+ docslight parse invoice.pdf --mode cloud -o invoice.zip
221
+ docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
222
+ docslight parse invoice.pdf --mode local -o invoice.zip
221
223
 
222
224
  # Extract
223
225
  docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
224
226
  docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
227
+ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
225
228
 
226
229
  # Extract with schema
227
230
  docslight extract invoice.pdf --schema schema.json
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docslight"
7
- version = "0.1.1"
7
+ version = "0.1.2"
8
8
  description = "Lightweight ComPDF document parsing and extraction SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
File without changes
File without changes
File without changes
File without changes
File without changes