docslight 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docslight-0.1.2 → docslight-0.1.4}/PKG-INFO +27 -30
  2. {docslight-0.1.2 → docslight-0.1.4}/README.md +38 -40
  3. {docslight-0.1.2 → docslight-0.1.4}/docslight/__init__.py +1 -1
  4. {docslight-0.1.2 → docslight-0.1.4}/docslight/cli.py +10 -10
  5. {docslight-0.1.2 → docslight-0.1.4}/docslight/cloud/client.py +1 -1
  6. {docslight-0.1.2 → docslight-0.1.4}/docslight/preview.py +1 -1
  7. {docslight-0.1.2 → docslight-0.1.4}/docslight/providers/openai_compatible.py +3 -3
  8. {docslight-0.1.2 → docslight-0.1.4}/docslight/schemas/fields.py +2 -2
  9. {docslight-0.1.2 → docslight-0.1.4}/docslight/web_app.py +32 -31
  10. {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/PKG-INFO +27 -30
  11. {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/SOURCES.txt +1 -10
  12. {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/requires.txt +8 -11
  13. {docslight-0.1.2 → docslight-0.1.4}/pyproject.toml +40 -46
  14. docslight-0.1.2/docslight/static/app/common.js +0 -668
  15. docslight-0.1.2/docslight/static/app/docslight-extract.json +0 -307
  16. docslight-0.1.2/docslight/static/app/extract.js +0 -394
  17. docslight-0.1.2/docslight/static/app/i18n.js +0 -405
  18. docslight-0.1.2/docslight/static/app/parse.js +0 -161
  19. docslight-0.1.2/docslight/static/styles.css +0 -878
  20. docslight-0.1.2/docslight/templates/base.html +0 -36
  21. docslight-0.1.2/docslight/templates/extract.html +0 -123
  22. docslight-0.1.2/docslight/templates/parse.html +0 -81
  23. {docslight-0.1.2 → docslight-0.1.4}/LICENSE +0 -0
  24. {docslight-0.1.2 → docslight-0.1.4}/docslight/client.py +0 -0
  25. {docslight-0.1.2 → docslight-0.1.4}/docslight/cloud/__init__.py +0 -0
  26. {docslight-0.1.2 → docslight-0.1.4}/docslight/config.py +0 -0
  27. {docslight-0.1.2 → docslight-0.1.4}/docslight/exceptions.py +0 -0
  28. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/__init__.py +0 -0
  29. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/layout_blocks.py +0 -0
  30. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/llm_extractor.py +0 -0
  31. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/loaders.py +0 -0
  32. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/markdown.py +0 -0
  33. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/office_loader.py +0 -0
  34. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/paddle_parser.py +0 -0
  35. {docslight-0.1.2 → docslight-0.1.4}/docslight/local/pipeline.py +0 -0
  36. {docslight-0.1.2 → docslight-0.1.4}/docslight/providers/__init__.py +0 -0
  37. {docslight-0.1.2 → docslight-0.1.4}/docslight/providers/ollama.py +0 -0
  38. {docslight-0.1.2 → docslight-0.1.4}/docslight/result.py +0 -0
  39. {docslight-0.1.2 → docslight-0.1.4}/docslight/schemas/__init__.py +0 -0
  40. {docslight-0.1.2 → docslight-0.1.4}/docslight/standard_json.py +0 -0
  41. {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/dependency_links.txt +0 -0
  42. {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/entry_points.txt +0 -0
  43. {docslight-0.1.2 → docslight-0.1.4}/docslight.egg-info/top_level.txt +0 -0
  44. {docslight-0.1.2 → docslight-0.1.4}/setup.cfg +0 -0
@@ -1,18 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docslight
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Lightweight ComPDF document parsing and extraction SDK
5
5
  Author-email: ComPDF AI <support@compdf.com>
6
6
  License-Expression: MIT
7
- Requires-Python: >=3.10
7
+ Requires-Python: <=3.13,>=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: requests>=2.31.0
11
11
  Requires-Dist: pydantic>=2.5.0
12
12
  Requires-Dist: typing-extensions>=4.8.0; python_version < "3.11"
13
13
  Requires-Dist: tomli>=2.0.1; python_version < "3.11"
14
- Requires-Dist: flask>=3.1.3
15
- Requires-Dist: werkzeug>=3.1.8
16
14
  Provides-Extra: local
17
15
  Requires-Dist: Pillow>=10.0.0; extra == "local"
18
16
  Requires-Dist: PyMuPDF>=1.23.0; extra == "local"
@@ -22,13 +20,10 @@ Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "local"
22
20
  Requires-Dist: python-docx>=1.1.0; extra == "local"
23
21
  Requires-Dist: python-pptx>=0.6.23; extra == "local"
24
22
  Requires-Dist: openpyxl>=3.1.0; extra == "local"
25
- Provides-Extra: local-llm
26
- Requires-Dist: openai>=1.0.0; extra == "local-llm"
23
+ Requires-Dist: openai>=1.0.0; extra == "local"
27
24
  Provides-Extra: web
28
- Requires-Dist: Flask>=3.0.0; extra == "web"
29
- Requires-Dist: Werkzeug>=3.0.0; extra == "web"
30
- Provides-Extra: web-test
31
- Requires-Dist: playwright>=1.40.0; extra == "web-test"
25
+ Requires-Dist: flask>=3.1.3; extra == "web"
26
+ Requires-Dist: werkzeug>=3.1.8; extra == "web"
32
27
  Provides-Extra: dev
33
28
  Requires-Dist: pytest>=7.4.0; extra == "dev"
34
29
  Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
@@ -36,14 +31,18 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
31
  Requires-Dist: mypy>=1.7.0; extra == "dev"
37
32
  Requires-Dist: build>=1.0.0; extra == "dev"
38
33
  Requires-Dist: twine>=4.0.0; extra == "dev"
34
+ Requires-Dist: playwright>=1.40.0; extra == "dev"
39
35
  Requires-Dist: Pillow>=10.0.0; extra == "dev"
40
36
  Requires-Dist: PyMuPDF>=1.23.0; extra == "dev"
41
37
  Requires-Dist: numpy>=1.24.0; extra == "dev"
38
+ Requires-Dist: paddlepaddle>=3.3.0; extra == "dev"
39
+ Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "dev"
42
40
  Requires-Dist: python-docx>=1.1.0; extra == "dev"
43
41
  Requires-Dist: python-pptx>=0.6.23; extra == "dev"
44
42
  Requires-Dist: openpyxl>=3.1.0; extra == "dev"
45
- Requires-Dist: Flask>=3.0.0; extra == "dev"
46
- Requires-Dist: Werkzeug>=3.0.0; extra == "dev"
43
+ Requires-Dist: openai>=1.0.0; extra == "dev"
44
+ Requires-Dist: flask>=3.1.3; extra == "dev"
45
+ Requires-Dist: werkzeug>=3.1.8; extra == "dev"
47
46
  Dynamic: license-file
48
47
 
49
48
  <p align="center">
@@ -87,14 +86,13 @@ Extract specific fields:
87
86
  docslight extract invoice.pdf --fields invoice_number,total_amount
88
87
  ```
89
88
 
90
- Launch the local Web UI workbench:
89
+ Launch the local API server:
91
90
 
92
91
  ```bash
93
- pip install "docslight[web]"
94
92
  docslight web
95
- # Open http://127.0.0.1:8000
93
+ # Health: http://127.0.0.1:8000/api/health
96
94
 
97
- # Or run the same Web UI directly as a module
95
+ # Or run the same API server directly as a module
98
96
  python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
99
97
  ```
100
98
 
@@ -104,7 +102,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
104
102
  - **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
105
103
  - **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
106
104
  - **CLI first** — Full-featured command-line interface, script-friendly
107
- - **Web UI** — Local Flask workbench with drag-and-drop, live preview with bbox highlights, and a Fields Builder UI
105
+ - **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
108
106
  - **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
109
107
  - **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
110
108
  - **Document types** — Classify and route documents by type for cloud extraction
@@ -116,8 +114,8 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
116
114
  |----------|---------|
117
115
  | Core SDK & CLI | `pip install docslight` |
118
116
  | + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
119
- | + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
120
- | + Web UI workbench | `pip install "docslight[web]"` |
117
+ | + API server | `pip install "docslight[web]"` |
118
+ | + Local parsing and API server | `pip install "docslight[local,web]"` |
121
119
 
122
120
  > Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
123
121
 
@@ -209,7 +207,7 @@ client = DocSlight(
209
207
 
210
208
  ```python
211
209
  results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
212
- for r in results:[release.ps1](scripts/release.ps1)
210
+ for r in results:
213
211
  print(r.to_markdown()[:200])
214
212
  ```
215
213
 
@@ -224,30 +222,29 @@ docslight parse invoice.pdf --mode local -o invoice.zip
224
222
  # Extract
225
223
  docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
226
224
  docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
227
- docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
225
+ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
228
226
 
229
227
  # Extract with schema
230
228
  docslight extract invoice.pdf --schema schema.json
231
229
 
232
- # Web UI
230
+ # API server
233
231
  docslight web --host 127.0.0.1 --port 8000
234
232
  ```
235
233
 
236
- ## Web UI Workbench
234
+ ## API Server
237
235
 
238
- DocSlight Workbench is a local Flask app for visual document processing.
236
+ DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
239
237
 
240
238
  ```bash
241
- pip install "docslight[web]"
242
239
  docslight web
243
240
  python -m docslight.web_app
244
241
  ```
245
242
 
246
- - **Parse & Extract tabs** — Switch between parsing and extraction workflows
247
- - **Drag-and-drop upload** — PDF, images, DOCX, PPTX, XLSX
248
- - **Live preview** — PDF page rendering with bbox highlight overlays
249
- - **Fields Builder** — Structured UI for building key-value and table extraction templates
250
- - **Download results** — One-click download of Markdown or JSON output
243
+ - `GET /api/health`
244
+ - `GET /api/system-info`
245
+ - `POST /api/parse`
246
+ - `POST /api/extract`
247
+ - `POST /api/preview`
251
248
 
252
249
  ## Environment Variables
253
250
 
@@ -39,16 +39,15 @@ Extract specific fields:
39
39
  docslight extract invoice.pdf --fields invoice_number,total_amount
40
40
  ```
41
41
 
42
- Launch the local Web UI workbench:
43
-
44
- ```bash
45
- pip install "docslight[web]"
46
- docslight web
47
- # Open http://127.0.0.1:8000
48
-
49
- # Or run the same Web UI directly as a module
50
- python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
51
- ```
42
+ Launch the local API server:
43
+
44
+ ```bash
45
+ docslight web
46
+ # Health: http://127.0.0.1:8000/api/health
47
+
48
+ # Or run the same API server directly as a module
49
+ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
50
+ ```
52
51
 
53
52
  ## Features
54
53
 
@@ -56,7 +55,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
56
55
  - **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
57
56
  - **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
58
57
  - **CLI first** — Full-featured command-line interface, script-friendly
59
- - **Web UI** — Local Flask workbench with drag-and-drop, live preview with bbox highlights, and a Fields Builder UI
58
+ - **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
60
59
  - **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
61
60
  - **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
62
61
  - **Document types** — Classify and route documents by type for cloud extraction
@@ -66,10 +65,10 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
66
65
 
67
66
  | Scenario | Command |
68
67
  |----------|---------|
69
- | Core SDK & CLI | `pip install docslight` |
70
- | + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
71
- | + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
72
- | + Web UI workbench | `pip install "docslight[web]"` |
68
+ | Core SDK & CLI | `pip install docslight` |
69
+ | + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
70
+ | + API server | `pip install "docslight[web]"` |
71
+ | + Local parsing and API server | `pip install "docslight[local,web]"` |
73
72
 
74
73
  > Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
75
74
 
@@ -160,9 +159,9 @@ client = DocSlight(
160
159
  ### Batch Processing
161
160
 
162
161
  ```python
163
- results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
164
- for r in results:[release.ps1](scripts/release.ps1)
165
- print(r.to_markdown()[:200])
162
+ results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
163
+ for r in results:
164
+ print(r.to_markdown()[:200])
166
165
  ```
167
166
 
168
167
  ## CLI Usage
@@ -174,32 +173,31 @@ docslight parse invoice.pdf --mode cloud --format zip -o invoice.zip
174
173
  docslight parse invoice.pdf --mode local -o invoice.zip
175
174
 
176
175
  # Extract
177
- docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
178
- docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
179
- docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
176
+ docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
177
+ docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
178
+ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
180
179
 
181
180
  # Extract with schema
182
181
  docslight extract invoice.pdf --schema schema.json
183
182
 
184
- # Web UI
185
- docslight web --host 127.0.0.1 --port 8000
186
- ```
187
-
188
- ## Web UI Workbench
189
-
190
- DocSlight Workbench is a local Flask app for visual document processing.
191
-
192
- ```bash
193
- pip install "docslight[web]"
194
- docslight web
195
- python -m docslight.web_app
196
- ```
197
-
198
- - **Parse & Extract tabs** — Switch between parsing and extraction workflows
199
- - **Drag-and-drop upload** — PDF, images, DOCX, PPTX, XLSX
200
- - **Live preview** — PDF page rendering with bbox highlight overlays
201
- - **Fields Builder** — Structured UI for building key-value and table extraction templates
202
- - **Download results** — One-click download of Markdown or JSON output
183
+ # API server
184
+ docslight web --host 127.0.0.1 --port 8000
185
+ ```
186
+
187
+ ## API Server
188
+
189
+ DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
190
+
191
+ ```bash
192
+ docslight web
193
+ python -m docslight.web_app
194
+ ```
195
+
196
+ - `GET /api/health`
197
+ - `GET /api/system-info`
198
+ - `POST /api/parse`
199
+ - `POST /api/extract`
200
+ - `POST /api/preview`
203
201
 
204
202
  ## Environment Variables
205
203
 
@@ -19,7 +19,7 @@ from docslight.exceptions import (
19
19
  )
20
20
  from docslight.result import ExtractResult, ParseResult
21
21
 
22
- __version__ = "0.1.2"
22
+ __version__ = "0.1.4"
23
23
 
24
24
  __all__ = [
25
25
  "AuthenticationError",
@@ -108,7 +108,7 @@ def _to_pretty_json(data: Any) -> str:
108
108
  return json.dumps(data, ensure_ascii=False, indent=2)
109
109
 
110
110
 
111
- def run_web_app(host: str, port: int, debug: bool) -> None:
111
+ def run_web_app(host: str, port: int, debug: bool) -> None:
112
112
  """Run the optional Flask web application."""
113
113
  if importlib.util.find_spec("docslight.web_app") is None:
114
114
  raise CLIUsageError(WEB_EXTRA_ERROR)
@@ -119,8 +119,8 @@ def run_web_app(host: str, port: int, debug: bool) -> None:
119
119
  if exc.name in {"flask", "werkzeug"}:
120
120
  raise CLIUsageError(WEB_EXTRA_ERROR) from exc
121
121
  raise
122
- _run_web_app = web_app.run_web_app
123
- _run_web_app(host, port, debug)
122
+ _run_web_app = web_app.run_web_app
123
+ _run_web_app(host, port, debug)
124
124
 
125
125
 
126
126
  def _print_cli_error(error: Exception) -> int:
@@ -165,10 +165,10 @@ def build_parser() -> argparse.ArgumentParser:
165
165
  extract_parser.set_defaults(func=_run_extract)
166
166
 
167
167
  web_parser = subparsers.add_parser("web", help="Run the web application")
168
- web_parser.add_argument("--host", default="127.0.0.1")
169
- web_parser.add_argument("--port", type=int, default=8000)
170
- web_parser.add_argument("--debug", action="store_true")
171
- web_parser.set_defaults(func=_run_web)
168
+ web_parser.add_argument("--host", default="127.0.0.1")
169
+ web_parser.add_argument("--port", type=int, default=8000)
170
+ web_parser.add_argument("--debug", action="store_true")
171
+ web_parser.set_defaults(func=_run_web)
172
172
 
173
173
  return parser
174
174
 
@@ -227,9 +227,9 @@ def _run_extract(args: argparse.Namespace) -> int:
227
227
  return 0
228
228
 
229
229
 
230
- def _run_web(args: argparse.Namespace) -> int:
231
- run_web_app(args.host, args.port, args.debug)
232
- return 0
230
+ def _run_web(args: argparse.Namespace) -> int:
231
+ run_web_app(args.host, args.port, args.debug)
232
+ return 0
233
233
 
234
234
 
235
235
  def main(argv: Sequence[str] | None = None) -> int:
@@ -219,7 +219,7 @@ class CloudClient:
219
219
  return compacted
220
220
 
221
221
  def _headers(self) -> dict[str, str]:
222
- headers = {"User-Agent": "docslight/0.1.2"}
222
+ headers = {"User-Agent": "docslight/0.1.4"}
223
223
  if self.api_key:
224
224
  headers["Authorization"] = f"Bearer {self.api_key}"
225
225
  headers["x-api-key"] = self.api_key
@@ -1,4 +1,4 @@
1
- """Preview rendering helpers for the local Web UI."""
1
+ """Preview rendering helpers for the local API server."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -6,9 +6,9 @@ from typing import Any
6
6
 
7
7
  from docslight.exceptions import DependencyMissingError, LocalProcessingError
8
8
 
9
- INSTALL_LOCAL_LLM_MESSAGE = (
10
- "Install local LLM dependencies with: pip install 'docslight[local-llm]'"
11
- )
9
+ INSTALL_LOCAL_LLM_MESSAGE = (
10
+ "Install local dependencies with: pip install 'docslight[local]'"
11
+ )
12
12
  NO_TEXT_CONTENT_MESSAGE = "OpenAI-compatible provider returned no text content"
13
13
  REQUEST_FAILED_MESSAGE = "OpenAI-compatible provider request failed"
14
14
 
@@ -11,8 +11,8 @@ NormalizedFields = list[str] | StructuredFields | None
11
11
  ExtractSchema = dict[str, Any]
12
12
 
13
13
 
14
- def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
15
- """Normalize extraction fields from SDK, CLI, or Web UI inputs."""
14
+ def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
15
+ """Normalize extraction fields from SDK, CLI, or API inputs."""
16
16
  if fields is None:
17
17
  return None
18
18
  if isinstance(fields, str):
@@ -2,19 +2,19 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import argparse
6
- import base64
7
- import json
8
- import logging
9
- import sys
10
- import tempfile
5
+ import argparse
6
+ import base64
7
+ import json
8
+ import logging
9
+ import sys
10
+ import tempfile
11
11
  from collections.abc import Callable
12
12
  from io import BytesIO
13
13
  from json import JSONDecodeError
14
14
  from pathlib import Path
15
15
  from typing import Any, cast
16
16
 
17
- from flask import Flask, Response, jsonify, redirect, render_template, request, send_file, url_for
17
+ from flask import Flask, Response, jsonify, request, send_file
18
18
  from werkzeug.datastructures import FileStorage
19
19
  from werkzeug.utils import secure_filename
20
20
 
@@ -58,21 +58,18 @@ OFFICE_PREVIEW_UNSUPPORTED_MESSAGE = (
58
58
  LOG_FORMAT = "%(levelname)s:%(name)s:%(message)s"
59
59
 
60
60
 
61
- def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
62
- """Create the local DocSlight Flask application."""
63
- app = Flask(__name__)
64
-
65
- @app.get("/")
66
- def index() -> Any:
67
- return redirect(url_for("parse_page"))
68
-
69
- @app.get("/parse")
70
- def parse_page() -> str:
71
- return render_template("parse.html", active_page="parse")
72
-
73
- @app.get("/extract")
74
- def extract_page() -> str:
75
- return render_template("extract.html", active_page="extract")
61
+ def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
62
+ """Create the local DocSlight Flask application."""
63
+ app = Flask(__name__)
64
+
65
+ @app.get("/")
66
+ def index() -> Any:
67
+ return jsonify(
68
+ {
69
+ "status": "healthy",
70
+ "service": "docslight-web",
71
+ }
72
+ )
76
73
 
77
74
  @app.get("/api/health")
78
75
  def health() -> Any:
@@ -160,7 +157,11 @@ def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
160
157
  return app
161
158
 
162
159
 
163
- def run_web_app(host: str = "127.0.0.1", port: int = 8000, debug: bool = False) -> None:
160
+ def run_web_app(
161
+ host: str = "127.0.0.1",
162
+ port: int = 8000,
163
+ debug: bool = False,
164
+ ) -> None:
164
165
  """Run the local DocSlight web application."""
165
166
  _configure_web_logging(debug)
166
167
  create_app().run(host=host, port=port, debug=debug)
@@ -183,17 +184,17 @@ def build_parser() -> argparse.ArgumentParser:
183
184
  prog="python -m docslight.web_app",
184
185
  description="Run the DocSlight web application.",
185
186
  )
186
- parser.add_argument("--host", default="127.0.0.1")
187
- parser.add_argument("--port", type=int, default=8000)
188
- parser.add_argument("--debug", action="store_true")
189
- return parser
187
+ parser.add_argument("--host", default="127.0.0.1")
188
+ parser.add_argument("--port", type=int, default=8000)
189
+ parser.add_argument("--debug", action="store_true")
190
+ return parser
190
191
 
191
192
 
192
193
  def main(argv: list[str] | None = None) -> int:
193
- """Run the standalone DocSlight web application entrypoint."""
194
- args = build_parser().parse_args(argv)
195
- run_web_app(args.host, args.port, args.debug)
196
- return 0
194
+ """Run the standalone DocSlight web application entrypoint."""
195
+ args = build_parser().parse_args(argv)
196
+ run_web_app(args.host, args.port, args.debug)
197
+ return 0
197
198
 
198
199
 
199
200
  def local_llm_from_form(form: Any) -> dict[str, str] | None:
@@ -1,18 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docslight
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Lightweight ComPDF document parsing and extraction SDK
5
5
  Author-email: ComPDF AI <support@compdf.com>
6
6
  License-Expression: MIT
7
- Requires-Python: >=3.10
7
+ Requires-Python: <=3.13,>=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: requests>=2.31.0
11
11
  Requires-Dist: pydantic>=2.5.0
12
12
  Requires-Dist: typing-extensions>=4.8.0; python_version < "3.11"
13
13
  Requires-Dist: tomli>=2.0.1; python_version < "3.11"
14
- Requires-Dist: flask>=3.1.3
15
- Requires-Dist: werkzeug>=3.1.8
16
14
  Provides-Extra: local
17
15
  Requires-Dist: Pillow>=10.0.0; extra == "local"
18
16
  Requires-Dist: PyMuPDF>=1.23.0; extra == "local"
@@ -22,13 +20,10 @@ Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "local"
22
20
  Requires-Dist: python-docx>=1.1.0; extra == "local"
23
21
  Requires-Dist: python-pptx>=0.6.23; extra == "local"
24
22
  Requires-Dist: openpyxl>=3.1.0; extra == "local"
25
- Provides-Extra: local-llm
26
- Requires-Dist: openai>=1.0.0; extra == "local-llm"
23
+ Requires-Dist: openai>=1.0.0; extra == "local"
27
24
  Provides-Extra: web
28
- Requires-Dist: Flask>=3.0.0; extra == "web"
29
- Requires-Dist: Werkzeug>=3.0.0; extra == "web"
30
- Provides-Extra: web-test
31
- Requires-Dist: playwright>=1.40.0; extra == "web-test"
25
+ Requires-Dist: flask>=3.1.3; extra == "web"
26
+ Requires-Dist: werkzeug>=3.1.8; extra == "web"
32
27
  Provides-Extra: dev
33
28
  Requires-Dist: pytest>=7.4.0; extra == "dev"
34
29
  Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
@@ -36,14 +31,18 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
31
  Requires-Dist: mypy>=1.7.0; extra == "dev"
37
32
  Requires-Dist: build>=1.0.0; extra == "dev"
38
33
  Requires-Dist: twine>=4.0.0; extra == "dev"
34
+ Requires-Dist: playwright>=1.40.0; extra == "dev"
39
35
  Requires-Dist: Pillow>=10.0.0; extra == "dev"
40
36
  Requires-Dist: PyMuPDF>=1.23.0; extra == "dev"
41
37
  Requires-Dist: numpy>=1.24.0; extra == "dev"
38
+ Requires-Dist: paddlepaddle>=3.3.0; extra == "dev"
39
+ Requires-Dist: paddleocr[doc-parser]>=3.3.0; extra == "dev"
42
40
  Requires-Dist: python-docx>=1.1.0; extra == "dev"
43
41
  Requires-Dist: python-pptx>=0.6.23; extra == "dev"
44
42
  Requires-Dist: openpyxl>=3.1.0; extra == "dev"
45
- Requires-Dist: Flask>=3.0.0; extra == "dev"
46
- Requires-Dist: Werkzeug>=3.0.0; extra == "dev"
43
+ Requires-Dist: openai>=1.0.0; extra == "dev"
44
+ Requires-Dist: flask>=3.1.3; extra == "dev"
45
+ Requires-Dist: werkzeug>=3.1.8; extra == "dev"
47
46
  Dynamic: license-file
48
47
 
49
48
  <p align="center">
@@ -87,14 +86,13 @@ Extract specific fields:
87
86
  docslight extract invoice.pdf --fields invoice_number,total_amount
88
87
  ```
89
88
 
90
- Launch the local Web UI workbench:
89
+ Launch the local API server:
91
90
 
92
91
  ```bash
93
- pip install "docslight[web]"
94
92
  docslight web
95
- # Open http://127.0.0.1:8000
93
+ # Health: http://127.0.0.1:8000/api/health
96
94
 
97
- # Or run the same Web UI directly as a module
95
+ # Or run the same API server directly as a module
98
96
  python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
99
97
  ```
100
98
 
@@ -104,7 +102,7 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
104
102
  - **Parse → Markdown** — Convert PDF, DOCX, PPTX, XLSX, and images (PNG, JPG, TIFF, BMP, WebP) to clean Markdown
105
103
  - **Extract → JSON** — Pull structured data by field list, JSON Schema, or structured template (key-value + table extraction)
106
104
  - **CLI first** — Full-featured command-line interface, script-friendly
107
- - **Web UI** — Local Flask workbench with drag-and-drop, live preview with bbox highlights, and a Fields Builder UI
105
+ - **API server** — Local Flask backend exposing parse, extract, preview, health, and system-info endpoints
108
106
  - **Batch processing** — `parse_batch()` / `extract_batch()` for multiple files
109
107
  - **Local LLM extraction** — Ollama or any OpenAI-compatible provider for offline extraction
110
108
  - **Document types** — Classify and route documents by type for cloud extraction
@@ -116,8 +114,8 @@ python -m docslight.web_app --host 0.0.0.0 --port 8000 --debug
116
114
  |----------|---------|
117
115
  | Core SDK & CLI | `pip install docslight` |
118
116
  | + Local parsing (OCR, Office) | `pip install "docslight[local]"` |
119
- | + Local LLM extraction | `pip install "docslight[local,local-llm]"` |
120
- | + Web UI workbench | `pip install "docslight[web]"` |
117
+ | + API server | `pip install "docslight[web]"` |
118
+ | + Local parsing and API server | `pip install "docslight[local,web]"` |
121
119
 
122
120
  > Local CPU parsing is experimental. Validate accuracy and latency on your own documents before production use.
123
121
 
@@ -209,7 +207,7 @@ client = DocSlight(
209
207
 
210
208
  ```python
211
209
  results = client.parse_batch(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
212
- for r in results:[release.ps1](scripts/release.ps1)
210
+ for r in results:
213
211
  print(r.to_markdown()[:200])
214
212
  ```
215
213
 
@@ -224,30 +222,29 @@ docslight parse invoice.pdf --mode local -o invoice.zip
224
222
  # Extract
225
223
  docslight extract invoice.pdf --mode cloud --fields invoice_number,total_amount
226
224
  docslight extract invoice.pdf --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
227
- docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama
225
+ docslight extract "D:\pdf\invoice\1.pdf" --mode local --fields invoice_number --local-llm-provider ollama --local-llm-model llama3.1
228
226
 
229
227
  # Extract with schema
230
228
  docslight extract invoice.pdf --schema schema.json
231
229
 
232
- # Web UI
230
+ # API server
233
231
  docslight web --host 127.0.0.1 --port 8000
234
232
  ```
235
233
 
236
- ## Web UI Workbench
234
+ ## API Server
237
235
 
238
- DocSlight Workbench is a local Flask app for visual document processing.
236
+ DocSlight includes a local Flask API server for document processing. Frontend assets are not bundled in this package.
239
237
 
240
238
  ```bash
241
- pip install "docslight[web]"
242
239
  docslight web
243
240
  python -m docslight.web_app
244
241
  ```
245
242
 
246
- - **Parse & Extract tabs** — Switch between parsing and extraction workflows
247
- - **Drag-and-drop upload** — PDF, images, DOCX, PPTX, XLSX
248
- - **Live preview** — PDF page rendering with bbox highlight overlays
249
- - **Fields Builder** — Structured UI for building key-value and table extraction templates
250
- - **Download results** — One-click download of Markdown or JSON output
243
+ - `GET /api/health`
244
+ - `GET /api/system-info`
245
+ - `POST /api/parse`
246
+ - `POST /api/extract`
247
+ - `POST /api/preview`
251
248
 
252
249
  ## Environment Variables
253
250
 
@@ -30,13 +30,4 @@ docslight/providers/__init__.py
30
30
  docslight/providers/ollama.py
31
31
  docslight/providers/openai_compatible.py
32
32
  docslight/schemas/__init__.py
33
- docslight/schemas/fields.py
34
- docslight/static/styles.css
35
- docslight/static/app/common.js
36
- docslight/static/app/docslight-extract.json
37
- docslight/static/app/extract.js
38
- docslight/static/app/i18n.js
39
- docslight/static/app/parse.js
40
- docslight/templates/base.html
41
- docslight/templates/extract.html
42
- docslight/templates/parse.html
33
+ docslight/schemas/fields.py