docslight-lite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docslight/__init__.py +41 -0
- docslight/cli.py +215 -0
- docslight/client.py +92 -0
- docslight/cloud/__init__.py +5 -0
- docslight/cloud/client.py +622 -0
- docslight/config.py +117 -0
- docslight/exceptions.py +65 -0
- docslight/local/__init__.py +31 -0
- docslight/local/layout_blocks.py +80 -0
- docslight/local/llm_extractor.py +252 -0
- docslight/local/loaders.py +95 -0
- docslight/local/markdown.py +18 -0
- docslight/local/office_loader.py +128 -0
- docslight/local/paddle_parser.py +173 -0
- docslight/local/pipeline.py +213 -0
- docslight/preview.py +46 -0
- docslight/providers/__init__.py +6 -0
- docslight/providers/ollama.py +30 -0
- docslight/providers/openai_compatible.py +64 -0
- docslight/result.py +89 -0
- docslight/schemas/__init__.py +5 -0
- docslight/schemas/fields.py +190 -0
- docslight/standard_json.py +367 -0
- docslight/static/app/common.js +668 -0
- docslight/static/app/docslight-extract.json +307 -0
- docslight/static/app/extract.js +394 -0
- docslight/static/app/i18n.js +405 -0
- docslight/static/app/parse.js +161 -0
- docslight/static/styles.css +878 -0
- docslight/templates/base.html +36 -0
- docslight/templates/extract.html +123 -0
- docslight/templates/parse.html +81 -0
- docslight/web_app.py +372 -0
- docslight_lite-0.1.0.dist-info/METADATA +277 -0
- docslight_lite-0.1.0.dist-info/RECORD +39 -0
- docslight_lite-0.1.0.dist-info/WHEEL +5 -0
- docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
- docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
- docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
docslight/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Lightweight ComPDF document parsing and extraction SDK."""
|
|
2
|
+
|
|
3
|
+
from docslight.client import DocSlight
|
|
4
|
+
from docslight.config import (
|
|
5
|
+
DEFAULT_BASE_URL,
|
|
6
|
+
DEFAULT_CONFIG_PATH,
|
|
7
|
+
VALID_MODES,
|
|
8
|
+
DocSlightConfig,
|
|
9
|
+
)
|
|
10
|
+
from docslight.exceptions import (
|
|
11
|
+
AuthenticationError,
|
|
12
|
+
CloudAPIError,
|
|
13
|
+
ConfigurationError,
|
|
14
|
+
DependencyMissingError,
|
|
15
|
+
DocSlightError,
|
|
16
|
+
LocalProcessingError,
|
|
17
|
+
RateLimitError,
|
|
18
|
+
UnsupportedFormatError,
|
|
19
|
+
)
|
|
20
|
+
from docslight.result import ExtractResult, ParseResult
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"AuthenticationError",
|
|
26
|
+
"CloudAPIError",
|
|
27
|
+
"ConfigurationError",
|
|
28
|
+
"DEFAULT_BASE_URL",
|
|
29
|
+
"DEFAULT_CONFIG_PATH",
|
|
30
|
+
"DependencyMissingError",
|
|
31
|
+
"DocSlight",
|
|
32
|
+
"DocSlightConfig",
|
|
33
|
+
"DocSlightError",
|
|
34
|
+
"ExtractResult",
|
|
35
|
+
"LocalProcessingError",
|
|
36
|
+
"ParseResult",
|
|
37
|
+
"RateLimitError",
|
|
38
|
+
"UnsupportedFormatError",
|
|
39
|
+
"VALID_MODES",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|
docslight/cli.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Command line interface for docslight."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import importlib
|
|
7
|
+
import importlib.util
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, cast
|
|
13
|
+
|
|
14
|
+
from docslight import DocSlight
|
|
15
|
+
from docslight.schemas import normalize_fields
|
|
16
|
+
from docslight.standard_json import convert_parse_payload
|
|
17
|
+
|
|
18
|
+
WEB_EXTRA_ERROR = "Install docslight-lite[web] to use the web command."
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CLIUsageError(Exception):
|
|
22
|
+
"""Expected command line usage error."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _add_common_options(parser: argparse.ArgumentParser) -> None:
|
|
26
|
+
parser.add_argument("--mode", choices=("cloud", "local"))
|
|
27
|
+
parser.add_argument("--api-key")
|
|
28
|
+
parser.add_argument("--base-url")
|
|
29
|
+
parser.add_argument("--local-parser")
|
|
30
|
+
parser.add_argument("--local-llm-provider")
|
|
31
|
+
parser.add_argument("--local-llm-model")
|
|
32
|
+
parser.add_argument("--local-llm-base-url")
|
|
33
|
+
parser.add_argument("--local-llm-api-key")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _load_json_file(path: str | None, label: str) -> Any:
|
|
37
|
+
if path is None:
|
|
38
|
+
return None
|
|
39
|
+
try:
|
|
40
|
+
with Path(path).open(encoding="utf-8") as file:
|
|
41
|
+
return json.load(file)
|
|
42
|
+
except FileNotFoundError as exc:
|
|
43
|
+
raise CLIUsageError(f"{label} file not found: {path}") from exc
|
|
44
|
+
except json.JSONDecodeError as exc:
|
|
45
|
+
raise CLIUsageError(f"Invalid JSON in {label} file {path}: {exc.msg}") from exc
|
|
46
|
+
except UnicodeDecodeError as exc:
|
|
47
|
+
raise CLIUsageError(f"Cannot read {label} file {path}: {exc}") from exc
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_document_types(path: str | None) -> list[Any] | None:
|
|
51
|
+
document_types = _load_json_file(path, "document-types")
|
|
52
|
+
if document_types is None:
|
|
53
|
+
return None
|
|
54
|
+
if not isinstance(document_types, list):
|
|
55
|
+
raise CLIUsageError("document-types JSON must be a list")
|
|
56
|
+
return document_types
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _local_llm_from_args(args: argparse.Namespace) -> dict[str, str] | None:
|
|
60
|
+
values = {
|
|
61
|
+
"provider": args.local_llm_provider,
|
|
62
|
+
"model": args.local_llm_model,
|
|
63
|
+
"base_url": args.local_llm_base_url,
|
|
64
|
+
"api_key": args.local_llm_api_key,
|
|
65
|
+
}
|
|
66
|
+
if not any(values.values()):
|
|
67
|
+
return None
|
|
68
|
+
if values["provider"] is None:
|
|
69
|
+
values["provider"] = "ollama"
|
|
70
|
+
return {key: value for key, value in values.items() if value is not None}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _client_from_args(args: argparse.Namespace) -> DocSlight:
|
|
74
|
+
return DocSlight(
|
|
75
|
+
mode=args.mode,
|
|
76
|
+
api_key=args.api_key,
|
|
77
|
+
base_url=args.base_url,
|
|
78
|
+
local_parser=args.local_parser,
|
|
79
|
+
local_llm=_local_llm_from_args(args),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _write_output(content: str, output_path: str | None) -> None:
|
|
84
|
+
if output_path is None:
|
|
85
|
+
sys.stdout.write(content)
|
|
86
|
+
if not content.endswith("\n"):
|
|
87
|
+
sys.stdout.write("\n")
|
|
88
|
+
return
|
|
89
|
+
Path(output_path).write_text(content, encoding="utf-8")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _to_pretty_json(data: Any) -> str:
|
|
93
|
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def run_web_app(host: str, port: int, debug: bool) -> None:
|
|
97
|
+
"""Run the optional Flask web application."""
|
|
98
|
+
if importlib.util.find_spec("docslight.web_app") is None:
|
|
99
|
+
raise CLIUsageError(WEB_EXTRA_ERROR)
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
web_app = importlib.import_module("docslight.web_app")
|
|
103
|
+
except ModuleNotFoundError as exc:
|
|
104
|
+
if exc.name in {"flask", "werkzeug"}:
|
|
105
|
+
raise CLIUsageError(WEB_EXTRA_ERROR) from exc
|
|
106
|
+
raise
|
|
107
|
+
_run_web_app = web_app.run_web_app
|
|
108
|
+
_run_web_app(host, port, debug)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _print_cli_error(error: Exception) -> int:
|
|
112
|
+
sys.stderr.write(f"docslight: error: {error}\n")
|
|
113
|
+
return 2
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
117
|
+
"""Build the docslight command line parser."""
|
|
118
|
+
parser = argparse.ArgumentParser(
|
|
119
|
+
prog="docslight",
|
|
120
|
+
description="Lightweight ComPDF document parsing and extraction SDK.",
|
|
121
|
+
)
|
|
122
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
123
|
+
|
|
124
|
+
parse_parser = subparsers.add_parser("parse", help="Parse a document")
|
|
125
|
+
parse_parser.add_argument("input")
|
|
126
|
+
parse_parser.add_argument("--output", "-o")
|
|
127
|
+
parse_parser.add_argument(
|
|
128
|
+
"--format",
|
|
129
|
+
choices=("markdown", "json", "standard-json"),
|
|
130
|
+
default="markdown",
|
|
131
|
+
)
|
|
132
|
+
_add_common_options(parse_parser)
|
|
133
|
+
parse_parser.set_defaults(func=_run_parse)
|
|
134
|
+
|
|
135
|
+
convert_parser = subparsers.add_parser(
|
|
136
|
+
"convert-parse-json",
|
|
137
|
+
help="Convert local parse JSON to the standard parse JSON schema",
|
|
138
|
+
)
|
|
139
|
+
convert_parser.add_argument("input")
|
|
140
|
+
convert_parser.add_argument("--output", "-o")
|
|
141
|
+
convert_parser.set_defaults(func=_run_convert_parse_json)
|
|
142
|
+
|
|
143
|
+
extract_parser = subparsers.add_parser("extract", help="Extract structured data")
|
|
144
|
+
extract_parser.add_argument("input")
|
|
145
|
+
extract_parser.add_argument("--output", "-o")
|
|
146
|
+
extract_parser.add_argument("--fields")
|
|
147
|
+
extract_parser.add_argument("--schema")
|
|
148
|
+
extract_parser.add_argument("--document-types")
|
|
149
|
+
_add_common_options(extract_parser)
|
|
150
|
+
extract_parser.set_defaults(func=_run_extract)
|
|
151
|
+
|
|
152
|
+
web_parser = subparsers.add_parser("web", help="Run the web application")
|
|
153
|
+
web_parser.add_argument("--host", default="127.0.0.1")
|
|
154
|
+
web_parser.add_argument("--port", type=int, default=8000)
|
|
155
|
+
web_parser.add_argument("--debug", action="store_true")
|
|
156
|
+
web_parser.set_defaults(func=_run_web)
|
|
157
|
+
|
|
158
|
+
return parser
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _run_parse(args: argparse.Namespace) -> int:
|
|
162
|
+
parse_output = "json" if args.format == "standard-json" else args.format
|
|
163
|
+
result = _client_from_args(args).parse(args.input, output=parse_output)
|
|
164
|
+
if args.format == "json":
|
|
165
|
+
content = _to_pretty_json(result.to_json())
|
|
166
|
+
elif args.format == "standard-json":
|
|
167
|
+
content = _to_pretty_json(result.to_standard_json())
|
|
168
|
+
else:
|
|
169
|
+
content = result.to_markdown()
|
|
170
|
+
_write_output(content, args.output)
|
|
171
|
+
return 0
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _run_convert_parse_json(args: argparse.Namespace) -> int:
|
|
175
|
+
payload = _load_json_file(args.input, "parse-json")
|
|
176
|
+
if not isinstance(payload, dict):
|
|
177
|
+
raise CLIUsageError("parse-json JSON must be an object")
|
|
178
|
+
_write_output(_to_pretty_json(convert_parse_payload(payload)), args.output)
|
|
179
|
+
return 0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _run_extract(args: argparse.Namespace) -> int:
|
|
183
|
+
extract_options: dict[str, Any] = {}
|
|
184
|
+
fields = normalize_fields(args.fields)
|
|
185
|
+
if fields is not None:
|
|
186
|
+
extract_options["fields"] = fields
|
|
187
|
+
schema = _load_json_file(args.schema, "schema")
|
|
188
|
+
if schema is not None:
|
|
189
|
+
extract_options["schema"] = schema
|
|
190
|
+
document_types = _load_document_types(args.document_types)
|
|
191
|
+
if document_types is not None:
|
|
192
|
+
extract_options["document_types"] = document_types
|
|
193
|
+
|
|
194
|
+
result = _client_from_args(args).extract(args.input, **extract_options)
|
|
195
|
+
_write_output(_to_pretty_json(result.to_json()), args.output)
|
|
196
|
+
return 0
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _run_web(args: argparse.Namespace) -> int:
|
|
200
|
+
run_web_app(args.host, args.port, args.debug)
|
|
201
|
+
return 0
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
205
|
+
"""Run the docslight command line interface."""
|
|
206
|
+
parser = build_parser()
|
|
207
|
+
args = parser.parse_args(argv)
|
|
208
|
+
try:
|
|
209
|
+
return cast(int, args.func(args))
|
|
210
|
+
except CLIUsageError as exc:
|
|
211
|
+
return _print_cli_error(exc)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
sys.exit(main())
|
docslight/client.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Public SDK router for document parsing and extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from docslight.cloud import CloudClient
|
|
9
|
+
from docslight.config import DocSlightConfig
|
|
10
|
+
from docslight.result import ExtractResult, ParseResult
|
|
11
|
+
from docslight.schemas import build_extract_schema, normalize_fields
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DocSlight:
|
|
15
|
+
"""Route SDK calls to cloud or local document processors."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
mode: str | None = None,
|
|
20
|
+
api_key: str | None = None,
|
|
21
|
+
base_url: str | None = None,
|
|
22
|
+
timeout: float | None = None,
|
|
23
|
+
local_parser: str | None = None,
|
|
24
|
+
local_llm: dict[str, Any] | None = None,
|
|
25
|
+
cloud_client: Any = None,
|
|
26
|
+
local_pipeline: Any = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
self.config = DocSlightConfig.from_sources(
|
|
29
|
+
mode=mode,
|
|
30
|
+
api_key=api_key,
|
|
31
|
+
base_url=base_url,
|
|
32
|
+
timeout=timeout,
|
|
33
|
+
local_parser=local_parser,
|
|
34
|
+
local_llm=local_llm,
|
|
35
|
+
)
|
|
36
|
+
self._cloud_client = cloud_client
|
|
37
|
+
self._local_pipeline = local_pipeline
|
|
38
|
+
|
|
39
|
+
def parse(self, path: Path | str, output: str = "markdown", **options: Any) -> ParseResult:
|
|
40
|
+
"""Parse a document into markdown by default."""
|
|
41
|
+
if output != "markdown":
|
|
42
|
+
options["output"] = output
|
|
43
|
+
return cast(ParseResult, self._processor().parse(path, **options))
|
|
44
|
+
|
|
45
|
+
def extract(
|
|
46
|
+
self,
|
|
47
|
+
path: Path | str,
|
|
48
|
+
fields: list[str] | str | dict[str, Any] | None = None,
|
|
49
|
+
schema: dict[str, Any] | None = None,
|
|
50
|
+
document_types: list[str] | None = None,
|
|
51
|
+
**options: Any,
|
|
52
|
+
) -> ExtractResult:
|
|
53
|
+
"""Extract structured data from a document."""
|
|
54
|
+
normalized_fields = normalize_fields(fields)
|
|
55
|
+
extract_kwargs = dict(options)
|
|
56
|
+
if normalized_fields is not None:
|
|
57
|
+
extract_kwargs["fields"] = normalized_fields
|
|
58
|
+
derived_schema = build_extract_schema(normalized_fields)
|
|
59
|
+
if schema is None and derived_schema is not None:
|
|
60
|
+
extract_kwargs["schema"] = derived_schema
|
|
61
|
+
if schema is not None:
|
|
62
|
+
extract_kwargs["schema"] = schema
|
|
63
|
+
if document_types is not None:
|
|
64
|
+
extract_kwargs["document_types"] = document_types
|
|
65
|
+
return cast(ExtractResult, self._processor().extract(path, **extract_kwargs))
|
|
66
|
+
|
|
67
|
+
def parse_batch(self, paths: list[Path | str], **options: Any) -> list[ParseResult]:
|
|
68
|
+
"""Parse documents sequentially."""
|
|
69
|
+
return [self.parse(path, **options) for path in paths]
|
|
70
|
+
|
|
71
|
+
def extract_batch(self, paths: list[Path | str], **options: Any) -> list[ExtractResult]:
|
|
72
|
+
"""Extract data from documents sequentially."""
|
|
73
|
+
return [self.extract(path, **options) for path in paths]
|
|
74
|
+
|
|
75
|
+
def _processor(self) -> Any:
|
|
76
|
+
if self.config.mode == "local":
|
|
77
|
+
if self._local_pipeline is None:
|
|
78
|
+
self._local_pipeline = self._build_local_pipeline()
|
|
79
|
+
return self._local_pipeline
|
|
80
|
+
|
|
81
|
+
if self._cloud_client is None:
|
|
82
|
+
self._cloud_client = CloudClient(
|
|
83
|
+
self.config.api_key,
|
|
84
|
+
self.config.base_url,
|
|
85
|
+
self.config.timeout,
|
|
86
|
+
)
|
|
87
|
+
return self._cloud_client
|
|
88
|
+
|
|
89
|
+
def _build_local_pipeline(self) -> Any:
|
|
90
|
+
from docslight.local.pipeline import LocalPipeline
|
|
91
|
+
|
|
92
|
+
return LocalPipeline.from_config(self.config)
|