docslight-lite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docslight/__init__.py +41 -0
  2. docslight/cli.py +215 -0
  3. docslight/client.py +92 -0
  4. docslight/cloud/__init__.py +5 -0
  5. docslight/cloud/client.py +622 -0
  6. docslight/config.py +117 -0
  7. docslight/exceptions.py +65 -0
  8. docslight/local/__init__.py +31 -0
  9. docslight/local/layout_blocks.py +80 -0
  10. docslight/local/llm_extractor.py +252 -0
  11. docslight/local/loaders.py +95 -0
  12. docslight/local/markdown.py +18 -0
  13. docslight/local/office_loader.py +128 -0
  14. docslight/local/paddle_parser.py +173 -0
  15. docslight/local/pipeline.py +213 -0
  16. docslight/preview.py +46 -0
  17. docslight/providers/__init__.py +6 -0
  18. docslight/providers/ollama.py +30 -0
  19. docslight/providers/openai_compatible.py +64 -0
  20. docslight/result.py +89 -0
  21. docslight/schemas/__init__.py +5 -0
  22. docslight/schemas/fields.py +190 -0
  23. docslight/standard_json.py +367 -0
  24. docslight/static/app/common.js +668 -0
  25. docslight/static/app/docslight-extract.json +307 -0
  26. docslight/static/app/extract.js +394 -0
  27. docslight/static/app/i18n.js +405 -0
  28. docslight/static/app/parse.js +161 -0
  29. docslight/static/styles.css +878 -0
  30. docslight/templates/base.html +36 -0
  31. docslight/templates/extract.html +123 -0
  32. docslight/templates/parse.html +81 -0
  33. docslight/web_app.py +372 -0
  34. docslight_lite-0.1.0.dist-info/METADATA +277 -0
  35. docslight_lite-0.1.0.dist-info/RECORD +39 -0
  36. docslight_lite-0.1.0.dist-info/WHEEL +5 -0
  37. docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
  38. docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
  39. docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
docslight/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ """Lightweight ComPDF document parsing and extraction SDK."""
2
+
3
+ from docslight.client import DocSlight
4
+ from docslight.config import (
5
+ DEFAULT_BASE_URL,
6
+ DEFAULT_CONFIG_PATH,
7
+ VALID_MODES,
8
+ DocSlightConfig,
9
+ )
10
+ from docslight.exceptions import (
11
+ AuthenticationError,
12
+ CloudAPIError,
13
+ ConfigurationError,
14
+ DependencyMissingError,
15
+ DocSlightError,
16
+ LocalProcessingError,
17
+ RateLimitError,
18
+ UnsupportedFormatError,
19
+ )
20
+ from docslight.result import ExtractResult, ParseResult
21
+
22
+ __version__ = "0.1.0"
23
+
24
+ __all__ = [
25
+ "AuthenticationError",
26
+ "CloudAPIError",
27
+ "ConfigurationError",
28
+ "DEFAULT_BASE_URL",
29
+ "DEFAULT_CONFIG_PATH",
30
+ "DependencyMissingError",
31
+ "DocSlight",
32
+ "DocSlightConfig",
33
+ "DocSlightError",
34
+ "ExtractResult",
35
+ "LocalProcessingError",
36
+ "ParseResult",
37
+ "RateLimitError",
38
+ "UnsupportedFormatError",
39
+ "VALID_MODES",
40
+ "__version__",
41
+ ]
docslight/cli.py ADDED
@@ -0,0 +1,215 @@
1
+ """Command line interface for docslight."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import importlib
7
+ import importlib.util
8
+ import json
9
+ import sys
10
+ from collections.abc import Sequence
11
+ from pathlib import Path
12
+ from typing import Any, cast
13
+
14
+ from docslight import DocSlight
15
+ from docslight.schemas import normalize_fields
16
+ from docslight.standard_json import convert_parse_payload
17
+
18
+ WEB_EXTRA_ERROR = "Install docslight-lite[web] to use the web command."
19
+
20
+
21
+ class CLIUsageError(Exception):
22
+ """Expected command line usage error."""
23
+
24
+
25
+ def _add_common_options(parser: argparse.ArgumentParser) -> None:
26
+ parser.add_argument("--mode", choices=("cloud", "local"))
27
+ parser.add_argument("--api-key")
28
+ parser.add_argument("--base-url")
29
+ parser.add_argument("--local-parser")
30
+ parser.add_argument("--local-llm-provider")
31
+ parser.add_argument("--local-llm-model")
32
+ parser.add_argument("--local-llm-base-url")
33
+ parser.add_argument("--local-llm-api-key")
34
+
35
+
36
+ def _load_json_file(path: str | None, label: str) -> Any:
37
+ if path is None:
38
+ return None
39
+ try:
40
+ with Path(path).open(encoding="utf-8") as file:
41
+ return json.load(file)
42
+ except FileNotFoundError as exc:
43
+ raise CLIUsageError(f"{label} file not found: {path}") from exc
44
+ except json.JSONDecodeError as exc:
45
+ raise CLIUsageError(f"Invalid JSON in {label} file {path}: {exc.msg}") from exc
46
+ except UnicodeDecodeError as exc:
47
+ raise CLIUsageError(f"Cannot read {label} file {path}: {exc}") from exc
48
+
49
+
50
+ def _load_document_types(path: str | None) -> list[Any] | None:
51
+ document_types = _load_json_file(path, "document-types")
52
+ if document_types is None:
53
+ return None
54
+ if not isinstance(document_types, list):
55
+ raise CLIUsageError("document-types JSON must be a list")
56
+ return document_types
57
+
58
+
59
+ def _local_llm_from_args(args: argparse.Namespace) -> dict[str, str] | None:
60
+ values = {
61
+ "provider": args.local_llm_provider,
62
+ "model": args.local_llm_model,
63
+ "base_url": args.local_llm_base_url,
64
+ "api_key": args.local_llm_api_key,
65
+ }
66
+ if not any(values.values()):
67
+ return None
68
+ if values["provider"] is None:
69
+ values["provider"] = "ollama"
70
+ return {key: value for key, value in values.items() if value is not None}
71
+
72
+
73
+ def _client_from_args(args: argparse.Namespace) -> DocSlight:
74
+ return DocSlight(
75
+ mode=args.mode,
76
+ api_key=args.api_key,
77
+ base_url=args.base_url,
78
+ local_parser=args.local_parser,
79
+ local_llm=_local_llm_from_args(args),
80
+ )
81
+
82
+
83
+ def _write_output(content: str, output_path: str | None) -> None:
84
+ if output_path is None:
85
+ sys.stdout.write(content)
86
+ if not content.endswith("\n"):
87
+ sys.stdout.write("\n")
88
+ return
89
+ Path(output_path).write_text(content, encoding="utf-8")
90
+
91
+
92
+ def _to_pretty_json(data: Any) -> str:
93
+ return json.dumps(data, ensure_ascii=False, indent=2)
94
+
95
+
96
+ def run_web_app(host: str, port: int, debug: bool) -> None:
97
+ """Run the optional Flask web application."""
98
+ if importlib.util.find_spec("docslight.web_app") is None:
99
+ raise CLIUsageError(WEB_EXTRA_ERROR)
100
+
101
+ try:
102
+ web_app = importlib.import_module("docslight.web_app")
103
+ except ModuleNotFoundError as exc:
104
+ if exc.name in {"flask", "werkzeug"}:
105
+ raise CLIUsageError(WEB_EXTRA_ERROR) from exc
106
+ raise
107
+ _run_web_app = web_app.run_web_app
108
+ _run_web_app(host, port, debug)
109
+
110
+
111
+ def _print_cli_error(error: Exception) -> int:
112
+ sys.stderr.write(f"docslight: error: {error}\n")
113
+ return 2
114
+
115
+
116
+ def build_parser() -> argparse.ArgumentParser:
117
+ """Build the docslight command line parser."""
118
+ parser = argparse.ArgumentParser(
119
+ prog="docslight",
120
+ description="Lightweight ComPDF document parsing and extraction SDK.",
121
+ )
122
+ subparsers = parser.add_subparsers(dest="command", required=True)
123
+
124
+ parse_parser = subparsers.add_parser("parse", help="Parse a document")
125
+ parse_parser.add_argument("input")
126
+ parse_parser.add_argument("--output", "-o")
127
+ parse_parser.add_argument(
128
+ "--format",
129
+ choices=("markdown", "json", "standard-json"),
130
+ default="markdown",
131
+ )
132
+ _add_common_options(parse_parser)
133
+ parse_parser.set_defaults(func=_run_parse)
134
+
135
+ convert_parser = subparsers.add_parser(
136
+ "convert-parse-json",
137
+ help="Convert local parse JSON to the standard parse JSON schema",
138
+ )
139
+ convert_parser.add_argument("input")
140
+ convert_parser.add_argument("--output", "-o")
141
+ convert_parser.set_defaults(func=_run_convert_parse_json)
142
+
143
+ extract_parser = subparsers.add_parser("extract", help="Extract structured data")
144
+ extract_parser.add_argument("input")
145
+ extract_parser.add_argument("--output", "-o")
146
+ extract_parser.add_argument("--fields")
147
+ extract_parser.add_argument("--schema")
148
+ extract_parser.add_argument("--document-types")
149
+ _add_common_options(extract_parser)
150
+ extract_parser.set_defaults(func=_run_extract)
151
+
152
+ web_parser = subparsers.add_parser("web", help="Run the web application")
153
+ web_parser.add_argument("--host", default="127.0.0.1")
154
+ web_parser.add_argument("--port", type=int, default=8000)
155
+ web_parser.add_argument("--debug", action="store_true")
156
+ web_parser.set_defaults(func=_run_web)
157
+
158
+ return parser
159
+
160
+
161
+ def _run_parse(args: argparse.Namespace) -> int:
162
+ parse_output = "json" if args.format == "standard-json" else args.format
163
+ result = _client_from_args(args).parse(args.input, output=parse_output)
164
+ if args.format == "json":
165
+ content = _to_pretty_json(result.to_json())
166
+ elif args.format == "standard-json":
167
+ content = _to_pretty_json(result.to_standard_json())
168
+ else:
169
+ content = result.to_markdown()
170
+ _write_output(content, args.output)
171
+ return 0
172
+
173
+
174
+ def _run_convert_parse_json(args: argparse.Namespace) -> int:
175
+ payload = _load_json_file(args.input, "parse-json")
176
+ if not isinstance(payload, dict):
177
+ raise CLIUsageError("parse-json JSON must be an object")
178
+ _write_output(_to_pretty_json(convert_parse_payload(payload)), args.output)
179
+ return 0
180
+
181
+
182
+ def _run_extract(args: argparse.Namespace) -> int:
183
+ extract_options: dict[str, Any] = {}
184
+ fields = normalize_fields(args.fields)
185
+ if fields is not None:
186
+ extract_options["fields"] = fields
187
+ schema = _load_json_file(args.schema, "schema")
188
+ if schema is not None:
189
+ extract_options["schema"] = schema
190
+ document_types = _load_document_types(args.document_types)
191
+ if document_types is not None:
192
+ extract_options["document_types"] = document_types
193
+
194
+ result = _client_from_args(args).extract(args.input, **extract_options)
195
+ _write_output(_to_pretty_json(result.to_json()), args.output)
196
+ return 0
197
+
198
+
199
+ def _run_web(args: argparse.Namespace) -> int:
200
+ run_web_app(args.host, args.port, args.debug)
201
+ return 0
202
+
203
+
204
+ def main(argv: Sequence[str] | None = None) -> int:
205
+ """Run the docslight command line interface."""
206
+ parser = build_parser()
207
+ args = parser.parse_args(argv)
208
+ try:
209
+ return cast(int, args.func(args))
210
+ except CLIUsageError as exc:
211
+ return _print_cli_error(exc)
212
+
213
+
214
+ if __name__ == "__main__":
215
+ sys.exit(main())
docslight/client.py ADDED
@@ -0,0 +1,92 @@
1
+ """Public SDK router for document parsing and extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, cast
7
+
8
+ from docslight.cloud import CloudClient
9
+ from docslight.config import DocSlightConfig
10
+ from docslight.result import ExtractResult, ParseResult
11
+ from docslight.schemas import build_extract_schema, normalize_fields
12
+
13
+
14
+ class DocSlight:
15
+ """Route SDK calls to cloud or local document processors."""
16
+
17
+ def __init__(
18
+ self,
19
+ mode: str | None = None,
20
+ api_key: str | None = None,
21
+ base_url: str | None = None,
22
+ timeout: float | None = None,
23
+ local_parser: str | None = None,
24
+ local_llm: dict[str, Any] | None = None,
25
+ cloud_client: Any = None,
26
+ local_pipeline: Any = None,
27
+ ) -> None:
28
+ self.config = DocSlightConfig.from_sources(
29
+ mode=mode,
30
+ api_key=api_key,
31
+ base_url=base_url,
32
+ timeout=timeout,
33
+ local_parser=local_parser,
34
+ local_llm=local_llm,
35
+ )
36
+ self._cloud_client = cloud_client
37
+ self._local_pipeline = local_pipeline
38
+
39
+ def parse(self, path: Path | str, output: str = "markdown", **options: Any) -> ParseResult:
40
+ """Parse a document into markdown by default."""
41
+ if output != "markdown":
42
+ options["output"] = output
43
+ return cast(ParseResult, self._processor().parse(path, **options))
44
+
45
+ def extract(
46
+ self,
47
+ path: Path | str,
48
+ fields: list[str] | str | dict[str, Any] | None = None,
49
+ schema: dict[str, Any] | None = None,
50
+ document_types: list[str] | None = None,
51
+ **options: Any,
52
+ ) -> ExtractResult:
53
+ """Extract structured data from a document."""
54
+ normalized_fields = normalize_fields(fields)
55
+ extract_kwargs = dict(options)
56
+ if normalized_fields is not None:
57
+ extract_kwargs["fields"] = normalized_fields
58
+ derived_schema = build_extract_schema(normalized_fields)
59
+ if schema is None and derived_schema is not None:
60
+ extract_kwargs["schema"] = derived_schema
61
+ if schema is not None:
62
+ extract_kwargs["schema"] = schema
63
+ if document_types is not None:
64
+ extract_kwargs["document_types"] = document_types
65
+ return cast(ExtractResult, self._processor().extract(path, **extract_kwargs))
66
+
67
+ def parse_batch(self, paths: list[Path | str], **options: Any) -> list[ParseResult]:
68
+ """Parse documents sequentially."""
69
+ return [self.parse(path, **options) for path in paths]
70
+
71
+ def extract_batch(self, paths: list[Path | str], **options: Any) -> list[ExtractResult]:
72
+ """Extract data from documents sequentially."""
73
+ return [self.extract(path, **options) for path in paths]
74
+
75
+ def _processor(self) -> Any:
76
+ if self.config.mode == "local":
77
+ if self._local_pipeline is None:
78
+ self._local_pipeline = self._build_local_pipeline()
79
+ return self._local_pipeline
80
+
81
+ if self._cloud_client is None:
82
+ self._cloud_client = CloudClient(
83
+ self.config.api_key,
84
+ self.config.base_url,
85
+ self.config.timeout,
86
+ )
87
+ return self._cloud_client
88
+
89
+ def _build_local_pipeline(self) -> Any:
90
+ from docslight.local.pipeline import LocalPipeline
91
+
92
+ return LocalPipeline.from_config(self.config)
@@ -0,0 +1,5 @@
1
+ """Cloud document parsing integrations."""
2
+
3
+ from docslight.cloud.client import CloudClient
4
+
5
+ __all__ = ["CloudClient"]