docslight-lite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docslight/__init__.py +41 -0
  2. docslight/cli.py +215 -0
  3. docslight/client.py +92 -0
  4. docslight/cloud/__init__.py +5 -0
  5. docslight/cloud/client.py +622 -0
  6. docslight/config.py +117 -0
  7. docslight/exceptions.py +65 -0
  8. docslight/local/__init__.py +31 -0
  9. docslight/local/layout_blocks.py +80 -0
  10. docslight/local/llm_extractor.py +252 -0
  11. docslight/local/loaders.py +95 -0
  12. docslight/local/markdown.py +18 -0
  13. docslight/local/office_loader.py +128 -0
  14. docslight/local/paddle_parser.py +173 -0
  15. docslight/local/pipeline.py +213 -0
  16. docslight/preview.py +46 -0
  17. docslight/providers/__init__.py +6 -0
  18. docslight/providers/ollama.py +30 -0
  19. docslight/providers/openai_compatible.py +64 -0
  20. docslight/result.py +89 -0
  21. docslight/schemas/__init__.py +5 -0
  22. docslight/schemas/fields.py +190 -0
  23. docslight/standard_json.py +367 -0
  24. docslight/static/app/common.js +668 -0
  25. docslight/static/app/docslight-extract.json +307 -0
  26. docslight/static/app/extract.js +394 -0
  27. docslight/static/app/i18n.js +405 -0
  28. docslight/static/app/parse.js +161 -0
  29. docslight/static/styles.css +878 -0
  30. docslight/templates/base.html +36 -0
  31. docslight/templates/extract.html +123 -0
  32. docslight/templates/parse.html +81 -0
  33. docslight/web_app.py +372 -0
  34. docslight_lite-0.1.0.dist-info/METADATA +277 -0
  35. docslight_lite-0.1.0.dist-info/RECORD +39 -0
  36. docslight_lite-0.1.0.dist-info/WHEEL +5 -0
  37. docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
  38. docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
  39. docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,128 @@
1
+ """Office document loading as Markdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from docslight.exceptions import DependencyMissingError, UnsupportedFormatError
9
+ from docslight.local.loaders import LOCAL_DEPS_MESSAGE, LoadedTextDocument
10
+
11
+ LEGACY_OFFICE_EXTENSIONS = {".doc", ".ppt", ".xls"}
12
+
13
+
14
+ def load_workbook(path: Path, **kwargs: Any) -> Any:
15
+ """Load an XLSX workbook with optional dependency handling."""
16
+ try:
17
+ from openpyxl import load_workbook as openpyxl_load_workbook
18
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on environment
19
+ raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
20
+ return openpyxl_load_workbook(path, **kwargs)
21
+
22
+
23
+ class OfficeMarkdownLoader:
24
+ """Load modern Office files into basic Markdown."""
25
+
26
+ def load(self, path: Path | str) -> LoadedTextDocument:
27
+ """Load a DOCX, PPTX, or XLSX file into Markdown."""
28
+ source_path = Path(path)
29
+ suffix = source_path.suffix.lower()
30
+ if suffix in LEGACY_OFFICE_EXTENSIONS:
31
+ raise UnsupportedFormatError("Legacy Office files must convert to DOCX, PPTX, or XLSX")
32
+ if suffix == ".docx":
33
+ return self._load_docx(source_path)
34
+ if suffix == ".pptx":
35
+ return self._load_pptx(source_path)
36
+ if suffix == ".xlsx":
37
+ return self._load_xlsx(source_path)
38
+ raise UnsupportedFormatError(f"Unsupported Office format: {suffix or source_path.name}")
39
+
40
+ def _load_docx(self, path: Path) -> LoadedTextDocument:
41
+ try:
42
+ from docx import Document
43
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on environment
44
+ raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
45
+
46
+ document = Document(str(path))
47
+ parts = [paragraph.text for paragraph in document.paragraphs if paragraph.text.strip()]
48
+ for table in document.tables:
49
+ rows: list[list[str]] = []
50
+ for row in table.rows:
51
+ cells = [cell.text.strip() for cell in row.cells]
52
+ if any(cells):
53
+ rows.append(cells)
54
+ if rows:
55
+ parts.append(self._rows_to_markdown(rows))
56
+ return LoadedTextDocument(
57
+ markdown="\n\n".join(parts),
58
+ metadata={
59
+ "document_type": "docx",
60
+ "page_count": 1,
61
+ "paragraph_count": len(document.paragraphs),
62
+ "table_count": len(document.tables),
63
+ },
64
+ )
65
+
66
+ def _load_pptx(self, path: Path) -> LoadedTextDocument:
67
+ try:
68
+ from pptx import Presentation
69
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on environment
70
+ raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
71
+
72
+ presentation = Presentation(str(path))
73
+ slide_parts: list[str] = []
74
+ for index, slide in enumerate(presentation.slides, start=1):
75
+ texts: list[str] = []
76
+ for shape in slide.shapes:
77
+ text = getattr(shape, "text", "")
78
+ if text.strip():
79
+ texts.append(text.strip())
80
+ slide_parts.append("\n\n".join([f"# Slide {index}", *texts]))
81
+ return LoadedTextDocument(
82
+ markdown="\n\n".join(slide_parts),
83
+ metadata={
84
+ "document_type": "pptx",
85
+ "page_count": len(presentation.slides),
86
+ "slide_count": len(presentation.slides),
87
+ },
88
+ )
89
+
90
+ def _load_xlsx(self, path: Path) -> LoadedTextDocument:
91
+ workbook = load_workbook(path, data_only=True, read_only=True)
92
+ sheets: list[str] = []
93
+ try:
94
+ for worksheet in workbook.worksheets:
95
+ rows = [self._format_row(row) for row in worksheet.iter_rows(values_only=True)]
96
+ rows = [row for row in rows if row]
97
+ sheets.append(
98
+ "\n".join([f"## Sheet: {worksheet.title}", self._rows_to_markdown(rows)])
99
+ )
100
+ return LoadedTextDocument(
101
+ markdown="\n\n".join(sheets),
102
+ metadata={
103
+ "document_type": "xlsx",
104
+ "page_count": len(workbook.worksheets),
105
+ "sheet_count": len(workbook.worksheets),
106
+ "sheet_names": [worksheet.title for worksheet in workbook.worksheets],
107
+ },
108
+ )
109
+ finally:
110
+ workbook.close()
111
+
112
+ def _format_row(self, row: tuple[Any, ...]) -> list[str]:
113
+ values = ["" if value is None else str(value) for value in row]
114
+ if not any(values):
115
+ return []
116
+ return values
117
+
118
+ def _rows_to_markdown(self, rows: list[list[str]]) -> str:
119
+ if not rows:
120
+ return ""
121
+ escaped_rows = [[self._escape_table_cell(cell) for cell in row] for row in rows]
122
+ header = escaped_rows[0]
123
+ separator = ["---"] * len(header)
124
+ markdown_rows = [header, separator, *escaped_rows[1:]]
125
+ return "\n".join("| " + " | ".join(row) + " |" for row in markdown_rows)
126
+
127
+ def _escape_table_cell(self, value: str) -> str:
128
+ return value.replace("\n", " ").replace("\r", " ").replace("|", "\\|")
@@ -0,0 +1,173 @@
1
+ """PaddleOCR PP-StructureV3 parser adapter for local files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+ from collections.abc import Iterable
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from docslight.exceptions import DependencyMissingError, LocalProcessingError
13
+ from docslight.local.loaders import LOCAL_DEPS_MESSAGE
14
+ from docslight.result import ParseResult
15
+
16
+ DEFAULT_PPSTRUCTUREV3_OPTIONS: dict[str, Any] = {
17
+ "use_doc_orientation_classify": False,
18
+ "use_doc_unwarping": False,
19
+ "use_textline_orientation": False,
20
+ "use_formula_recognition": False,
21
+ "use_chart_recognition": False,
22
+ # "layout_detection_model_name": "PP-DocLayout-M",
23
+ "use_region_detection": False,
24
+ "text_recognition_model_name": "PP-OCRv5_mobile_rec",
25
+ "text_detection_model_name": "PP-OCRv5_mobile_det",
26
+
27
+ }
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class OCRLine:
32
+ """One OCR text line."""
33
+
34
+ text: str
35
+ bbox: list[Any] | None
36
+ confidence: float | None
37
+
38
+ def to_json(self) -> dict[str, Any]:
39
+ """Return a JSON-serializable representation."""
40
+ return {
41
+ "text": self.text,
42
+ "bbox": self.bbox,
43
+ "confidence": self.confidence,
44
+ }
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class OCRPage:
49
+ """OCR result for one page."""
50
+
51
+ page_number: int
52
+ lines: list[OCRLine]
53
+
54
+ def to_json(self) -> dict[str, Any]:
55
+ """Return a JSON-serializable representation."""
56
+ return {
57
+ "page_number": self.page_number,
58
+ "lines": [line.to_json() for line in self.lines],
59
+ }
60
+
61
+
62
+ class PaddleOCRParser:
63
+ """Run PP-StructureV3 over local PDF and image files."""
64
+
65
+ def __init__(self, pipeline: Any | None = None, **pipeline_options: Any) -> None:
66
+ self.pipeline_options = {**DEFAULT_PPSTRUCTUREV3_OPTIONS, **pipeline_options}
67
+ self._pipeline: Any = pipeline
68
+ self._device_label: str | None = None
69
+
70
+ def parse(self, path: Path | str) -> ParseResult:
71
+ """Parse a local PDF or image into structured Markdown and page JSON."""
72
+ pipeline = self._load_pipeline()
73
+ raw_results = self._predict(pipeline, Path(path))
74
+ markdown_pages, pages = self._normalize_results(raw_results)
75
+ return ParseResult(
76
+ markdown=self._build_markdown(pipeline, markdown_pages),
77
+ pages=pages,
78
+ )
79
+
80
+ def _load_pipeline(self) -> Any:
81
+ if self._pipeline is not None:
82
+ return self._pipeline
83
+ os.environ.setdefault("PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT", "0")
84
+ try:
85
+ from paddleocr import PPStructureV3
86
+ except (ImportError, ModuleNotFoundError) as exc: # pragma: no cover - depends on env
87
+ raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
88
+ self.pipeline_options.setdefault("device", _detect_device())
89
+ self._device_label = (
90
+ "GPU" if str(self.pipeline_options["device"]).startswith("gpu") else "CPU"
91
+ )
92
+ print(
93
+ f"DocSlight local PP-StructureV3 inference device: {self._device_label}",
94
+ file=sys.stderr,
95
+ )
96
+ self._pipeline = PPStructureV3(**self.pipeline_options)
97
+ return self._pipeline
98
+
99
+ def _predict(self, pipeline: Any, path: Path) -> list[Any]:
100
+ try:
101
+ raw_results = pipeline.predict(input=str(path))
102
+ except LocalProcessingError:
103
+ raise
104
+ except Exception as exc: # noqa: BLE001
105
+ raise LocalProcessingError(f"Local PaddleOCR parsing failed: {exc}") from exc
106
+ if raw_results is None:
107
+ return []
108
+ if isinstance(raw_results, list):
109
+ return raw_results
110
+ if isinstance(raw_results, Iterable) and not isinstance(raw_results, (str, bytes, dict)):
111
+ return list(raw_results)
112
+ raise LocalProcessingError("Unexpected PP-StructureV3 result format")
113
+
114
+ def _normalize_results(self, results: list[Any]) -> tuple[list[Any], list[dict[str, Any]]]:
115
+ markdown_pages: list[Any] = []
116
+ pages: list[dict[str, Any]] = []
117
+ for result in results:
118
+ if result is None:
119
+ continue
120
+ if not hasattr(result, "markdown") or not hasattr(result, "json"):
121
+ raise LocalProcessingError("Unexpected PP-StructureV3 result format")
122
+ markdown_pages.append(result.markdown)
123
+ pages.append(self._normalize_page_payload(result.json))
124
+ return markdown_pages, pages
125
+
126
+ def _normalize_page_payload(self, payload: Any) -> dict[str, Any]:
127
+ if callable(payload):
128
+ payload = payload()
129
+ if not isinstance(payload, dict):
130
+ raise LocalProcessingError("Unexpected PP-StructureV3 result format")
131
+ data = payload.get("res", payload)
132
+ if not isinstance(data, dict):
133
+ raise LocalProcessingError("Unexpected PP-StructureV3 result format")
134
+ return data
135
+
136
+ def _build_markdown(self, pipeline: Any, markdown_pages: list[Any]) -> str:
137
+ if not markdown_pages:
138
+ return ""
139
+ if len(markdown_pages) == 1:
140
+ return self._markdown_to_text(markdown_pages[0])
141
+ combined_markdown = pipeline.concatenate_markdown_pages(markdown_pages)
142
+ return self._markdown_to_text(combined_markdown)
143
+
144
+ def _markdown_to_text(self, markdown: Any) -> str:
145
+ if markdown is None:
146
+ return ""
147
+ if isinstance(markdown, str):
148
+ return markdown
149
+ if isinstance(markdown, dict):
150
+ text = markdown.get("markdown_texts", markdown.get("markdown", ""))
151
+ if isinstance(text, str):
152
+ return text
153
+ raise LocalProcessingError("Unexpected PP-StructureV3 result format")
154
+
155
+
156
+ def _detect_device() -> str:
157
+ try:
158
+ import paddle
159
+ except ModuleNotFoundError:
160
+ return "cpu"
161
+
162
+ try:
163
+ device = getattr(paddle, "device", None)
164
+ cuda = getattr(device, "cuda", None) if device is not None else None
165
+ if cuda is None or cuda.device_count() <= 0:
166
+ return "cpu"
167
+ if device is not None and device.is_compiled_with_cuda():
168
+ return "gpu"
169
+ if hasattr(paddle, "is_compiled_with_cuda") and paddle.is_compiled_with_cuda():
170
+ return "gpu"
171
+ except Exception:
172
+ return "cpu"
173
+ return "cpu"
@@ -0,0 +1,213 @@
1
+ """Local document parsing pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ import inspect
7
+ import io
8
+ import json
9
+ import zipfile
10
+ from dataclasses import replace
11
+ from pathlib import Path
12
+ from typing import Any, cast
13
+
14
+ from docslight.config import DocSlightConfig
15
+ from docslight.exceptions import ConfigurationError, UnsupportedFormatError
16
+ from docslight.local.layout_blocks import build_layout_blocks
17
+ from docslight.local.loaders import OFFICE_EXTENSIONS, RASTER_EXTENSIONS, FileLoader
18
+ from docslight.local.markdown import MarkdownBuilder
19
+ from docslight.local.office_loader import LEGACY_OFFICE_EXTENSIONS, OfficeMarkdownLoader
20
+ from docslight.local.paddle_parser import PaddleOCRParser
21
+ from docslight.result import ExtractResult, ParseResult
22
+
23
+ FIXED_LLM_PARAMETERS = {"markdown", "fields", "schema", "document_types"}
24
+
25
+
26
+ def _cloud_extract_result(data: dict[str, Any]) -> dict[str, Any]:
27
+ if _is_page_grouped_result(data):
28
+ return data
29
+
30
+ table_bboxes = data.get("_table_bboxes") if isinstance(data.get("_table_bboxes"), dict) else {}
31
+ grouped: dict[str, dict[str, Any]] = {}
32
+ for key, value in data.items():
33
+ if key == "_table_bboxes":
34
+ continue
35
+ if key == "tables" and isinstance(value, dict):
36
+ for table_name, rows in value.items():
37
+ page_key = _page_key(table_bboxes.get(table_name))
38
+ page = grouped.setdefault(page_key, {})
39
+ tables = page.setdefault("tables", {})
40
+ if isinstance(tables, dict):
41
+ tables[table_name] = rows
42
+ continue
43
+ page_key = _page_key(value)
44
+ grouped.setdefault(page_key, {})[key] = _cloud_extract_value(value)
45
+ return grouped or {"Page_1": {}}
46
+
47
+
48
+ def _is_page_grouped_result(data: dict[str, Any]) -> bool:
49
+ return bool(data) and all(isinstance(key, str) and key.startswith("Page_") for key in data)
50
+
51
+
52
+ def _page_key(value: Any) -> str:
53
+ if isinstance(value, dict):
54
+ page_id = value.get("page_id") or value.get("page")
55
+ if page_id is not None:
56
+ return f"Page_{page_id}"
57
+ bboxes = value.get("bboxes") or value.get("bbox")
58
+ if isinstance(bboxes, list):
59
+ candidates = bboxes if bboxes and isinstance(bboxes[0], dict) else [value]
60
+ for candidate in candidates:
61
+ if isinstance(candidate, dict):
62
+ page_id = candidate.get("page_id") or candidate.get("page")
63
+ if page_id is not None:
64
+ return f"Page_{page_id}"
65
+ return "Page_1"
66
+
67
+
68
+ def _cloud_extract_value(value: Any) -> Any:
69
+ if isinstance(value, dict) and "value" in value:
70
+ return "" if value["value"] is None else value["value"]
71
+ return "" if value is None else value
72
+
73
+
74
+ def _with_local_parse_outputs(result: ParseResult) -> ParseResult:
75
+ raw_response = result.to_standard_json()
76
+ return replace(
77
+ result,
78
+ raw_response=raw_response,
79
+ raw_archive=_build_parse_archive(result.markdown, raw_response),
80
+ )
81
+
82
+
83
+ def _build_parse_archive(markdown: str, payload: dict[str, Any]) -> bytes:
84
+ buffer = io.BytesIO()
85
+ with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as archive:
86
+ archive.writestr("result.md", markdown)
87
+ archive.writestr(
88
+ "result.json",
89
+ json.dumps(payload, ensure_ascii=False, indent=2),
90
+ )
91
+ return buffer.getvalue()
92
+
93
+
94
+ def _build_llm_extractor(config: dict[str, Any]) -> Any:
95
+ """Build the local LLM extractor while keeping Task 8 dependency lazy."""
96
+ module = importlib.import_module("docslight.local.llm_extractor")
97
+ return module.LocalLLMExtractor(module.provider_from_config(config))
98
+
99
+
100
+ class LocalPipeline:
101
+ """Parse and extract documents using local components."""
102
+
103
+ def __init__(
104
+ self,
105
+ loader: Any | None = None,
106
+ parser: Any | None = None,
107
+ office_loader: Any | None = None,
108
+ markdown_builder: MarkdownBuilder | None = None,
109
+ llm_extractor: Any | None = None,
110
+ ) -> None:
111
+ self.loader = loader or FileLoader()
112
+ self.parser = parser or PaddleOCRParser()
113
+ self.office_loader = office_loader or OfficeMarkdownLoader()
114
+ self.markdown_builder = markdown_builder or MarkdownBuilder()
115
+ self.llm_extractor = llm_extractor
116
+
117
+ @classmethod
118
+ def from_config(cls, config: DocSlightConfig) -> LocalPipeline:
119
+ """Build a local pipeline from SDK configuration."""
120
+ llm_extractor = None
121
+ if config.local_llm:
122
+ try:
123
+ llm_extractor = _build_llm_extractor(config.local_llm)
124
+ except ModuleNotFoundError as exc:
125
+ if exc.name == "docslight.local.llm_extractor":
126
+ raise ConfigurationError("local_llm support is not available") from exc
127
+ raise
128
+ return cls(llm_extractor=llm_extractor)
129
+
130
+ def parse(self, path: Path | str, **options: Any) -> ParseResult:
131
+ """Parse a local document into Markdown."""
132
+ source_path = Path(path)
133
+ suffix = source_path.suffix.lower()
134
+ if suffix in LEGACY_OFFICE_EXTENSIONS:
135
+ raise UnsupportedFormatError("Legacy Office files must convert to DOCX, PPTX, or XLSX")
136
+ if suffix in OFFICE_EXTENSIONS:
137
+ return self._parse_office(source_path, options)
138
+ if suffix in RASTER_EXTENSIONS:
139
+ return self._parse_raster(source_path, suffix, options)
140
+ raise UnsupportedFormatError(f"Unsupported local format: {suffix or source_path.name}")
141
+
142
+ def extract(
143
+ self,
144
+ path: Path | str,
145
+ fields: list[str] | dict[str, Any] | None = None,
146
+ schema: dict[str, Any] | None = None,
147
+ document_types: list[str] | None = None,
148
+ **options: Any,
149
+ ) -> ExtractResult:
150
+ """Extract structured data from a local document."""
151
+ llm_extractor = self.llm_extractor
152
+ if llm_extractor is None:
153
+ raise ConfigurationError("local_llm must be configured")
154
+ parsed = self.parse(path, **options)
155
+ candidate_options = dict(options)
156
+ layout_blocks = build_layout_blocks(parsed.pages)
157
+ if layout_blocks:
158
+ candidate_options["layout_blocks"] = layout_blocks
159
+ llm_options = self._supported_llm_options(llm_extractor, candidate_options)
160
+ extracted = cast(ExtractResult, llm_extractor.extract(
161
+ parsed.markdown,
162
+ fields=fields,
163
+ schema=schema,
164
+ document_types=document_types,
165
+ **llm_options,
166
+ ))
167
+ return replace(
168
+ extracted,
169
+ data=_cloud_extract_result(extracted.data),
170
+ metadata={**extracted.metadata, **parsed.metadata},
171
+ )
172
+
173
+ def _parse_raster(self, path: Path, suffix: str, options: dict[str, Any]) -> ParseResult:
174
+ parsed = self.parser.parse(path)
175
+ document_type = "pdf" if suffix == ".pdf" else "image"
176
+ metadata = self._merge_metadata(
177
+ {
178
+ "engine": "ppstructurev3-local",
179
+ "mode": "local",
180
+ "document_type": document_type,
181
+ "page_count": len(parsed.pages),
182
+ },
183
+ {**parsed.metadata, **options},
184
+ )
185
+ result = replace(parsed, metadata=metadata)
186
+ return _with_local_parse_outputs(result)
187
+
188
+ def _parse_office(self, path: Path, options: dict[str, Any]) -> ParseResult:
189
+ document = self.office_loader.load(path)
190
+ metadata = self._merge_metadata({"mode": "local", **document.metadata}, options)
191
+ result = ParseResult(markdown=document.markdown, pages=[], metadata=metadata)
192
+ return _with_local_parse_outputs(result)
193
+
194
+ def _merge_metadata(
195
+ self,
196
+ base_metadata: dict[str, Any],
197
+ options: dict[str, Any],
198
+ ) -> dict[str, Any]:
199
+ metadata = dict(base_metadata)
200
+ for key, value in options.items():
201
+ if key not in metadata:
202
+ metadata[key] = value
203
+ return metadata
204
+
205
+ def _supported_llm_options(self, llm_extractor: Any, options: dict[str, Any]) -> dict[str, Any]:
206
+ candidate_options = {
207
+ key: value for key, value in options.items() if key not in FIXED_LLM_PARAMETERS
208
+ }
209
+ signature = inspect.signature(llm_extractor.extract)
210
+ parameters = signature.parameters
211
+ if any(parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in parameters.values()):
212
+ return candidate_options
213
+ return {key: value for key, value in candidate_options.items() if key in parameters}
docslight/preview.py ADDED
@@ -0,0 +1,46 @@
1
+ """Preview rendering helpers for the local Web UI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from docslight.exceptions import DependencyMissingError, LocalProcessingError
10
+
11
+ PDF_PREVIEW_DEPENDENCY_MESSAGE = (
12
+ "Install docslight-lite[local] to enable PDF preview rendering."
13
+ )
14
+
15
+
16
+ def render_pdf_preview(path: Path, max_pages: int | None = None) -> dict[str, Any]:
17
+ """Render PDF pages to PNG data URLs for browser overlay highlighting."""
18
+ try:
19
+ import fitz # type: ignore[import-not-found]
20
+ except ModuleNotFoundError as exc:
21
+ raise DependencyMissingError(PDF_PREVIEW_DEPENDENCY_MESSAGE) from exc
22
+
23
+ pages: list[dict[str, Any]] = []
24
+ try:
25
+ with fitz.open(path) as document:
26
+ page_count = len(document) if max_pages is None else min(len(document), max_pages)
27
+ for page_index in range(page_count):
28
+ page = document.load_page(page_index)
29
+ pixmap = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
30
+ image = base64.b64encode(pixmap.tobytes("png")).decode("ascii")
31
+ rect = page.rect
32
+ pages.append(
33
+ {
34
+ "page_id": page_index + 1,
35
+ "page_index": page_index,
36
+ "width": float(rect.width),
37
+ "height": float(rect.height),
38
+ "image": f"data:image/png;base64,{image}",
39
+ }
40
+ )
41
+ except Exception as exc:
42
+ if isinstance(exc, DependencyMissingError):
43
+ raise
44
+ raise LocalProcessingError("PDF preview rendering failed") from exc
45
+
46
+ return {"kind": "pdf", "pages": pages}
@@ -0,0 +1,6 @@
1
+ """Provider integrations for document parsing workflows."""
2
+
3
+ from docslight.providers.ollama import OllamaProvider
4
+ from docslight.providers.openai_compatible import OpenAICompatibleProvider
5
+
6
+ __all__ = ["OllamaProvider", "OpenAICompatibleProvider"]
@@ -0,0 +1,30 @@
1
+ """Ollama local chat completion provider."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from docslight.providers.openai_compatible import OpenAICompatibleProvider
6
+
7
+
8
+ class OllamaProvider(OpenAICompatibleProvider):
9
+ """OpenAI-compatible provider configured for Ollama."""
10
+
11
+ def __init__(
12
+ self,
13
+ model: str,
14
+ base_url: str = "http://localhost:11434",
15
+ api_key: str = "ollama",
16
+ timeout: float = 120.0,
17
+ ) -> None:
18
+ super().__init__(
19
+ model=model,
20
+ base_url=_normalize_ollama_base_url(base_url),
21
+ api_key=api_key,
22
+ timeout=timeout,
23
+ )
24
+
25
+
26
+ def _normalize_ollama_base_url(base_url: str) -> str:
27
+ normalized = base_url.rstrip("/")
28
+ if normalized.endswith("/v1"):
29
+ return normalized
30
+ return f"{normalized}/v1"
@@ -0,0 +1,64 @@
1
+ """OpenAI-compatible chat completion provider."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from docslight.exceptions import DependencyMissingError, LocalProcessingError
8
+
9
+ INSTALL_LOCAL_LLM_MESSAGE = (
10
+ "Install local LLM dependencies with: pip install 'docslight-lite[local-llm]'"
11
+ )
12
+ NO_TEXT_CONTENT_MESSAGE = "OpenAI-compatible provider returned no text content"
13
+ REQUEST_FAILED_MESSAGE = "OpenAI-compatible provider request failed"
14
+
15
+
16
+ class OpenAICompatibleProvider:
17
+ """Provider for OpenAI-compatible chat completion APIs."""
18
+
19
+ def __init__(
20
+ self,
21
+ model: str,
22
+ base_url: str,
23
+ api_key: str = "",
24
+ timeout: float = 120.0,
25
+ extra_body: dict[str, Any] | None = None,
26
+ ) -> None:
27
+ self.model = model
28
+ self.base_url = base_url
29
+ self.api_key = api_key
30
+ self.timeout = timeout
31
+ self.extra_body = extra_body or {}
32
+
33
+ def complete(self, messages: list[dict[str, str]]) -> str:
34
+ """Return chat completion content from an OpenAI-compatible endpoint."""
35
+ try:
36
+ from openai import OpenAI
37
+ except ImportError as exc:
38
+ raise DependencyMissingError(INSTALL_LOCAL_LLM_MESSAGE) from exc
39
+
40
+ client = OpenAI(
41
+ api_key=self.api_key,
42
+ base_url=self.base_url,
43
+ timeout=self.timeout,
44
+ )
45
+ try:
46
+ request_kwargs: dict[str, Any] = {
47
+ "model": self.model,
48
+ "messages": messages,
49
+ "temperature": 0,
50
+ }
51
+ if self.extra_body:
52
+ request_kwargs["extra_body"] = self.extra_body
53
+ response = client.chat.completions.create(**request_kwargs)
54
+ except Exception as exc:
55
+ raise LocalProcessingError(REQUEST_FAILED_MESSAGE) from exc
56
+ try:
57
+ choice = response.choices[0]
58
+ message = choice.message
59
+ content: Any = message.content
60
+ except (AttributeError, IndexError, TypeError) as exc:
61
+ raise LocalProcessingError(NO_TEXT_CONTENT_MESSAGE) from exc
62
+ if not isinstance(content, str):
63
+ raise LocalProcessingError(NO_TEXT_CONTENT_MESSAGE)
64
+ return content