docslight-lite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docslight/__init__.py +41 -0
- docslight/cli.py +215 -0
- docslight/client.py +92 -0
- docslight/cloud/__init__.py +5 -0
- docslight/cloud/client.py +622 -0
- docslight/config.py +117 -0
- docslight/exceptions.py +65 -0
- docslight/local/__init__.py +31 -0
- docslight/local/layout_blocks.py +80 -0
- docslight/local/llm_extractor.py +252 -0
- docslight/local/loaders.py +95 -0
- docslight/local/markdown.py +18 -0
- docslight/local/office_loader.py +128 -0
- docslight/local/paddle_parser.py +173 -0
- docslight/local/pipeline.py +213 -0
- docslight/preview.py +46 -0
- docslight/providers/__init__.py +6 -0
- docslight/providers/ollama.py +30 -0
- docslight/providers/openai_compatible.py +64 -0
- docslight/result.py +89 -0
- docslight/schemas/__init__.py +5 -0
- docslight/schemas/fields.py +190 -0
- docslight/standard_json.py +367 -0
- docslight/static/app/common.js +668 -0
- docslight/static/app/docslight-extract.json +307 -0
- docslight/static/app/extract.js +394 -0
- docslight/static/app/i18n.js +405 -0
- docslight/static/app/parse.js +161 -0
- docslight/static/styles.css +878 -0
- docslight/templates/base.html +36 -0
- docslight/templates/extract.html +123 -0
- docslight/templates/parse.html +81 -0
- docslight/web_app.py +372 -0
- docslight_lite-0.1.0.dist-info/METADATA +277 -0
- docslight_lite-0.1.0.dist-info/RECORD +39 -0
- docslight_lite-0.1.0.dist-info/WHEEL +5 -0
- docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
- docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
- docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Office document loading as Markdown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from docslight.exceptions import DependencyMissingError, UnsupportedFormatError
|
|
9
|
+
from docslight.local.loaders import LOCAL_DEPS_MESSAGE, LoadedTextDocument
|
|
10
|
+
|
|
11
|
+
LEGACY_OFFICE_EXTENSIONS = {".doc", ".ppt", ".xls"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_workbook(path: Path, **kwargs: Any) -> Any:
|
|
15
|
+
"""Load an XLSX workbook with optional dependency handling."""
|
|
16
|
+
try:
|
|
17
|
+
from openpyxl import load_workbook as openpyxl_load_workbook
|
|
18
|
+
except ModuleNotFoundError as exc: # pragma: no cover - depends on environment
|
|
19
|
+
raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
|
|
20
|
+
return openpyxl_load_workbook(path, **kwargs)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OfficeMarkdownLoader:
|
|
24
|
+
"""Load modern Office files into basic Markdown."""
|
|
25
|
+
|
|
26
|
+
def load(self, path: Path | str) -> LoadedTextDocument:
|
|
27
|
+
"""Load a DOCX, PPTX, or XLSX file into Markdown."""
|
|
28
|
+
source_path = Path(path)
|
|
29
|
+
suffix = source_path.suffix.lower()
|
|
30
|
+
if suffix in LEGACY_OFFICE_EXTENSIONS:
|
|
31
|
+
raise UnsupportedFormatError("Legacy Office files must convert to DOCX, PPTX, or XLSX")
|
|
32
|
+
if suffix == ".docx":
|
|
33
|
+
return self._load_docx(source_path)
|
|
34
|
+
if suffix == ".pptx":
|
|
35
|
+
return self._load_pptx(source_path)
|
|
36
|
+
if suffix == ".xlsx":
|
|
37
|
+
return self._load_xlsx(source_path)
|
|
38
|
+
raise UnsupportedFormatError(f"Unsupported Office format: {suffix or source_path.name}")
|
|
39
|
+
|
|
40
|
+
def _load_docx(self, path: Path) -> LoadedTextDocument:
|
|
41
|
+
try:
|
|
42
|
+
from docx import Document
|
|
43
|
+
except ModuleNotFoundError as exc: # pragma: no cover - depends on environment
|
|
44
|
+
raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
|
|
45
|
+
|
|
46
|
+
document = Document(str(path))
|
|
47
|
+
parts = [paragraph.text for paragraph in document.paragraphs if paragraph.text.strip()]
|
|
48
|
+
for table in document.tables:
|
|
49
|
+
rows: list[list[str]] = []
|
|
50
|
+
for row in table.rows:
|
|
51
|
+
cells = [cell.text.strip() for cell in row.cells]
|
|
52
|
+
if any(cells):
|
|
53
|
+
rows.append(cells)
|
|
54
|
+
if rows:
|
|
55
|
+
parts.append(self._rows_to_markdown(rows))
|
|
56
|
+
return LoadedTextDocument(
|
|
57
|
+
markdown="\n\n".join(parts),
|
|
58
|
+
metadata={
|
|
59
|
+
"document_type": "docx",
|
|
60
|
+
"page_count": 1,
|
|
61
|
+
"paragraph_count": len(document.paragraphs),
|
|
62
|
+
"table_count": len(document.tables),
|
|
63
|
+
},
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _load_pptx(self, path: Path) -> LoadedTextDocument:
|
|
67
|
+
try:
|
|
68
|
+
from pptx import Presentation
|
|
69
|
+
except ModuleNotFoundError as exc: # pragma: no cover - depends on environment
|
|
70
|
+
raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
|
|
71
|
+
|
|
72
|
+
presentation = Presentation(str(path))
|
|
73
|
+
slide_parts: list[str] = []
|
|
74
|
+
for index, slide in enumerate(presentation.slides, start=1):
|
|
75
|
+
texts: list[str] = []
|
|
76
|
+
for shape in slide.shapes:
|
|
77
|
+
text = getattr(shape, "text", "")
|
|
78
|
+
if text.strip():
|
|
79
|
+
texts.append(text.strip())
|
|
80
|
+
slide_parts.append("\n\n".join([f"# Slide {index}", *texts]))
|
|
81
|
+
return LoadedTextDocument(
|
|
82
|
+
markdown="\n\n".join(slide_parts),
|
|
83
|
+
metadata={
|
|
84
|
+
"document_type": "pptx",
|
|
85
|
+
"page_count": len(presentation.slides),
|
|
86
|
+
"slide_count": len(presentation.slides),
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def _load_xlsx(self, path: Path) -> LoadedTextDocument:
|
|
91
|
+
workbook = load_workbook(path, data_only=True, read_only=True)
|
|
92
|
+
sheets: list[str] = []
|
|
93
|
+
try:
|
|
94
|
+
for worksheet in workbook.worksheets:
|
|
95
|
+
rows = [self._format_row(row) for row in worksheet.iter_rows(values_only=True)]
|
|
96
|
+
rows = [row for row in rows if row]
|
|
97
|
+
sheets.append(
|
|
98
|
+
"\n".join([f"## Sheet: {worksheet.title}", self._rows_to_markdown(rows)])
|
|
99
|
+
)
|
|
100
|
+
return LoadedTextDocument(
|
|
101
|
+
markdown="\n\n".join(sheets),
|
|
102
|
+
metadata={
|
|
103
|
+
"document_type": "xlsx",
|
|
104
|
+
"page_count": len(workbook.worksheets),
|
|
105
|
+
"sheet_count": len(workbook.worksheets),
|
|
106
|
+
"sheet_names": [worksheet.title for worksheet in workbook.worksheets],
|
|
107
|
+
},
|
|
108
|
+
)
|
|
109
|
+
finally:
|
|
110
|
+
workbook.close()
|
|
111
|
+
|
|
112
|
+
def _format_row(self, row: tuple[Any, ...]) -> list[str]:
|
|
113
|
+
values = ["" if value is None else str(value) for value in row]
|
|
114
|
+
if not any(values):
|
|
115
|
+
return []
|
|
116
|
+
return values
|
|
117
|
+
|
|
118
|
+
def _rows_to_markdown(self, rows: list[list[str]]) -> str:
|
|
119
|
+
if not rows:
|
|
120
|
+
return ""
|
|
121
|
+
escaped_rows = [[self._escape_table_cell(cell) for cell in row] for row in rows]
|
|
122
|
+
header = escaped_rows[0]
|
|
123
|
+
separator = ["---"] * len(header)
|
|
124
|
+
markdown_rows = [header, separator, *escaped_rows[1:]]
|
|
125
|
+
return "\n".join("| " + " | ".join(row) + " |" for row in markdown_rows)
|
|
126
|
+
|
|
127
|
+
def _escape_table_cell(self, value: str) -> str:
|
|
128
|
+
return value.replace("\n", " ").replace("\r", " ").replace("|", "\\|")
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""PaddleOCR PP-StructureV3 parser adapter for local files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from docslight.exceptions import DependencyMissingError, LocalProcessingError
|
|
13
|
+
from docslight.local.loaders import LOCAL_DEPS_MESSAGE
|
|
14
|
+
from docslight.result import ParseResult
|
|
15
|
+
|
|
16
|
+
DEFAULT_PPSTRUCTUREV3_OPTIONS: dict[str, Any] = {
|
|
17
|
+
"use_doc_orientation_classify": False,
|
|
18
|
+
"use_doc_unwarping": False,
|
|
19
|
+
"use_textline_orientation": False,
|
|
20
|
+
"use_formula_recognition": False,
|
|
21
|
+
"use_chart_recognition": False,
|
|
22
|
+
# "layout_detection_model_name": "PP-DocLayout-M",
|
|
23
|
+
"use_region_detection": False,
|
|
24
|
+
"text_recognition_model_name": "PP-OCRv5_mobile_rec",
|
|
25
|
+
"text_detection_model_name": "PP-OCRv5_mobile_det",
|
|
26
|
+
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class OCRLine:
|
|
32
|
+
"""One OCR text line."""
|
|
33
|
+
|
|
34
|
+
text: str
|
|
35
|
+
bbox: list[Any] | None
|
|
36
|
+
confidence: float | None
|
|
37
|
+
|
|
38
|
+
def to_json(self) -> dict[str, Any]:
|
|
39
|
+
"""Return a JSON-serializable representation."""
|
|
40
|
+
return {
|
|
41
|
+
"text": self.text,
|
|
42
|
+
"bbox": self.bbox,
|
|
43
|
+
"confidence": self.confidence,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class OCRPage:
|
|
49
|
+
"""OCR result for one page."""
|
|
50
|
+
|
|
51
|
+
page_number: int
|
|
52
|
+
lines: list[OCRLine]
|
|
53
|
+
|
|
54
|
+
def to_json(self) -> dict[str, Any]:
|
|
55
|
+
"""Return a JSON-serializable representation."""
|
|
56
|
+
return {
|
|
57
|
+
"page_number": self.page_number,
|
|
58
|
+
"lines": [line.to_json() for line in self.lines],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class PaddleOCRParser:
|
|
63
|
+
"""Run PP-StructureV3 over local PDF and image files."""
|
|
64
|
+
|
|
65
|
+
def __init__(self, pipeline: Any | None = None, **pipeline_options: Any) -> None:
|
|
66
|
+
self.pipeline_options = {**DEFAULT_PPSTRUCTUREV3_OPTIONS, **pipeline_options}
|
|
67
|
+
self._pipeline: Any = pipeline
|
|
68
|
+
self._device_label: str | None = None
|
|
69
|
+
|
|
70
|
+
def parse(self, path: Path | str) -> ParseResult:
|
|
71
|
+
"""Parse a local PDF or image into structured Markdown and page JSON."""
|
|
72
|
+
pipeline = self._load_pipeline()
|
|
73
|
+
raw_results = self._predict(pipeline, Path(path))
|
|
74
|
+
markdown_pages, pages = self._normalize_results(raw_results)
|
|
75
|
+
return ParseResult(
|
|
76
|
+
markdown=self._build_markdown(pipeline, markdown_pages),
|
|
77
|
+
pages=pages,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def _load_pipeline(self) -> Any:
|
|
81
|
+
if self._pipeline is not None:
|
|
82
|
+
return self._pipeline
|
|
83
|
+
os.environ.setdefault("PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT", "0")
|
|
84
|
+
try:
|
|
85
|
+
from paddleocr import PPStructureV3
|
|
86
|
+
except (ImportError, ModuleNotFoundError) as exc: # pragma: no cover - depends on env
|
|
87
|
+
raise DependencyMissingError(LOCAL_DEPS_MESSAGE) from exc
|
|
88
|
+
self.pipeline_options.setdefault("device", _detect_device())
|
|
89
|
+
self._device_label = (
|
|
90
|
+
"GPU" if str(self.pipeline_options["device"]).startswith("gpu") else "CPU"
|
|
91
|
+
)
|
|
92
|
+
print(
|
|
93
|
+
f"DocSlight local PP-StructureV3 inference device: {self._device_label}",
|
|
94
|
+
file=sys.stderr,
|
|
95
|
+
)
|
|
96
|
+
self._pipeline = PPStructureV3(**self.pipeline_options)
|
|
97
|
+
return self._pipeline
|
|
98
|
+
|
|
99
|
+
def _predict(self, pipeline: Any, path: Path) -> list[Any]:
|
|
100
|
+
try:
|
|
101
|
+
raw_results = pipeline.predict(input=str(path))
|
|
102
|
+
except LocalProcessingError:
|
|
103
|
+
raise
|
|
104
|
+
except Exception as exc: # noqa: BLE001
|
|
105
|
+
raise LocalProcessingError(f"Local PaddleOCR parsing failed: {exc}") from exc
|
|
106
|
+
if raw_results is None:
|
|
107
|
+
return []
|
|
108
|
+
if isinstance(raw_results, list):
|
|
109
|
+
return raw_results
|
|
110
|
+
if isinstance(raw_results, Iterable) and not isinstance(raw_results, (str, bytes, dict)):
|
|
111
|
+
return list(raw_results)
|
|
112
|
+
raise LocalProcessingError("Unexpected PP-StructureV3 result format")
|
|
113
|
+
|
|
114
|
+
def _normalize_results(self, results: list[Any]) -> tuple[list[Any], list[dict[str, Any]]]:
|
|
115
|
+
markdown_pages: list[Any] = []
|
|
116
|
+
pages: list[dict[str, Any]] = []
|
|
117
|
+
for result in results:
|
|
118
|
+
if result is None:
|
|
119
|
+
continue
|
|
120
|
+
if not hasattr(result, "markdown") or not hasattr(result, "json"):
|
|
121
|
+
raise LocalProcessingError("Unexpected PP-StructureV3 result format")
|
|
122
|
+
markdown_pages.append(result.markdown)
|
|
123
|
+
pages.append(self._normalize_page_payload(result.json))
|
|
124
|
+
return markdown_pages, pages
|
|
125
|
+
|
|
126
|
+
def _normalize_page_payload(self, payload: Any) -> dict[str, Any]:
|
|
127
|
+
if callable(payload):
|
|
128
|
+
payload = payload()
|
|
129
|
+
if not isinstance(payload, dict):
|
|
130
|
+
raise LocalProcessingError("Unexpected PP-StructureV3 result format")
|
|
131
|
+
data = payload.get("res", payload)
|
|
132
|
+
if not isinstance(data, dict):
|
|
133
|
+
raise LocalProcessingError("Unexpected PP-StructureV3 result format")
|
|
134
|
+
return data
|
|
135
|
+
|
|
136
|
+
def _build_markdown(self, pipeline: Any, markdown_pages: list[Any]) -> str:
|
|
137
|
+
if not markdown_pages:
|
|
138
|
+
return ""
|
|
139
|
+
if len(markdown_pages) == 1:
|
|
140
|
+
return self._markdown_to_text(markdown_pages[0])
|
|
141
|
+
combined_markdown = pipeline.concatenate_markdown_pages(markdown_pages)
|
|
142
|
+
return self._markdown_to_text(combined_markdown)
|
|
143
|
+
|
|
144
|
+
def _markdown_to_text(self, markdown: Any) -> str:
|
|
145
|
+
if markdown is None:
|
|
146
|
+
return ""
|
|
147
|
+
if isinstance(markdown, str):
|
|
148
|
+
return markdown
|
|
149
|
+
if isinstance(markdown, dict):
|
|
150
|
+
text = markdown.get("markdown_texts", markdown.get("markdown", ""))
|
|
151
|
+
if isinstance(text, str):
|
|
152
|
+
return text
|
|
153
|
+
raise LocalProcessingError("Unexpected PP-StructureV3 result format")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _detect_device() -> str:
|
|
157
|
+
try:
|
|
158
|
+
import paddle
|
|
159
|
+
except ModuleNotFoundError:
|
|
160
|
+
return "cpu"
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
device = getattr(paddle, "device", None)
|
|
164
|
+
cuda = getattr(device, "cuda", None) if device is not None else None
|
|
165
|
+
if cuda is None or cuda.device_count() <= 0:
|
|
166
|
+
return "cpu"
|
|
167
|
+
if device is not None and device.is_compiled_with_cuda():
|
|
168
|
+
return "gpu"
|
|
169
|
+
if hasattr(paddle, "is_compiled_with_cuda") and paddle.is_compiled_with_cuda():
|
|
170
|
+
return "gpu"
|
|
171
|
+
except Exception:
|
|
172
|
+
return "cpu"
|
|
173
|
+
return "cpu"
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Local document parsing pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import inspect
|
|
7
|
+
import io
|
|
8
|
+
import json
|
|
9
|
+
import zipfile
|
|
10
|
+
from dataclasses import replace
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, cast
|
|
13
|
+
|
|
14
|
+
from docslight.config import DocSlightConfig
|
|
15
|
+
from docslight.exceptions import ConfigurationError, UnsupportedFormatError
|
|
16
|
+
from docslight.local.layout_blocks import build_layout_blocks
|
|
17
|
+
from docslight.local.loaders import OFFICE_EXTENSIONS, RASTER_EXTENSIONS, FileLoader
|
|
18
|
+
from docslight.local.markdown import MarkdownBuilder
|
|
19
|
+
from docslight.local.office_loader import LEGACY_OFFICE_EXTENSIONS, OfficeMarkdownLoader
|
|
20
|
+
from docslight.local.paddle_parser import PaddleOCRParser
|
|
21
|
+
from docslight.result import ExtractResult, ParseResult
|
|
22
|
+
|
|
23
|
+
FIXED_LLM_PARAMETERS = {"markdown", "fields", "schema", "document_types"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _cloud_extract_result(data: dict[str, Any]) -> dict[str, Any]:
|
|
27
|
+
if _is_page_grouped_result(data):
|
|
28
|
+
return data
|
|
29
|
+
|
|
30
|
+
table_bboxes = data.get("_table_bboxes") if isinstance(data.get("_table_bboxes"), dict) else {}
|
|
31
|
+
grouped: dict[str, dict[str, Any]] = {}
|
|
32
|
+
for key, value in data.items():
|
|
33
|
+
if key == "_table_bboxes":
|
|
34
|
+
continue
|
|
35
|
+
if key == "tables" and isinstance(value, dict):
|
|
36
|
+
for table_name, rows in value.items():
|
|
37
|
+
page_key = _page_key(table_bboxes.get(table_name))
|
|
38
|
+
page = grouped.setdefault(page_key, {})
|
|
39
|
+
tables = page.setdefault("tables", {})
|
|
40
|
+
if isinstance(tables, dict):
|
|
41
|
+
tables[table_name] = rows
|
|
42
|
+
continue
|
|
43
|
+
page_key = _page_key(value)
|
|
44
|
+
grouped.setdefault(page_key, {})[key] = _cloud_extract_value(value)
|
|
45
|
+
return grouped or {"Page_1": {}}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _is_page_grouped_result(data: dict[str, Any]) -> bool:
|
|
49
|
+
return bool(data) and all(isinstance(key, str) and key.startswith("Page_") for key in data)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _page_key(value: Any) -> str:
|
|
53
|
+
if isinstance(value, dict):
|
|
54
|
+
page_id = value.get("page_id") or value.get("page")
|
|
55
|
+
if page_id is not None:
|
|
56
|
+
return f"Page_{page_id}"
|
|
57
|
+
bboxes = value.get("bboxes") or value.get("bbox")
|
|
58
|
+
if isinstance(bboxes, list):
|
|
59
|
+
candidates = bboxes if bboxes and isinstance(bboxes[0], dict) else [value]
|
|
60
|
+
for candidate in candidates:
|
|
61
|
+
if isinstance(candidate, dict):
|
|
62
|
+
page_id = candidate.get("page_id") or candidate.get("page")
|
|
63
|
+
if page_id is not None:
|
|
64
|
+
return f"Page_{page_id}"
|
|
65
|
+
return "Page_1"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _cloud_extract_value(value: Any) -> Any:
|
|
69
|
+
if isinstance(value, dict) and "value" in value:
|
|
70
|
+
return "" if value["value"] is None else value["value"]
|
|
71
|
+
return "" if value is None else value
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _with_local_parse_outputs(result: ParseResult) -> ParseResult:
|
|
75
|
+
raw_response = result.to_standard_json()
|
|
76
|
+
return replace(
|
|
77
|
+
result,
|
|
78
|
+
raw_response=raw_response,
|
|
79
|
+
raw_archive=_build_parse_archive(result.markdown, raw_response),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _build_parse_archive(markdown: str, payload: dict[str, Any]) -> bytes:
|
|
84
|
+
buffer = io.BytesIO()
|
|
85
|
+
with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
86
|
+
archive.writestr("result.md", markdown)
|
|
87
|
+
archive.writestr(
|
|
88
|
+
"result.json",
|
|
89
|
+
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
90
|
+
)
|
|
91
|
+
return buffer.getvalue()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _build_llm_extractor(config: dict[str, Any]) -> Any:
|
|
95
|
+
"""Build the local LLM extractor while keeping Task 8 dependency lazy."""
|
|
96
|
+
module = importlib.import_module("docslight.local.llm_extractor")
|
|
97
|
+
return module.LocalLLMExtractor(module.provider_from_config(config))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class LocalPipeline:
|
|
101
|
+
"""Parse and extract documents using local components."""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self,
|
|
105
|
+
loader: Any | None = None,
|
|
106
|
+
parser: Any | None = None,
|
|
107
|
+
office_loader: Any | None = None,
|
|
108
|
+
markdown_builder: MarkdownBuilder | None = None,
|
|
109
|
+
llm_extractor: Any | None = None,
|
|
110
|
+
) -> None:
|
|
111
|
+
self.loader = loader or FileLoader()
|
|
112
|
+
self.parser = parser or PaddleOCRParser()
|
|
113
|
+
self.office_loader = office_loader or OfficeMarkdownLoader()
|
|
114
|
+
self.markdown_builder = markdown_builder or MarkdownBuilder()
|
|
115
|
+
self.llm_extractor = llm_extractor
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def from_config(cls, config: DocSlightConfig) -> LocalPipeline:
|
|
119
|
+
"""Build a local pipeline from SDK configuration."""
|
|
120
|
+
llm_extractor = None
|
|
121
|
+
if config.local_llm:
|
|
122
|
+
try:
|
|
123
|
+
llm_extractor = _build_llm_extractor(config.local_llm)
|
|
124
|
+
except ModuleNotFoundError as exc:
|
|
125
|
+
if exc.name == "docslight.local.llm_extractor":
|
|
126
|
+
raise ConfigurationError("local_llm support is not available") from exc
|
|
127
|
+
raise
|
|
128
|
+
return cls(llm_extractor=llm_extractor)
|
|
129
|
+
|
|
130
|
+
def parse(self, path: Path | str, **options: Any) -> ParseResult:
|
|
131
|
+
"""Parse a local document into Markdown."""
|
|
132
|
+
source_path = Path(path)
|
|
133
|
+
suffix = source_path.suffix.lower()
|
|
134
|
+
if suffix in LEGACY_OFFICE_EXTENSIONS:
|
|
135
|
+
raise UnsupportedFormatError("Legacy Office files must convert to DOCX, PPTX, or XLSX")
|
|
136
|
+
if suffix in OFFICE_EXTENSIONS:
|
|
137
|
+
return self._parse_office(source_path, options)
|
|
138
|
+
if suffix in RASTER_EXTENSIONS:
|
|
139
|
+
return self._parse_raster(source_path, suffix, options)
|
|
140
|
+
raise UnsupportedFormatError(f"Unsupported local format: {suffix or source_path.name}")
|
|
141
|
+
|
|
142
|
+
def extract(
|
|
143
|
+
self,
|
|
144
|
+
path: Path | str,
|
|
145
|
+
fields: list[str] | dict[str, Any] | None = None,
|
|
146
|
+
schema: dict[str, Any] | None = None,
|
|
147
|
+
document_types: list[str] | None = None,
|
|
148
|
+
**options: Any,
|
|
149
|
+
) -> ExtractResult:
|
|
150
|
+
"""Extract structured data from a local document."""
|
|
151
|
+
llm_extractor = self.llm_extractor
|
|
152
|
+
if llm_extractor is None:
|
|
153
|
+
raise ConfigurationError("local_llm must be configured")
|
|
154
|
+
parsed = self.parse(path, **options)
|
|
155
|
+
candidate_options = dict(options)
|
|
156
|
+
layout_blocks = build_layout_blocks(parsed.pages)
|
|
157
|
+
if layout_blocks:
|
|
158
|
+
candidate_options["layout_blocks"] = layout_blocks
|
|
159
|
+
llm_options = self._supported_llm_options(llm_extractor, candidate_options)
|
|
160
|
+
extracted = cast(ExtractResult, llm_extractor.extract(
|
|
161
|
+
parsed.markdown,
|
|
162
|
+
fields=fields,
|
|
163
|
+
schema=schema,
|
|
164
|
+
document_types=document_types,
|
|
165
|
+
**llm_options,
|
|
166
|
+
))
|
|
167
|
+
return replace(
|
|
168
|
+
extracted,
|
|
169
|
+
data=_cloud_extract_result(extracted.data),
|
|
170
|
+
metadata={**extracted.metadata, **parsed.metadata},
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def _parse_raster(self, path: Path, suffix: str, options: dict[str, Any]) -> ParseResult:
|
|
174
|
+
parsed = self.parser.parse(path)
|
|
175
|
+
document_type = "pdf" if suffix == ".pdf" else "image"
|
|
176
|
+
metadata = self._merge_metadata(
|
|
177
|
+
{
|
|
178
|
+
"engine": "ppstructurev3-local",
|
|
179
|
+
"mode": "local",
|
|
180
|
+
"document_type": document_type,
|
|
181
|
+
"page_count": len(parsed.pages),
|
|
182
|
+
},
|
|
183
|
+
{**parsed.metadata, **options},
|
|
184
|
+
)
|
|
185
|
+
result = replace(parsed, metadata=metadata)
|
|
186
|
+
return _with_local_parse_outputs(result)
|
|
187
|
+
|
|
188
|
+
def _parse_office(self, path: Path, options: dict[str, Any]) -> ParseResult:
|
|
189
|
+
document = self.office_loader.load(path)
|
|
190
|
+
metadata = self._merge_metadata({"mode": "local", **document.metadata}, options)
|
|
191
|
+
result = ParseResult(markdown=document.markdown, pages=[], metadata=metadata)
|
|
192
|
+
return _with_local_parse_outputs(result)
|
|
193
|
+
|
|
194
|
+
def _merge_metadata(
|
|
195
|
+
self,
|
|
196
|
+
base_metadata: dict[str, Any],
|
|
197
|
+
options: dict[str, Any],
|
|
198
|
+
) -> dict[str, Any]:
|
|
199
|
+
metadata = dict(base_metadata)
|
|
200
|
+
for key, value in options.items():
|
|
201
|
+
if key not in metadata:
|
|
202
|
+
metadata[key] = value
|
|
203
|
+
return metadata
|
|
204
|
+
|
|
205
|
+
def _supported_llm_options(self, llm_extractor: Any, options: dict[str, Any]) -> dict[str, Any]:
|
|
206
|
+
candidate_options = {
|
|
207
|
+
key: value for key, value in options.items() if key not in FIXED_LLM_PARAMETERS
|
|
208
|
+
}
|
|
209
|
+
signature = inspect.signature(llm_extractor.extract)
|
|
210
|
+
parameters = signature.parameters
|
|
211
|
+
if any(parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in parameters.values()):
|
|
212
|
+
return candidate_options
|
|
213
|
+
return {key: value for key, value in candidate_options.items() if key in parameters}
|
docslight/preview.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Preview rendering helpers for the local Web UI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from docslight.exceptions import DependencyMissingError, LocalProcessingError
|
|
10
|
+
|
|
11
|
+
PDF_PREVIEW_DEPENDENCY_MESSAGE = (
|
|
12
|
+
"Install docslight-lite[local] to enable PDF preview rendering."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def render_pdf_preview(path: Path, max_pages: int | None = None) -> dict[str, Any]:
|
|
17
|
+
"""Render PDF pages to PNG data URLs for browser overlay highlighting."""
|
|
18
|
+
try:
|
|
19
|
+
import fitz # type: ignore[import-not-found]
|
|
20
|
+
except ModuleNotFoundError as exc:
|
|
21
|
+
raise DependencyMissingError(PDF_PREVIEW_DEPENDENCY_MESSAGE) from exc
|
|
22
|
+
|
|
23
|
+
pages: list[dict[str, Any]] = []
|
|
24
|
+
try:
|
|
25
|
+
with fitz.open(path) as document:
|
|
26
|
+
page_count = len(document) if max_pages is None else min(len(document), max_pages)
|
|
27
|
+
for page_index in range(page_count):
|
|
28
|
+
page = document.load_page(page_index)
|
|
29
|
+
pixmap = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
|
30
|
+
image = base64.b64encode(pixmap.tobytes("png")).decode("ascii")
|
|
31
|
+
rect = page.rect
|
|
32
|
+
pages.append(
|
|
33
|
+
{
|
|
34
|
+
"page_id": page_index + 1,
|
|
35
|
+
"page_index": page_index,
|
|
36
|
+
"width": float(rect.width),
|
|
37
|
+
"height": float(rect.height),
|
|
38
|
+
"image": f"data:image/png;base64,{image}",
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
except Exception as exc:
|
|
42
|
+
if isinstance(exc, DependencyMissingError):
|
|
43
|
+
raise
|
|
44
|
+
raise LocalProcessingError("PDF preview rendering failed") from exc
|
|
45
|
+
|
|
46
|
+
return {"kind": "pdf", "pages": pages}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Ollama local chat completion provider."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from docslight.providers.openai_compatible import OpenAICompatibleProvider
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OllamaProvider(OpenAICompatibleProvider):
|
|
9
|
+
"""OpenAI-compatible provider configured for Ollama."""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
model: str,
|
|
14
|
+
base_url: str = "http://localhost:11434",
|
|
15
|
+
api_key: str = "ollama",
|
|
16
|
+
timeout: float = 120.0,
|
|
17
|
+
) -> None:
|
|
18
|
+
super().__init__(
|
|
19
|
+
model=model,
|
|
20
|
+
base_url=_normalize_ollama_base_url(base_url),
|
|
21
|
+
api_key=api_key,
|
|
22
|
+
timeout=timeout,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _normalize_ollama_base_url(base_url: str) -> str:
|
|
27
|
+
normalized = base_url.rstrip("/")
|
|
28
|
+
if normalized.endswith("/v1"):
|
|
29
|
+
return normalized
|
|
30
|
+
return f"{normalized}/v1"
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""OpenAI-compatible chat completion provider."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from docslight.exceptions import DependencyMissingError, LocalProcessingError
|
|
8
|
+
|
|
9
|
+
INSTALL_LOCAL_LLM_MESSAGE = (
|
|
10
|
+
"Install local LLM dependencies with: pip install 'docslight-lite[local-llm]'"
|
|
11
|
+
)
|
|
12
|
+
NO_TEXT_CONTENT_MESSAGE = "OpenAI-compatible provider returned no text content"
|
|
13
|
+
REQUEST_FAILED_MESSAGE = "OpenAI-compatible provider request failed"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpenAICompatibleProvider:
|
|
17
|
+
"""Provider for OpenAI-compatible chat completion APIs."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
model: str,
|
|
22
|
+
base_url: str,
|
|
23
|
+
api_key: str = "",
|
|
24
|
+
timeout: float = 120.0,
|
|
25
|
+
extra_body: dict[str, Any] | None = None,
|
|
26
|
+
) -> None:
|
|
27
|
+
self.model = model
|
|
28
|
+
self.base_url = base_url
|
|
29
|
+
self.api_key = api_key
|
|
30
|
+
self.timeout = timeout
|
|
31
|
+
self.extra_body = extra_body or {}
|
|
32
|
+
|
|
33
|
+
def complete(self, messages: list[dict[str, str]]) -> str:
|
|
34
|
+
"""Return chat completion content from an OpenAI-compatible endpoint."""
|
|
35
|
+
try:
|
|
36
|
+
from openai import OpenAI
|
|
37
|
+
except ImportError as exc:
|
|
38
|
+
raise DependencyMissingError(INSTALL_LOCAL_LLM_MESSAGE) from exc
|
|
39
|
+
|
|
40
|
+
client = OpenAI(
|
|
41
|
+
api_key=self.api_key,
|
|
42
|
+
base_url=self.base_url,
|
|
43
|
+
timeout=self.timeout,
|
|
44
|
+
)
|
|
45
|
+
try:
|
|
46
|
+
request_kwargs: dict[str, Any] = {
|
|
47
|
+
"model": self.model,
|
|
48
|
+
"messages": messages,
|
|
49
|
+
"temperature": 0,
|
|
50
|
+
}
|
|
51
|
+
if self.extra_body:
|
|
52
|
+
request_kwargs["extra_body"] = self.extra_body
|
|
53
|
+
response = client.chat.completions.create(**request_kwargs)
|
|
54
|
+
except Exception as exc:
|
|
55
|
+
raise LocalProcessingError(REQUEST_FAILED_MESSAGE) from exc
|
|
56
|
+
try:
|
|
57
|
+
choice = response.choices[0]
|
|
58
|
+
message = choice.message
|
|
59
|
+
content: Any = message.content
|
|
60
|
+
except (AttributeError, IndexError, TypeError) as exc:
|
|
61
|
+
raise LocalProcessingError(NO_TEXT_CONTENT_MESSAGE) from exc
|
|
62
|
+
if not isinstance(content, str):
|
|
63
|
+
raise LocalProcessingError(NO_TEXT_CONTENT_MESSAGE)
|
|
64
|
+
return content
|