docslight-lite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docslight/__init__.py +41 -0
- docslight/cli.py +215 -0
- docslight/client.py +92 -0
- docslight/cloud/__init__.py +5 -0
- docslight/cloud/client.py +622 -0
- docslight/config.py +117 -0
- docslight/exceptions.py +65 -0
- docslight/local/__init__.py +31 -0
- docslight/local/layout_blocks.py +80 -0
- docslight/local/llm_extractor.py +252 -0
- docslight/local/loaders.py +95 -0
- docslight/local/markdown.py +18 -0
- docslight/local/office_loader.py +128 -0
- docslight/local/paddle_parser.py +173 -0
- docslight/local/pipeline.py +213 -0
- docslight/preview.py +46 -0
- docslight/providers/__init__.py +6 -0
- docslight/providers/ollama.py +30 -0
- docslight/providers/openai_compatible.py +64 -0
- docslight/result.py +89 -0
- docslight/schemas/__init__.py +5 -0
- docslight/schemas/fields.py +190 -0
- docslight/standard_json.py +367 -0
- docslight/static/app/common.js +668 -0
- docslight/static/app/docslight-extract.json +307 -0
- docslight/static/app/extract.js +394 -0
- docslight/static/app/i18n.js +405 -0
- docslight/static/app/parse.js +161 -0
- docslight/static/styles.css +878 -0
- docslight/templates/base.html +36 -0
- docslight/templates/extract.html +123 -0
- docslight/templates/parse.html +81 -0
- docslight/web_app.py +372 -0
- docslight_lite-0.1.0.dist-info/METADATA +277 -0
- docslight_lite-0.1.0.dist-info/RECORD +39 -0
- docslight_lite-0.1.0.dist-info/WHEEL +5 -0
- docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
- docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
- docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
docslight/result.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Result objects returned by docslight operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
EXTRACT_DIMENSION_KEYS = ("source_width", "source_height")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class ParseResult:
|
|
13
|
+
"""Parsed document content."""
|
|
14
|
+
|
|
15
|
+
markdown: str
|
|
16
|
+
pages: list[dict[str, Any]] = field(default_factory=list)
|
|
17
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
18
|
+
raw_response: Any = None
|
|
19
|
+
raw_archive: bytes | None = None
|
|
20
|
+
|
|
21
|
+
def to_markdown(self) -> str:
|
|
22
|
+
"""Return the parsed markdown content."""
|
|
23
|
+
return self.markdown
|
|
24
|
+
|
|
25
|
+
def to_json(self) -> dict[str, Any]:
|
|
26
|
+
"""Return a JSON-serializable representation."""
|
|
27
|
+
return {
|
|
28
|
+
"markdown": self.markdown,
|
|
29
|
+
"pages": self.pages,
|
|
30
|
+
"metadata": self.metadata,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def to_standard_json(self) -> dict[str, Any]:
|
|
34
|
+
"""Return the ComPDF-style standard parse JSON representation."""
|
|
35
|
+
from docslight.standard_json import build_standard_parse_json
|
|
36
|
+
|
|
37
|
+
return build_standard_parse_json(
|
|
38
|
+
markdown=self.markdown,
|
|
39
|
+
pages=self.pages,
|
|
40
|
+
metadata=self.metadata,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class ExtractResult:
|
|
46
|
+
"""Structured extraction result."""
|
|
47
|
+
|
|
48
|
+
data: dict[str, Any]
|
|
49
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
50
|
+
raw_response: Any = None
|
|
51
|
+
|
|
52
|
+
def __post_init__(self) -> None:
|
|
53
|
+
"""Normalize extract payloads into a fixed {results, metadata} shape."""
|
|
54
|
+
results, metadata = normalize_extract_payload(self.data, self.metadata)
|
|
55
|
+
object.__setattr__(self, "data", results)
|
|
56
|
+
object.__setattr__(self, "metadata", metadata)
|
|
57
|
+
|
|
58
|
+
def to_json(self) -> dict[str, Any]:
|
|
59
|
+
"""Return a JSON-serializable representation."""
|
|
60
|
+
return {
|
|
61
|
+
"results": self.data,
|
|
62
|
+
"metadata": self.metadata,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def normalize_extract_payload(
|
|
67
|
+
data: dict[str, Any] | None,
|
|
68
|
+
metadata: dict[str, Any] | None = None,
|
|
69
|
+
) -> tuple[dict[str, Any], dict[str, Any]]:
|
|
70
|
+
"""Normalize extract payloads from cloud/local providers."""
|
|
71
|
+
payload = dict(data) if isinstance(data, dict) else {}
|
|
72
|
+
normalized_metadata = dict(payload.get("metadata", {})) if isinstance(payload.get("metadata"), dict) else {}
|
|
73
|
+
if isinstance(metadata, dict):
|
|
74
|
+
normalized_metadata.update(metadata)
|
|
75
|
+
|
|
76
|
+
if isinstance(payload.get("results"), dict):
|
|
77
|
+
results = dict(payload["results"])
|
|
78
|
+
else:
|
|
79
|
+
results = {key: value for key, value in payload.items() if key not in {"message", "metadata"}}
|
|
80
|
+
|
|
81
|
+
fields = results.pop("fields", None)
|
|
82
|
+
if isinstance(fields, dict):
|
|
83
|
+
results = {**fields, **results}
|
|
84
|
+
|
|
85
|
+
for key in EXTRACT_DIMENSION_KEYS:
|
|
86
|
+
if key in results and key not in normalized_metadata:
|
|
87
|
+
normalized_metadata[key] = results.pop(key)
|
|
88
|
+
|
|
89
|
+
return results, normalized_metadata
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Helpers for extraction field schemas."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from docslight.exceptions import ConfigurationError
|
|
8
|
+
|
|
9
|
+
StructuredFields = dict[str, Any]
|
|
10
|
+
NormalizedFields = list[str] | StructuredFields | None
|
|
11
|
+
ExtractSchema = dict[str, Any]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
|
|
15
|
+
"""Normalize extraction fields from SDK, CLI, or Web UI inputs."""
|
|
16
|
+
if fields is None:
|
|
17
|
+
return None
|
|
18
|
+
if isinstance(fields, str):
|
|
19
|
+
normalized = [field.strip() for field in fields.split(",") if field.strip()]
|
|
20
|
+
return normalized or None
|
|
21
|
+
if isinstance(fields, dict):
|
|
22
|
+
return _normalize_structured_fields(fields)
|
|
23
|
+
if not isinstance(fields, list):
|
|
24
|
+
raise ConfigurationError(
|
|
25
|
+
"fields must be a list of strings, comma-separated string, object, or None"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
normalized = []
|
|
29
|
+
for field in fields:
|
|
30
|
+
if not isinstance(field, str):
|
|
31
|
+
raise ConfigurationError(
|
|
32
|
+
"fields must be a list of strings, comma-separated string, object, or None"
|
|
33
|
+
)
|
|
34
|
+
stripped = field.strip()
|
|
35
|
+
if stripped:
|
|
36
|
+
normalized.append(stripped)
|
|
37
|
+
return normalized or None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def build_extract_schema(fields: NormalizedFields) -> ExtractSchema | None:
|
|
41
|
+
"""Build a stable JSON schema for extract outputs."""
|
|
42
|
+
if not isinstance(fields, dict):
|
|
43
|
+
if isinstance(fields, list):
|
|
44
|
+
return {
|
|
45
|
+
"type": "object",
|
|
46
|
+
"properties": {
|
|
47
|
+
field: {"type": ["string", "number", "boolean", "null", "object", "array"]}
|
|
48
|
+
for field in fields
|
|
49
|
+
},
|
|
50
|
+
"additionalProperties": True,
|
|
51
|
+
}
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
properties: dict[str, Any] = {}
|
|
55
|
+
required: list[str] = []
|
|
56
|
+
keys = fields.get("keys", {})
|
|
57
|
+
tables = fields.get("tableHeaders", {})
|
|
58
|
+
|
|
59
|
+
for field_name in keys:
|
|
60
|
+
properties[field_name] = {
|
|
61
|
+
"type": "object",
|
|
62
|
+
"properties": {
|
|
63
|
+
"value": {"type": ["string", "number", "boolean", "null"]},
|
|
64
|
+
"bboxes": {
|
|
65
|
+
"type": "array",
|
|
66
|
+
"items": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"properties": {
|
|
69
|
+
"page_id": {"type": "number"},
|
|
70
|
+
"bbox": {
|
|
71
|
+
"type": "array",
|
|
72
|
+
"items": {"type": "number"},
|
|
73
|
+
"minItems": 4,
|
|
74
|
+
"maxItems": 4,
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
"required": ["page_id", "bbox"],
|
|
78
|
+
"additionalProperties": True,
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
"required": ["value"],
|
|
83
|
+
"additionalProperties": True,
|
|
84
|
+
}
|
|
85
|
+
required.append(field_name)
|
|
86
|
+
|
|
87
|
+
table_properties: dict[str, Any] = {}
|
|
88
|
+
for table_name, columns in tables.items():
|
|
89
|
+
table_properties[table_name] = {
|
|
90
|
+
"type": "array",
|
|
91
|
+
"items": {
|
|
92
|
+
"type": "object",
|
|
93
|
+
"properties": {
|
|
94
|
+
column_name: {"type": ["string", "number", "boolean", "null", "object", "array"]}
|
|
95
|
+
for column_name in columns
|
|
96
|
+
},
|
|
97
|
+
"additionalProperties": True,
|
|
98
|
+
},
|
|
99
|
+
}
|
|
100
|
+
required.append(table_name)
|
|
101
|
+
|
|
102
|
+
if table_properties:
|
|
103
|
+
properties["tables"] = {
|
|
104
|
+
"type": "object",
|
|
105
|
+
"properties": table_properties,
|
|
106
|
+
"additionalProperties": False,
|
|
107
|
+
}
|
|
108
|
+
properties["_table_bboxes"] = {
|
|
109
|
+
"type": "object",
|
|
110
|
+
"additionalProperties": {
|
|
111
|
+
"type": "array",
|
|
112
|
+
"items": {
|
|
113
|
+
"type": "object",
|
|
114
|
+
"properties": {
|
|
115
|
+
"page_id": {"type": "number"},
|
|
116
|
+
"bbox": {
|
|
117
|
+
"type": "array",
|
|
118
|
+
"items": {"type": "number"},
|
|
119
|
+
"minItems": 4,
|
|
120
|
+
"maxItems": 4,
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
"required": ["page_id", "bbox"],
|
|
124
|
+
"additionalProperties": True,
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return {
|
|
130
|
+
"type": "object",
|
|
131
|
+
"properties": properties,
|
|
132
|
+
"required": required,
|
|
133
|
+
"additionalProperties": True,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _normalize_structured_fields(fields: dict[str, Any]) -> dict[str, Any]:
|
|
138
|
+
name = fields.get("name")
|
|
139
|
+
if not isinstance(name, str) or not name.strip():
|
|
140
|
+
raise ConfigurationError("fields.name is required")
|
|
141
|
+
|
|
142
|
+
normalized: dict[str, Any] = {"name": name.strip()}
|
|
143
|
+
keys = _normalize_field_group(fields.get("keys", {}), "fields.keys")
|
|
144
|
+
tables = _normalize_tables(fields.get("tableHeaders", {}))
|
|
145
|
+
if keys:
|
|
146
|
+
normalized["keys"] = keys
|
|
147
|
+
if tables:
|
|
148
|
+
normalized["tableHeaders"] = tables
|
|
149
|
+
if "keys" not in normalized and "tableHeaders" not in normalized:
|
|
150
|
+
raise ConfigurationError("fields must include at least one keys or tableHeaders entry")
|
|
151
|
+
return normalized
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _normalize_tables(value: Any) -> dict[str, dict[str, Any]]:
|
|
155
|
+
if value in (None, ""):
|
|
156
|
+
return {}
|
|
157
|
+
if not isinstance(value, dict):
|
|
158
|
+
raise ConfigurationError("fields.tableHeaders must be an object")
|
|
159
|
+
normalized: dict[str, dict[str, Any]] = {}
|
|
160
|
+
for table_name, columns in value.items():
|
|
161
|
+
if not isinstance(table_name, str) or not table_name.strip():
|
|
162
|
+
raise ConfigurationError("fields.tableHeaders table names must be non-empty strings")
|
|
163
|
+
table_columns = _normalize_field_group(
|
|
164
|
+
columns,
|
|
165
|
+
f"fields.tableHeaders.{table_name}",
|
|
166
|
+
)
|
|
167
|
+
if table_columns:
|
|
168
|
+
normalized[table_name.strip()] = table_columns
|
|
169
|
+
return normalized
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _normalize_field_group(value: Any, label: str) -> dict[str, dict[str, Any]]:
|
|
173
|
+
if value in (None, ""):
|
|
174
|
+
return {}
|
|
175
|
+
if not isinstance(value, dict):
|
|
176
|
+
raise ConfigurationError(f"{label} must be an object")
|
|
177
|
+
normalized: dict[str, dict[str, Any]] = {}
|
|
178
|
+
for field_name, field_def in value.items():
|
|
179
|
+
if not isinstance(field_name, str) or not field_name.strip():
|
|
180
|
+
raise ConfigurationError(f"{label} field names must be non-empty strings")
|
|
181
|
+
if not isinstance(field_def, dict):
|
|
182
|
+
raise ConfigurationError(f"{label}.{field_name} must be an object")
|
|
183
|
+
prompt = field_def.get("prompt")
|
|
184
|
+
if prompt is not None and not isinstance(prompt, str):
|
|
185
|
+
raise ConfigurationError(f"{label}.{field_name}.prompt must be a string or null")
|
|
186
|
+
normalized[field_name.strip()] = {
|
|
187
|
+
"prompt": prompt,
|
|
188
|
+
"mapping": field_def.get("mapping"),
|
|
189
|
+
}
|
|
190
|
+
return normalized
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Convert local parse payloads into the ComPDF-style standard JSON shape."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from html.parser import HTMLParser
|
|
7
|
+
from numbers import Real
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
IGNORED_STATUS_LABELS = {
|
|
11
|
+
"aside_text",
|
|
12
|
+
"footer",
|
|
13
|
+
"footer_image",
|
|
14
|
+
"footnote",
|
|
15
|
+
"header",
|
|
16
|
+
"header_image",
|
|
17
|
+
"number",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def convert_parse_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
|
22
|
+
"""Convert a serialized ParseResult payload into standard parse JSON."""
|
|
23
|
+
if _looks_like_standard_payload(payload):
|
|
24
|
+
return payload
|
|
25
|
+
metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {}
|
|
26
|
+
pages = payload.get("pages") if isinstance(payload.get("pages"), list) else []
|
|
27
|
+
markdown = payload.get("markdown") if isinstance(payload.get("markdown"), str) else ""
|
|
28
|
+
return build_standard_parse_json(markdown=markdown, pages=pages, metadata=metadata)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_standard_parse_json(
|
|
32
|
+
*,
|
|
33
|
+
markdown: str,
|
|
34
|
+
pages: list[dict[str, Any]],
|
|
35
|
+
metadata: dict[str, Any] | None = None,
|
|
36
|
+
) -> dict[str, Any]:
|
|
37
|
+
"""Build standard parse JSON from ParseResult parts."""
|
|
38
|
+
metadata = metadata or {}
|
|
39
|
+
standard_pages: list[dict[str, Any]] = []
|
|
40
|
+
detail: list[dict[str, Any]] = []
|
|
41
|
+
metrics: list[dict[str, Any]] = []
|
|
42
|
+
|
|
43
|
+
for fallback_page_index, page in enumerate(pages):
|
|
44
|
+
if not isinstance(page, dict):
|
|
45
|
+
continue
|
|
46
|
+
standard_page, page_detail, metric = _convert_page(
|
|
47
|
+
page,
|
|
48
|
+
fallback_page_index,
|
|
49
|
+
first_paragraph_id=len(detail) + 1,
|
|
50
|
+
)
|
|
51
|
+
standard_pages.append(standard_page)
|
|
52
|
+
detail.extend(page_detail)
|
|
53
|
+
metrics.append(metric)
|
|
54
|
+
|
|
55
|
+
success_count = sum(1 for page in standard_pages if page.get("status") == "Success")
|
|
56
|
+
return {
|
|
57
|
+
"result": {
|
|
58
|
+
"pages": standard_pages,
|
|
59
|
+
"detail": detail,
|
|
60
|
+
"total_count": len(standard_pages),
|
|
61
|
+
"valid_page_number": success_count,
|
|
62
|
+
"total_page_number": len(standard_pages),
|
|
63
|
+
"catalog": {},
|
|
64
|
+
"excel_base64": "",
|
|
65
|
+
"success_count": success_count,
|
|
66
|
+
"markdown": markdown,
|
|
67
|
+
},
|
|
68
|
+
"x_request_id": _string_value(metadata.get("x_request_id")),
|
|
69
|
+
"file_type": _file_type(metadata),
|
|
70
|
+
"metrics": metrics,
|
|
71
|
+
"message": "Success",
|
|
72
|
+
"code": 200,
|
|
73
|
+
"image_process": [],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _looks_like_standard_payload(payload: dict[str, Any]) -> bool:
|
|
78
|
+
result = payload.get("result")
|
|
79
|
+
return isinstance(result, dict) and isinstance(result.get("pages"), list)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _convert_page(
|
|
83
|
+
page: dict[str, Any],
|
|
84
|
+
fallback_page_index: int,
|
|
85
|
+
*,
|
|
86
|
+
first_paragraph_id: int,
|
|
87
|
+
) -> tuple[dict[str, Any], list[dict[str, Any]], dict[str, Any]]:
|
|
88
|
+
page_index = _int_value(page.get("page_index"), fallback_page_index)
|
|
89
|
+
page_id = _page_id(page, page_index)
|
|
90
|
+
width = _number_or_zero(page.get("width"), page.get("page_width"))
|
|
91
|
+
height = _number_or_zero(page.get("height"), page.get("page_height"))
|
|
92
|
+
angle = _page_angle(page)
|
|
93
|
+
duration = _number_or_zero(page.get("durations"), page.get("duration"))
|
|
94
|
+
|
|
95
|
+
structured: list[dict[str, Any]] = []
|
|
96
|
+
content: list[dict[str, Any]] = []
|
|
97
|
+
detail: list[dict[str, Any]] = []
|
|
98
|
+
layout_boxes = _layout_boxes(page)
|
|
99
|
+
|
|
100
|
+
for fallback_block_index, block in enumerate(_blocks(page)):
|
|
101
|
+
block_id = _int_value(block.get("block_id"), fallback_block_index)
|
|
102
|
+
label = (
|
|
103
|
+
_string_value(block.get("block_label"))
|
|
104
|
+
or _string_value(block.get("block_type"))
|
|
105
|
+
or _string_value(block.get("type"))
|
|
106
|
+
or "text"
|
|
107
|
+
)
|
|
108
|
+
text = _string_value(block.get("block_content")) or _string_value(block.get("text"))
|
|
109
|
+
pos = _bbox_to_quad(block.get("block_bbox") or block.get("bbox") or block.get("pos"))
|
|
110
|
+
status = 0 if label in IGNORED_STATUS_LABELS else 1
|
|
111
|
+
|
|
112
|
+
structured_item: dict[str, Any] = {
|
|
113
|
+
"pos": pos,
|
|
114
|
+
"id": block_id,
|
|
115
|
+
"content": [block_id],
|
|
116
|
+
"text": text,
|
|
117
|
+
"type": label,
|
|
118
|
+
"outline_level": _int_value(block.get("outline_level"), -1),
|
|
119
|
+
}
|
|
120
|
+
if label == "table":
|
|
121
|
+
rows, cols = _table_dimensions(text)
|
|
122
|
+
structured_item.update({"rows": rows, "cols": cols, "sub_type": "bordered"})
|
|
123
|
+
structured.append(structured_item)
|
|
124
|
+
|
|
125
|
+
content_item: dict[str, Any] = {
|
|
126
|
+
"id": block_id,
|
|
127
|
+
"status": status,
|
|
128
|
+
"pos": pos,
|
|
129
|
+
"type": _content_type(label),
|
|
130
|
+
"text": text,
|
|
131
|
+
}
|
|
132
|
+
score = _score_for_block(block, label, layout_boxes)
|
|
133
|
+
if label != "table":
|
|
134
|
+
content_item["score"] = score
|
|
135
|
+
content_item["angle"] = angle
|
|
136
|
+
content.append(content_item)
|
|
137
|
+
|
|
138
|
+
detail_item: dict[str, Any] = {
|
|
139
|
+
"paragraph_id": first_paragraph_id + len(detail),
|
|
140
|
+
"page_id": page_id,
|
|
141
|
+
"status": status,
|
|
142
|
+
"type": _detail_type(label),
|
|
143
|
+
"position": pos,
|
|
144
|
+
"outline_level": structured_item["outline_level"],
|
|
145
|
+
"sub_type": "bordered" if label == "table" else label,
|
|
146
|
+
"content": 0,
|
|
147
|
+
"text": text,
|
|
148
|
+
}
|
|
149
|
+
if label == "table":
|
|
150
|
+
detail_item["caption_id"] = block_id
|
|
151
|
+
else:
|
|
152
|
+
detail_item["tags"] = []
|
|
153
|
+
detail.append(detail_item)
|
|
154
|
+
|
|
155
|
+
standard_page = {
|
|
156
|
+
"angle": angle,
|
|
157
|
+
"page_id": page_id,
|
|
158
|
+
"image_id": _string_value(page.get("image_id")),
|
|
159
|
+
"height": height,
|
|
160
|
+
"width": width,
|
|
161
|
+
"durations": duration,
|
|
162
|
+
"structured": structured,
|
|
163
|
+
"status": "Success",
|
|
164
|
+
"content": content,
|
|
165
|
+
}
|
|
166
|
+
metric = {
|
|
167
|
+
"angle": angle,
|
|
168
|
+
"status": "Success",
|
|
169
|
+
"dpi": _int_value(page.get("dpi"), 144),
|
|
170
|
+
"page_id": page_id,
|
|
171
|
+
"image_id": standard_page["image_id"],
|
|
172
|
+
"duration": duration,
|
|
173
|
+
"page_image_height": height,
|
|
174
|
+
"page_image_width": width,
|
|
175
|
+
}
|
|
176
|
+
return standard_page, detail, metric
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _blocks(page: dict[str, Any]) -> list[dict[str, Any]]:
|
|
180
|
+
blocks = page.get("parsing_res_list")
|
|
181
|
+
if isinstance(blocks, list):
|
|
182
|
+
return [block for block in blocks if isinstance(block, dict)]
|
|
183
|
+
structured = page.get("structured")
|
|
184
|
+
if isinstance(structured, list):
|
|
185
|
+
return [block for block in structured if isinstance(block, dict)]
|
|
186
|
+
return []
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _layout_boxes(page: dict[str, Any]) -> list[dict[str, Any]]:
|
|
190
|
+
layout = page.get("layout_det_res")
|
|
191
|
+
boxes = layout.get("boxes") if isinstance(layout, dict) else None
|
|
192
|
+
if not isinstance(boxes, list):
|
|
193
|
+
return []
|
|
194
|
+
return [box for box in boxes if isinstance(box, dict)]
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _score_for_block(
|
|
198
|
+
block: dict[str, Any],
|
|
199
|
+
label: str,
|
|
200
|
+
layout_boxes: list[dict[str, Any]],
|
|
201
|
+
) -> float:
|
|
202
|
+
bbox = _bbox4(block.get("block_bbox") or block.get("bbox") or block.get("pos"))
|
|
203
|
+
if bbox is None:
|
|
204
|
+
return _float_value(block.get("score"), 0.0)
|
|
205
|
+
|
|
206
|
+
best_score = _float_value(block.get("score"), 0.0)
|
|
207
|
+
best_overlap = 0.0
|
|
208
|
+
for candidate in layout_boxes:
|
|
209
|
+
if candidate.get("label") != label:
|
|
210
|
+
continue
|
|
211
|
+
candidate_bbox = _bbox4(candidate.get("coordinate"))
|
|
212
|
+
if candidate_bbox is None:
|
|
213
|
+
continue
|
|
214
|
+
overlap = _iou(bbox, candidate_bbox)
|
|
215
|
+
if overlap > best_overlap:
|
|
216
|
+
best_overlap = overlap
|
|
217
|
+
best_score = _float_value(candidate.get("score"), best_score)
|
|
218
|
+
return best_score
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _page_id(page: dict[str, Any], page_index: int) -> int:
|
|
222
|
+
for key in ("page_id", "page_number"):
|
|
223
|
+
value = page.get(key)
|
|
224
|
+
if isinstance(value, int) and not isinstance(value, bool):
|
|
225
|
+
return value
|
|
226
|
+
return page_index + 1
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _page_angle(page: dict[str, Any]) -> int:
|
|
230
|
+
for value in (
|
|
231
|
+
page.get("angle"),
|
|
232
|
+
page.get("doc_preprocessor_res", {}).get("angle")
|
|
233
|
+
if isinstance(page.get("doc_preprocessor_res"), dict)
|
|
234
|
+
else None,
|
|
235
|
+
):
|
|
236
|
+
if isinstance(value, Real) and not isinstance(value, bool) and math.isfinite(value):
|
|
237
|
+
return int(value) if value >= 0 else 0
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _file_type(metadata: dict[str, Any]) -> str:
|
|
242
|
+
explicit = _string_value(metadata.get("file_type"))
|
|
243
|
+
if explicit:
|
|
244
|
+
return explicit
|
|
245
|
+
document_type = _string_value(metadata.get("document_type"))
|
|
246
|
+
return document_type.upper() if document_type else ""
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _content_type(label: str) -> str:
|
|
250
|
+
return "image" if label.endswith("_image") else label
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _detail_type(label: str) -> str:
|
|
254
|
+
if label == "table":
|
|
255
|
+
return "table"
|
|
256
|
+
if label.endswith("_image"):
|
|
257
|
+
return "image"
|
|
258
|
+
return "paragraph"
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _bbox_to_quad(value: Any) -> list[int]:
|
|
262
|
+
bbox = _bbox4(value)
|
|
263
|
+
if bbox is None:
|
|
264
|
+
return []
|
|
265
|
+
x1, y1, x2, y2 = [_rounded_int(item) for item in bbox]
|
|
266
|
+
return [x1, y1, x2, y1, x2, y2, x1, y2]
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _bbox4(value: Any) -> list[float] | None:
|
|
270
|
+
if not isinstance(value, list):
|
|
271
|
+
return None
|
|
272
|
+
if len(value) >= 8:
|
|
273
|
+
numbers = [_finite_float(item) for item in value[:8]]
|
|
274
|
+
if any(item is None for item in numbers):
|
|
275
|
+
return None
|
|
276
|
+
xs = [item for item in numbers[0::2] if item is not None]
|
|
277
|
+
ys = [item for item in numbers[1::2] if item is not None]
|
|
278
|
+
return [min(xs), min(ys), max(xs), max(ys)]
|
|
279
|
+
if len(value) >= 4:
|
|
280
|
+
numbers = [_finite_float(item) for item in value[:4]]
|
|
281
|
+
if any(item is None for item in numbers):
|
|
282
|
+
return None
|
|
283
|
+
return [item for item in numbers if item is not None]
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _iou(left: list[float], right: list[float]) -> float:
|
|
288
|
+
left_x1, left_y1, left_x2, left_y2 = left
|
|
289
|
+
right_x1, right_y1, right_x2, right_y2 = right
|
|
290
|
+
x1 = max(left_x1, right_x1)
|
|
291
|
+
y1 = max(left_y1, right_y1)
|
|
292
|
+
x2 = min(left_x2, right_x2)
|
|
293
|
+
y2 = min(left_y2, right_y2)
|
|
294
|
+
intersection = max(0.0, x2 - x1) * max(0.0, y2 - y1)
|
|
295
|
+
left_area = max(0.0, left_x2 - left_x1) * max(0.0, left_y2 - left_y1)
|
|
296
|
+
right_area = max(0.0, right_x2 - right_x1) * max(0.0, right_y2 - right_y1)
|
|
297
|
+
union = left_area + right_area - intersection
|
|
298
|
+
return intersection / union if union else 0.0
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _table_dimensions(html: str) -> tuple[int, int]:
|
|
302
|
+
parser = _TableDimensionParser()
|
|
303
|
+
parser.feed(html)
|
|
304
|
+
parser.close()
|
|
305
|
+
return parser.rows, parser.cols
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
class _TableDimensionParser(HTMLParser):
|
|
309
|
+
def __init__(self) -> None:
|
|
310
|
+
super().__init__()
|
|
311
|
+
self.rows = 0
|
|
312
|
+
self.cols = 0
|
|
313
|
+
self._current_cols = 0
|
|
314
|
+
self._in_row = False
|
|
315
|
+
|
|
316
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
317
|
+
if tag == "tr":
|
|
318
|
+
self._in_row = True
|
|
319
|
+
self._current_cols = 0
|
|
320
|
+
self.rows += 1
|
|
321
|
+
elif self._in_row and tag in {"td", "th"}:
|
|
322
|
+
colspan = dict(attrs).get("colspan")
|
|
323
|
+
self._current_cols += _positive_int(colspan, 1)
|
|
324
|
+
|
|
325
|
+
def handle_endtag(self, tag: str) -> None:
|
|
326
|
+
if tag == "tr":
|
|
327
|
+
self.cols = max(self.cols, self._current_cols)
|
|
328
|
+
self._in_row = False
|
|
329
|
+
self._current_cols = 0
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _positive_int(value: Any, default: int) -> int:
|
|
333
|
+
try:
|
|
334
|
+
parsed = int(value)
|
|
335
|
+
except (TypeError, ValueError):
|
|
336
|
+
return default
|
|
337
|
+
return parsed if parsed > 0 else default
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _int_value(value: Any, default: int) -> int:
|
|
341
|
+
return value if isinstance(value, int) and not isinstance(value, bool) else default
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _number_or_zero(*values: Any) -> int | float:
|
|
345
|
+
for value in values:
|
|
346
|
+
if isinstance(value, Real) and not isinstance(value, bool) and math.isfinite(value):
|
|
347
|
+
return int(value) if float(value).is_integer() else float(value)
|
|
348
|
+
return 0
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _float_value(value: Any, default: float) -> float:
|
|
352
|
+
parsed = _finite_float(value)
|
|
353
|
+
return parsed if parsed is not None else default
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _finite_float(value: Any) -> float | None:
|
|
357
|
+
if isinstance(value, Real) and not isinstance(value, bool) and math.isfinite(value):
|
|
358
|
+
return float(value)
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _rounded_int(value: float) -> int:
|
|
363
|
+
return int(round(value))
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _string_value(value: Any) -> str:
|
|
367
|
+
return value if isinstance(value, str) else ""
|