docslight-lite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docslight/__init__.py +41 -0
  2. docslight/cli.py +215 -0
  3. docslight/client.py +92 -0
  4. docslight/cloud/__init__.py +5 -0
  5. docslight/cloud/client.py +622 -0
  6. docslight/config.py +117 -0
  7. docslight/exceptions.py +65 -0
  8. docslight/local/__init__.py +31 -0
  9. docslight/local/layout_blocks.py +80 -0
  10. docslight/local/llm_extractor.py +252 -0
  11. docslight/local/loaders.py +95 -0
  12. docslight/local/markdown.py +18 -0
  13. docslight/local/office_loader.py +128 -0
  14. docslight/local/paddle_parser.py +173 -0
  15. docslight/local/pipeline.py +213 -0
  16. docslight/preview.py +46 -0
  17. docslight/providers/__init__.py +6 -0
  18. docslight/providers/ollama.py +30 -0
  19. docslight/providers/openai_compatible.py +64 -0
  20. docslight/result.py +89 -0
  21. docslight/schemas/__init__.py +5 -0
  22. docslight/schemas/fields.py +190 -0
  23. docslight/standard_json.py +367 -0
  24. docslight/static/app/common.js +668 -0
  25. docslight/static/app/docslight-extract.json +307 -0
  26. docslight/static/app/extract.js +394 -0
  27. docslight/static/app/i18n.js +405 -0
  28. docslight/static/app/parse.js +161 -0
  29. docslight/static/styles.css +878 -0
  30. docslight/templates/base.html +36 -0
  31. docslight/templates/extract.html +123 -0
  32. docslight/templates/parse.html +81 -0
  33. docslight/web_app.py +372 -0
  34. docslight_lite-0.1.0.dist-info/METADATA +277 -0
  35. docslight_lite-0.1.0.dist-info/RECORD +39 -0
  36. docslight_lite-0.1.0.dist-info/WHEEL +5 -0
  37. docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
  38. docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
  39. docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
docslight/result.py ADDED
@@ -0,0 +1,89 @@
1
+ """Result objects returned by docslight operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ EXTRACT_DIMENSION_KEYS = ("source_width", "source_height")
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class ParseResult:
13
+ """Parsed document content."""
14
+
15
+ markdown: str
16
+ pages: list[dict[str, Any]] = field(default_factory=list)
17
+ metadata: dict[str, Any] = field(default_factory=dict)
18
+ raw_response: Any = None
19
+ raw_archive: bytes | None = None
20
+
21
+ def to_markdown(self) -> str:
22
+ """Return the parsed markdown content."""
23
+ return self.markdown
24
+
25
+ def to_json(self) -> dict[str, Any]:
26
+ """Return a JSON-serializable representation."""
27
+ return {
28
+ "markdown": self.markdown,
29
+ "pages": self.pages,
30
+ "metadata": self.metadata,
31
+ }
32
+
33
+ def to_standard_json(self) -> dict[str, Any]:
34
+ """Return the ComPDF-style standard parse JSON representation."""
35
+ from docslight.standard_json import build_standard_parse_json
36
+
37
+ return build_standard_parse_json(
38
+ markdown=self.markdown,
39
+ pages=self.pages,
40
+ metadata=self.metadata,
41
+ )
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class ExtractResult:
46
+ """Structured extraction result."""
47
+
48
+ data: dict[str, Any]
49
+ metadata: dict[str, Any] = field(default_factory=dict)
50
+ raw_response: Any = None
51
+
52
+ def __post_init__(self) -> None:
53
+ """Normalize extract payloads into a fixed {results, metadata} shape."""
54
+ results, metadata = normalize_extract_payload(self.data, self.metadata)
55
+ object.__setattr__(self, "data", results)
56
+ object.__setattr__(self, "metadata", metadata)
57
+
58
+ def to_json(self) -> dict[str, Any]:
59
+ """Return a JSON-serializable representation."""
60
+ return {
61
+ "results": self.data,
62
+ "metadata": self.metadata,
63
+ }
64
+
65
+
66
+ def normalize_extract_payload(
67
+ data: dict[str, Any] | None,
68
+ metadata: dict[str, Any] | None = None,
69
+ ) -> tuple[dict[str, Any], dict[str, Any]]:
70
+ """Normalize extract payloads from cloud/local providers."""
71
+ payload = dict(data) if isinstance(data, dict) else {}
72
+ normalized_metadata = dict(payload.get("metadata", {})) if isinstance(payload.get("metadata"), dict) else {}
73
+ if isinstance(metadata, dict):
74
+ normalized_metadata.update(metadata)
75
+
76
+ if isinstance(payload.get("results"), dict):
77
+ results = dict(payload["results"])
78
+ else:
79
+ results = {key: value for key, value in payload.items() if key not in {"message", "metadata"}}
80
+
81
+ fields = results.pop("fields", None)
82
+ if isinstance(fields, dict):
83
+ results = {**fields, **results}
84
+
85
+ for key in EXTRACT_DIMENSION_KEYS:
86
+ if key in results and key not in normalized_metadata:
87
+ normalized_metadata[key] = results.pop(key)
88
+
89
+ return results, normalized_metadata
@@ -0,0 +1,5 @@
1
+ """Shared schemas for document parsing workflows."""
2
+
3
+ from docslight.schemas.fields import build_extract_schema, normalize_fields
4
+
5
+ __all__ = ["build_extract_schema", "normalize_fields"]
@@ -0,0 +1,190 @@
1
+ """Helpers for extraction field schemas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from docslight.exceptions import ConfigurationError
8
+
9
+ StructuredFields = dict[str, Any]
10
+ NormalizedFields = list[str] | StructuredFields | None
11
+ ExtractSchema = dict[str, Any]
12
+
13
+
14
+ def normalize_fields(fields: list[str] | str | StructuredFields | None) -> NormalizedFields:
15
+ """Normalize extraction fields from SDK, CLI, or Web UI inputs."""
16
+ if fields is None:
17
+ return None
18
+ if isinstance(fields, str):
19
+ normalized = [field.strip() for field in fields.split(",") if field.strip()]
20
+ return normalized or None
21
+ if isinstance(fields, dict):
22
+ return _normalize_structured_fields(fields)
23
+ if not isinstance(fields, list):
24
+ raise ConfigurationError(
25
+ "fields must be a list of strings, comma-separated string, object, or None"
26
+ )
27
+
28
+ normalized = []
29
+ for field in fields:
30
+ if not isinstance(field, str):
31
+ raise ConfigurationError(
32
+ "fields must be a list of strings, comma-separated string, object, or None"
33
+ )
34
+ stripped = field.strip()
35
+ if stripped:
36
+ normalized.append(stripped)
37
+ return normalized or None
38
+
39
+
40
+ def build_extract_schema(fields: NormalizedFields) -> ExtractSchema | None:
41
+ """Build a stable JSON schema for extract outputs."""
42
+ if not isinstance(fields, dict):
43
+ if isinstance(fields, list):
44
+ return {
45
+ "type": "object",
46
+ "properties": {
47
+ field: {"type": ["string", "number", "boolean", "null", "object", "array"]}
48
+ for field in fields
49
+ },
50
+ "additionalProperties": True,
51
+ }
52
+ return None
53
+
54
+ properties: dict[str, Any] = {}
55
+ required: list[str] = []
56
+ keys = fields.get("keys", {})
57
+ tables = fields.get("tableHeaders", {})
58
+
59
+ for field_name in keys:
60
+ properties[field_name] = {
61
+ "type": "object",
62
+ "properties": {
63
+ "value": {"type": ["string", "number", "boolean", "null"]},
64
+ "bboxes": {
65
+ "type": "array",
66
+ "items": {
67
+ "type": "object",
68
+ "properties": {
69
+ "page_id": {"type": "number"},
70
+ "bbox": {
71
+ "type": "array",
72
+ "items": {"type": "number"},
73
+ "minItems": 4,
74
+ "maxItems": 4,
75
+ },
76
+ },
77
+ "required": ["page_id", "bbox"],
78
+ "additionalProperties": True,
79
+ },
80
+ },
81
+ },
82
+ "required": ["value"],
83
+ "additionalProperties": True,
84
+ }
85
+ required.append(field_name)
86
+
87
+ table_properties: dict[str, Any] = {}
88
+ for table_name, columns in tables.items():
89
+ table_properties[table_name] = {
90
+ "type": "array",
91
+ "items": {
92
+ "type": "object",
93
+ "properties": {
94
+ column_name: {"type": ["string", "number", "boolean", "null", "object", "array"]}
95
+ for column_name in columns
96
+ },
97
+ "additionalProperties": True,
98
+ },
99
+ }
100
+ required.append(table_name)
101
+
102
+ if table_properties:
103
+ properties["tables"] = {
104
+ "type": "object",
105
+ "properties": table_properties,
106
+ "additionalProperties": False,
107
+ }
108
+ properties["_table_bboxes"] = {
109
+ "type": "object",
110
+ "additionalProperties": {
111
+ "type": "array",
112
+ "items": {
113
+ "type": "object",
114
+ "properties": {
115
+ "page_id": {"type": "number"},
116
+ "bbox": {
117
+ "type": "array",
118
+ "items": {"type": "number"},
119
+ "minItems": 4,
120
+ "maxItems": 4,
121
+ },
122
+ },
123
+ "required": ["page_id", "bbox"],
124
+ "additionalProperties": True,
125
+ },
126
+ },
127
+ }
128
+
129
+ return {
130
+ "type": "object",
131
+ "properties": properties,
132
+ "required": required,
133
+ "additionalProperties": True,
134
+ }
135
+
136
+
137
+ def _normalize_structured_fields(fields: dict[str, Any]) -> dict[str, Any]:
138
+ name = fields.get("name")
139
+ if not isinstance(name, str) or not name.strip():
140
+ raise ConfigurationError("fields.name is required")
141
+
142
+ normalized: dict[str, Any] = {"name": name.strip()}
143
+ keys = _normalize_field_group(fields.get("keys", {}), "fields.keys")
144
+ tables = _normalize_tables(fields.get("tableHeaders", {}))
145
+ if keys:
146
+ normalized["keys"] = keys
147
+ if tables:
148
+ normalized["tableHeaders"] = tables
149
+ if "keys" not in normalized and "tableHeaders" not in normalized:
150
+ raise ConfigurationError("fields must include at least one keys or tableHeaders entry")
151
+ return normalized
152
+
153
+
154
+ def _normalize_tables(value: Any) -> dict[str, dict[str, Any]]:
155
+ if value in (None, ""):
156
+ return {}
157
+ if not isinstance(value, dict):
158
+ raise ConfigurationError("fields.tableHeaders must be an object")
159
+ normalized: dict[str, dict[str, Any]] = {}
160
+ for table_name, columns in value.items():
161
+ if not isinstance(table_name, str) or not table_name.strip():
162
+ raise ConfigurationError("fields.tableHeaders table names must be non-empty strings")
163
+ table_columns = _normalize_field_group(
164
+ columns,
165
+ f"fields.tableHeaders.{table_name}",
166
+ )
167
+ if table_columns:
168
+ normalized[table_name.strip()] = table_columns
169
+ return normalized
170
+
171
+
172
+ def _normalize_field_group(value: Any, label: str) -> dict[str, dict[str, Any]]:
173
+ if value in (None, ""):
174
+ return {}
175
+ if not isinstance(value, dict):
176
+ raise ConfigurationError(f"{label} must be an object")
177
+ normalized: dict[str, dict[str, Any]] = {}
178
+ for field_name, field_def in value.items():
179
+ if not isinstance(field_name, str) or not field_name.strip():
180
+ raise ConfigurationError(f"{label} field names must be non-empty strings")
181
+ if not isinstance(field_def, dict):
182
+ raise ConfigurationError(f"{label}.{field_name} must be an object")
183
+ prompt = field_def.get("prompt")
184
+ if prompt is not None and not isinstance(prompt, str):
185
+ raise ConfigurationError(f"{label}.{field_name}.prompt must be a string or null")
186
+ normalized[field_name.strip()] = {
187
+ "prompt": prompt,
188
+ "mapping": field_def.get("mapping"),
189
+ }
190
+ return normalized
@@ -0,0 +1,367 @@
1
+ """Convert local parse payloads into the ComPDF-style standard JSON shape."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from html.parser import HTMLParser
7
+ from numbers import Real
8
+ from typing import Any
9
+
10
+ IGNORED_STATUS_LABELS = {
11
+ "aside_text",
12
+ "footer",
13
+ "footer_image",
14
+ "footnote",
15
+ "header",
16
+ "header_image",
17
+ "number",
18
+ }
19
+
20
+
21
+ def convert_parse_payload(payload: dict[str, Any]) -> dict[str, Any]:
22
+ """Convert a serialized ParseResult payload into standard parse JSON."""
23
+ if _looks_like_standard_payload(payload):
24
+ return payload
25
+ metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {}
26
+ pages = payload.get("pages") if isinstance(payload.get("pages"), list) else []
27
+ markdown = payload.get("markdown") if isinstance(payload.get("markdown"), str) else ""
28
+ return build_standard_parse_json(markdown=markdown, pages=pages, metadata=metadata)
29
+
30
+
31
+ def build_standard_parse_json(
32
+ *,
33
+ markdown: str,
34
+ pages: list[dict[str, Any]],
35
+ metadata: dict[str, Any] | None = None,
36
+ ) -> dict[str, Any]:
37
+ """Build standard parse JSON from ParseResult parts."""
38
+ metadata = metadata or {}
39
+ standard_pages: list[dict[str, Any]] = []
40
+ detail: list[dict[str, Any]] = []
41
+ metrics: list[dict[str, Any]] = []
42
+
43
+ for fallback_page_index, page in enumerate(pages):
44
+ if not isinstance(page, dict):
45
+ continue
46
+ standard_page, page_detail, metric = _convert_page(
47
+ page,
48
+ fallback_page_index,
49
+ first_paragraph_id=len(detail) + 1,
50
+ )
51
+ standard_pages.append(standard_page)
52
+ detail.extend(page_detail)
53
+ metrics.append(metric)
54
+
55
+ success_count = sum(1 for page in standard_pages if page.get("status") == "Success")
56
+ return {
57
+ "result": {
58
+ "pages": standard_pages,
59
+ "detail": detail,
60
+ "total_count": len(standard_pages),
61
+ "valid_page_number": success_count,
62
+ "total_page_number": len(standard_pages),
63
+ "catalog": {},
64
+ "excel_base64": "",
65
+ "success_count": success_count,
66
+ "markdown": markdown,
67
+ },
68
+ "x_request_id": _string_value(metadata.get("x_request_id")),
69
+ "file_type": _file_type(metadata),
70
+ "metrics": metrics,
71
+ "message": "Success",
72
+ "code": 200,
73
+ "image_process": [],
74
+ }
75
+
76
+
77
+ def _looks_like_standard_payload(payload: dict[str, Any]) -> bool:
78
+ result = payload.get("result")
79
+ return isinstance(result, dict) and isinstance(result.get("pages"), list)
80
+
81
+
82
+ def _convert_page(
83
+ page: dict[str, Any],
84
+ fallback_page_index: int,
85
+ *,
86
+ first_paragraph_id: int,
87
+ ) -> tuple[dict[str, Any], list[dict[str, Any]], dict[str, Any]]:
88
+ page_index = _int_value(page.get("page_index"), fallback_page_index)
89
+ page_id = _page_id(page, page_index)
90
+ width = _number_or_zero(page.get("width"), page.get("page_width"))
91
+ height = _number_or_zero(page.get("height"), page.get("page_height"))
92
+ angle = _page_angle(page)
93
+ duration = _number_or_zero(page.get("durations"), page.get("duration"))
94
+
95
+ structured: list[dict[str, Any]] = []
96
+ content: list[dict[str, Any]] = []
97
+ detail: list[dict[str, Any]] = []
98
+ layout_boxes = _layout_boxes(page)
99
+
100
+ for fallback_block_index, block in enumerate(_blocks(page)):
101
+ block_id = _int_value(block.get("block_id"), fallback_block_index)
102
+ label = (
103
+ _string_value(block.get("block_label"))
104
+ or _string_value(block.get("block_type"))
105
+ or _string_value(block.get("type"))
106
+ or "text"
107
+ )
108
+ text = _string_value(block.get("block_content")) or _string_value(block.get("text"))
109
+ pos = _bbox_to_quad(block.get("block_bbox") or block.get("bbox") or block.get("pos"))
110
+ status = 0 if label in IGNORED_STATUS_LABELS else 1
111
+
112
+ structured_item: dict[str, Any] = {
113
+ "pos": pos,
114
+ "id": block_id,
115
+ "content": [block_id],
116
+ "text": text,
117
+ "type": label,
118
+ "outline_level": _int_value(block.get("outline_level"), -1),
119
+ }
120
+ if label == "table":
121
+ rows, cols = _table_dimensions(text)
122
+ structured_item.update({"rows": rows, "cols": cols, "sub_type": "bordered"})
123
+ structured.append(structured_item)
124
+
125
+ content_item: dict[str, Any] = {
126
+ "id": block_id,
127
+ "status": status,
128
+ "pos": pos,
129
+ "type": _content_type(label),
130
+ "text": text,
131
+ }
132
+ score = _score_for_block(block, label, layout_boxes)
133
+ if label != "table":
134
+ content_item["score"] = score
135
+ content_item["angle"] = angle
136
+ content.append(content_item)
137
+
138
+ detail_item: dict[str, Any] = {
139
+ "paragraph_id": first_paragraph_id + len(detail),
140
+ "page_id": page_id,
141
+ "status": status,
142
+ "type": _detail_type(label),
143
+ "position": pos,
144
+ "outline_level": structured_item["outline_level"],
145
+ "sub_type": "bordered" if label == "table" else label,
146
+ "content": 0,
147
+ "text": text,
148
+ }
149
+ if label == "table":
150
+ detail_item["caption_id"] = block_id
151
+ else:
152
+ detail_item["tags"] = []
153
+ detail.append(detail_item)
154
+
155
+ standard_page = {
156
+ "angle": angle,
157
+ "page_id": page_id,
158
+ "image_id": _string_value(page.get("image_id")),
159
+ "height": height,
160
+ "width": width,
161
+ "durations": duration,
162
+ "structured": structured,
163
+ "status": "Success",
164
+ "content": content,
165
+ }
166
+ metric = {
167
+ "angle": angle,
168
+ "status": "Success",
169
+ "dpi": _int_value(page.get("dpi"), 144),
170
+ "page_id": page_id,
171
+ "image_id": standard_page["image_id"],
172
+ "duration": duration,
173
+ "page_image_height": height,
174
+ "page_image_width": width,
175
+ }
176
+ return standard_page, detail, metric
177
+
178
+
179
+ def _blocks(page: dict[str, Any]) -> list[dict[str, Any]]:
180
+ blocks = page.get("parsing_res_list")
181
+ if isinstance(blocks, list):
182
+ return [block for block in blocks if isinstance(block, dict)]
183
+ structured = page.get("structured")
184
+ if isinstance(structured, list):
185
+ return [block for block in structured if isinstance(block, dict)]
186
+ return []
187
+
188
+
189
+ def _layout_boxes(page: dict[str, Any]) -> list[dict[str, Any]]:
190
+ layout = page.get("layout_det_res")
191
+ boxes = layout.get("boxes") if isinstance(layout, dict) else None
192
+ if not isinstance(boxes, list):
193
+ return []
194
+ return [box for box in boxes if isinstance(box, dict)]
195
+
196
+
197
+ def _score_for_block(
198
+ block: dict[str, Any],
199
+ label: str,
200
+ layout_boxes: list[dict[str, Any]],
201
+ ) -> float:
202
+ bbox = _bbox4(block.get("block_bbox") or block.get("bbox") or block.get("pos"))
203
+ if bbox is None:
204
+ return _float_value(block.get("score"), 0.0)
205
+
206
+ best_score = _float_value(block.get("score"), 0.0)
207
+ best_overlap = 0.0
208
+ for candidate in layout_boxes:
209
+ if candidate.get("label") != label:
210
+ continue
211
+ candidate_bbox = _bbox4(candidate.get("coordinate"))
212
+ if candidate_bbox is None:
213
+ continue
214
+ overlap = _iou(bbox, candidate_bbox)
215
+ if overlap > best_overlap:
216
+ best_overlap = overlap
217
+ best_score = _float_value(candidate.get("score"), best_score)
218
+ return best_score
219
+
220
+
221
+ def _page_id(page: dict[str, Any], page_index: int) -> int:
222
+ for key in ("page_id", "page_number"):
223
+ value = page.get(key)
224
+ if isinstance(value, int) and not isinstance(value, bool):
225
+ return value
226
+ return page_index + 1
227
+
228
+
229
+ def _page_angle(page: dict[str, Any]) -> int:
230
+ for value in (
231
+ page.get("angle"),
232
+ page.get("doc_preprocessor_res", {}).get("angle")
233
+ if isinstance(page.get("doc_preprocessor_res"), dict)
234
+ else None,
235
+ ):
236
+ if isinstance(value, Real) and not isinstance(value, bool) and math.isfinite(value):
237
+ return int(value) if value >= 0 else 0
238
+ return 0
239
+
240
+
241
+ def _file_type(metadata: dict[str, Any]) -> str:
242
+ explicit = _string_value(metadata.get("file_type"))
243
+ if explicit:
244
+ return explicit
245
+ document_type = _string_value(metadata.get("document_type"))
246
+ return document_type.upper() if document_type else ""
247
+
248
+
249
+ def _content_type(label: str) -> str:
250
+ return "image" if label.endswith("_image") else label
251
+
252
+
253
+ def _detail_type(label: str) -> str:
254
+ if label == "table":
255
+ return "table"
256
+ if label.endswith("_image"):
257
+ return "image"
258
+ return "paragraph"
259
+
260
+
261
+ def _bbox_to_quad(value: Any) -> list[int]:
262
+ bbox = _bbox4(value)
263
+ if bbox is None:
264
+ return []
265
+ x1, y1, x2, y2 = [_rounded_int(item) for item in bbox]
266
+ return [x1, y1, x2, y1, x2, y2, x1, y2]
267
+
268
+
269
+ def _bbox4(value: Any) -> list[float] | None:
270
+ if not isinstance(value, list):
271
+ return None
272
+ if len(value) >= 8:
273
+ numbers = [_finite_float(item) for item in value[:8]]
274
+ if any(item is None for item in numbers):
275
+ return None
276
+ xs = [item for item in numbers[0::2] if item is not None]
277
+ ys = [item for item in numbers[1::2] if item is not None]
278
+ return [min(xs), min(ys), max(xs), max(ys)]
279
+ if len(value) >= 4:
280
+ numbers = [_finite_float(item) for item in value[:4]]
281
+ if any(item is None for item in numbers):
282
+ return None
283
+ return [item for item in numbers if item is not None]
284
+ return None
285
+
286
+
287
+ def _iou(left: list[float], right: list[float]) -> float:
288
+ left_x1, left_y1, left_x2, left_y2 = left
289
+ right_x1, right_y1, right_x2, right_y2 = right
290
+ x1 = max(left_x1, right_x1)
291
+ y1 = max(left_y1, right_y1)
292
+ x2 = min(left_x2, right_x2)
293
+ y2 = min(left_y2, right_y2)
294
+ intersection = max(0.0, x2 - x1) * max(0.0, y2 - y1)
295
+ left_area = max(0.0, left_x2 - left_x1) * max(0.0, left_y2 - left_y1)
296
+ right_area = max(0.0, right_x2 - right_x1) * max(0.0, right_y2 - right_y1)
297
+ union = left_area + right_area - intersection
298
+ return intersection / union if union else 0.0
299
+
300
+
301
+ def _table_dimensions(html: str) -> tuple[int, int]:
302
+ parser = _TableDimensionParser()
303
+ parser.feed(html)
304
+ parser.close()
305
+ return parser.rows, parser.cols
306
+
307
+
308
+ class _TableDimensionParser(HTMLParser):
309
+ def __init__(self) -> None:
310
+ super().__init__()
311
+ self.rows = 0
312
+ self.cols = 0
313
+ self._current_cols = 0
314
+ self._in_row = False
315
+
316
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
317
+ if tag == "tr":
318
+ self._in_row = True
319
+ self._current_cols = 0
320
+ self.rows += 1
321
+ elif self._in_row and tag in {"td", "th"}:
322
+ colspan = dict(attrs).get("colspan")
323
+ self._current_cols += _positive_int(colspan, 1)
324
+
325
+ def handle_endtag(self, tag: str) -> None:
326
+ if tag == "tr":
327
+ self.cols = max(self.cols, self._current_cols)
328
+ self._in_row = False
329
+ self._current_cols = 0
330
+
331
+
332
+ def _positive_int(value: Any, default: int) -> int:
333
+ try:
334
+ parsed = int(value)
335
+ except (TypeError, ValueError):
336
+ return default
337
+ return parsed if parsed > 0 else default
338
+
339
+
340
+ def _int_value(value: Any, default: int) -> int:
341
+ return value if isinstance(value, int) and not isinstance(value, bool) else default
342
+
343
+
344
+ def _number_or_zero(*values: Any) -> int | float:
345
+ for value in values:
346
+ if isinstance(value, Real) and not isinstance(value, bool) and math.isfinite(value):
347
+ return int(value) if float(value).is_integer() else float(value)
348
+ return 0
349
+
350
+
351
+ def _float_value(value: Any, default: float) -> float:
352
+ parsed = _finite_float(value)
353
+ return parsed if parsed is not None else default
354
+
355
+
356
+ def _finite_float(value: Any) -> float | None:
357
+ if isinstance(value, Real) and not isinstance(value, bool) and math.isfinite(value):
358
+ return float(value)
359
+ return None
360
+
361
+
362
+ def _rounded_int(value: float) -> int:
363
+ return int(round(value))
364
+
365
+
366
+ def _string_value(value: Any) -> str:
367
+ return value if isinstance(value, str) else ""