docslight-lite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docslight/__init__.py +41 -0
  2. docslight/cli.py +215 -0
  3. docslight/client.py +92 -0
  4. docslight/cloud/__init__.py +5 -0
  5. docslight/cloud/client.py +622 -0
  6. docslight/config.py +117 -0
  7. docslight/exceptions.py +65 -0
  8. docslight/local/__init__.py +31 -0
  9. docslight/local/layout_blocks.py +80 -0
  10. docslight/local/llm_extractor.py +252 -0
  11. docslight/local/loaders.py +95 -0
  12. docslight/local/markdown.py +18 -0
  13. docslight/local/office_loader.py +128 -0
  14. docslight/local/paddle_parser.py +173 -0
  15. docslight/local/pipeline.py +213 -0
  16. docslight/preview.py +46 -0
  17. docslight/providers/__init__.py +6 -0
  18. docslight/providers/ollama.py +30 -0
  19. docslight/providers/openai_compatible.py +64 -0
  20. docslight/result.py +89 -0
  21. docslight/schemas/__init__.py +5 -0
  22. docslight/schemas/fields.py +190 -0
  23. docslight/standard_json.py +367 -0
  24. docslight/static/app/common.js +668 -0
  25. docslight/static/app/docslight-extract.json +307 -0
  26. docslight/static/app/extract.js +394 -0
  27. docslight/static/app/i18n.js +405 -0
  28. docslight/static/app/parse.js +161 -0
  29. docslight/static/styles.css +878 -0
  30. docslight/templates/base.html +36 -0
  31. docslight/templates/extract.html +123 -0
  32. docslight/templates/parse.html +81 -0
  33. docslight/web_app.py +372 -0
  34. docslight_lite-0.1.0.dist-info/METADATA +277 -0
  35. docslight_lite-0.1.0.dist-info/RECORD +39 -0
  36. docslight_lite-0.1.0.dist-info/WHEEL +5 -0
  37. docslight_lite-0.1.0.dist-info/entry_points.txt +2 -0
  38. docslight_lite-0.1.0.dist-info/licenses/LICENSE +21 -0
  39. docslight_lite-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,622 @@
1
+ """ComPDF Cloud API client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import json
7
+ import zipfile
8
+ from pathlib import Path
9
+ from typing import Any
10
+ from urllib.parse import urlparse, urlunparse
11
+
12
+ import requests
13
+
14
+ from docslight.exceptions import AuthenticationError, CloudAPIError, RateLimitError
15
+ from docslight.result import ExtractResult, ParseResult
16
+ from docslight.schemas import normalize_fields
17
+
18
+ UNSAFE_ERROR_MESSAGE_MARKERS = ("<", ">", "\n", "\r", "traceback", "bearer ", "sk-")
19
+ MAX_ERROR_MESSAGE_LENGTH = 200
20
+ SUCCESS_API_CODES = {"0", "200"}
21
+ RESULT_PAYLOAD_KEYS = {"data", "markdown", "metadata", "pages", "results"}
22
+
23
+
24
+ class CloudClient:
25
+ """Client for ComPDF Cloud document parsing and extraction APIs."""
26
+
27
+ def __init__(
28
+ self,
29
+ api_key: str | None = None,
30
+ base_url: str = "https://api.compdf.com",
31
+ timeout: float = 120.0,
32
+ session: requests.Session | None = None,
33
+ ) -> None:
34
+ self.api_key = api_key
35
+ self.base_url = base_url.rstrip("/")
36
+ self.timeout = timeout
37
+ self._owns_session = session is None
38
+ self.session = session if session is not None else requests.Session()
39
+
40
+ def __enter__(self) -> CloudClient:
41
+ """Return this client for use as a context manager."""
42
+ return self
43
+
44
+ def __exit__(self, *exc_info: object) -> None:
45
+ """Close owned resources when leaving a context manager."""
46
+ self.close()
47
+
48
+ def close(self) -> None:
49
+ """Close the internally-owned HTTP session."""
50
+ if self._owns_session:
51
+ self.session.close()
52
+
53
+ def parse(self, path: str | Path, **options: Any) -> ParseResult:
54
+ """Parse a document into markdown."""
55
+ request_options = dict(options)
56
+ download_result = _pop_bool_option(request_options, "download_result", True)
57
+ payload, direct_archive = self._post_file("parse", path, request_options)
58
+ process_payload, envelope_metadata = _unwrap_process_payload(payload)
59
+ result_payload, downloaded_archive = self._result_payload(
60
+ process_payload,
61
+ download_result,
62
+ )
63
+ raw_archive = direct_archive or downloaded_archive
64
+ pages = result_payload.get("pages")
65
+ metadata = _merge_metadata(result_payload, envelope_metadata)
66
+ markdown = _parse_markdown(result_payload)
67
+ return ParseResult(
68
+ markdown=markdown,
69
+ pages=pages if isinstance(pages, list) else [],
70
+ metadata=metadata,
71
+ raw_response=result_payload,
72
+ raw_archive=raw_archive,
73
+ )
74
+
75
+ def extract(
76
+ self,
77
+ path: str | Path,
78
+ fields: Any = None,
79
+ schema: Any = None,
80
+ document_types: Any = None,
81
+ **options: Any,
82
+ ) -> ExtractResult:
83
+ """Extract structured data from a document."""
84
+ download_result = _pop_bool_option(options, "download_result", True)
85
+ request_options = {
86
+ "extractFields": fields,
87
+ "schema": schema,
88
+ "document_types": document_types,
89
+ **options,
90
+ }
91
+ payload, _direct_archive = self._post_file("extract", path, request_options)
92
+ process_payload, envelope_metadata = _unwrap_process_payload(payload)
93
+ result_payload, _downloaded_archive = self._result_payload(
94
+ process_payload,
95
+ download_result,
96
+ )
97
+ data = _extract_data(result_payload)
98
+ metadata = _merge_metadata(result_payload, envelope_metadata)
99
+ return ExtractResult(
100
+ data=data if isinstance(data, dict) else {},
101
+ metadata=metadata,
102
+ raw_response=payload,
103
+ )
104
+
105
+ def health(self) -> dict[str, Any]:
106
+ """Return the Cloud API health payload."""
107
+ try:
108
+ response = self.session.get(
109
+ self._endpoint_url("health"),
110
+ headers=self._headers(),
111
+ timeout=self.timeout,
112
+ )
113
+ except requests.RequestException as exc:
114
+ raise CloudAPIError(f"Cloud API request failed: {exc}") from exc
115
+
116
+ payload = self._response_json(response, allow_invalid_error=True)
117
+ self._raise_for_error(response, payload)
118
+ return payload
119
+
120
+ def _post_file(
121
+ self,
122
+ operation: str,
123
+ path: str | Path,
124
+ options: dict[str, Any],
125
+ ) -> tuple[dict[str, Any], bytes | None]:
126
+ file_path = Path(path)
127
+ prepared_options = self._prepare_options(operation, options)
128
+ try:
129
+ with file_path.open("rb") as file_obj:
130
+ response = self.session.post(
131
+ self._endpoint_url(operation),
132
+ files={"file": (file_path.name, file_obj)},
133
+ data=self._compact_options(prepared_options),
134
+ headers=self._headers(),
135
+ timeout=self.timeout,
136
+ )
137
+ except requests.RequestException as exc:
138
+ raise CloudAPIError(f"Cloud API request failed: {exc}") from exc
139
+
140
+ return self._decode_response(response, operation)
141
+
142
+ def _result_payload(
143
+ self,
144
+ process_payload: dict[str, Any],
145
+ download_result: bool,
146
+ ) -> tuple[dict[str, Any], bytes | None]:
147
+ if _has_result_content(process_payload):
148
+ return process_payload, None
149
+ download_url = process_payload.get("downloadUrl") or process_payload.get("download_url")
150
+ if download_result and isinstance(download_url, str) and download_url:
151
+ return self._download_result_payload(download_url)
152
+ return {}, None
153
+
154
+ def _download_result_payload(self, url: str) -> tuple[dict[str, Any], bytes | None]:
155
+ try:
156
+ response = self.session.get(url, timeout=self.timeout)
157
+ except requests.RequestException as exc:
158
+ raise CloudAPIError(f"Cloud API result download failed: {exc}") from exc
159
+
160
+ if response.status_code >= 400:
161
+ payload = self._response_json(response, allow_invalid_error=True)
162
+ self._raise_for_error(response, payload)
163
+
164
+ content = getattr(response, "content", b"")
165
+ if isinstance(content, bytes) and content:
166
+ return _read_downloaded_result_payload(content)
167
+ return self._response_json(response), None
168
+
169
+ def _prepare_options(self, operation: str, options: dict[str, Any]) -> dict[str, Any]:
170
+ if operation != "extract" or not self._uses_custom_operation_urls():
171
+ return options
172
+
173
+ prepared = dict(options)
174
+ if "extract_fields" in prepared:
175
+ return prepared
176
+
177
+ fields = prepared.pop("fields", None)
178
+ if fields is None:
179
+ fields = prepared.pop("extractFields", None)
180
+ else:
181
+ prepared.pop("extractFields", None)
182
+ schema = prepared.pop("schema", None)
183
+ document_types = prepared.pop("document_types", None)
184
+ extract_fields = _to_extract_fields_payload(fields, schema)
185
+
186
+ if extract_fields is not None:
187
+ prepared["extract_fields"] = extract_fields
188
+ elif schema is not None:
189
+ prepared["schema"] = schema
190
+
191
+ if document_types is not None:
192
+ prepared["document_types"] = document_types
193
+
194
+ return prepared
195
+
196
+ def _compact_options(self, options: dict[str, Any]) -> dict[str, str]:
197
+ compacted: dict[str, str] = {}
198
+ for key, value in options.items():
199
+ if value is None:
200
+ continue
201
+ if isinstance(value, str):
202
+ compacted[key] = value
203
+ else:
204
+ try:
205
+ compacted[key] = json.dumps(
206
+ value,
207
+ ensure_ascii=False,
208
+ separators=(",", ":"),
209
+ )
210
+ except TypeError as exc:
211
+ raise CloudAPIError(
212
+ f"Cloud API option '{key}' is not JSON serializable"
213
+ ) from exc
214
+ return compacted
215
+
216
+ def _headers(self) -> dict[str, str]:
217
+ headers = {"User-Agent": "docslight-lite/0.1.0"}
218
+ if self.api_key:
219
+ headers["Authorization"] = f"Bearer {self.api_key}"
220
+ headers["x-api-key"] = self.api_key
221
+ return headers
222
+
223
+ def _decode_response(
224
+ self,
225
+ response: requests.Response,
226
+ operation: str,
227
+ ) -> tuple[dict[str, Any], bytes | None]:
228
+ if _looks_like_zip_response(response):
229
+ payload = _parse_zip_payload(response.content)
230
+ self._raise_for_error(response, payload)
231
+ return payload, response.content
232
+
233
+ payload = self._response_json(response, allow_invalid_error=True)
234
+ self._raise_for_error(response, payload)
235
+ if operation == "extract":
236
+ payload = _normalize_extract_response(payload)
237
+ return payload, None
238
+
239
+ def _endpoint_url(self, operation: str) -> str:
240
+ if operation not in {"parse", "extract", "health"}:
241
+ raise CloudAPIError(f"Unsupported cloud operation: {operation}")
242
+
243
+ if not self._uses_custom_operation_urls():
244
+ if operation == "health":
245
+ return f"{self.base_url}/v1/health"
246
+ if operation == "parse":
247
+ return f"{self.base_url}/server/v2/process/idp/documentParsing"
248
+ return f"{self.base_url}/server/v2/process/idp/documentExtract"
249
+
250
+ parsed = urlparse(self.base_url)
251
+ segments = [segment for segment in parsed.path.split("/") if segment]
252
+ if not segments:
253
+ path = f"/{operation}"
254
+ elif segments[-1] in {"parse", "extract", "health"}:
255
+ segments[-1] = operation
256
+ path = "/" + "/".join(segments)
257
+ else:
258
+ path = parsed.path.rstrip("/") + f"/{operation}"
259
+ return urlunparse(parsed._replace(path=path))
260
+
261
+ def _uses_custom_operation_urls(self) -> bool:
262
+ path = urlparse(self.base_url).path.rstrip("/")
263
+ return path.endswith("/parse") or path.endswith("/extract") or path.endswith("/health")
264
+
265
+ def _response_json(
266
+ self,
267
+ response: requests.Response,
268
+ allow_invalid_error: bool = False,
269
+ ) -> dict[str, Any]:
270
+ try:
271
+ payload = response.json()
272
+ except ValueError as exc:
273
+ if allow_invalid_error and response.status_code >= 400:
274
+ return {}
275
+ raise CloudAPIError("Cloud API returned invalid JSON") from exc
276
+ if not isinstance(payload, dict):
277
+ if allow_invalid_error and response.status_code >= 400:
278
+ return {}
279
+ raise CloudAPIError("Cloud API returned non-object JSON")
280
+ return payload
281
+
282
+ def _raise_for_error(
283
+ self,
284
+ response: requests.Response,
285
+ payload: dict[str, Any],
286
+ ) -> None:
287
+ status_code = response.status_code
288
+ api_status_code = _api_status_code(payload)
289
+ if status_code < 400 and api_status_code is None:
290
+ return
291
+
292
+ error_status_code = api_status_code or status_code
293
+ if status_code < 400 and str(error_status_code) in SUCCESS_API_CODES:
294
+ return
295
+
296
+ message = _safe_error_message(payload, error_status_code)
297
+ request_id = response.headers.get("x-request-id") or response.headers.get(
298
+ "X-Request-ID"
299
+ )
300
+ if error_status_code == 401:
301
+ raise AuthenticationError(
302
+ message,
303
+ status_code=error_status_code,
304
+ request_id=request_id,
305
+ )
306
+ if error_status_code == 429:
307
+ raise RateLimitError(
308
+ message,
309
+ status_code=error_status_code,
310
+ request_id=request_id,
311
+ )
312
+ raise CloudAPIError(message, status_code=error_status_code, request_id=request_id)
313
+
314
+
315
+ def _safe_error_message(payload: dict[str, Any], status_code: int) -> str:
316
+ fallback = f"Cloud API error {status_code}"
317
+ raw_message = payload.get("message") or payload.get("msg") or payload.get("error")
318
+ if not isinstance(raw_message, str):
319
+ return fallback
320
+ message = raw_message.strip()
321
+ if not message or len(message) > MAX_ERROR_MESSAGE_LENGTH:
322
+ return fallback
323
+ lowered = message.lower()
324
+ if any(marker in lowered for marker in UNSAFE_ERROR_MESSAGE_MARKERS):
325
+ return fallback
326
+ return message
327
+
328
+
329
+ def _unwrap_process_payload(
330
+ payload: dict[str, Any],
331
+ ) -> tuple[dict[str, Any], dict[str, Any]]:
332
+ if "code" not in payload:
333
+ return payload, {}
334
+ data = payload.get("data")
335
+ process_payload = data if isinstance(data, dict) else {}
336
+ envelope_metadata: dict[str, Any] = {}
337
+ code = payload.get("code")
338
+ msg = payload.get("msg")
339
+ if code is not None:
340
+ envelope_metadata["api_code"] = code
341
+ if isinstance(msg, str) and msg:
342
+ envelope_metadata["api_message"] = msg
343
+ envelope_metadata.update(_process_metadata(process_payload))
344
+ return process_payload, envelope_metadata
345
+
346
+
347
+ def _process_metadata(process_payload: dict[str, Any]) -> dict[str, Any]:
348
+ return {
349
+ key: value
350
+ for key, value in process_payload.items()
351
+ if key not in RESULT_PAYLOAD_KEYS
352
+ }
353
+
354
+
355
+ def _has_result_content(payload: dict[str, Any]) -> bool:
356
+ if isinstance(payload.get("markdown"), str):
357
+ return True
358
+ if isinstance(payload.get("pages"), list):
359
+ return True
360
+ if isinstance(payload.get("results"), dict):
361
+ return True
362
+ if "code" in payload:
363
+ return False
364
+ if isinstance(payload.get("data"), dict):
365
+ return True
366
+ return False
367
+
368
+
369
+ def _extract_data(payload: dict[str, Any]) -> dict[str, Any]:
370
+ data = payload.get("data")
371
+ if isinstance(data, dict):
372
+ return data
373
+ results = payload.get("results")
374
+ if isinstance(results, dict):
375
+ return results
376
+ return payload
377
+
378
+
379
+ def _parse_markdown(payload: dict[str, Any]) -> str:
380
+ markdown = payload.get("markdown")
381
+ if isinstance(markdown, str):
382
+ return markdown
383
+ markdown_texts = payload.get("markdown_texts")
384
+ if isinstance(markdown_texts, str):
385
+ return markdown_texts
386
+ pages = payload.get("pages")
387
+ if not isinstance(pages, list):
388
+ return ""
389
+ page_markdowns = [_page_markdown(page) for page in pages if isinstance(page, dict)]
390
+ return "\n\n".join(markdown for markdown in page_markdowns if markdown)
391
+
392
+
393
+ def _page_markdown(page: dict[str, Any]) -> str:
394
+ markdown = page.get("markdown")
395
+ if isinstance(markdown, str):
396
+ return markdown
397
+ markdown_texts = page.get("markdown_texts")
398
+ if isinstance(markdown_texts, str):
399
+ return markdown_texts
400
+ blocks = page.get("parsing_res_list")
401
+ if not isinstance(blocks, list):
402
+ return ""
403
+ parts = []
404
+ for block in blocks:
405
+ if not isinstance(block, dict):
406
+ continue
407
+ text = block.get("block_content") or block.get("block_text") or block.get("text")
408
+ if isinstance(text, str) and text.strip():
409
+ parts.append(text.strip())
410
+ return "\n\n".join(parts)
411
+
412
+
413
+ def _merge_metadata(
414
+ result_payload: dict[str, Any],
415
+ envelope_metadata: dict[str, Any],
416
+ ) -> dict[str, Any]:
417
+ metadata = {}
418
+ payload_metadata = result_payload.get("metadata")
419
+ if isinstance(payload_metadata, dict):
420
+ metadata.update(payload_metadata)
421
+ metadata.update(envelope_metadata)
422
+ return metadata
423
+
424
+
425
+ def _read_downloaded_result_payload(content: bytes) -> tuple[dict[str, Any], bytes | None]:
426
+ json_payload = _read_json_payload(content)
427
+ if json_payload is not None:
428
+ return json_payload, None
429
+ return _parse_zip_payload(content), content
430
+
431
+
432
+ def _read_json_payload(content: bytes) -> dict[str, Any] | None:
433
+ stripped = content.lstrip()
434
+ if not stripped or stripped[:1] not in {b"{", b"["}:
435
+ return None
436
+ try:
437
+ payload = json.loads(content.decode("utf-8-sig"))
438
+ except ValueError:
439
+ return None
440
+ if not isinstance(payload, dict):
441
+ raise CloudAPIError("Cloud API result download returned non-object JSON")
442
+ return _unwrap_result_file_payload(payload)
443
+
444
+
445
+ def _unwrap_result_file_payload(payload: dict[str, Any]) -> dict[str, Any]:
446
+ data = payload.get("data")
447
+ if "code" in payload and isinstance(data, dict):
448
+ return data
449
+ if isinstance(data, dict) and any(
450
+ key in data for key in ("markdown", "metadata", "pages", "results")
451
+ ):
452
+ result_payload = dict(data)
453
+ metadata = payload.get("metadata")
454
+ if isinstance(metadata, dict) and "metadata" not in result_payload:
455
+ result_payload["metadata"] = metadata
456
+ return result_payload
457
+ return payload
458
+
459
+
460
+ def _pop_bool_option(options: dict[str, Any], key: str, default: bool) -> bool:
461
+ value = options.pop(key, default)
462
+ return value if isinstance(value, bool) else default
463
+
464
+
465
+ def _api_status_code(payload: dict[str, Any]) -> int | None:
466
+ code = payload.get("code")
467
+ if code is None and payload.get("success") is False:
468
+ return 400
469
+ if code is None:
470
+ return None
471
+ if str(code) in SUCCESS_API_CODES:
472
+ return None
473
+ try:
474
+ return int(str(code))
475
+ except ValueError:
476
+ return 400
477
+
478
+
479
+ def _looks_like_zip_response(response: requests.Response) -> bool:
480
+ content_type = response.headers.get("content-type", "").lower()
481
+ return "application/zip" in content_type or "application/x-zip-compressed" in content_type
482
+
483
+
484
+ def _parse_zip_payload(content: bytes) -> dict[str, Any]:
485
+ try:
486
+ archive = zipfile.ZipFile(io.BytesIO(content))
487
+ except zipfile.BadZipFile as exc:
488
+ raise CloudAPIError("Cloud API returned an invalid ZIP response") from exc
489
+
490
+ json_payload: dict[str, Any] = {}
491
+ markdown = ""
492
+
493
+ for name in archive.namelist():
494
+ if name.endswith(".json") and not json_payload:
495
+ with archive.open(name) as file_obj:
496
+ loaded = json.load(file_obj)
497
+ if isinstance(loaded, dict):
498
+ json_payload = loaded
499
+ elif name.endswith(".md") and not markdown:
500
+ with archive.open(name) as file_obj:
501
+ markdown = file_obj.read().decode("utf-8", errors="replace")
502
+
503
+ result = json_payload.get("result") if isinstance(json_payload.get("result"), dict) else json_payload
504
+ result = _unwrap_result_file_payload(result)
505
+ pages = _zip_parse_pages(result.get("pages"))
506
+ metadata = result.get("metadata") if isinstance(result.get("metadata"), dict) else {}
507
+ metadata = {
508
+ **metadata,
509
+ "response_format": "zip",
510
+ "archive_entries": archive.namelist(),
511
+ }
512
+ payload = {
513
+ **result,
514
+ "pages": pages,
515
+ "metadata": metadata,
516
+ }
517
+ return {
518
+ **payload,
519
+ "markdown": markdown or _parse_markdown(payload),
520
+ }
521
+
522
+
523
+ def _zip_parse_pages(value: Any) -> list[dict[str, Any]]:
524
+ if not isinstance(value, list):
525
+ return []
526
+ if any(
527
+ isinstance(page, dict) and isinstance(page.get("parsing_res_list"), list)
528
+ for page in value
529
+ ):
530
+ return [page for page in value if isinstance(page, dict)]
531
+ if any(
532
+ isinstance(page, dict) and isinstance(page.get("structured"), list)
533
+ for page in value
534
+ ):
535
+ return _normalize_parse_pages(value)
536
+ return [page for page in value if isinstance(page, dict)]
537
+
538
+
539
+ def _normalize_parse_pages(value: Any) -> list[dict[str, Any]]:
540
+ if not isinstance(value, list):
541
+ return []
542
+
543
+ pages: list[dict[str, Any]] = []
544
+ for index, page in enumerate(value):
545
+ if not isinstance(page, dict):
546
+ continue
547
+ blocks = []
548
+ for item in page.get("structured", []):
549
+ if not isinstance(item, dict):
550
+ continue
551
+ bbox = _quad_to_bbox(item.get("pos"))
552
+ block = {
553
+ "block_content": item.get("text") or "",
554
+ "block_type": item.get("type"),
555
+ }
556
+ if bbox is not None:
557
+ block["block_bbox"] = bbox
558
+ blocks.append(block)
559
+ pages.append(
560
+ {
561
+ "page_id": page.get("page_id") or page.get("page") or index + 1,
562
+ "page_index": index,
563
+ "width": page.get("width"),
564
+ "height": page.get("height"),
565
+ "parsing_res_list": blocks,
566
+ }
567
+ )
568
+ return pages
569
+
570
+
571
+ def _quad_to_bbox(value: Any) -> list[float] | None:
572
+ if not isinstance(value, list) or len(value) < 8:
573
+ return None
574
+ numbers = [float(item) for item in value[:8]]
575
+ xs = numbers[0::2]
576
+ ys = numbers[1::2]
577
+ return [min(xs), min(ys), max(xs), max(ys)]
578
+
579
+
580
+ def _to_extract_fields_payload(fields: Any, schema: Any) -> dict[str, Any] | None:
581
+ normalized_fields = normalize_fields(fields)
582
+ if isinstance(normalized_fields, dict):
583
+ return normalized_fields
584
+ if isinstance(normalized_fields, list):
585
+ return {
586
+ "keys": {name: {} for name in normalized_fields},
587
+ "tableHeaders": {},
588
+ "name": "Document",
589
+ }
590
+ if isinstance(schema, dict):
591
+ properties = schema.get("properties")
592
+ if isinstance(properties, dict):
593
+ return {
594
+ "keys": {name: {} for name in properties},
595
+ "tableHeaders": {},
596
+ "name": schema.get("title") or "Document",
597
+ }
598
+ return None
599
+
600
+
601
+ def _normalize_extract_response(payload: dict[str, Any]) -> dict[str, Any]:
602
+ results = payload.get("results")
603
+ if not isinstance(results, dict):
604
+ return payload
605
+
606
+ normalized_payload = {
607
+ **payload,
608
+ "data": results,
609
+ }
610
+ page_keys = [key for key in results if isinstance(key, str) and key.startswith("Page_")]
611
+ if len(page_keys) == 1 and isinstance(results[page_keys[0]], dict):
612
+ flattened = results[page_keys[0]]
613
+ return {
614
+ **normalized_payload,
615
+ "results": flattened,
616
+ "data": flattened,
617
+ "metadata": {
618
+ **(payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {}),
619
+ "page_key": page_keys[0],
620
+ },
621
+ }
622
+ return normalized_payload