docslight 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,626 @@
1
+ """ComPDF Cloud API client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import json
7
+ import logging
8
+ import zipfile
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from urllib.parse import urlparse, urlunparse
12
+
13
+ import requests
14
+
15
+ from docslight.exceptions import AuthenticationError, CloudAPIError, RateLimitError
16
+ from docslight.result import ExtractResult, ParseResult
17
+ from docslight.schemas import normalize_fields
18
+
19
+ UNSAFE_ERROR_MESSAGE_MARKERS = ("<", ">", "\n", "\r", "traceback", "bearer ", "sk-")
20
+ MAX_ERROR_MESSAGE_LENGTH = 200
21
+ SUCCESS_API_CODES = {"0", "200"}
22
+ RESULT_PAYLOAD_KEYS = {"data", "markdown", "metadata", "pages", "results"}
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class CloudClient:
27
+ """Client for ComPDF Cloud document parsing and extraction APIs."""
28
+
29
+ def __init__(
30
+ self,
31
+ api_key: str | None = None,
32
+ base_url: str = "https://api.compdf.com",
33
+ timeout: float = 120.0,
34
+ session: requests.Session | None = None,
35
+ ) -> None:
36
+ self.api_key = api_key
37
+ self.base_url = base_url.rstrip("/")
38
+ self.timeout = timeout
39
+ self._owns_session = session is None
40
+ self.session = session if session is not None else requests.Session()
41
+
42
+ def __enter__(self) -> CloudClient:
43
+ """Return this client for use as a context manager."""
44
+ return self
45
+
46
+ def __exit__(self, *exc_info: object) -> None:
47
+ """Close owned resources when leaving a context manager."""
48
+ self.close()
49
+
50
+ def close(self) -> None:
51
+ """Close the internally-owned HTTP session."""
52
+ if self._owns_session:
53
+ self.session.close()
54
+
55
+ def parse(self, path: str | Path, **options: Any) -> ParseResult:
56
+ """Parse a document into markdown."""
57
+ request_options = dict(options)
58
+ download_result = _pop_bool_option(request_options, "download_result", True)
59
+ payload, direct_archive = self._post_file("parse", path, request_options)
60
+ process_payload, envelope_metadata = _unwrap_process_payload(payload)
61
+ result_payload, downloaded_archive = self._result_payload(
62
+ process_payload,
63
+ download_result,
64
+ )
65
+ raw_archive = direct_archive or downloaded_archive
66
+ pages = result_payload.get("pages")
67
+ metadata = _merge_metadata(result_payload, envelope_metadata)
68
+ markdown = _parse_markdown(result_payload)
69
+ return ParseResult(
70
+ markdown=markdown,
71
+ pages=pages if isinstance(pages, list) else [],
72
+ metadata=metadata,
73
+ raw_response=result_payload,
74
+ raw_archive=raw_archive,
75
+ )
76
+
77
+ def extract(
78
+ self,
79
+ path: str | Path,
80
+ fields: Any = None,
81
+ schema: Any = None,
82
+ document_types: Any = None,
83
+ **options: Any,
84
+ ) -> ExtractResult:
85
+ """Extract structured data from a document."""
86
+ download_result = _pop_bool_option(options, "download_result", True)
87
+ request_options = {
88
+ "extractFields": fields,
89
+ "schema": schema,
90
+ "document_types": document_types,
91
+ **options,
92
+ }
93
+ payload, _direct_archive = self._post_file("extract", path, request_options)
94
+ process_payload, envelope_metadata = _unwrap_process_payload(payload)
95
+ result_payload, _downloaded_archive = self._result_payload(
96
+ process_payload,
97
+ download_result,
98
+ )
99
+ data = _extract_data(result_payload)
100
+ metadata = _merge_metadata(result_payload, envelope_metadata)
101
+ return ExtractResult(
102
+ data=data if isinstance(data, dict) else {},
103
+ metadata=metadata,
104
+ raw_response=payload,
105
+ )
106
+
107
+ def health(self) -> dict[str, Any]:
108
+ """Return the Cloud API health payload."""
109
+ try:
110
+ response = self.session.get(
111
+ self._endpoint_url("health"),
112
+ headers=self._headers(),
113
+ timeout=self.timeout,
114
+ )
115
+ except requests.RequestException as exc:
116
+ raise CloudAPIError(f"Cloud API request failed: {exc}") from exc
117
+
118
+ payload = self._response_json(response, allow_invalid_error=True)
119
+ self._raise_for_error(response, payload)
120
+ return payload
121
+
122
+ def _post_file(
123
+ self,
124
+ operation: str,
125
+ path: str | Path,
126
+ options: dict[str, Any],
127
+ ) -> tuple[dict[str, Any], bytes | None]:
128
+ file_path = Path(path)
129
+ prepared_options = self._prepare_options(operation, options)
130
+ endpoint_url = self._endpoint_url(operation)
131
+ logger.info("Calling ComPDF Cloud %s endpoint: POST %s", operation, endpoint_url)
132
+ try:
133
+ with file_path.open("rb") as file_obj:
134
+ response = self.session.post(
135
+ endpoint_url,
136
+ files={"file": (file_path.name, file_obj)},
137
+ data=self._compact_options(prepared_options),
138
+ headers=self._headers(),
139
+ timeout=self.timeout,
140
+ )
141
+ except requests.RequestException as exc:
142
+ raise CloudAPIError(f"Cloud API request failed: {exc}") from exc
143
+
144
+ return self._decode_response(response, operation)
145
+
146
+ def _result_payload(
147
+ self,
148
+ process_payload: dict[str, Any],
149
+ download_result: bool,
150
+ ) -> tuple[dict[str, Any], bytes | None]:
151
+ if _has_result_content(process_payload):
152
+ return process_payload, None
153
+ download_url = process_payload.get("downloadUrl") or process_payload.get("download_url")
154
+ if download_result and isinstance(download_url, str) and download_url:
155
+ return self._download_result_payload(download_url)
156
+ return {}, None
157
+
158
+ def _download_result_payload(self, url: str) -> tuple[dict[str, Any], bytes | None]:
159
+ try:
160
+ response = self.session.get(url, timeout=self.timeout)
161
+ except requests.RequestException as exc:
162
+ raise CloudAPIError(f"Cloud API result download failed: {exc}") from exc
163
+
164
+ if response.status_code >= 400:
165
+ payload = self._response_json(response, allow_invalid_error=True)
166
+ self._raise_for_error(response, payload)
167
+
168
+ content = getattr(response, "content", b"")
169
+ if isinstance(content, bytes) and content:
170
+ return _read_downloaded_result_payload(content)
171
+ return self._response_json(response), None
172
+
173
+ def _prepare_options(self, operation: str, options: dict[str, Any]) -> dict[str, Any]:
174
+ if operation != "extract" or not self._uses_custom_operation_urls():
175
+ return options
176
+
177
+ prepared = dict(options)
178
+ if "extract_fields" in prepared:
179
+ return prepared
180
+
181
+ fields = prepared.pop("fields", None)
182
+ if fields is None:
183
+ fields = prepared.pop("extractFields", None)
184
+ else:
185
+ prepared.pop("extractFields", None)
186
+ schema = prepared.pop("schema", None)
187
+ document_types = prepared.pop("document_types", None)
188
+ extract_fields = _to_extract_fields_payload(fields, schema)
189
+
190
+ if extract_fields is not None:
191
+ prepared["extract_fields"] = extract_fields
192
+ elif schema is not None:
193
+ prepared["schema"] = schema
194
+
195
+ if document_types is not None:
196
+ prepared["document_types"] = document_types
197
+
198
+ return prepared
199
+
200
+ def _compact_options(self, options: dict[str, Any]) -> dict[str, str]:
201
+ compacted: dict[str, str] = {}
202
+ for key, value in options.items():
203
+ if value is None:
204
+ continue
205
+ if isinstance(value, str):
206
+ compacted[key] = value
207
+ else:
208
+ try:
209
+ compacted[key] = json.dumps(
210
+ value,
211
+ ensure_ascii=False,
212
+ separators=(",", ":"),
213
+ )
214
+ except TypeError as exc:
215
+ raise CloudAPIError(
216
+ f"Cloud API option '{key}' is not JSON serializable"
217
+ ) from exc
218
+ return compacted
219
+
220
+ def _headers(self) -> dict[str, str]:
221
+ headers = {"User-Agent": "docslight/0.1.0"}
222
+ if self.api_key:
223
+ headers["Authorization"] = f"Bearer {self.api_key}"
224
+ headers["x-api-key"] = self.api_key
225
+ return headers
226
+
227
+ def _decode_response(
228
+ self,
229
+ response: requests.Response,
230
+ operation: str,
231
+ ) -> tuple[dict[str, Any], bytes | None]:
232
+ if _looks_like_zip_response(response):
233
+ payload = _parse_zip_payload(response.content)
234
+ self._raise_for_error(response, payload)
235
+ return payload, response.content
236
+
237
+ payload = self._response_json(response, allow_invalid_error=True)
238
+ self._raise_for_error(response, payload)
239
+ if operation == "extract":
240
+ payload = _normalize_extract_response(payload)
241
+ return payload, None
242
+
243
+ def _endpoint_url(self, operation: str) -> str:
244
+ if operation not in {"parse", "extract", "health"}:
245
+ raise CloudAPIError(f"Unsupported cloud operation: {operation}")
246
+
247
+ if not self._uses_custom_operation_urls():
248
+ if operation == "health":
249
+ return f"{self.base_url}/v1/health"
250
+ if operation == "parse":
251
+ return f"{self.base_url}/server/v2/process/idp/documentParsing"
252
+ return f"{self.base_url}/server/v2/process/idp/documentExtract"
253
+
254
+ parsed = urlparse(self.base_url)
255
+ segments = [segment for segment in parsed.path.split("/") if segment]
256
+ if not segments:
257
+ path = f"/{operation}"
258
+ elif segments[-1] in {"parse", "extract", "health"}:
259
+ segments[-1] = operation
260
+ path = "/" + "/".join(segments)
261
+ else:
262
+ path = parsed.path.rstrip("/") + f"/{operation}"
263
+ return urlunparse(parsed._replace(path=path))
264
+
265
+ def _uses_custom_operation_urls(self) -> bool:
266
+ path = urlparse(self.base_url).path.rstrip("/")
267
+ return path.endswith("/parse") or path.endswith("/extract") or path.endswith("/health")
268
+
269
+ def _response_json(
270
+ self,
271
+ response: requests.Response,
272
+ allow_invalid_error: bool = False,
273
+ ) -> dict[str, Any]:
274
+ try:
275
+ payload = response.json()
276
+ except ValueError as exc:
277
+ if allow_invalid_error and response.status_code >= 400:
278
+ return {}
279
+ raise CloudAPIError("Cloud API returned invalid JSON") from exc
280
+ if not isinstance(payload, dict):
281
+ if allow_invalid_error and response.status_code >= 400:
282
+ return {}
283
+ raise CloudAPIError("Cloud API returned non-object JSON")
284
+ return payload
285
+
286
+ def _raise_for_error(
287
+ self,
288
+ response: requests.Response,
289
+ payload: dict[str, Any],
290
+ ) -> None:
291
+ status_code = response.status_code
292
+ api_status_code = _api_status_code(payload)
293
+ if status_code < 400 and api_status_code is None:
294
+ return
295
+
296
+ error_status_code = api_status_code or status_code
297
+ if status_code < 400 and str(error_status_code) in SUCCESS_API_CODES:
298
+ return
299
+
300
+ message = _safe_error_message(payload, error_status_code)
301
+ request_id = response.headers.get("x-request-id") or response.headers.get(
302
+ "X-Request-ID"
303
+ )
304
+ if error_status_code == 401:
305
+ raise AuthenticationError(
306
+ message,
307
+ status_code=error_status_code,
308
+ request_id=request_id,
309
+ )
310
+ if error_status_code == 429:
311
+ raise RateLimitError(
312
+ message,
313
+ status_code=error_status_code,
314
+ request_id=request_id,
315
+ )
316
+ raise CloudAPIError(message, status_code=error_status_code, request_id=request_id)
317
+
318
+
319
+ def _safe_error_message(payload: dict[str, Any], status_code: int) -> str:
320
+ fallback = f"Cloud API error {status_code}"
321
+ raw_message = payload.get("message") or payload.get("msg") or payload.get("error")
322
+ if not isinstance(raw_message, str):
323
+ return fallback
324
+ message = raw_message.strip()
325
+ if not message or len(message) > MAX_ERROR_MESSAGE_LENGTH:
326
+ return fallback
327
+ lowered = message.lower()
328
+ if any(marker in lowered for marker in UNSAFE_ERROR_MESSAGE_MARKERS):
329
+ return fallback
330
+ return message
331
+
332
+
333
+ def _unwrap_process_payload(
334
+ payload: dict[str, Any],
335
+ ) -> tuple[dict[str, Any], dict[str, Any]]:
336
+ if "code" not in payload:
337
+ return payload, {}
338
+ data = payload.get("data")
339
+ process_payload = data if isinstance(data, dict) else {}
340
+ envelope_metadata: dict[str, Any] = {}
341
+ code = payload.get("code")
342
+ msg = payload.get("msg")
343
+ if code is not None:
344
+ envelope_metadata["api_code"] = code
345
+ if isinstance(msg, str) and msg:
346
+ envelope_metadata["api_message"] = msg
347
+ envelope_metadata.update(_process_metadata(process_payload))
348
+ return process_payload, envelope_metadata
349
+
350
+
351
+ def _process_metadata(process_payload: dict[str, Any]) -> dict[str, Any]:
352
+ return {
353
+ key: value
354
+ for key, value in process_payload.items()
355
+ if key not in RESULT_PAYLOAD_KEYS
356
+ }
357
+
358
+
359
+ def _has_result_content(payload: dict[str, Any]) -> bool:
360
+ if isinstance(payload.get("markdown"), str):
361
+ return True
362
+ if isinstance(payload.get("pages"), list):
363
+ return True
364
+ if isinstance(payload.get("results"), dict):
365
+ return True
366
+ if "code" in payload:
367
+ return False
368
+ if isinstance(payload.get("data"), dict):
369
+ return True
370
+ return False
371
+
372
+
373
+ def _extract_data(payload: dict[str, Any]) -> dict[str, Any]:
374
+ data = payload.get("data")
375
+ if isinstance(data, dict):
376
+ return data
377
+ results = payload.get("results")
378
+ if isinstance(results, dict):
379
+ return results
380
+ return payload
381
+
382
+
383
+ def _parse_markdown(payload: dict[str, Any]) -> str:
384
+ markdown = payload.get("markdown")
385
+ if isinstance(markdown, str):
386
+ return markdown
387
+ markdown_texts = payload.get("markdown_texts")
388
+ if isinstance(markdown_texts, str):
389
+ return markdown_texts
390
+ pages = payload.get("pages")
391
+ if not isinstance(pages, list):
392
+ return ""
393
+ page_markdowns = [_page_markdown(page) for page in pages if isinstance(page, dict)]
394
+ return "\n\n".join(markdown for markdown in page_markdowns if markdown)
395
+
396
+
397
+ def _page_markdown(page: dict[str, Any]) -> str:
398
+ markdown = page.get("markdown")
399
+ if isinstance(markdown, str):
400
+ return markdown
401
+ markdown_texts = page.get("markdown_texts")
402
+ if isinstance(markdown_texts, str):
403
+ return markdown_texts
404
+ blocks = page.get("parsing_res_list")
405
+ if not isinstance(blocks, list):
406
+ return ""
407
+ parts = []
408
+ for block in blocks:
409
+ if not isinstance(block, dict):
410
+ continue
411
+ text = block.get("block_content") or block.get("block_text") or block.get("text")
412
+ if isinstance(text, str) and text.strip():
413
+ parts.append(text.strip())
414
+ return "\n\n".join(parts)
415
+
416
+
417
+ def _merge_metadata(
418
+ result_payload: dict[str, Any],
419
+ envelope_metadata: dict[str, Any],
420
+ ) -> dict[str, Any]:
421
+ metadata = {}
422
+ payload_metadata = result_payload.get("metadata")
423
+ if isinstance(payload_metadata, dict):
424
+ metadata.update(payload_metadata)
425
+ metadata.update(envelope_metadata)
426
+ return metadata
427
+
428
+
429
+ def _read_downloaded_result_payload(content: bytes) -> tuple[dict[str, Any], bytes | None]:
430
+ json_payload = _read_json_payload(content)
431
+ if json_payload is not None:
432
+ return json_payload, None
433
+ return _parse_zip_payload(content), content
434
+
435
+
436
+ def _read_json_payload(content: bytes) -> dict[str, Any] | None:
437
+ stripped = content.lstrip()
438
+ if not stripped or stripped[:1] not in {b"{", b"["}:
439
+ return None
440
+ try:
441
+ payload = json.loads(content.decode("utf-8-sig"))
442
+ except ValueError:
443
+ return None
444
+ if not isinstance(payload, dict):
445
+ raise CloudAPIError("Cloud API result download returned non-object JSON")
446
+ return _unwrap_result_file_payload(payload)
447
+
448
+
449
+ def _unwrap_result_file_payload(payload: dict[str, Any]) -> dict[str, Any]:
450
+ data = payload.get("data")
451
+ if "code" in payload and isinstance(data, dict):
452
+ return data
453
+ if isinstance(data, dict) and any(
454
+ key in data for key in ("markdown", "metadata", "pages", "results")
455
+ ):
456
+ result_payload = dict(data)
457
+ metadata = payload.get("metadata")
458
+ if isinstance(metadata, dict) and "metadata" not in result_payload:
459
+ result_payload["metadata"] = metadata
460
+ return result_payload
461
+ return payload
462
+
463
+
464
+ def _pop_bool_option(options: dict[str, Any], key: str, default: bool) -> bool:
465
+ value = options.pop(key, default)
466
+ return value if isinstance(value, bool) else default
467
+
468
+
469
+ def _api_status_code(payload: dict[str, Any]) -> int | None:
470
+ code = payload.get("code")
471
+ if code is None and payload.get("success") is False:
472
+ return 400
473
+ if code is None:
474
+ return None
475
+ if str(code) in SUCCESS_API_CODES:
476
+ return None
477
+ try:
478
+ return int(str(code))
479
+ except ValueError:
480
+ return 400
481
+
482
+
483
+ def _looks_like_zip_response(response: requests.Response) -> bool:
484
+ content_type = response.headers.get("content-type", "").lower()
485
+ return "application/zip" in content_type or "application/x-zip-compressed" in content_type
486
+
487
+
488
+ def _parse_zip_payload(content: bytes) -> dict[str, Any]:
489
+ try:
490
+ archive = zipfile.ZipFile(io.BytesIO(content))
491
+ except zipfile.BadZipFile as exc:
492
+ raise CloudAPIError("Cloud API returned an invalid ZIP response") from exc
493
+
494
+ json_payload: dict[str, Any] = {}
495
+ markdown = ""
496
+
497
+ for name in archive.namelist():
498
+ if name.endswith(".json") and not json_payload:
499
+ with archive.open(name) as file_obj:
500
+ loaded = json.load(file_obj)
501
+ if isinstance(loaded, dict):
502
+ json_payload = loaded
503
+ elif name.endswith(".md") and not markdown:
504
+ with archive.open(name) as file_obj:
505
+ markdown = file_obj.read().decode("utf-8", errors="replace")
506
+
507
+ result = json_payload.get("result") if isinstance(json_payload.get("result"), dict) else json_payload
508
+ result = _unwrap_result_file_payload(result)
509
+ pages = _zip_parse_pages(result.get("pages"))
510
+ metadata = result.get("metadata") if isinstance(result.get("metadata"), dict) else {}
511
+ metadata = {
512
+ **metadata,
513
+ "response_format": "zip",
514
+ "archive_entries": archive.namelist(),
515
+ }
516
+ payload = {
517
+ **result,
518
+ "pages": pages,
519
+ "metadata": metadata,
520
+ }
521
+ return {
522
+ **payload,
523
+ "markdown": markdown or _parse_markdown(payload),
524
+ }
525
+
526
+
527
+ def _zip_parse_pages(value: Any) -> list[dict[str, Any]]:
528
+ if not isinstance(value, list):
529
+ return []
530
+ if any(
531
+ isinstance(page, dict) and isinstance(page.get("parsing_res_list"), list)
532
+ for page in value
533
+ ):
534
+ return [page for page in value if isinstance(page, dict)]
535
+ if any(
536
+ isinstance(page, dict) and isinstance(page.get("structured"), list)
537
+ for page in value
538
+ ):
539
+ return _normalize_parse_pages(value)
540
+ return [page for page in value if isinstance(page, dict)]
541
+
542
+
543
+ def _normalize_parse_pages(value: Any) -> list[dict[str, Any]]:
544
+ if not isinstance(value, list):
545
+ return []
546
+
547
+ pages: list[dict[str, Any]] = []
548
+ for index, page in enumerate(value):
549
+ if not isinstance(page, dict):
550
+ continue
551
+ blocks = []
552
+ for item in page.get("structured", []):
553
+ if not isinstance(item, dict):
554
+ continue
555
+ bbox = _quad_to_bbox(item.get("pos"))
556
+ block = {
557
+ "block_content": item.get("text") or "",
558
+ "block_type": item.get("type"),
559
+ }
560
+ if bbox is not None:
561
+ block["block_bbox"] = bbox
562
+ blocks.append(block)
563
+ pages.append(
564
+ {
565
+ "page_id": page.get("page_id") or page.get("page") or index + 1,
566
+ "page_index": index,
567
+ "width": page.get("width"),
568
+ "height": page.get("height"),
569
+ "parsing_res_list": blocks,
570
+ }
571
+ )
572
+ return pages
573
+
574
+
575
+ def _quad_to_bbox(value: Any) -> list[float] | None:
576
+ if not isinstance(value, list) or len(value) < 8:
577
+ return None
578
+ numbers = [float(item) for item in value[:8]]
579
+ xs = numbers[0::2]
580
+ ys = numbers[1::2]
581
+ return [min(xs), min(ys), max(xs), max(ys)]
582
+
583
+
584
+ def _to_extract_fields_payload(fields: Any, schema: Any) -> dict[str, Any] | None:
585
+ normalized_fields = normalize_fields(fields)
586
+ if isinstance(normalized_fields, dict):
587
+ return normalized_fields
588
+ if isinstance(normalized_fields, list):
589
+ return {
590
+ "keys": {name: {} for name in normalized_fields},
591
+ "tableHeaders": {},
592
+ "name": "Document",
593
+ }
594
+ if isinstance(schema, dict):
595
+ properties = schema.get("properties")
596
+ if isinstance(properties, dict):
597
+ return {
598
+ "keys": {name: {} for name in properties},
599
+ "tableHeaders": {},
600
+ "name": schema.get("title") or "Document",
601
+ }
602
+ return None
603
+
604
+
605
+ def _normalize_extract_response(payload: dict[str, Any]) -> dict[str, Any]:
606
+ results = payload.get("results")
607
+ if not isinstance(results, dict):
608
+ return payload
609
+
610
+ normalized_payload = {
611
+ **payload,
612
+ "data": results,
613
+ }
614
+ page_keys = [key for key in results if isinstance(key, str) and key.startswith("Page_")]
615
+ if len(page_keys) == 1 and isinstance(results[page_keys[0]], dict):
616
+ flattened = results[page_keys[0]]
617
+ return {
618
+ **normalized_payload,
619
+ "results": flattened,
620
+ "data": flattened,
621
+ "metadata": {
622
+ **(payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {}),
623
+ "page_key": page_keys[0],
624
+ },
625
+ }
626
+ return normalized_payload