pdfdancer-client-python 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfdancer-client-python might be problematic. Click here for more details.

pdfdancer/pdfdancer_v1.py CHANGED
@@ -5,16 +5,117 @@ A Python client that closely mirrors the Java Client class structure and functio
5
5
  Provides session-based PDF manipulation operations with strict validation.
6
6
  """
7
7
 
8
+ import gzip
8
9
  import json
9
10
  import os
11
+ import time
12
+ from datetime import datetime, timezone
10
13
  from pathlib import Path
11
14
  from typing import List, Optional, Union, BinaryIO, Mapping, Any
12
15
 
13
- import requests
16
+ import httpx
14
17
  from dotenv import load_dotenv
15
18
 
19
+ from .fingerprint import Fingerprint
20
+
16
21
  load_dotenv()
17
22
 
23
+ # Global variable to disable SSL certificate verification
24
+ # Set to True to skip SSL verification (useful for testing with self-signed certificates)
25
+ # WARNING: Only use in development/testing environments
26
+ DISABLE_SSL_VERIFY = os.environ.get("PDFDANCER_CLIENT_DISABLE_SSL_VERIFY", False)
27
+
28
+ DEBUG = os.environ.get("PDFDANCER_CLIENT_DEBUG", False)
29
+ DEFAULT_TOLERANCE = 0.01
30
+
31
+
32
+ def _generate_timestamp() -> str:
33
+ """
34
+ Generate a timestamp string in the format expected by the API.
35
+ Format: YYYY-MM-DDTHH:MM:SS.ffffffZ (with microseconds)
36
+
37
+ Returns:
38
+ Timestamp string with UTC timezone
39
+ """
40
+ return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
41
+
42
+
43
+ def _parse_timestamp(timestamp_str: str) -> datetime:
44
+ """
45
+ Parse timestamp string, handling both microseconds and nanoseconds precision.
46
+
47
+ Args:
48
+ timestamp_str: Timestamp string in format YYYY-MM-DDTHH:MM:SS.fffffffZ
49
+ (with 6 or 9 fractional digits)
50
+
51
+ Returns:
52
+ datetime object with UTC timezone
53
+ """
54
+ # Remove the 'Z' suffix
55
+ ts = timestamp_str.rstrip('Z')
56
+
57
+ # Handle nanoseconds (9 digits) by truncating to microseconds (6 digits)
58
+ # Python's datetime only supports microseconds precision
59
+ if '.' in ts:
60
+ date_part, frac_part = ts.rsplit('.', 1)
61
+ if len(frac_part) > 6:
62
+ # Truncate to 6 digits (microseconds)
63
+ frac_part = frac_part[:6]
64
+ ts = f"{date_part}.{frac_part}"
65
+
66
+ return datetime.fromisoformat(ts).replace(tzinfo=timezone.utc)
67
+
68
+
69
+ def _log_generated_at_header(response: httpx.Response, method: str, path: str) -> None:
70
+ """
71
+ Check for X-Generated-At and X-Received-At headers and log timing information if DEBUG=True.
72
+
73
+ Expected timestamp formats:
74
+ - 2025-10-24T08:49:39.161945Z (microseconds - 6 digits)
75
+ - 2025-10-24T08:58:45.468131265Z (nanoseconds - 9 digits)
76
+
77
+ Args:
78
+ response: The HTTP response object
79
+ method: HTTP method used
80
+ path: API path
81
+ """
82
+ if not DEBUG:
83
+ return
84
+
85
+ generated_at = response.headers.get('X-Generated-At')
86
+ received_at = response.headers.get('X-Received-At')
87
+
88
+ if generated_at or received_at:
89
+ try:
90
+ log_parts = []
91
+ current_time = datetime.now(timezone.utc)
92
+
93
+ # Parse and log X-Received-At
94
+ received_time = None
95
+ if received_at:
96
+ received_time = _parse_timestamp(received_at)
97
+ time_since_received = (current_time - received_time).total_seconds()
98
+ log_parts.append(f"X-Received-At: {received_at}, time since received: {time_since_received:.3f}s")
99
+
100
+ # Parse and log X-Generated-At
101
+ generated_time = None
102
+ if generated_at:
103
+ generated_time = _parse_timestamp(generated_at)
104
+ time_since_generated = (current_time - generated_time).total_seconds()
105
+ log_parts.append(f"X-Generated-At: {generated_at}, time since generated: {time_since_generated:.3f}s")
106
+
107
+ # Calculate processing time (X-Generated-At - X-Received-At)
108
+ if received_time and generated_time:
109
+ processing_time = (generated_time - received_time).total_seconds()
110
+ log_parts.append(f"processing time: {processing_time:.3f}s")
111
+
112
+ if log_parts:
113
+ print(f"{time.time()}|{method} {path} - {', '.join(log_parts)}")
114
+
115
+ except (ValueError, AttributeError) as e:
116
+ print(f"{time.time()}|{method} {path} - Header parse error: {e}")
117
+
118
+
18
119
  from . import ParagraphBuilder
19
120
  from .exceptions import (
20
121
  PdfDancerException,
@@ -28,7 +129,8 @@ from .models import (
28
129
  ObjectRef, Position, ObjectType, Font, Image, Paragraph, FormFieldRef, TextObjectRef, PageRef,
29
130
  FindRequest, DeleteRequest, MoveRequest, PageMoveRequest, AddRequest, ModifyRequest, ModifyTextRequest,
30
131
  ChangeFormFieldRequest, CommandResult,
31
- ShapeType, PositionMode, PageSize, Orientation
132
+ ShapeType, PositionMode, PageSize, Orientation,
133
+ PageSnapshot, DocumentSnapshot, FontRecommendation, FontType
32
134
  )
33
135
  from .paragraph_builder import ParagraphPageBuilder
34
136
  from .types import PathObject, ParagraphObject, TextLineObject, ImageObject, FormObject, FormFieldObject
@@ -52,9 +154,10 @@ class PageClient:
52
154
  else:
53
155
  self.orientation = orientation
54
156
 
55
- def select_paths_at(self, x: float, y: float) -> List[PathObject]:
157
+ def select_paths_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[PathObject]:
158
+ position = Position.at_page_coordinates(self.page_index, x, y)
56
159
  # noinspection PyProtectedMember
57
- return self.root._to_path_objects(self.root._find_paths(Position.at_page_coordinates(self.page_index, x, y)))
160
+ return self.root._to_path_objects(self.root._find_paths(position, tolerance))
58
161
 
59
162
  def select_paragraphs(self) -> List[ParagraphObject]:
60
163
  # noinspection PyProtectedMember
@@ -78,10 +181,10 @@ class PageClient:
78
181
  # noinspection PyProtectedMember
79
182
  return self.root._to_textline_objects(self.root._find_text_lines(position))
80
183
 
81
- def select_paragraphs_at(self, x: float, y: float) -> List[ParagraphObject]:
184
+ def select_paragraphs_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ParagraphObject]:
82
185
  position = Position.at_page_coordinates(self.page_index, x, y)
83
186
  # noinspection PyProtectedMember
84
- return self.root._to_paragraph_objects(self.root._find_paragraphs(position))
187
+ return self.root._to_paragraph_objects(self.root._find_paragraphs(position, tolerance))
85
188
 
86
189
  def select_text_lines(self) -> List[TextLineObject]:
87
190
  position = Position.at_page(self.page_index)
@@ -94,29 +197,29 @@ class PageClient:
94
197
  # noinspection PyProtectedMember
95
198
  return self.root._to_textline_objects(self.root._find_text_lines(position))
96
199
 
97
- def select_text_lines_at(self, x, y) -> List[TextLineObject]:
200
+ def select_text_lines_at(self, x, y, tolerance: float = DEFAULT_TOLERANCE) -> List[TextLineObject]:
98
201
  position = Position.at_page_coordinates(self.page_index, x, y)
99
202
  # noinspection PyProtectedMember
100
- return self.root._to_textline_objects(self.root._find_text_lines(position))
203
+ return self.root._to_textline_objects(self.root._find_text_lines(position, tolerance))
101
204
 
102
205
  def select_images(self) -> List[ImageObject]:
103
206
  # noinspection PyProtectedMember
104
207
  return self.root._to_image_objects(self.root._find_images(Position.at_page(self.page_index)))
105
208
 
106
- def select_images_at(self, x: float, y: float) -> List[ImageObject]:
209
+ def select_images_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ImageObject]:
107
210
  position = Position.at_page_coordinates(self.page_index, x, y)
108
211
  # noinspection PyProtectedMember
109
- return self.root._to_image_objects(self.root._find_images(position))
212
+ return self.root._to_image_objects(self.root._find_images(position, tolerance))
110
213
 
111
214
  def select_forms(self) -> List[FormObject]:
112
215
  position = Position.at_page(self.page_index)
113
216
  # noinspection PyProtectedMember
114
217
  return self.root._to_form_objects(self.root._find_form_x_objects(position))
115
218
 
116
- def select_forms_at(self, x: float, y: float) -> List[FormObject]:
219
+ def select_forms_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormObject]:
117
220
  position = Position.at_page_coordinates(self.page_index, x, y)
118
221
  # noinspection PyProtectedMember
119
- return self.root._to_form_objects(self.root._find_form_x_objects(position))
222
+ return self.root._to_form_objects(self.root._find_form_x_objects(position, tolerance))
120
223
 
121
224
  def select_form_fields(self) -> List[FormFieldObject]:
122
225
  position = Position.at_page(self.page_index)
@@ -129,10 +232,10 @@ class PageClient:
129
232
  # noinspection PyProtectedMember
130
233
  return self.root._to_form_field_objects(self.root._find_form_fields(pos))
131
234
 
132
- def select_form_fields_at(self, x: float, y: float) -> List[FormFieldObject]:
235
+ def select_form_fields_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormFieldObject]:
133
236
  position = Position.at_page_coordinates(self.page_index, x, y)
134
237
  # noinspection PyProtectedMember
135
- return self.root._to_form_field_objects(self.root._find_form_fields(position))
238
+ return self.root._to_form_field_objects(self.root._find_form_fields(position, tolerance))
136
239
 
137
240
  @classmethod
138
241
  def from_ref(cls, root: 'PDFDancer', page_ref: PageRef) -> 'PageClient':
@@ -170,6 +273,18 @@ class PageClient:
170
273
  def new_paragraph(self):
171
274
  return ParagraphPageBuilder(self.root, self.page_index)
172
275
 
276
+ def new_path(self):
277
+ from .path_builder import PathBuilder
278
+ return PathBuilder(self.root, self.page_index)
279
+
280
+ def new_line(self):
281
+ from .path_builder import LineBuilder
282
+ return LineBuilder(self.root, self.page_index)
283
+
284
+ def new_bezier(self):
285
+ from .path_builder import BezierBuilder
286
+ return BezierBuilder(self.root, self.page_index)
287
+
173
288
  def select_paths(self):
174
289
  # noinspection PyProtectedMember
175
290
  return self.root._to_path_objects(self.root._find_paths(Position.at_page(self.page_index)))
@@ -221,9 +336,15 @@ class PDFDancer:
221
336
  """
222
337
  Create a client session, falling back to environment variables when needed.
223
338
 
339
+ Authentication:
340
+ - If token is provided, uses it
341
+ - Otherwise, checks PDFDANCER_TOKEN environment variable
342
+ - If no token is found, automatically obtains an anonymous token
343
+
224
344
  Args:
225
345
  pdf_data: PDF payload supplied directly or via filesystem handles.
226
- token: Override for the API token; falls back to `PDFDANCER_TOKEN` environement variable.
346
+ token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
347
+ then to anonymous token if not set.
227
348
  base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
228
349
  or defaults to `https://api.pdfdancer.com`.
229
350
  timeout: HTTP read timeout in seconds.
@@ -234,6 +355,10 @@ class PDFDancer:
234
355
  resolved_token = cls._resolve_token(token)
235
356
  resolved_base_url = cls._resolve_base_url(base_url)
236
357
 
358
+ # If no token found, obtain anonymous token
359
+ if resolved_token is None:
360
+ resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
361
+
237
362
  return PDFDancer(resolved_token, pdf_data, resolved_base_url, timeout)
238
363
 
239
364
  @classmethod
@@ -244,18 +369,66 @@ class PDFDancer:
244
369
  resolved_base_url = "https://api.pdfdancer.com"
245
370
  return resolved_base_url
246
371
 
372
+ @classmethod
373
+ def _obtain_anonymous_token(cls, base_url: str, timeout: float = 30.0) -> str:
374
+ """
375
+ Obtain an anonymous token from the /keys/anon endpoint.
376
+
377
+ Args:
378
+ base_url: Base URL of the PDFDancer API server
379
+ timeout: HTTP read timeout in seconds
380
+
381
+ Returns:
382
+ Anonymous token string
383
+
384
+ Raises:
385
+ HttpClientException: If token request fails
386
+ """
387
+ try:
388
+ # Create temporary client without authentication
389
+ temp_client = httpx.Client(
390
+ http2=True,
391
+ verify=not DISABLE_SSL_VERIFY
392
+ )
393
+
394
+ headers = {
395
+ 'X-Fingerprint': Fingerprint.generate()
396
+ }
397
+
398
+ response = temp_client.post(
399
+ cls._cleanup_url_path(base_url, "/keys/anon"),
400
+ headers=headers,
401
+ timeout=timeout if timeout > 0 else None
402
+ )
403
+
404
+ response.raise_for_status()
405
+ token_data = response.json()
406
+
407
+ # Extract token from response (matches Java AnonTokenResponse structure)
408
+ if isinstance(token_data, dict) and 'token' in token_data:
409
+ return token_data['token']
410
+ else:
411
+ raise HttpClientException("Invalid anonymous token response format")
412
+
413
+ except httpx.HTTPStatusError as e:
414
+ raise HttpClientException(f"Failed to obtain anonymous token: HTTP {e.response.status_code}",
415
+ response=e.response, cause=e) from None
416
+ except httpx.RequestError as e:
417
+ raise HttpClientException(f"Failed to obtain anonymous token: {str(e)}",
418
+ response=None, cause=e) from None
419
+ finally:
420
+ temp_client.close()
421
+
247
422
  @classmethod
248
423
  def _resolve_token(cls, token: Optional[str]) -> Optional[str]:
424
+ """
425
+ Resolve token from argument or environment variable.
426
+ Returns None if no token is found (allowing fallback to anonymous token).
427
+ """
249
428
  resolved_token = token.strip() if token and token.strip() else None
250
429
  if resolved_token is None:
251
430
  env_token = os.getenv("PDFDANCER_TOKEN")
252
431
  resolved_token = env_token.strip() if env_token and env_token.strip() else None
253
-
254
- if resolved_token is None:
255
- raise ValidationException(
256
- "Missing PDFDancer API token. Pass a token via the `token` argument "
257
- "or set the PDFDANCER_TOKEN environment variable."
258
- )
259
432
  return resolved_token
260
433
 
261
434
  @classmethod
@@ -269,8 +442,14 @@ class PDFDancer:
269
442
  """
270
443
  Create a new blank PDF document with optional configuration.
271
444
 
445
+ Authentication:
446
+ - If token is provided, uses it
447
+ - Otherwise, checks PDFDANCER_TOKEN environment variable
448
+ - If no token is found, automatically obtains an anonymous token
449
+
272
450
  Args:
273
- token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable.
451
+ token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
452
+ then to anonymous token if not set.
274
453
  base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
275
454
  or defaults to `https://api.pdfdancer.com`.
276
455
  timeout: HTTP read timeout in seconds.
@@ -285,6 +464,10 @@ class PDFDancer:
285
464
  resolved_token = cls._resolve_token(token)
286
465
  resolved_base_url = cls._resolve_base_url(base_url)
287
466
 
467
+ # If no token found, obtain anonymous token
468
+ if resolved_token is None:
469
+ resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
470
+
288
471
  # Create a new instance that will call _create_blank_pdf_session
289
472
  instance = object.__new__(cls)
290
473
 
@@ -296,11 +479,12 @@ class PDFDancer:
296
479
  instance._base_url = resolved_base_url.rstrip('/')
297
480
  instance._read_timeout = timeout
298
481
 
299
- # Create HTTP session for connection reuse
300
- instance._session = requests.Session()
301
- instance._session.headers.update({
302
- 'Authorization': f'Bearer {instance._token}'
303
- })
482
+ # Create HTTP client for connection reuse with HTTP/2 support
483
+ instance._client = httpx.Client(
484
+ http2=True,
485
+ headers={'Authorization': f'Bearer {instance._token}'},
486
+ verify=not DISABLE_SSL_VERIFY
487
+ )
304
488
 
305
489
  # Create blank PDF session
306
490
  instance._session_id = instance._create_blank_pdf_session(
@@ -312,6 +496,10 @@ class PDFDancer:
312
496
  # Set pdf_bytes to None since we don't have the PDF bytes yet
313
497
  instance._pdf_bytes = None
314
498
 
499
+ # Initialize snapshot caches (lazy-loaded)
500
+ instance._document_snapshot = None
501
+ instance._page_snapshots = {}
502
+
315
503
  return instance
316
504
 
317
505
  def __init__(self, token: str, pdf_data: Union[bytes, Path, str, BinaryIO],
@@ -343,15 +531,20 @@ class PDFDancer:
343
531
  # Process PDF data with validation
344
532
  self._pdf_bytes = self._process_pdf_data(pdf_data)
345
533
 
346
- # Create HTTP session for connection reuse
347
- self._session = requests.Session()
348
- self._session.headers.update({
349
- 'Authorization': f'Bearer {self._token}'
350
- })
534
+ # Create HTTP client for connection reuse with HTTP/2 support
535
+ self._client = httpx.Client(
536
+ http2=True,
537
+ headers={'Authorization': f'Bearer {self._token}'},
538
+ verify=not DISABLE_SSL_VERIFY
539
+ )
351
540
 
352
541
  # Create session - equivalent to Java constructor behavior
353
542
  self._session_id = self._create_session()
354
543
 
544
+ # Initialize snapshot caches (lazy-loaded)
545
+ self._document_snapshot: Optional[DocumentSnapshot] = None
546
+ self._page_snapshots: dict[int, PageSnapshot] = {}
547
+
355
548
  @staticmethod
356
549
  def _process_pdf_data(pdf_data: Union[bytes, Path, str, BinaryIO]) -> bytes:
357
550
  """
@@ -393,7 +586,7 @@ class PDFDancer:
393
586
  except (IOError, OSError) as e:
394
587
  raise PdfDancerException(f"Failed to read PDF data: {e}", cause=e)
395
588
 
396
- def _extract_error_message(self, response: Optional[requests.Response]) -> str:
589
+ def _extract_error_message(self, response: Optional[httpx.Response]) -> str:
397
590
  """
398
591
  Extract meaningful error messages from API response.
399
592
  Parses JSON error responses with _embedded.errors structure.
@@ -429,7 +622,7 @@ class PDFDancer:
429
622
  # If JSON parsing fails, return response content or status
430
623
  return response.text or f"HTTP {response.status_code}"
431
624
 
432
- def _handle_authentication_error(self, response: Optional[requests.Response]) -> None:
625
+ def _handle_authentication_error(self, response: Optional[httpx.Response]) -> None:
433
626
  """
434
627
  Translate authentication failures into a clear, actionable validation error.
435
628
  """
@@ -466,16 +659,54 @@ class PDFDancer:
466
659
  Creates a new PDF processing session by uploading the PDF data.
467
660
  """
468
661
  try:
469
- files = {
470
- 'pdf': ('document.pdf', self._pdf_bytes, 'application/pdf')
662
+ # Build multipart body manually to avoid base64 encoding and enable compression
663
+ # httpx by default may add Content-Transfer-Encoding: base64 which the server rejects
664
+ import uuid
665
+
666
+ boundary = uuid.uuid4().hex
667
+
668
+ # Build multipart body with binary (not base64) encoding
669
+ body_parts = []
670
+ body_parts.append(f'--{boundary}\r\n'.encode('utf-8'))
671
+ body_parts.append(b'Content-Disposition: form-data; name="pdf"; filename="document.pdf"\r\n')
672
+ body_parts.append(b'Content-Type: application/pdf\r\n')
673
+ body_parts.append(b'\r\n') # End of headers, no Content-Transfer-Encoding
674
+ body_parts.append(self._pdf_bytes)
675
+ body_parts.append(b'\r\n')
676
+ body_parts.append(f'--{boundary}--\r\n'.encode('utf-8'))
677
+
678
+ uncompressed_body = b''.join(body_parts)
679
+
680
+ # Compress entire request body using gzip
681
+ compressed_body = gzip.compress(uncompressed_body)
682
+
683
+ original_size = len(uncompressed_body)
684
+ compressed_size = len(compressed_body)
685
+ compression_ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
686
+
687
+ if DEBUG:
688
+ print(f"{time.time()}|POST /session/create - original size: {original_size} bytes, "
689
+ f"compressed size: {compressed_size} bytes, "
690
+ f"compression: {compression_ratio:.1f}%")
691
+
692
+ headers = {
693
+ 'X-Generated-At': _generate_timestamp(),
694
+ 'Content-Type': f'multipart/form-data; boundary={boundary}',
695
+ 'Content-Encoding': 'gzip'
471
696
  }
472
697
 
473
- response = self._session.post(
698
+ response = self._client.post(
474
699
  self._cleanup_url_path(self._base_url, "/session/create"),
475
- files=files,
700
+ content=compressed_body,
701
+ headers=headers,
476
702
  timeout=self._read_timeout if self._read_timeout > 0 else None
477
703
  )
478
704
 
705
+ response_size = len(response.content)
706
+ if DEBUG:
707
+ print(f"{time.time()}|POST /session/create - response size: {response_size} bytes")
708
+
709
+ _log_generated_at_header(response, "POST", "/session/create")
479
710
  self._handle_authentication_error(response)
480
711
  response.raise_for_status()
481
712
  session_id = response.text.strip()
@@ -485,11 +716,14 @@ class PDFDancer:
485
716
 
486
717
  return session_id
487
718
 
488
- except requests.exceptions.RequestException as e:
489
- self._handle_authentication_error(getattr(e, 'response', None))
490
- error_message = self._extract_error_message(getattr(e, 'response', None))
719
+ except httpx.HTTPStatusError as e:
720
+ self._handle_authentication_error(e.response)
721
+ error_message = self._extract_error_message(e.response)
491
722
  raise HttpClientException(f"Failed to create session: {error_message}",
492
- response=getattr(e, 'response', None), cause=e) from None
723
+ response=e.response, cause=e) from None
724
+ except httpx.RequestError as e:
725
+ raise HttpClientException(f"Failed to create session: {str(e)}",
726
+ response=None, cause=e) from None
493
727
 
494
728
  def _create_blank_pdf_session(self,
495
729
  page_size: Optional[Union[PageSize, str, Mapping[str, Any]]] = None,
@@ -538,14 +772,27 @@ class PDFDancer:
538
772
  raise ValidationException(f"Initial page count must be at least 1, got {initial_page_count}")
539
773
  request_data['initialPageCount'] = initial_page_count
540
774
 
541
- headers = {'Content-Type': 'application/json'}
542
- response = self._session.post(
775
+ request_body = json.dumps(request_data)
776
+ request_size = len(request_body.encode('utf-8'))
777
+ if DEBUG:
778
+ print(f"{time.time()}|POST /session/new - request size: {request_size} bytes")
779
+
780
+ headers = {
781
+ 'Content-Type': 'application/json',
782
+ 'X-Generated-At': _generate_timestamp()
783
+ }
784
+ response = self._client.post(
543
785
  self._cleanup_url_path(self._base_url, "/session/new"),
544
786
  json=request_data,
545
787
  headers=headers,
546
788
  timeout=self._read_timeout if self._read_timeout > 0 else None
547
789
  )
548
790
 
791
+ response_size = len(response.content)
792
+ if DEBUG:
793
+ print(f"{time.time()}|POST /session/new - response size: {response_size} bytes")
794
+
795
+ _log_generated_at_header(response, "POST", "/session/new")
549
796
  self._handle_authentication_error(response)
550
797
  response.raise_for_status()
551
798
  session_id = response.text.strip()
@@ -555,24 +802,36 @@ class PDFDancer:
555
802
 
556
803
  return session_id
557
804
 
558
- except requests.exceptions.RequestException as e:
559
- self._handle_authentication_error(getattr(e, 'response', None))
560
- error_message = self._extract_error_message(getattr(e, 'response', None))
805
+ except httpx.HTTPStatusError as e:
806
+ self._handle_authentication_error(e.response)
807
+ error_message = self._extract_error_message(e.response)
561
808
  raise HttpClientException(f"Failed to create blank PDF session: {error_message}",
562
- response=getattr(e, 'response', None), cause=e) from None
809
+ response=e.response, cause=e) from None
810
+ except httpx.RequestError as e:
811
+ raise HttpClientException(f"Failed to create blank PDF session: {str(e)}",
812
+ response=None, cause=e) from None
563
813
 
564
814
  def _make_request(self, method: str, path: str, data: Optional[dict] = None,
565
- params: Optional[dict] = None) -> requests.Response:
815
+ params: Optional[dict] = None) -> httpx.Response:
566
816
  """
567
817
  Make HTTP request with session headers and error handling.
568
818
  """
569
819
  headers = {
570
820
  'X-Session-Id': self._session_id,
571
- 'Content-Type': 'application/json'
821
+ 'Content-Type': 'application/json',
822
+ 'X-Generated-At': _generate_timestamp(),
823
+ 'X-Fingerprint': Fingerprint.generate()
572
824
  }
573
825
 
574
826
  try:
575
- response = self._session.request(
827
+ request_size = 0
828
+ if data is not None:
829
+ request_body = json.dumps(data)
830
+ request_size = len(request_body.encode('utf-8'))
831
+ if DEBUG:
832
+ print(f"{time.time()}|{method} {path} - request size: {request_size} bytes")
833
+
834
+ response = self._client.request(
576
835
  method=method,
577
836
  url=self._cleanup_url_path(self._base_url, path),
578
837
  json=data,
@@ -581,6 +840,12 @@ class PDFDancer:
581
840
  timeout=self._read_timeout if self._read_timeout > 0 else None
582
841
  )
583
842
 
843
+ response_size = len(response.content)
844
+ if DEBUG:
845
+ print(f"{time.time()}|{method} {path} - response size: {response_size} bytes")
846
+
847
+ _log_generated_at_header(response, method, path)
848
+
584
849
  # Handle FontNotFoundException
585
850
  if response.status_code == 404:
586
851
  try:
@@ -594,31 +859,46 @@ class PDFDancer:
594
859
  response.raise_for_status()
595
860
  return response
596
861
 
597
- except requests.exceptions.RequestException as e:
598
- self._handle_authentication_error(getattr(e, 'response', None))
599
- error_message = self._extract_error_message(getattr(e, 'response', None))
600
- raise HttpClientException(f"API request failed: {error_message}", response=getattr(e, 'response', None),
862
+ except httpx.HTTPStatusError as e:
863
+ self._handle_authentication_error(e.response)
864
+ error_message = self._extract_error_message(e.response)
865
+ raise HttpClientException(f"API request failed: {error_message}", response=e.response,
866
+ cause=e) from None
867
+ except httpx.RequestError as e:
868
+ raise HttpClientException(f"API request failed: {str(e)}", response=None,
601
869
  cause=e) from None
602
870
 
603
- def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None) -> List[ObjectRef]:
871
+ def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None,
872
+ tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
604
873
  """
605
874
  Searches for PDF objects matching the specified criteria.
606
- This method provides flexible search capabilities across all PDF content,
607
- allowing filtering by object type and position constraints.
875
+ Uses snapshot cache for all queries except paths at specific coordinates.
608
876
 
609
877
  Args:
610
878
  object_type: The type of objects to find (None for all types)
611
879
  position: Positional constraints for the search (None for all positions)
880
+ tolerance: Tolerance in points for spatial matching (default: DEFAULT_TOLERANCE)
612
881
 
613
882
  Returns:
614
883
  List of object references matching the search criteria
615
884
  """
616
- request_data = FindRequest(object_type, position).to_dict()
617
- response = self._make_request('POST', '/pdf/find', data=request_data)
618
-
619
- # Parse response into ObjectRef objects
620
- objects_data = response.json()
621
- return [self._parse_object_ref(obj_data) for obj_data in objects_data]
885
+ # Special case: PATH queries with bounding_rect need API (full vector data)
886
+ if object_type == ObjectType.PATH and position and position.bounding_rect:
887
+ request_data = FindRequest(object_type, position).to_dict()
888
+ response = self._make_request('POST', '/pdf/find', data=request_data)
889
+ objects_data = response.json()
890
+ return [self._parse_object_ref(obj_data) for obj_data in objects_data]
891
+
892
+ # Use snapshot for all other queries
893
+ if position and position.page_index is not None:
894
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
895
+ return self._filter_snapshot_elements(snapshot.elements, object_type, position, tolerance)
896
+ else:
897
+ snapshot = self._get_or_fetch_document_snapshot()
898
+ all_elements = []
899
+ for page_snap in snapshot.pages:
900
+ all_elements.extend(page_snap.elements)
901
+ return self._filter_snapshot_elements(all_elements, object_type, position, tolerance)
622
902
 
623
903
  def select_paragraphs(self) -> List[TextObjectRef]:
624
904
  """
@@ -626,21 +906,39 @@ class PDFDancer:
626
906
  """
627
907
  return self._find_paragraphs(None)
628
908
 
629
- def _find_paragraphs(self, position: Optional[Position] = None) -> List[TextObjectRef]:
909
+ def _find_paragraphs(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
910
+ TextObjectRef]:
630
911
  """
631
912
  Searches for paragraph objects returning TextObjectRef with hierarchical structure.
913
+ Uses snapshot cache for all queries.
632
914
  """
633
- request_data = FindRequest(ObjectType.PARAGRAPH, position).to_dict()
634
- response = self._make_request('POST', '/pdf/find', data=request_data)
635
-
636
- objects_data = response.json()
637
- return [self._parse_text_object_ref(obj_data) for obj_data in objects_data]
915
+ # Use snapshot for all queries (including spatial)
916
+ if position and position.page_index is not None:
917
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
918
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.PARAGRAPH, position, tolerance)
919
+ else:
920
+ snapshot = self._get_or_fetch_document_snapshot()
921
+ all_elements = []
922
+ for page_snap in snapshot.pages:
923
+ all_elements.extend(page_snap.elements)
924
+ return self._filter_snapshot_elements(all_elements, ObjectType.PARAGRAPH, position, tolerance)
638
925
 
639
- def _find_images(self, position: Optional[Position] = None) -> List[ObjectRef]:
926
+ def _find_images(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
927
+ ObjectRef]:
640
928
  """
641
929
  Searches for image objects at the specified position.
930
+ Uses snapshot cache for all queries.
642
931
  """
643
- return self._find(ObjectType.IMAGE, position)
932
+ # Use snapshot for all queries (including spatial)
933
+ if position and position.page_index is not None:
934
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
935
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.IMAGE, position, tolerance)
936
+ else:
937
+ snapshot = self._get_or_fetch_document_snapshot()
938
+ all_elements = []
939
+ for page_snap in snapshot.pages:
940
+ all_elements.extend(page_snap.elements)
941
+ return self._filter_snapshot_elements(all_elements, ObjectType.IMAGE, position, tolerance)
644
942
 
645
943
  def select_images(self) -> List[ImageObject]:
646
944
  """
@@ -654,11 +952,22 @@ class PDFDancer:
654
952
  """
655
953
  return self._to_form_objects(self._find(ObjectType.FORM_X_OBJECT, None))
656
954
 
657
- def _find_form_x_objects(self, position: Optional[Position] = None) -> List[ObjectRef]:
955
+ def _find_form_x_objects(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
956
+ ObjectRef]:
658
957
  """
659
- Searches for form field objects at the specified position.
958
+ Searches for form X objects at the specified position.
959
+ Uses snapshot cache for all queries.
660
960
  """
661
- return self._find(ObjectType.FORM_X_OBJECT, position)
961
+ # Use snapshot for all queries (including spatial)
962
+ if position and position.page_index is not None:
963
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
964
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_X_OBJECT, position, tolerance)
965
+ else:
966
+ snapshot = self._get_or_fetch_document_snapshot()
967
+ all_elements = []
968
+ for page_snap in snapshot.pages:
969
+ all_elements.extend(page_snap.elements)
970
+ return self._filter_snapshot_elements(all_elements, ObjectType.FORM_X_OBJECT, position, tolerance)
662
971
 
663
972
  def select_form_fields(self) -> List[FormFieldObject]:
664
973
  """
@@ -672,17 +981,23 @@ class PDFDancer:
672
981
  """
673
982
  return self._to_form_field_objects(self._find_form_fields(Position.by_name(field_name)))
674
983
 
675
- def _find_form_fields(self, position: Optional[Position] = None) -> List[FormFieldRef]:
984
+ def _find_form_fields(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
985
+ FormFieldRef]:
676
986
  """
677
987
  Searches for form fields at the specified position.
678
988
  Returns FormFieldRef objects with name and value properties.
989
+ Uses snapshot cache for all queries (including name and spatial filtering).
679
990
  """
680
- request_data = FindRequest(ObjectType.FORM_FIELD, position).to_dict()
681
- response = self._make_request('POST', '/pdf/find', data=request_data)
682
-
683
- # Parse response into ObjectRef objects
684
- objects_data = response.json()
685
- return [self._parse_form_field_ref(obj_data) for obj_data in objects_data]
991
+ # Use snapshot for all queries (including name and spatial)
992
+ if position and position.page_index is not None:
993
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
994
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_FIELD, position, tolerance)
995
+ else:
996
+ snapshot = self._get_or_fetch_document_snapshot()
997
+ all_elements = []
998
+ for page_snap in snapshot.pages:
999
+ all_elements.extend(page_snap.elements)
1000
+ return self._filter_snapshot_elements(all_elements, ObjectType.FORM_FIELD, position, tolerance)
686
1001
 
687
1002
  def _change_form_field(self, form_field_ref: FormFieldRef, new_value: str) -> bool:
688
1003
  """
@@ -691,9 +1006,12 @@ class PDFDancer:
691
1006
  if form_field_ref is None:
692
1007
  raise ValidationException("Form field reference cannot be null")
693
1008
 
694
- request_data = ChangeFormFieldRequest(form_field_ref, new_value).to_dict()
695
- response = self._make_request('PUT', '/pdf/modify/formField', data=request_data)
696
- return response.json()
1009
+ try:
1010
+ request_data = ChangeFormFieldRequest(form_field_ref, new_value).to_dict()
1011
+ response = self._make_request('PUT', '/pdf/modify/formField', data=request_data)
1012
+ return response.json()
1013
+ finally:
1014
+ self._invalidate_snapshots()
697
1015
 
698
1016
  def select_paths(self) -> List[ObjectRef]:
699
1017
  """
@@ -701,21 +1019,45 @@ class PDFDancer:
701
1019
  """
702
1020
  return self._find(ObjectType.PATH, None)
703
1021
 
704
- def _find_paths(self, position: Optional[Position] = None) -> List[ObjectRef]:
1022
+ def _find_paths(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
705
1023
  """
706
1024
  Searches for vector path objects at the specified position.
707
- """
708
- return self._find(ObjectType.PATH, position)
1025
+ Note: Spatial queries (with bounding_rect) fall back to API since snapshots
1026
+ don't include full vector path data needed for precise intersection tests.
1027
+ """
1028
+ # Special case: paths at specific coordinates need full vector data
1029
+ # which is not available in snapshots, so pass through to API
1030
+ if position and position.bounding_rect:
1031
+ return self._find(ObjectType.PATH, position, tolerance)
1032
+
1033
+ # For simple page-level "all paths" queries, use snapshot
1034
+ if position and position.page_index is not None:
1035
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
1036
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.PATH, position, tolerance)
1037
+ else:
1038
+ # Document-level query - use document snapshot
1039
+ snapshot = self._get_or_fetch_document_snapshot()
1040
+ all_elements = []
1041
+ for page_snap in snapshot.pages:
1042
+ all_elements.extend(page_snap.elements)
1043
+ return self._filter_snapshot_elements(all_elements, ObjectType.PATH, position, tolerance)
709
1044
 
710
- def _find_text_lines(self, position: Optional[Position] = None) -> List[TextObjectRef]:
1045
+ def _find_text_lines(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
1046
+ TextObjectRef]:
711
1047
  """
712
1048
  Searches for text line objects returning TextObjectRef with hierarchical structure.
1049
+ Uses snapshot cache for all queries.
713
1050
  """
714
- request_data = FindRequest(ObjectType.TEXT_LINE, position).to_dict()
715
- response = self._make_request('POST', '/pdf/find', data=request_data)
716
-
717
- objects_data = response.json()
718
- return [self._parse_text_object_ref(obj_data) for obj_data in objects_data]
1051
+ # Use snapshot for all queries (including spatial)
1052
+ if position and position.page_index is not None:
1053
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
1054
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.TEXT_LINE, position, tolerance)
1055
+ else:
1056
+ snapshot = self._get_or_fetch_document_snapshot()
1057
+ all_elements = []
1058
+ for page_snap in snapshot.pages:
1059
+ all_elements.extend(page_snap.elements)
1060
+ return self._filter_snapshot_elements(all_elements, ObjectType.TEXT_LINE, position, tolerance)
719
1061
 
720
1062
  def select_text_lines(self) -> List[TextLineObject]:
721
1063
  """
@@ -725,7 +1067,7 @@ class PDFDancer:
725
1067
 
726
1068
  def page(self, page_index: int) -> PageClient:
727
1069
  """
728
- Get a specific page by index, fetching page properties from the server.
1070
+ Get a specific page by index, using snapshot cache when available.
729
1071
 
730
1072
  Args:
731
1073
  page_index: The 0-based page index
@@ -733,11 +1075,16 @@ class PDFDancer:
733
1075
  Returns:
734
1076
  PageClient with page properties populated
735
1077
  """
1078
+ # Try to get page ref from snapshot first (avoids API call)
1079
+ page_snapshot = self._get_or_fetch_page_snapshot(page_index)
1080
+ if page_snapshot and page_snapshot.page_ref:
1081
+ return PageClient.from_ref(self, page_snapshot.page_ref)
1082
+
1083
+ # Fallback to API if snapshot doesn't have page ref
736
1084
  page_ref = self._get_page(page_index)
737
1085
  if page_ref:
738
1086
  return PageClient.from_ref(self, page_ref)
739
1087
  else:
740
- # Fallback to basic PageClient if page not found
741
1088
  return PageClient(page_index, self)
742
1089
 
743
1090
  # Page Operations
@@ -747,11 +1094,11 @@ class PDFDancer:
747
1094
 
748
1095
  def _get_pages(self) -> List[PageRef]:
749
1096
  """
750
- Retrieves references to all pages in the PDF document.
1097
+ Retrieves references to all pages in the PDF document using snapshot cache.
751
1098
  """
752
- response = self._make_request('POST', '/pdf/page/find')
753
- pages_data = response.json()
754
- return [self._parse_page_ref(page_data) for page_data in pages_data]
1099
+ # Use document snapshot which includes all pages (avoids API call)
1100
+ doc_snapshot = self._get_or_fetch_document_snapshot()
1101
+ return [page_snap.page_ref for page_snap in doc_snapshot.pages]
755
1102
 
756
1103
  def _get_page(self, page_index: int) -> Optional[PageRef]:
757
1104
  """
@@ -791,7 +1138,13 @@ class PDFDancer:
791
1138
  request_data = page_ref.to_dict()
792
1139
 
793
1140
  response = self._make_request('DELETE', '/pdf/page/delete', data=request_data)
794
- return response.json()
1141
+ result = response.json()
1142
+
1143
+ # Invalidate snapshot caches after mutation
1144
+ if result:
1145
+ self._invalidate_snapshots()
1146
+
1147
+ return result
795
1148
 
796
1149
  def move_page(self, from_page_index: int, to_page_index: int) -> bool:
797
1150
  """Move a page to a different index within the document."""
@@ -810,6 +1163,11 @@ class PDFDancer:
810
1163
  request_data = PageMoveRequest(from_page_index, to_page_index).to_dict()
811
1164
  response = self._make_request('PUT', '/pdf/page/move', data=request_data)
812
1165
  result = response.json()
1166
+
1167
+ # Invalidate snapshot caches after mutation
1168
+ if result:
1169
+ self._invalidate_snapshots()
1170
+
813
1171
  return bool(result)
814
1172
 
815
1173
  # Manipulation Operations
@@ -829,7 +1187,13 @@ class PDFDancer:
829
1187
 
830
1188
  request_data = DeleteRequest(object_ref).to_dict()
831
1189
  response = self._make_request('DELETE', '/pdf/delete', data=request_data)
832
- return response.json()
1190
+ result = response.json()
1191
+
1192
+ # Invalidate snapshot caches after mutation
1193
+ if result:
1194
+ self._invalidate_snapshots()
1195
+
1196
+ return result
833
1197
 
834
1198
  def _move(self, object_ref: ObjectRef, position: Position) -> bool:
835
1199
  """
@@ -849,7 +1213,13 @@ class PDFDancer:
849
1213
 
850
1214
  request_data = MoveRequest(object_ref, position).to_dict()
851
1215
  response = self._make_request('PUT', '/pdf/move', data=request_data)
852
- return response.json()
1216
+ result = response.json()
1217
+
1218
+ # Invalidate snapshot caches after mutation
1219
+ if result:
1220
+ self._invalidate_snapshots()
1221
+
1222
+ return result
853
1223
 
854
1224
  # Add Operations
855
1225
 
@@ -896,24 +1266,58 @@ class PDFDancer:
896
1266
 
897
1267
  return self._add_object(paragraph)
898
1268
 
1269
+ def _add_path(self, path: 'Path') -> bool:
1270
+ """
1271
+ Internal method to add a path to the document after validation.
1272
+ """
1273
+ from .models import Path as PathModel
1274
+
1275
+ if path is None:
1276
+ raise ValidationException("Path cannot be null")
1277
+ if path.get_position() is None:
1278
+ raise ValidationException("Path position is null")
1279
+ if path.get_position().page_index is None:
1280
+ raise ValidationException("Path position page index is null")
1281
+ if path.get_position().page_index < 0:
1282
+ raise ValidationException("Path position page index is less than 0")
1283
+ if not path.get_path_segments() or len(path.get_path_segments()) == 0:
1284
+ raise ValidationException("Path must have at least one segment")
1285
+
1286
+ return self._add_object(path)
1287
+
899
1288
  def _add_object(self, pdf_object) -> bool:
900
1289
  """
901
1290
  Internal method to add any PDF object.
902
1291
  """
903
1292
  request_data = AddRequest(pdf_object).to_dict()
904
1293
  response = self._make_request('POST', '/pdf/add', data=request_data)
905
- return response.json()
1294
+ result = response.json()
1295
+
1296
+ # Invalidate snapshot caches after mutation
1297
+ if result:
1298
+ self._invalidate_snapshots()
1299
+
1300
+ return result
906
1301
 
907
1302
  def new_paragraph(self) -> ParagraphBuilder:
908
1303
  return ParagraphBuilder(self)
909
1304
 
910
1305
  def new_page(self):
911
1306
  response = self._make_request('POST', '/pdf/page/add', data=None)
912
- return self._parse_page_ref(response.json())
1307
+ result = self._parse_page_ref(response.json())
1308
+
1309
+ # Invalidate snapshot caches after adding page
1310
+ self._invalidate_snapshots()
1311
+
1312
+ return result
913
1313
 
914
1314
  def new_image(self) -> ImageBuilder:
915
1315
  return ImageBuilder(self)
916
1316
 
1317
+ def new_path(self) -> 'PathBuilder':
1318
+ from .path_builder import PathBuilder
1319
+ return PathBuilder(self)
1320
+
917
1321
  # Modify Operations
918
1322
  def _modify_paragraph(self, object_ref: ObjectRef, new_paragraph: Union[Paragraph, str]) -> CommandResult:
919
1323
  """
@@ -935,12 +1339,16 @@ class PDFDancer:
935
1339
  # Text modification - returns CommandResult
936
1340
  request_data = ModifyTextRequest(object_ref, new_paragraph).to_dict()
937
1341
  response = self._make_request('PUT', '/pdf/text/paragraph', data=request_data)
938
- return CommandResult.from_dict(response.json())
1342
+ result = CommandResult.from_dict(response.json())
939
1343
  else:
940
1344
  # Object modification
941
1345
  request_data = ModifyRequest(object_ref, new_paragraph).to_dict()
942
1346
  response = self._make_request('PUT', '/pdf/modify', data=request_data)
943
- return CommandResult.from_dict(response.json())
1347
+ result = CommandResult.from_dict(response.json())
1348
+
1349
+ # Invalidate snapshot caches after mutation
1350
+ self._invalidate_snapshots()
1351
+ return result
944
1352
 
945
1353
  def _modify_text_line(self, object_ref: ObjectRef, new_text: str) -> CommandResult:
946
1354
  """
@@ -960,7 +1368,11 @@ class PDFDancer:
960
1368
 
961
1369
  request_data = ModifyTextRequest(object_ref, new_text).to_dict()
962
1370
  response = self._make_request('PUT', '/pdf/text/line', data=request_data)
963
- return CommandResult.from_dict(response.json())
1371
+ result = CommandResult.from_dict(response.json())
1372
+
1373
+ # Invalidate snapshot caches after mutation
1374
+ self._invalidate_snapshots()
1375
+ return result
964
1376
 
965
1377
  # Font Operations
966
1378
 
@@ -1040,26 +1452,224 @@ class PDFDancer:
1040
1452
  'ttfFile': (filename, font_data, 'font/ttf')
1041
1453
  }
1042
1454
 
1043
- headers = {'X-Session-Id': self._session_id}
1044
- response = self._session.post(
1455
+ request_size = len(font_data)
1456
+ if DEBUG:
1457
+ print(f"{time.time()}|POST /font/register - request size: {request_size} bytes")
1458
+
1459
+ headers = {
1460
+ 'X-Session-Id': self._session_id,
1461
+ 'X-Generated-At': _generate_timestamp()
1462
+ }
1463
+ response = self._client.post(
1045
1464
  self._cleanup_url_path(self._base_url, "/font/register"),
1046
1465
  files=files,
1047
1466
  headers=headers,
1048
1467
  timeout=30
1049
1468
  )
1050
1469
 
1470
+ response_size = len(response.content)
1471
+ if DEBUG:
1472
+ print(f"{time.time()}|POST /font/register - response size: {response_size} bytes")
1473
+
1474
+ _log_generated_at_header(response, "POST", "/font/register")
1051
1475
  response.raise_for_status()
1052
1476
  return response.text.strip()
1053
1477
 
1054
1478
  except (IOError, OSError) as e:
1055
1479
  raise PdfDancerException(f"Failed to read font file: {e}", cause=e)
1056
- except requests.exceptions.RequestException as e:
1057
- error_message = self._extract_error_message(getattr(e, 'response', None))
1480
+ except httpx.HTTPStatusError as e:
1481
+ error_message = self._extract_error_message(e.response)
1058
1482
  raise HttpClientException(f"Font registration failed: {error_message}",
1059
- response=getattr(e, 'response', None), cause=e) from None
1483
+ response=e.response, cause=e) from None
1484
+ except httpx.RequestError as e:
1485
+ raise HttpClientException(f"Font registration failed: {str(e)}",
1486
+ response=None, cause=e) from None
1060
1487
 
1061
1488
  # Document Operations
1062
1489
 
1490
+ # Snapshot Operations
1491
+
1492
+ def get_document_snapshot(self, types: Optional[str] = None) -> DocumentSnapshot:
1493
+ """
1494
+ Retrieve a snapshot of the entire document with all pages and elements.
1495
+
1496
+ Args:
1497
+ types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
1498
+
1499
+ Returns:
1500
+ DocumentSnapshot containing page count, fonts, and all page snapshots
1501
+ """
1502
+ params = {}
1503
+ if types:
1504
+ params['types'] = types
1505
+
1506
+ response = self._make_request('GET', '/pdf/document/snapshot', params=params)
1507
+ data = response.json()
1508
+
1509
+ return self._parse_document_snapshot(data)
1510
+
1511
+ def get_page_snapshot(self, page_index: int, types: Optional[str] = None) -> PageSnapshot:
1512
+ """
1513
+ Retrieve a snapshot of a specific page with all its elements.
1514
+
1515
+ Args:
1516
+ page_index: The index of the page to snapshot (0-based)
1517
+ types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
1518
+
1519
+ Returns:
1520
+ PageSnapshot containing page reference and all elements on that page
1521
+ """
1522
+ if page_index < 0:
1523
+ raise ValidationException(f"Page index must be >= 0, got {page_index}")
1524
+
1525
+ params = {}
1526
+ if types:
1527
+ params['types'] = types
1528
+
1529
+ response = self._make_request('GET', f'/pdf/page/{page_index}/snapshot', params=params)
1530
+ data = response.json()
1531
+
1532
+ return self._parse_page_snapshot(data)
1533
+
1534
+ def _get_or_fetch_document_snapshot(self) -> DocumentSnapshot:
1535
+ """
1536
+ Get document snapshot from cache or fetch if not cached.
1537
+ This is used internally by select_* methods for optimization.
1538
+ Also caches individual page snapshots from the document snapshot.
1539
+ """
1540
+ if self._document_snapshot is None:
1541
+ self._document_snapshot = self.get_document_snapshot()
1542
+ # Cache individual page snapshots from document snapshot
1543
+ for i, page_snapshot in enumerate(self._document_snapshot.pages):
1544
+ if i not in self._page_snapshots:
1545
+ self._page_snapshots[i] = page_snapshot
1546
+ return self._document_snapshot
1547
+
1548
+ def _get_or_fetch_page_snapshot(self, page_index: int) -> PageSnapshot:
1549
+ """
1550
+ Get page snapshot from cache or fetch if not cached.
1551
+ This is used internally by select_* methods for optimization.
1552
+ If document snapshot exists, uses page from it instead of making separate API call.
1553
+ """
1554
+ # Check if already cached
1555
+ if page_index in self._page_snapshots:
1556
+ return self._page_snapshots[page_index]
1557
+
1558
+ # If document snapshot exists, get page from it (no API call needed)
1559
+ if self._document_snapshot is not None:
1560
+ if 0 <= page_index < len(self._document_snapshot.pages):
1561
+ page_snapshot = self._document_snapshot.pages[page_index]
1562
+ self._page_snapshots[page_index] = page_snapshot
1563
+ return page_snapshot
1564
+
1565
+ # Otherwise fetch page snapshot individually
1566
+ self._page_snapshots[page_index] = self.get_page_snapshot(page_index)
1567
+ return self._page_snapshots[page_index]
1568
+
1569
+ def _invalidate_snapshots(self) -> None:
1570
+ """
1571
+ Clear all snapshot caches.
1572
+ Called after mutations (delete, move, modify) to ensure fresh data on next select.
1573
+ """
1574
+ self._document_snapshot = None
1575
+ self._page_snapshots.clear()
1576
+
1577
+ def _filter_snapshot_elements(self, elements: List, object_type: ObjectType,
1578
+ position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List:
1579
+ """
1580
+ Filter snapshot elements client-side based on object type and position criteria.
1581
+
1582
+ Args:
1583
+ elements: List of elements from snapshot (ObjectRef, TextObjectRef, etc.)
1584
+ object_type: Type to filter for
1585
+ position: Optional position filter with text matching, bounding rect, etc.
1586
+ tolerance: Tolerance in points for spatial matching (default: 10.0)
1587
+
1588
+ Returns:
1589
+ Filtered list of elements matching the criteria
1590
+ """
1591
+ import re
1592
+
1593
+ # Filter by object type (handle form field subtypes)
1594
+ if object_type == ObjectType.FORM_FIELD:
1595
+ # Form fields include TEXT_FIELD, CHECK_BOX, RADIO_BUTTON, BUTTON, DROPDOWN
1596
+ form_field_types = {ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
1597
+ ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
1598
+ ObjectType.BUTTON, ObjectType.DROPDOWN}
1599
+ filtered = [e for e in elements if e.type in form_field_types]
1600
+ else:
1601
+ filtered = [e for e in elements if e.type == object_type]
1602
+
1603
+ if position is None:
1604
+ return filtered
1605
+
1606
+ # Apply position filters
1607
+ result = filtered
1608
+
1609
+ # Text starts with filter (case-insensitive to match API behavior)
1610
+ if position.text_starts_with:
1611
+ search_text = position.text_starts_with.lower()
1612
+ result = [
1613
+ e for e in result
1614
+ if isinstance(e, TextObjectRef) and e.text and e.text.lower().startswith(search_text)
1615
+ ]
1616
+
1617
+ # Regex pattern filter
1618
+ if position.text_pattern:
1619
+ pattern = re.compile(position.text_pattern)
1620
+ result = [
1621
+ e for e in result
1622
+ if isinstance(e, TextObjectRef) and e.text and pattern.search(e.text)
1623
+ ]
1624
+
1625
+ # Bounding rect filter (spatial queries like at(x, y))
1626
+ if position.bounding_rect:
1627
+ rect = position.bounding_rect
1628
+ result = [
1629
+ e for e in result
1630
+ if e.position and e.position.bounding_rect and
1631
+ self._rects_intersect(e.position.bounding_rect, rect, tolerance)
1632
+ ]
1633
+
1634
+ # Name filter (for form fields)
1635
+ if position.name:
1636
+ from .models import FormFieldRef
1637
+ result = [
1638
+ e for e in result
1639
+ if isinstance(e, FormFieldRef) and e.name == position.name
1640
+ ]
1641
+
1642
+ return result
1643
+
1644
+ @staticmethod
1645
+ def _rects_intersect(rect1, rect2, tolerance: float = DEFAULT_TOLERANCE) -> bool:
1646
+ """
1647
+ Check if two bounding rectangles intersect or are very close.
1648
+ Handles point queries (width/height = 0) with tolerance.
1649
+
1650
+ Args:
1651
+ rect1: First bounding rectangle
1652
+ rect2: Second bounding rectangle
1653
+ tolerance: Tolerance in points for position matching (default: 10.0)
1654
+ """
1655
+ # Get effective bounds with tolerance
1656
+ r1_left = rect1.x - tolerance
1657
+ r1_right = rect1.x + rect1.width + tolerance
1658
+ r1_top = rect1.y - tolerance
1659
+ r1_bottom = rect1.y + rect1.height + tolerance
1660
+
1661
+ r2_left = rect2.x - tolerance
1662
+ r2_right = rect2.x + rect2.width + tolerance
1663
+ r2_top = rect2.y - tolerance
1664
+ r2_bottom = rect2.y + rect2.height + tolerance
1665
+
1666
+ # Check if rectangles overlap
1667
+ if r1_right < r2_left or r2_right < r1_left:
1668
+ return False
1669
+ if r1_bottom < r2_top or r2_bottom < r1_top:
1670
+ return False
1671
+ return True
1672
+
1063
1673
  def get_bytes(self) -> bytes:
1064
1674
  """
1065
1675
  Downloads the current state of the PDF document with all modifications applied.
@@ -1250,6 +1860,175 @@ class PDFDancer:
1250
1860
  orientation=orientation
1251
1861
  )
1252
1862
 
1863
+ def _parse_path_segment(self, segment_data: dict) -> 'PathSegment':
1864
+ """Parse JSON data into PathSegment instance (Line or Bezier)."""
1865
+ from .models import Line, Bezier, PathSegment, Point, Color
1866
+
1867
+ segment_type = segment_data.get('segmentType', segment_data.get('type', '')).upper()
1868
+
1869
+ # Parse common properties
1870
+ stroke_color = None
1871
+ stroke_color_data = segment_data.get('strokeColor')
1872
+ if isinstance(stroke_color_data, dict):
1873
+ r = stroke_color_data.get('red', 0)
1874
+ g = stroke_color_data.get('green', 0)
1875
+ b = stroke_color_data.get('blue', 0)
1876
+ a = stroke_color_data.get('alpha', 255)
1877
+ if all(isinstance(v, int) for v in [r, g, b]):
1878
+ stroke_color = Color(r, g, b, a)
1879
+
1880
+ fill_color = None
1881
+ fill_color_data = segment_data.get('fillColor')
1882
+ if isinstance(fill_color_data, dict):
1883
+ r = fill_color_data.get('red', 0)
1884
+ g = fill_color_data.get('green', 0)
1885
+ b = fill_color_data.get('blue', 0)
1886
+ a = fill_color_data.get('alpha', 255)
1887
+ if all(isinstance(v, int) for v in [r, g, b]):
1888
+ fill_color = Color(r, g, b, a)
1889
+
1890
+ stroke_width = segment_data.get('strokeWidth')
1891
+ dash_array = segment_data.get('dashArray')
1892
+ dash_phase = segment_data.get('dashPhase')
1893
+
1894
+ # Parse specific segment type
1895
+ if segment_type == 'LINE':
1896
+ p0_data = segment_data.get('p0', {})
1897
+ p1_data = segment_data.get('p1', {})
1898
+
1899
+ p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
1900
+ p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
1901
+
1902
+ return Line(
1903
+ stroke_color=stroke_color,
1904
+ fill_color=fill_color,
1905
+ stroke_width=stroke_width,
1906
+ dash_array=dash_array,
1907
+ dash_phase=dash_phase,
1908
+ p0=p0,
1909
+ p1=p1
1910
+ )
1911
+ elif segment_type == 'BEZIER':
1912
+ p0_data = segment_data.get('p0', {})
1913
+ p1_data = segment_data.get('p1', {})
1914
+ p2_data = segment_data.get('p2', {})
1915
+ p3_data = segment_data.get('p3', {})
1916
+
1917
+ p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
1918
+ p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
1919
+ p2 = Point(p2_data.get('x', 0.0), p2_data.get('y', 0.0)) if p2_data else None
1920
+ p3 = Point(p3_data.get('x', 0.0), p3_data.get('y', 0.0)) if p3_data else None
1921
+
1922
+ return Bezier(
1923
+ stroke_color=stroke_color,
1924
+ fill_color=fill_color,
1925
+ stroke_width=stroke_width,
1926
+ dash_array=dash_array,
1927
+ dash_phase=dash_phase,
1928
+ p0=p0,
1929
+ p1=p1,
1930
+ p2=p2,
1931
+ p3=p3
1932
+ )
1933
+ else:
1934
+ # Fallback to base PathSegment for unknown types
1935
+ return PathSegment(
1936
+ stroke_color=stroke_color,
1937
+ fill_color=fill_color,
1938
+ stroke_width=stroke_width,
1939
+ dash_array=dash_array,
1940
+ dash_phase=dash_phase
1941
+ )
1942
+
1943
+ def _parse_path(self, obj_data: dict) -> 'Path':
1944
+ """Parse JSON data into Path instance with path segments."""
1945
+ from .models import Path
1946
+
1947
+ position_data = obj_data.get('position', {})
1948
+ position = self._parse_position(position_data) if position_data else None
1949
+
1950
+ # Parse path segments
1951
+ path_segments = []
1952
+ segments_data = obj_data.get('pathSegments', [])
1953
+ if isinstance(segments_data, list):
1954
+ for segment_data in segments_data:
1955
+ if isinstance(segment_data, dict):
1956
+ path_segments.append(self._parse_path_segment(segment_data))
1957
+
1958
+ even_odd_fill = obj_data.get('evenOddFill')
1959
+
1960
+ return Path(
1961
+ position=position,
1962
+ path_segments=path_segments if path_segments else None,
1963
+ even_odd_fill=even_odd_fill
1964
+ )
1965
+
1966
+ def _parse_font_recommendation(self, data: dict) -> FontRecommendation:
1967
+ """Parse JSON data into FontRecommendation instance."""
1968
+ font_type_str = data.get('fontType', 'SYSTEM')
1969
+ font_type = FontType(font_type_str)
1970
+
1971
+ return FontRecommendation(
1972
+ font_name=data.get('fontName', ''),
1973
+ font_type=font_type,
1974
+ similarity_score=data.get('similarityScore', 0.0)
1975
+ )
1976
+
1977
+ def _parse_page_snapshot(self, data: dict) -> PageSnapshot:
1978
+ """Parse JSON data into PageSnapshot instance with proper type handling."""
1979
+ page_ref = self._parse_page_ref(data.get('pageRef', {}))
1980
+
1981
+ # Parse elements using appropriate parser based on type
1982
+ elements = []
1983
+ for elem_data in data.get('elements', []):
1984
+ elem_type_str = elem_data.get('type')
1985
+ if not elem_type_str:
1986
+ continue
1987
+
1988
+ try:
1989
+ # Normalize type string (API returns "CHECKBOX" but enum is "CHECK_BOX")
1990
+ if elem_type_str == "CHECKBOX":
1991
+ elem_type_str = "CHECK_BOX"
1992
+ # Deep copy to avoid modifying original
1993
+ import copy
1994
+ elem_data = copy.deepcopy(elem_data)
1995
+ elem_data['type'] = elem_type_str # Update type in data
1996
+
1997
+ elem_type = ObjectType(elem_type_str)
1998
+
1999
+ # Use appropriate parser based on element type
2000
+ if elem_type in (ObjectType.PARAGRAPH, ObjectType.TEXT_LINE):
2001
+ # Parse as TextObjectRef to capture text, font, color, children
2002
+ elements.append(self._parse_text_object_ref(elem_data))
2003
+ elif elem_type in (ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
2004
+ ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
2005
+ ObjectType.BUTTON, ObjectType.DROPDOWN):
2006
+ # Parse as FormFieldRef to capture name and value
2007
+ elements.append(self._parse_form_field_ref(elem_data))
2008
+ else:
2009
+ # Parse as basic ObjectRef
2010
+ elements.append(self._parse_object_ref(elem_data))
2011
+ except (ValueError, KeyError):
2012
+ # Skip elements with invalid types
2013
+ continue
2014
+
2015
+ return PageSnapshot(
2016
+ page_ref=page_ref,
2017
+ elements=elements
2018
+ )
2019
+
2020
+ def _parse_document_snapshot(self, data: dict) -> DocumentSnapshot:
2021
+ """Parse JSON data into DocumentSnapshot instance."""
2022
+ page_count = data.get('pageCount', 0)
2023
+ fonts = [self._parse_font_recommendation(font_data) for font_data in data.get('fonts', [])]
2024
+ pages = [self._parse_page_snapshot(page_data) for page_data in data.get('pages', [])]
2025
+
2026
+ return DocumentSnapshot(
2027
+ page_count=page_count,
2028
+ fonts=fonts,
2029
+ pages=pages
2030
+ )
2031
+
1253
2032
  # Builder Pattern Support
1254
2033
 
1255
2034
  def _paragraph_builder(self) -> 'ParagraphBuilder':
@@ -1268,9 +2047,17 @@ class PDFDancer:
1268
2047
 
1269
2048
  def __exit__(self, exc_type, exc_val, exc_tb):
1270
2049
  """Context manager exit - cleanup if needed."""
2050
+ # Close the HTTP client to free resources
2051
+ if hasattr(self, '_client'):
2052
+ self._client.close()
1271
2053
  # TODO Could add session cleanup here if API supports it. Cleanup on the server
1272
2054
  pass
1273
2055
 
2056
+ def close(self):
2057
+ """Close the HTTP client and free resources."""
2058
+ if hasattr(self, '_client'):
2059
+ self._client.close()
2060
+
1274
2061
  def _to_path_objects(self, refs: List[ObjectRef]) -> List[PathObject]:
1275
2062
  return [PathObject(self, ref.internal_id, ref.type, ref.position) for ref in refs]
1276
2063