pdfdancer-client-python 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdfdancer/pdfdancer_v1.py CHANGED
@@ -5,23 +5,116 @@ A Python client that closely mirrors the Java Client class structure and functio
5
5
  Provides session-based PDF manipulation operations with strict validation.
6
6
  """
7
7
 
8
+ import gzip
8
9
  import json
9
10
  import os
10
11
  import time
12
+ from datetime import datetime, timezone
11
13
  from pathlib import Path
12
14
  from typing import List, Optional, Union, BinaryIO, Mapping, Any
13
15
 
14
- import requests
16
+ import httpx
15
17
  from dotenv import load_dotenv
16
18
 
19
+ from .fingerprint import Fingerprint
20
+
17
21
  load_dotenv()
18
22
 
19
23
  # Global variable to disable SSL certificate verification
20
24
  # Set to True to skip SSL verification (useful for testing with self-signed certificates)
21
25
  # WARNING: Only use in development/testing environments
22
- DISABLE_SSL_VERIFY = False
26
+ DISABLE_SSL_VERIFY = os.environ.get("PDFDANCER_CLIENT_DISABLE_SSL_VERIFY", False)
27
+
28
+ DEBUG = os.environ.get("PDFDANCER_CLIENT_DEBUG", False)
29
+ DEFAULT_TOLERANCE = 0.01
30
+
31
+
32
+ def _generate_timestamp() -> str:
33
+ """
34
+ Generate a timestamp string in the format expected by the API.
35
+ Format: YYYY-MM-DDTHH:MM:SS.ffffffZ (with microseconds)
36
+
37
+ Returns:
38
+ Timestamp string with UTC timezone
39
+ """
40
+ return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
41
+
42
+
43
+ def _parse_timestamp(timestamp_str: str) -> datetime:
44
+ """
45
+ Parse timestamp string, handling both microseconds and nanoseconds precision.
46
+
47
+ Args:
48
+ timestamp_str: Timestamp string in format YYYY-MM-DDTHH:MM:SS.fffffffZ
49
+ (with 6 or 9 fractional digits)
50
+
51
+ Returns:
52
+ datetime object with UTC timezone
53
+ """
54
+ # Remove the 'Z' suffix
55
+ ts = timestamp_str.rstrip('Z')
56
+
57
+ # Handle nanoseconds (9 digits) by truncating to microseconds (6 digits)
58
+ # Python's datetime only supports microseconds precision
59
+ if '.' in ts:
60
+ date_part, frac_part = ts.rsplit('.', 1)
61
+ if len(frac_part) > 6:
62
+ # Truncate to 6 digits (microseconds)
63
+ frac_part = frac_part[:6]
64
+ ts = f"{date_part}.{frac_part}"
65
+
66
+ return datetime.fromisoformat(ts).replace(tzinfo=timezone.utc)
67
+
68
+
69
+ def _log_generated_at_header(response: httpx.Response, method: str, path: str) -> None:
70
+ """
71
+ Check for X-Generated-At and X-Received-At headers and log timing information if DEBUG=True.
72
+
73
+ Expected timestamp formats:
74
+ - 2025-10-24T08:49:39.161945Z (microseconds - 6 digits)
75
+ - 2025-10-24T08:58:45.468131265Z (nanoseconds - 9 digits)
76
+
77
+ Args:
78
+ response: The HTTP response object
79
+ method: HTTP method used
80
+ path: API path
81
+ """
82
+ if not DEBUG:
83
+ return
84
+
85
+ generated_at = response.headers.get('X-Generated-At')
86
+ received_at = response.headers.get('X-Received-At')
87
+
88
+ if generated_at or received_at:
89
+ try:
90
+ log_parts = []
91
+ current_time = datetime.now(timezone.utc)
92
+
93
+ # Parse and log X-Received-At
94
+ received_time = None
95
+ if received_at:
96
+ received_time = _parse_timestamp(received_at)
97
+ time_since_received = (current_time - received_time).total_seconds()
98
+ log_parts.append(f"X-Received-At: {received_at}, time since received: {time_since_received:.3f}s")
99
+
100
+ # Parse and log X-Generated-At
101
+ generated_time = None
102
+ if generated_at:
103
+ generated_time = _parse_timestamp(generated_at)
104
+ time_since_generated = (current_time - generated_time).total_seconds()
105
+ log_parts.append(f"X-Generated-At: {generated_at}, time since generated: {time_since_generated:.3f}s")
106
+
107
+ # Calculate processing time (X-Generated-At - X-Received-At)
108
+ if received_time and generated_time:
109
+ processing_time = (generated_time - received_time).total_seconds()
110
+ log_parts.append(f"processing time: {processing_time:.3f}s")
111
+
112
+ if log_parts:
113
+ print(f"{time.time()}|{method} {path} - {', '.join(log_parts)}")
114
+
115
+ except (ValueError, AttributeError) as e:
116
+ print(f"{time.time()}|{method} {path} - Header parse error: {e}")
23
117
 
24
- DEBUG = False
25
118
 
26
119
  from . import ParagraphBuilder
27
120
  from .exceptions import (
@@ -36,7 +129,8 @@ from .models import (
36
129
  ObjectRef, Position, ObjectType, Font, Image, Paragraph, FormFieldRef, TextObjectRef, PageRef,
37
130
  FindRequest, DeleteRequest, MoveRequest, PageMoveRequest, AddRequest, ModifyRequest, ModifyTextRequest,
38
131
  ChangeFormFieldRequest, CommandResult,
39
- ShapeType, PositionMode, PageSize, Orientation
132
+ ShapeType, PositionMode, PageSize, Orientation,
133
+ PageSnapshot, DocumentSnapshot, FontRecommendation, FontType
40
134
  )
41
135
  from .paragraph_builder import ParagraphPageBuilder
42
136
  from .types import PathObject, ParagraphObject, TextLineObject, ImageObject, FormObject, FormFieldObject
@@ -60,9 +154,10 @@ class PageClient:
60
154
  else:
61
155
  self.orientation = orientation
62
156
 
63
- def select_paths_at(self, x: float, y: float) -> List[PathObject]:
157
+ def select_paths_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[PathObject]:
158
+ position = Position.at_page_coordinates(self.page_index, x, y)
64
159
  # noinspection PyProtectedMember
65
- return self.root._to_path_objects(self.root._find_paths(Position.at_page_coordinates(self.page_index, x, y)))
160
+ return self.root._to_path_objects(self.root._find_paths(position, tolerance))
66
161
 
67
162
  def select_paragraphs(self) -> List[ParagraphObject]:
68
163
  # noinspection PyProtectedMember
@@ -86,10 +181,10 @@ class PageClient:
86
181
  # noinspection PyProtectedMember
87
182
  return self.root._to_textline_objects(self.root._find_text_lines(position))
88
183
 
89
- def select_paragraphs_at(self, x: float, y: float) -> List[ParagraphObject]:
184
+ def select_paragraphs_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ParagraphObject]:
90
185
  position = Position.at_page_coordinates(self.page_index, x, y)
91
186
  # noinspection PyProtectedMember
92
- return self.root._to_paragraph_objects(self.root._find_paragraphs(position))
187
+ return self.root._to_paragraph_objects(self.root._find_paragraphs(position, tolerance))
93
188
 
94
189
  def select_text_lines(self) -> List[TextLineObject]:
95
190
  position = Position.at_page(self.page_index)
@@ -102,29 +197,29 @@ class PageClient:
102
197
  # noinspection PyProtectedMember
103
198
  return self.root._to_textline_objects(self.root._find_text_lines(position))
104
199
 
105
- def select_text_lines_at(self, x, y) -> List[TextLineObject]:
200
+ def select_text_lines_at(self, x, y, tolerance: float = DEFAULT_TOLERANCE) -> List[TextLineObject]:
106
201
  position = Position.at_page_coordinates(self.page_index, x, y)
107
202
  # noinspection PyProtectedMember
108
- return self.root._to_textline_objects(self.root._find_text_lines(position))
203
+ return self.root._to_textline_objects(self.root._find_text_lines(position, tolerance))
109
204
 
110
205
  def select_images(self) -> List[ImageObject]:
111
206
  # noinspection PyProtectedMember
112
207
  return self.root._to_image_objects(self.root._find_images(Position.at_page(self.page_index)))
113
208
 
114
- def select_images_at(self, x: float, y: float) -> List[ImageObject]:
209
+ def select_images_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ImageObject]:
115
210
  position = Position.at_page_coordinates(self.page_index, x, y)
116
211
  # noinspection PyProtectedMember
117
- return self.root._to_image_objects(self.root._find_images(position))
212
+ return self.root._to_image_objects(self.root._find_images(position, tolerance))
118
213
 
119
214
  def select_forms(self) -> List[FormObject]:
120
215
  position = Position.at_page(self.page_index)
121
216
  # noinspection PyProtectedMember
122
217
  return self.root._to_form_objects(self.root._find_form_x_objects(position))
123
218
 
124
- def select_forms_at(self, x: float, y: float) -> List[FormObject]:
219
+ def select_forms_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormObject]:
125
220
  position = Position.at_page_coordinates(self.page_index, x, y)
126
221
  # noinspection PyProtectedMember
127
- return self.root._to_form_objects(self.root._find_form_x_objects(position))
222
+ return self.root._to_form_objects(self.root._find_form_x_objects(position, tolerance))
128
223
 
129
224
  def select_form_fields(self) -> List[FormFieldObject]:
130
225
  position = Position.at_page(self.page_index)
@@ -137,10 +232,10 @@ class PageClient:
137
232
  # noinspection PyProtectedMember
138
233
  return self.root._to_form_field_objects(self.root._find_form_fields(pos))
139
234
 
140
- def select_form_fields_at(self, x: float, y: float) -> List[FormFieldObject]:
235
+ def select_form_fields_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormFieldObject]:
141
236
  position = Position.at_page_coordinates(self.page_index, x, y)
142
237
  # noinspection PyProtectedMember
143
- return self.root._to_form_field_objects(self.root._find_form_fields(position))
238
+ return self.root._to_form_field_objects(self.root._find_form_fields(position, tolerance))
144
239
 
145
240
  @classmethod
146
241
  def from_ref(cls, root: 'PDFDancer', page_ref: PageRef) -> 'PageClient':
@@ -178,6 +273,18 @@ class PageClient:
178
273
  def new_paragraph(self):
179
274
  return ParagraphPageBuilder(self.root, self.page_index)
180
275
 
276
+ def new_path(self):
277
+ from .path_builder import PathBuilder
278
+ return PathBuilder(self.root, self.page_index)
279
+
280
+ def new_line(self):
281
+ from .path_builder import LineBuilder
282
+ return LineBuilder(self.root, self.page_index)
283
+
284
+ def new_bezier(self):
285
+ from .path_builder import BezierBuilder
286
+ return BezierBuilder(self.root, self.page_index)
287
+
181
288
  def select_paths(self):
182
289
  # noinspection PyProtectedMember
183
290
  return self.root._to_path_objects(self.root._find_paths(Position.at_page(self.page_index)))
@@ -229,9 +336,15 @@ class PDFDancer:
229
336
  """
230
337
  Create a client session, falling back to environment variables when needed.
231
338
 
339
+ Authentication:
340
+ - If token is provided, uses it
341
+ - Otherwise, checks PDFDANCER_TOKEN environment variable
342
+ - If no token is found, automatically obtains an anonymous token
343
+
232
344
  Args:
233
345
  pdf_data: PDF payload supplied directly or via filesystem handles.
234
- token: Override for the API token; falls back to `PDFDANCER_TOKEN` environement variable.
346
+ token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
347
+ then to anonymous token if not set.
235
348
  base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
236
349
  or defaults to `https://api.pdfdancer.com`.
237
350
  timeout: HTTP read timeout in seconds.
@@ -242,6 +355,10 @@ class PDFDancer:
242
355
  resolved_token = cls._resolve_token(token)
243
356
  resolved_base_url = cls._resolve_base_url(base_url)
244
357
 
358
+ # If no token found, obtain anonymous token
359
+ if resolved_token is None:
360
+ resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
361
+
245
362
  return PDFDancer(resolved_token, pdf_data, resolved_base_url, timeout)
246
363
 
247
364
  @classmethod
@@ -252,18 +369,66 @@ class PDFDancer:
252
369
  resolved_base_url = "https://api.pdfdancer.com"
253
370
  return resolved_base_url
254
371
 
372
+ @classmethod
373
+ def _obtain_anonymous_token(cls, base_url: str, timeout: float = 30.0) -> str:
374
+ """
375
+ Obtain an anonymous token from the /keys/anon endpoint.
376
+
377
+ Args:
378
+ base_url: Base URL of the PDFDancer API server
379
+ timeout: HTTP read timeout in seconds
380
+
381
+ Returns:
382
+ Anonymous token string
383
+
384
+ Raises:
385
+ HttpClientException: If token request fails
386
+ """
387
+ try:
388
+ # Create temporary client without authentication
389
+ temp_client = httpx.Client(
390
+ http2=True,
391
+ verify=not DISABLE_SSL_VERIFY
392
+ )
393
+
394
+ headers = {
395
+ 'X-Fingerprint': Fingerprint.generate()
396
+ }
397
+
398
+ response = temp_client.post(
399
+ cls._cleanup_url_path(base_url, "/keys/anon"),
400
+ headers=headers,
401
+ timeout=timeout if timeout > 0 else None
402
+ )
403
+
404
+ response.raise_for_status()
405
+ token_data = response.json()
406
+
407
+ # Extract token from response (matches Java AnonTokenResponse structure)
408
+ if isinstance(token_data, dict) and 'token' in token_data:
409
+ return token_data['token']
410
+ else:
411
+ raise HttpClientException("Invalid anonymous token response format")
412
+
413
+ except httpx.HTTPStatusError as e:
414
+ raise HttpClientException(f"Failed to obtain anonymous token: HTTP {e.response.status_code}",
415
+ response=e.response, cause=e) from None
416
+ except httpx.RequestError as e:
417
+ raise HttpClientException(f"Failed to obtain anonymous token: {str(e)}",
418
+ response=None, cause=e) from None
419
+ finally:
420
+ temp_client.close()
421
+
255
422
  @classmethod
256
423
  def _resolve_token(cls, token: Optional[str]) -> Optional[str]:
424
+ """
425
+ Resolve token from argument or environment variable.
426
+ Returns None if no token is found (allowing fallback to anonymous token).
427
+ """
257
428
  resolved_token = token.strip() if token and token.strip() else None
258
429
  if resolved_token is None:
259
430
  env_token = os.getenv("PDFDANCER_TOKEN")
260
431
  resolved_token = env_token.strip() if env_token and env_token.strip() else None
261
-
262
- if resolved_token is None:
263
- raise ValidationException(
264
- "Missing PDFDancer API token. Pass a token via the `token` argument "
265
- "or set the PDFDANCER_TOKEN environment variable."
266
- )
267
432
  return resolved_token
268
433
 
269
434
  @classmethod
@@ -277,8 +442,14 @@ class PDFDancer:
277
442
  """
278
443
  Create a new blank PDF document with optional configuration.
279
444
 
445
+ Authentication:
446
+ - If token is provided, uses it
447
+ - Otherwise, checks PDFDANCER_TOKEN environment variable
448
+ - If no token is found, automatically obtains an anonymous token
449
+
280
450
  Args:
281
- token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable.
451
+ token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
452
+ then to anonymous token if not set.
282
453
  base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
283
454
  or defaults to `https://api.pdfdancer.com`.
284
455
  timeout: HTTP read timeout in seconds.
@@ -293,6 +464,10 @@ class PDFDancer:
293
464
  resolved_token = cls._resolve_token(token)
294
465
  resolved_base_url = cls._resolve_base_url(base_url)
295
466
 
467
+ # If no token found, obtain anonymous token
468
+ if resolved_token is None:
469
+ resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
470
+
296
471
  # Create a new instance that will call _create_blank_pdf_session
297
472
  instance = object.__new__(cls)
298
473
 
@@ -304,11 +479,12 @@ class PDFDancer:
304
479
  instance._base_url = resolved_base_url.rstrip('/')
305
480
  instance._read_timeout = timeout
306
481
 
307
- # Create HTTP session for connection reuse
308
- instance._session = requests.Session()
309
- instance._session.headers.update({
310
- 'Authorization': f'Bearer {instance._token}'
311
- })
482
+ # Create HTTP client for connection reuse with HTTP/2 support
483
+ instance._client = httpx.Client(
484
+ http2=True,
485
+ headers={'Authorization': f'Bearer {instance._token}'},
486
+ verify=not DISABLE_SSL_VERIFY
487
+ )
312
488
 
313
489
  # Create blank PDF session
314
490
  instance._session_id = instance._create_blank_pdf_session(
@@ -320,6 +496,10 @@ class PDFDancer:
320
496
  # Set pdf_bytes to None since we don't have the PDF bytes yet
321
497
  instance._pdf_bytes = None
322
498
 
499
+ # Initialize snapshot caches (lazy-loaded)
500
+ instance._document_snapshot = None
501
+ instance._page_snapshots = {}
502
+
323
503
  return instance
324
504
 
325
505
  def __init__(self, token: str, pdf_data: Union[bytes, Path, str, BinaryIO],
@@ -351,15 +531,20 @@ class PDFDancer:
351
531
  # Process PDF data with validation
352
532
  self._pdf_bytes = self._process_pdf_data(pdf_data)
353
533
 
354
- # Create HTTP session for connection reuse
355
- self._session = requests.Session()
356
- self._session.headers.update({
357
- 'Authorization': f'Bearer {self._token}'
358
- })
534
+ # Create HTTP client for connection reuse with HTTP/2 support
535
+ self._client = httpx.Client(
536
+ http2=True,
537
+ headers={'Authorization': f'Bearer {self._token}'},
538
+ verify=not DISABLE_SSL_VERIFY
539
+ )
359
540
 
360
541
  # Create session - equivalent to Java constructor behavior
361
542
  self._session_id = self._create_session()
362
543
 
544
+ # Initialize snapshot caches (lazy-loaded)
545
+ self._document_snapshot: Optional[DocumentSnapshot] = None
546
+ self._page_snapshots: dict[int, PageSnapshot] = {}
547
+
363
548
  @staticmethod
364
549
  def _process_pdf_data(pdf_data: Union[bytes, Path, str, BinaryIO]) -> bytes:
365
550
  """
@@ -401,7 +586,7 @@ class PDFDancer:
401
586
  except (IOError, OSError) as e:
402
587
  raise PdfDancerException(f"Failed to read PDF data: {e}", cause=e)
403
588
 
404
- def _extract_error_message(self, response: Optional[requests.Response]) -> str:
589
+ def _extract_error_message(self, response: Optional[httpx.Response]) -> str:
405
590
  """
406
591
  Extract meaningful error messages from API response.
407
592
  Parses JSON error responses with _embedded.errors structure.
@@ -437,7 +622,7 @@ class PDFDancer:
437
622
  # If JSON parsing fails, return response content or status
438
623
  return response.text or f"HTTP {response.status_code}"
439
624
 
440
- def _handle_authentication_error(self, response: Optional[requests.Response]) -> None:
625
+ def _handle_authentication_error(self, response: Optional[httpx.Response]) -> None:
441
626
  """
442
627
  Translate authentication failures into a clear, actionable validation error.
443
628
  """
@@ -474,25 +659,54 @@ class PDFDancer:
474
659
  Creates a new PDF processing session by uploading the PDF data.
475
660
  """
476
661
  try:
477
- files = {
478
- 'pdf': ('document.pdf', self._pdf_bytes, 'application/pdf')
479
- }
662
+ # Build multipart body manually to avoid base64 encoding and enable compression
663
+ # httpx by default may add Content-Transfer-Encoding: base64 which the server rejects
664
+ import uuid
665
+
666
+ boundary = uuid.uuid4().hex
667
+
668
+ # Build multipart body with binary (not base64) encoding
669
+ body_parts = []
670
+ body_parts.append(f'--{boundary}\r\n'.encode('utf-8'))
671
+ body_parts.append(b'Content-Disposition: form-data; name="pdf"; filename="document.pdf"\r\n')
672
+ body_parts.append(b'Content-Type: application/pdf\r\n')
673
+ body_parts.append(b'\r\n') # End of headers, no Content-Transfer-Encoding
674
+ body_parts.append(self._pdf_bytes)
675
+ body_parts.append(b'\r\n')
676
+ body_parts.append(f'--{boundary}--\r\n'.encode('utf-8'))
677
+
678
+ uncompressed_body = b''.join(body_parts)
679
+
680
+ # Compress entire request body using gzip
681
+ compressed_body = gzip.compress(uncompressed_body)
682
+
683
+ original_size = len(uncompressed_body)
684
+ compressed_size = len(compressed_body)
685
+ compression_ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
480
686
 
481
- request_size = len(self._pdf_bytes)
482
687
  if DEBUG:
483
- print(f"{time.time()}|POST /session/create - request size: {request_size} bytes")
688
+ print(f"{time.time()}|POST /session/create - original size: {original_size} bytes, "
689
+ f"compressed size: {compressed_size} bytes, "
690
+ f"compression: {compression_ratio:.1f}%")
691
+
692
+ headers = {
693
+ 'X-Generated-At': _generate_timestamp(),
694
+ 'Content-Type': f'multipart/form-data; boundary={boundary}',
695
+ 'Content-Encoding': 'gzip'
696
+ }
484
697
 
485
- response = self._session.post(
698
+ response = self._client.post(
486
699
  self._cleanup_url_path(self._base_url, "/session/create"),
487
- files=files,
488
- timeout=self._read_timeout if self._read_timeout > 0 else None,
489
- verify=not DISABLE_SSL_VERIFY
700
+ content=compressed_body,
701
+ headers=headers,
702
+ timeout=self._read_timeout if self._read_timeout > 0 else None
490
703
  )
491
704
 
492
705
  response_size = len(response.content)
493
706
  if DEBUG:
494
707
  print(f"{time.time()}|POST /session/create - response size: {response_size} bytes")
495
708
 
709
+ _log_generated_at_header(response, "POST", "/session/create")
496
710
  self._handle_authentication_error(response)
497
711
  response.raise_for_status()
498
712
  session_id = response.text.strip()
@@ -502,11 +716,14 @@ class PDFDancer:
502
716
 
503
717
  return session_id
504
718
 
505
- except requests.exceptions.RequestException as e:
506
- self._handle_authentication_error(getattr(e, 'response', None))
507
- error_message = self._extract_error_message(getattr(e, 'response', None))
719
+ except httpx.HTTPStatusError as e:
720
+ self._handle_authentication_error(e.response)
721
+ error_message = self._extract_error_message(e.response)
508
722
  raise HttpClientException(f"Failed to create session: {error_message}",
509
- response=getattr(e, 'response', None), cause=e) from None
723
+ response=e.response, cause=e) from None
724
+ except httpx.RequestError as e:
725
+ raise HttpClientException(f"Failed to create session: {str(e)}",
726
+ response=None, cause=e) from None
510
727
 
511
728
  def _create_blank_pdf_session(self,
512
729
  page_size: Optional[Union[PageSize, str, Mapping[str, Any]]] = None,
@@ -560,19 +777,22 @@ class PDFDancer:
560
777
  if DEBUG:
561
778
  print(f"{time.time()}|POST /session/new - request size: {request_size} bytes")
562
779
 
563
- headers = {'Content-Type': 'application/json'}
564
- response = self._session.post(
780
+ headers = {
781
+ 'Content-Type': 'application/json',
782
+ 'X-Generated-At': _generate_timestamp()
783
+ }
784
+ response = self._client.post(
565
785
  self._cleanup_url_path(self._base_url, "/session/new"),
566
786
  json=request_data,
567
787
  headers=headers,
568
- timeout=self._read_timeout if self._read_timeout > 0 else None,
569
- verify=not DISABLE_SSL_VERIFY
788
+ timeout=self._read_timeout if self._read_timeout > 0 else None
570
789
  )
571
790
 
572
791
  response_size = len(response.content)
573
792
  if DEBUG:
574
793
  print(f"{time.time()}|POST /session/new - response size: {response_size} bytes")
575
794
 
795
+ _log_generated_at_header(response, "POST", "/session/new")
576
796
  self._handle_authentication_error(response)
577
797
  response.raise_for_status()
578
798
  session_id = response.text.strip()
@@ -582,20 +802,25 @@ class PDFDancer:
582
802
 
583
803
  return session_id
584
804
 
585
- except requests.exceptions.RequestException as e:
586
- self._handle_authentication_error(getattr(e, 'response', None))
587
- error_message = self._extract_error_message(getattr(e, 'response', None))
805
+ except httpx.HTTPStatusError as e:
806
+ self._handle_authentication_error(e.response)
807
+ error_message = self._extract_error_message(e.response)
588
808
  raise HttpClientException(f"Failed to create blank PDF session: {error_message}",
589
- response=getattr(e, 'response', None), cause=e) from None
809
+ response=e.response, cause=e) from None
810
+ except httpx.RequestError as e:
811
+ raise HttpClientException(f"Failed to create blank PDF session: {str(e)}",
812
+ response=None, cause=e) from None
590
813
 
591
814
  def _make_request(self, method: str, path: str, data: Optional[dict] = None,
592
- params: Optional[dict] = None) -> requests.Response:
815
+ params: Optional[dict] = None) -> httpx.Response:
593
816
  """
594
817
  Make HTTP request with session headers and error handling.
595
818
  """
596
819
  headers = {
597
820
  'X-Session-Id': self._session_id,
598
- 'Content-Type': 'application/json'
821
+ 'Content-Type': 'application/json',
822
+ 'X-Generated-At': _generate_timestamp(),
823
+ 'X-Fingerprint': Fingerprint.generate()
599
824
  }
600
825
 
601
826
  try:
@@ -606,20 +831,21 @@ class PDFDancer:
606
831
  if DEBUG:
607
832
  print(f"{time.time()}|{method} {path} - request size: {request_size} bytes")
608
833
 
609
- response = self._session.request(
834
+ response = self._client.request(
610
835
  method=method,
611
836
  url=self._cleanup_url_path(self._base_url, path),
612
837
  json=data,
613
838
  params=params,
614
839
  headers=headers,
615
- timeout=self._read_timeout if self._read_timeout > 0 else None,
616
- verify=not DISABLE_SSL_VERIFY
840
+ timeout=self._read_timeout if self._read_timeout > 0 else None
617
841
  )
618
842
 
619
843
  response_size = len(response.content)
620
844
  if DEBUG:
621
845
  print(f"{time.time()}|{method} {path} - response size: {response_size} bytes")
622
846
 
847
+ _log_generated_at_header(response, method, path)
848
+
623
849
  # Handle FontNotFoundException
624
850
  if response.status_code == 404:
625
851
  try:
@@ -633,53 +859,86 @@ class PDFDancer:
633
859
  response.raise_for_status()
634
860
  return response
635
861
 
636
- except requests.exceptions.RequestException as e:
637
- self._handle_authentication_error(getattr(e, 'response', None))
638
- error_message = self._extract_error_message(getattr(e, 'response', None))
639
- raise HttpClientException(f"API request failed: {error_message}", response=getattr(e, 'response', None),
862
+ except httpx.HTTPStatusError as e:
863
+ self._handle_authentication_error(e.response)
864
+ error_message = self._extract_error_message(e.response)
865
+ raise HttpClientException(f"API request failed: {error_message}", response=e.response,
866
+ cause=e) from None
867
+ except httpx.RequestError as e:
868
+ raise HttpClientException(f"API request failed: {str(e)}", response=None,
640
869
  cause=e) from None
641
870
 
642
- def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None) -> List[ObjectRef]:
871
+ def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None,
872
+ tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
643
873
  """
644
874
  Searches for PDF objects matching the specified criteria.
645
- This method provides flexible search capabilities across all PDF content,
646
- allowing filtering by object type and position constraints.
875
+ Uses snapshot cache for all queries except paths at specific coordinates.
647
876
 
648
877
  Args:
649
878
  object_type: The type of objects to find (None for all types)
650
879
  position: Positional constraints for the search (None for all positions)
880
+ tolerance: Tolerance in points for spatial matching (default: DEFAULT_TOLERANCE)
651
881
 
652
882
  Returns:
653
883
  List of object references matching the search criteria
654
884
  """
655
- request_data = FindRequest(object_type, position).to_dict()
656
- response = self._make_request('POST', '/pdf/find', data=request_data)
657
-
658
- # Parse response into ObjectRef objects
659
- objects_data = response.json()
660
- return [self._parse_object_ref(obj_data) for obj_data in objects_data]
885
+ # Special case: PATH queries with bounding_rect need API (full vector data)
886
+ if object_type == ObjectType.PATH and position and position.bounding_rect:
887
+ request_data = FindRequest(object_type, position).to_dict()
888
+ response = self._make_request('POST', '/pdf/find', data=request_data)
889
+ objects_data = response.json()
890
+ return [self._parse_object_ref(obj_data) for obj_data in objects_data]
891
+
892
+ # Use snapshot for all other queries
893
+ if position and position.page_index is not None:
894
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
895
+ return self._filter_snapshot_elements(snapshot.elements, object_type, position, tolerance)
896
+ else:
897
+ snapshot = self._get_or_fetch_document_snapshot()
898
+ all_elements = []
899
+ for page_snap in snapshot.pages:
900
+ all_elements.extend(page_snap.elements)
901
+ return self._filter_snapshot_elements(all_elements, object_type, position, tolerance)
661
902
 
662
- def select_paragraphs(self) -> List[TextObjectRef]:
903
+ def select_paragraphs(self) -> List[ParagraphObject]:
663
904
  """
664
- Searches for paragraph objects returning TextObjectRef with hierarchical structure.
905
+ Searches for paragraph objects returning ParagraphObject instances.
665
906
  """
666
- return self._find_paragraphs(None)
907
+ return self._to_paragraph_objects(self._find_paragraphs(None))
667
908
 
668
- def _find_paragraphs(self, position: Optional[Position] = None) -> List[TextObjectRef]:
909
+ def _find_paragraphs(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
910
+ TextObjectRef]:
669
911
  """
670
912
  Searches for paragraph objects returning TextObjectRef with hierarchical structure.
913
+ Uses snapshot cache for all queries.
671
914
  """
672
- request_data = FindRequest(ObjectType.PARAGRAPH, position).to_dict()
673
- response = self._make_request('POST', '/pdf/find', data=request_data)
674
-
675
- objects_data = response.json()
676
- return [self._parse_text_object_ref(obj_data) for obj_data in objects_data]
915
+ # Use snapshot for all queries (including spatial)
916
+ if position and position.page_index is not None:
917
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
918
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.PARAGRAPH, position, tolerance)
919
+ else:
920
+ snapshot = self._get_or_fetch_document_snapshot()
921
+ all_elements = []
922
+ for page_snap in snapshot.pages:
923
+ all_elements.extend(page_snap.elements)
924
+ return self._filter_snapshot_elements(all_elements, ObjectType.PARAGRAPH, position, tolerance)
677
925
 
678
- def _find_images(self, position: Optional[Position] = None) -> List[ObjectRef]:
926
+ def _find_images(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
927
+ ObjectRef]:
679
928
  """
680
929
  Searches for image objects at the specified position.
930
+ Uses snapshot cache for all queries.
681
931
  """
682
- return self._find(ObjectType.IMAGE, position)
932
+ # Use snapshot for all queries (including spatial)
933
+ if position and position.page_index is not None:
934
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
935
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.IMAGE, position, tolerance)
936
+ else:
937
+ snapshot = self._get_or_fetch_document_snapshot()
938
+ all_elements = []
939
+ for page_snap in snapshot.pages:
940
+ all_elements.extend(page_snap.elements)
941
+ return self._filter_snapshot_elements(all_elements, ObjectType.IMAGE, position, tolerance)
683
942
 
684
943
  def select_images(self) -> List[ImageObject]:
685
944
  """
@@ -693,11 +952,22 @@ class PDFDancer:
693
952
  """
694
953
  return self._to_form_objects(self._find(ObjectType.FORM_X_OBJECT, None))
695
954
 
696
- def _find_form_x_objects(self, position: Optional[Position] = None) -> List[ObjectRef]:
955
+ def _find_form_x_objects(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
956
+ ObjectRef]:
697
957
  """
698
- Searches for form field objects at the specified position.
958
+ Searches for form X objects at the specified position.
959
+ Uses snapshot cache for all queries.
699
960
  """
700
- return self._find(ObjectType.FORM_X_OBJECT, position)
961
+ # Use snapshot for all queries (including spatial)
962
+ if position and position.page_index is not None:
963
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
964
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_X_OBJECT, position, tolerance)
965
+ else:
966
+ snapshot = self._get_or_fetch_document_snapshot()
967
+ all_elements = []
968
+ for page_snap in snapshot.pages:
969
+ all_elements.extend(page_snap.elements)
970
+ return self._filter_snapshot_elements(all_elements, ObjectType.FORM_X_OBJECT, position, tolerance)
701
971
 
702
972
  def select_form_fields(self) -> List[FormFieldObject]:
703
973
  """
@@ -711,17 +981,23 @@ class PDFDancer:
711
981
  """
712
982
  return self._to_form_field_objects(self._find_form_fields(Position.by_name(field_name)))
713
983
 
714
- def _find_form_fields(self, position: Optional[Position] = None) -> List[FormFieldRef]:
984
+ def _find_form_fields(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
985
+ FormFieldRef]:
715
986
  """
716
987
  Searches for form fields at the specified position.
717
988
  Returns FormFieldRef objects with name and value properties.
989
+ Uses snapshot cache for all queries (including name and spatial filtering).
718
990
  """
719
- request_data = FindRequest(ObjectType.FORM_FIELD, position).to_dict()
720
- response = self._make_request('POST', '/pdf/find', data=request_data)
721
-
722
- # Parse response into ObjectRef objects
723
- objects_data = response.json()
724
- return [self._parse_form_field_ref(obj_data) for obj_data in objects_data]
991
+ # Use snapshot for all queries (including name and spatial)
992
+ if position and position.page_index is not None:
993
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
994
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_FIELD, position, tolerance)
995
+ else:
996
+ snapshot = self._get_or_fetch_document_snapshot()
997
+ all_elements = []
998
+ for page_snap in snapshot.pages:
999
+ all_elements.extend(page_snap.elements)
1000
+ return self._filter_snapshot_elements(all_elements, ObjectType.FORM_FIELD, position, tolerance)
725
1001
 
726
1002
  def _change_form_field(self, form_field_ref: FormFieldRef, new_value: str) -> bool:
727
1003
  """
@@ -730,9 +1006,12 @@ class PDFDancer:
730
1006
  if form_field_ref is None:
731
1007
  raise ValidationException("Form field reference cannot be null")
732
1008
 
733
- request_data = ChangeFormFieldRequest(form_field_ref, new_value).to_dict()
734
- response = self._make_request('PUT', '/pdf/modify/formField', data=request_data)
735
- return response.json()
1009
+ try:
1010
+ request_data = ChangeFormFieldRequest(form_field_ref, new_value).to_dict()
1011
+ response = self._make_request('PUT', '/pdf/modify/formField', data=request_data)
1012
+ return response.json()
1013
+ finally:
1014
+ self._invalidate_snapshots()
736
1015
 
737
1016
  def select_paths(self) -> List[ObjectRef]:
738
1017
  """
@@ -740,21 +1019,45 @@ class PDFDancer:
740
1019
  """
741
1020
  return self._find(ObjectType.PATH, None)
742
1021
 
743
- def _find_paths(self, position: Optional[Position] = None) -> List[ObjectRef]:
1022
+ def _find_paths(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
744
1023
  """
745
1024
  Searches for vector path objects at the specified position.
746
- """
747
- return self._find(ObjectType.PATH, position)
1025
+ Note: Spatial queries (with bounding_rect) fall back to API since snapshots
1026
+ don't include full vector path data needed for precise intersection tests.
1027
+ """
1028
+ # Special case: paths at specific coordinates need full vector data
1029
+ # which is not available in snapshots, so pass through to API
1030
+ if position and position.bounding_rect:
1031
+ return self._find(ObjectType.PATH, position, tolerance)
1032
+
1033
+ # For simple page-level "all paths" queries, use snapshot
1034
+ if position and position.page_index is not None:
1035
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
1036
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.PATH, position, tolerance)
1037
+ else:
1038
+ # Document-level query - use document snapshot
1039
+ snapshot = self._get_or_fetch_document_snapshot()
1040
+ all_elements = []
1041
+ for page_snap in snapshot.pages:
1042
+ all_elements.extend(page_snap.elements)
1043
+ return self._filter_snapshot_elements(all_elements, ObjectType.PATH, position, tolerance)
748
1044
 
749
- def _find_text_lines(self, position: Optional[Position] = None) -> List[TextObjectRef]:
1045
+ def _find_text_lines(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
1046
+ TextObjectRef]:
750
1047
  """
751
1048
  Searches for text line objects returning TextObjectRef with hierarchical structure.
1049
+ Uses snapshot cache for all queries.
752
1050
  """
753
- request_data = FindRequest(ObjectType.TEXT_LINE, position).to_dict()
754
- response = self._make_request('POST', '/pdf/find', data=request_data)
755
-
756
- objects_data = response.json()
757
- return [self._parse_text_object_ref(obj_data) for obj_data in objects_data]
1051
+ # Use snapshot for all queries (including spatial)
1052
+ if position and position.page_index is not None:
1053
+ snapshot = self._get_or_fetch_page_snapshot(position.page_index)
1054
+ return self._filter_snapshot_elements(snapshot.elements, ObjectType.TEXT_LINE, position, tolerance)
1055
+ else:
1056
+ snapshot = self._get_or_fetch_document_snapshot()
1057
+ all_elements = []
1058
+ for page_snap in snapshot.pages:
1059
+ all_elements.extend(page_snap.elements)
1060
+ return self._filter_snapshot_elements(all_elements, ObjectType.TEXT_LINE, position, tolerance)
758
1061
 
759
1062
  def select_text_lines(self) -> List[TextLineObject]:
760
1063
  """
@@ -764,7 +1067,7 @@ class PDFDancer:
764
1067
 
765
1068
  def page(self, page_index: int) -> PageClient:
766
1069
  """
767
- Get a specific page by index, fetching page properties from the server.
1070
+ Get a specific page by index, using snapshot cache when available.
768
1071
 
769
1072
  Args:
770
1073
  page_index: The 0-based page index
@@ -772,11 +1075,16 @@ class PDFDancer:
772
1075
  Returns:
773
1076
  PageClient with page properties populated
774
1077
  """
1078
+ # Try to get page ref from snapshot first (avoids API call)
1079
+ page_snapshot = self._get_or_fetch_page_snapshot(page_index)
1080
+ if page_snapshot and page_snapshot.page_ref:
1081
+ return PageClient.from_ref(self, page_snapshot.page_ref)
1082
+
1083
+ # Fallback to API if snapshot doesn't have page ref
775
1084
  page_ref = self._get_page(page_index)
776
1085
  if page_ref:
777
1086
  return PageClient.from_ref(self, page_ref)
778
1087
  else:
779
- # Fallback to basic PageClient if page not found
780
1088
  return PageClient(page_index, self)
781
1089
 
782
1090
  # Page Operations
@@ -786,11 +1094,11 @@ class PDFDancer:
786
1094
 
787
1095
  def _get_pages(self) -> List[PageRef]:
788
1096
  """
789
- Retrieves references to all pages in the PDF document.
1097
+ Retrieves references to all pages in the PDF document using snapshot cache.
790
1098
  """
791
- response = self._make_request('POST', '/pdf/page/find')
792
- pages_data = response.json()
793
- return [self._parse_page_ref(page_data) for page_data in pages_data]
1099
+ # Use document snapshot which includes all pages (avoids API call)
1100
+ doc_snapshot = self._get_or_fetch_document_snapshot()
1101
+ return [page_snap.page_ref for page_snap in doc_snapshot.pages]
794
1102
 
795
1103
  def _get_page(self, page_index: int) -> Optional[PageRef]:
796
1104
  """
@@ -830,7 +1138,13 @@ class PDFDancer:
830
1138
  request_data = page_ref.to_dict()
831
1139
 
832
1140
  response = self._make_request('DELETE', '/pdf/page/delete', data=request_data)
833
- return response.json()
1141
+ result = response.json()
1142
+
1143
+ # Invalidate snapshot caches after mutation
1144
+ if result:
1145
+ self._invalidate_snapshots()
1146
+
1147
+ return result
834
1148
 
835
1149
  def move_page(self, from_page_index: int, to_page_index: int) -> bool:
836
1150
  """Move a page to a different index within the document."""
@@ -849,6 +1163,11 @@ class PDFDancer:
849
1163
  request_data = PageMoveRequest(from_page_index, to_page_index).to_dict()
850
1164
  response = self._make_request('PUT', '/pdf/page/move', data=request_data)
851
1165
  result = response.json()
1166
+
1167
+ # Invalidate snapshot caches after mutation
1168
+ if result:
1169
+ self._invalidate_snapshots()
1170
+
852
1171
  return bool(result)
853
1172
 
854
1173
  # Manipulation Operations
@@ -868,7 +1187,13 @@ class PDFDancer:
868
1187
 
869
1188
  request_data = DeleteRequest(object_ref).to_dict()
870
1189
  response = self._make_request('DELETE', '/pdf/delete', data=request_data)
871
- return response.json()
1190
+ result = response.json()
1191
+
1192
+ # Invalidate snapshot caches after mutation
1193
+ if result:
1194
+ self._invalidate_snapshots()
1195
+
1196
+ return result
872
1197
 
873
1198
  def _move(self, object_ref: ObjectRef, position: Position) -> bool:
874
1199
  """
@@ -888,7 +1213,13 @@ class PDFDancer:
888
1213
 
889
1214
  request_data = MoveRequest(object_ref, position).to_dict()
890
1215
  response = self._make_request('PUT', '/pdf/move', data=request_data)
891
- return response.json()
1216
+ result = response.json()
1217
+
1218
+ # Invalidate snapshot caches after mutation
1219
+ if result:
1220
+ self._invalidate_snapshots()
1221
+
1222
+ return result
892
1223
 
893
1224
  # Add Operations
894
1225
 
@@ -935,24 +1266,58 @@ class PDFDancer:
935
1266
 
936
1267
  return self._add_object(paragraph)
937
1268
 
1269
+ def _add_path(self, path: 'Path') -> bool:
1270
+ """
1271
+ Internal method to add a path to the document after validation.
1272
+ """
1273
+ from .models import Path as PathModel
1274
+
1275
+ if path is None:
1276
+ raise ValidationException("Path cannot be null")
1277
+ if path.get_position() is None:
1278
+ raise ValidationException("Path position is null")
1279
+ if path.get_position().page_index is None:
1280
+ raise ValidationException("Path position page index is null")
1281
+ if path.get_position().page_index < 0:
1282
+ raise ValidationException("Path position page index is less than 0")
1283
+ if not path.get_path_segments() or len(path.get_path_segments()) == 0:
1284
+ raise ValidationException("Path must have at least one segment")
1285
+
1286
+ return self._add_object(path)
1287
+
938
1288
  def _add_object(self, pdf_object) -> bool:
939
1289
  """
940
1290
  Internal method to add any PDF object.
941
1291
  """
942
1292
  request_data = AddRequest(pdf_object).to_dict()
943
1293
  response = self._make_request('POST', '/pdf/add', data=request_data)
944
- return response.json()
1294
+ result = response.json()
1295
+
1296
+ # Invalidate snapshot caches after mutation
1297
+ if result:
1298
+ self._invalidate_snapshots()
1299
+
1300
+ return result
945
1301
 
946
1302
  def new_paragraph(self) -> ParagraphBuilder:
947
1303
  return ParagraphBuilder(self)
948
1304
 
949
1305
  def new_page(self):
950
1306
  response = self._make_request('POST', '/pdf/page/add', data=None)
951
- return self._parse_page_ref(response.json())
1307
+ result = self._parse_page_ref(response.json())
1308
+
1309
+ # Invalidate snapshot caches after adding page
1310
+ self._invalidate_snapshots()
1311
+
1312
+ return result
952
1313
 
953
1314
  def new_image(self) -> ImageBuilder:
954
1315
  return ImageBuilder(self)
955
1316
 
1317
+ def new_path(self) -> 'PathBuilder':
1318
+ from .path_builder import PathBuilder
1319
+ return PathBuilder(self)
1320
+
956
1321
  # Modify Operations
957
1322
  def _modify_paragraph(self, object_ref: ObjectRef, new_paragraph: Union[Paragraph, str]) -> CommandResult:
958
1323
  """
@@ -974,12 +1339,16 @@ class PDFDancer:
974
1339
  # Text modification - returns CommandResult
975
1340
  request_data = ModifyTextRequest(object_ref, new_paragraph).to_dict()
976
1341
  response = self._make_request('PUT', '/pdf/text/paragraph', data=request_data)
977
- return CommandResult.from_dict(response.json())
1342
+ result = CommandResult.from_dict(response.json())
978
1343
  else:
979
1344
  # Object modification
980
1345
  request_data = ModifyRequest(object_ref, new_paragraph).to_dict()
981
1346
  response = self._make_request('PUT', '/pdf/modify', data=request_data)
982
- return CommandResult.from_dict(response.json())
1347
+ result = CommandResult.from_dict(response.json())
1348
+
1349
+ # Invalidate snapshot caches after mutation
1350
+ self._invalidate_snapshots()
1351
+ return result
983
1352
 
984
1353
  def _modify_text_line(self, object_ref: ObjectRef, new_text: str) -> CommandResult:
985
1354
  """
@@ -999,7 +1368,11 @@ class PDFDancer:
999
1368
 
1000
1369
  request_data = ModifyTextRequest(object_ref, new_text).to_dict()
1001
1370
  response = self._make_request('PUT', '/pdf/text/line', data=request_data)
1002
- return CommandResult.from_dict(response.json())
1371
+ result = CommandResult.from_dict(response.json())
1372
+
1373
+ # Invalidate snapshot caches after mutation
1374
+ self._invalidate_snapshots()
1375
+ return result
1003
1376
 
1004
1377
  # Font Operations
1005
1378
 
@@ -1083,31 +1456,220 @@ class PDFDancer:
1083
1456
  if DEBUG:
1084
1457
  print(f"{time.time()}|POST /font/register - request size: {request_size} bytes")
1085
1458
 
1086
- headers = {'X-Session-Id': self._session_id}
1087
- response = self._session.post(
1459
+ headers = {
1460
+ 'X-Session-Id': self._session_id,
1461
+ 'X-Generated-At': _generate_timestamp()
1462
+ }
1463
+ response = self._client.post(
1088
1464
  self._cleanup_url_path(self._base_url, "/font/register"),
1089
1465
  files=files,
1090
1466
  headers=headers,
1091
- timeout=30,
1092
- verify=not DISABLE_SSL_VERIFY
1467
+ timeout=30
1093
1468
  )
1094
1469
 
1095
1470
  response_size = len(response.content)
1096
1471
  if DEBUG:
1097
1472
  print(f"{time.time()}|POST /font/register - response size: {response_size} bytes")
1098
1473
 
1474
+ _log_generated_at_header(response, "POST", "/font/register")
1099
1475
  response.raise_for_status()
1100
1476
  return response.text.strip()
1101
1477
 
1102
1478
  except (IOError, OSError) as e:
1103
1479
  raise PdfDancerException(f"Failed to read font file: {e}", cause=e)
1104
- except requests.exceptions.RequestException as e:
1105
- error_message = self._extract_error_message(getattr(e, 'response', None))
1480
+ except httpx.HTTPStatusError as e:
1481
+ error_message = self._extract_error_message(e.response)
1106
1482
  raise HttpClientException(f"Font registration failed: {error_message}",
1107
- response=getattr(e, 'response', None), cause=e) from None
1483
+ response=e.response, cause=e) from None
1484
+ except httpx.RequestError as e:
1485
+ raise HttpClientException(f"Font registration failed: {str(e)}",
1486
+ response=None, cause=e) from None
1108
1487
 
1109
1488
  # Document Operations
1110
1489
 
1490
+ # Snapshot Operations
1491
+
1492
+ def get_document_snapshot(self, types: Optional[str] = None) -> DocumentSnapshot:
1493
+ """
1494
+ Retrieve a snapshot of the entire document with all pages and elements.
1495
+
1496
+ Args:
1497
+ types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
1498
+
1499
+ Returns:
1500
+ DocumentSnapshot containing page count, fonts, and all page snapshots
1501
+ """
1502
+ params = {}
1503
+ if types:
1504
+ params['types'] = types
1505
+
1506
+ response = self._make_request('GET', '/pdf/document/snapshot', params=params)
1507
+ data = response.json()
1508
+
1509
+ return self._parse_document_snapshot(data)
1510
+
1511
+ def get_page_snapshot(self, page_index: int, types: Optional[str] = None) -> PageSnapshot:
1512
+ """
1513
+ Retrieve a snapshot of a specific page with all its elements.
1514
+
1515
+ Args:
1516
+ page_index: The index of the page to snapshot (0-based)
1517
+ types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
1518
+
1519
+ Returns:
1520
+ PageSnapshot containing page reference and all elements on that page
1521
+ """
1522
+ if page_index < 0:
1523
+ raise ValidationException(f"Page index must be >= 0, got {page_index}")
1524
+
1525
+ params = {}
1526
+ if types:
1527
+ params['types'] = types
1528
+
1529
+ response = self._make_request('GET', f'/pdf/page/{page_index}/snapshot', params=params)
1530
+ data = response.json()
1531
+
1532
+ return self._parse_page_snapshot(data)
1533
+
1534
+ def _get_or_fetch_document_snapshot(self) -> DocumentSnapshot:
1535
+ """
1536
+ Get document snapshot from cache or fetch if not cached.
1537
+ This is used internally by select_* methods for optimization.
1538
+ Also caches individual page snapshots from the document snapshot.
1539
+ """
1540
+ if self._document_snapshot is None:
1541
+ self._document_snapshot = self.get_document_snapshot()
1542
+ # Cache individual page snapshots from document snapshot
1543
+ for i, page_snapshot in enumerate(self._document_snapshot.pages):
1544
+ if i not in self._page_snapshots:
1545
+ self._page_snapshots[i] = page_snapshot
1546
+ return self._document_snapshot
1547
+
1548
+ def _get_or_fetch_page_snapshot(self, page_index: int) -> PageSnapshot:
1549
+ """
1550
+ Get page snapshot from cache or fetch if not cached.
1551
+ This is used internally by select_* methods for optimization.
1552
+ If document snapshot exists, uses page from it instead of making separate API call.
1553
+ """
1554
+ # Check if already cached
1555
+ if page_index in self._page_snapshots:
1556
+ return self._page_snapshots[page_index]
1557
+
1558
+ # If document snapshot exists, get page from it (no API call needed)
1559
+ if self._document_snapshot is not None:
1560
+ if 0 <= page_index < len(self._document_snapshot.pages):
1561
+ page_snapshot = self._document_snapshot.pages[page_index]
1562
+ self._page_snapshots[page_index] = page_snapshot
1563
+ return page_snapshot
1564
+
1565
+ # Otherwise fetch page snapshot individually
1566
+ self._page_snapshots[page_index] = self.get_page_snapshot(page_index)
1567
+ return self._page_snapshots[page_index]
1568
+
1569
+ def _invalidate_snapshots(self) -> None:
1570
+ """
1571
+ Clear all snapshot caches.
1572
+ Called after mutations (delete, move, modify) to ensure fresh data on next select.
1573
+ """
1574
+ self._document_snapshot = None
1575
+ self._page_snapshots.clear()
1576
+
1577
+ def _filter_snapshot_elements(self, elements: List, object_type: ObjectType,
1578
+ position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List:
1579
+ """
1580
+ Filter snapshot elements client-side based on object type and position criteria.
1581
+
1582
+ Args:
1583
+ elements: List of elements from snapshot (ObjectRef, TextObjectRef, etc.)
1584
+ object_type: Type to filter for
1585
+ position: Optional position filter with text matching, bounding rect, etc.
1586
+ tolerance: Tolerance in points for spatial matching (default: 10.0)
1587
+
1588
+ Returns:
1589
+ Filtered list of elements matching the criteria
1590
+ """
1591
+ import re
1592
+
1593
+ # Filter by object type (handle form field subtypes)
1594
+ if object_type == ObjectType.FORM_FIELD:
1595
+ # Form fields include TEXT_FIELD, CHECK_BOX, RADIO_BUTTON, BUTTON, DROPDOWN
1596
+ form_field_types = {ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
1597
+ ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
1598
+ ObjectType.BUTTON, ObjectType.DROPDOWN}
1599
+ filtered = [e for e in elements if e.type in form_field_types]
1600
+ else:
1601
+ filtered = [e for e in elements if e.type == object_type]
1602
+
1603
+ if position is None:
1604
+ return filtered
1605
+
1606
+ # Apply position filters
1607
+ result = filtered
1608
+
1609
+ # Text starts with filter (case-insensitive to match API behavior)
1610
+ if position.text_starts_with:
1611
+ search_text = position.text_starts_with.lower()
1612
+ result = [
1613
+ e for e in result
1614
+ if isinstance(e, TextObjectRef) and e.text and e.text.lower().startswith(search_text)
1615
+ ]
1616
+
1617
+ # Regex pattern filter
1618
+ if position.text_pattern:
1619
+ pattern = re.compile(position.text_pattern)
1620
+ result = [
1621
+ e for e in result
1622
+ if isinstance(e, TextObjectRef) and e.text and pattern.search(e.text)
1623
+ ]
1624
+
1625
+ # Bounding rect filter (spatial queries like at(x, y))
1626
+ if position.bounding_rect:
1627
+ rect = position.bounding_rect
1628
+ result = [
1629
+ e for e in result
1630
+ if e.position and e.position.bounding_rect and
1631
+ self._rects_intersect(e.position.bounding_rect, rect, tolerance)
1632
+ ]
1633
+
1634
+ # Name filter (for form fields)
1635
+ if position.name:
1636
+ from .models import FormFieldRef
1637
+ result = [
1638
+ e for e in result
1639
+ if isinstance(e, FormFieldRef) and e.name == position.name
1640
+ ]
1641
+
1642
+ return result
1643
+
1644
+ @staticmethod
1645
+ def _rects_intersect(rect1, rect2, tolerance: float = DEFAULT_TOLERANCE) -> bool:
1646
+ """
1647
+ Check if two bounding rectangles intersect or are very close.
1648
+ Handles point queries (width/height = 0) with tolerance.
1649
+
1650
+ Args:
1651
+ rect1: First bounding rectangle
1652
+ rect2: Second bounding rectangle
1653
+ tolerance: Tolerance in points for position matching (default: 10.0)
1654
+ """
1655
+ # Get effective bounds with tolerance
1656
+ r1_left = rect1.x - tolerance
1657
+ r1_right = rect1.x + rect1.width + tolerance
1658
+ r1_top = rect1.y - tolerance
1659
+ r1_bottom = rect1.y + rect1.height + tolerance
1660
+
1661
+ r2_left = rect2.x - tolerance
1662
+ r2_right = rect2.x + rect2.width + tolerance
1663
+ r2_top = rect2.y - tolerance
1664
+ r2_bottom = rect2.y + rect2.height + tolerance
1665
+
1666
+ # Check if rectangles overlap
1667
+ if r1_right < r2_left or r2_right < r1_left:
1668
+ return False
1669
+ if r1_bottom < r2_top or r2_bottom < r1_top:
1670
+ return False
1671
+ return True
1672
+
1111
1673
  def get_bytes(self) -> bytes:
1112
1674
  """
1113
1675
  Downloads the current state of the PDF document with all modifications applied.
@@ -1298,6 +1860,175 @@ class PDFDancer:
1298
1860
  orientation=orientation
1299
1861
  )
1300
1862
 
1863
+ def _parse_path_segment(self, segment_data: dict) -> 'PathSegment':
1864
+ """Parse JSON data into PathSegment instance (Line or Bezier)."""
1865
+ from .models import Line, Bezier, PathSegment, Point, Color
1866
+
1867
+ segment_type = segment_data.get('segmentType', segment_data.get('type', '')).upper()
1868
+
1869
+ # Parse common properties
1870
+ stroke_color = None
1871
+ stroke_color_data = segment_data.get('strokeColor')
1872
+ if isinstance(stroke_color_data, dict):
1873
+ r = stroke_color_data.get('red', 0)
1874
+ g = stroke_color_data.get('green', 0)
1875
+ b = stroke_color_data.get('blue', 0)
1876
+ a = stroke_color_data.get('alpha', 255)
1877
+ if all(isinstance(v, int) for v in [r, g, b]):
1878
+ stroke_color = Color(r, g, b, a)
1879
+
1880
+ fill_color = None
1881
+ fill_color_data = segment_data.get('fillColor')
1882
+ if isinstance(fill_color_data, dict):
1883
+ r = fill_color_data.get('red', 0)
1884
+ g = fill_color_data.get('green', 0)
1885
+ b = fill_color_data.get('blue', 0)
1886
+ a = fill_color_data.get('alpha', 255)
1887
+ if all(isinstance(v, int) for v in [r, g, b]):
1888
+ fill_color = Color(r, g, b, a)
1889
+
1890
+ stroke_width = segment_data.get('strokeWidth')
1891
+ dash_array = segment_data.get('dashArray')
1892
+ dash_phase = segment_data.get('dashPhase')
1893
+
1894
+ # Parse specific segment type
1895
+ if segment_type == 'LINE':
1896
+ p0_data = segment_data.get('p0', {})
1897
+ p1_data = segment_data.get('p1', {})
1898
+
1899
+ p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
1900
+ p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
1901
+
1902
+ return Line(
1903
+ stroke_color=stroke_color,
1904
+ fill_color=fill_color,
1905
+ stroke_width=stroke_width,
1906
+ dash_array=dash_array,
1907
+ dash_phase=dash_phase,
1908
+ p0=p0,
1909
+ p1=p1
1910
+ )
1911
+ elif segment_type == 'BEZIER':
1912
+ p0_data = segment_data.get('p0', {})
1913
+ p1_data = segment_data.get('p1', {})
1914
+ p2_data = segment_data.get('p2', {})
1915
+ p3_data = segment_data.get('p3', {})
1916
+
1917
+ p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
1918
+ p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
1919
+ p2 = Point(p2_data.get('x', 0.0), p2_data.get('y', 0.0)) if p2_data else None
1920
+ p3 = Point(p3_data.get('x', 0.0), p3_data.get('y', 0.0)) if p3_data else None
1921
+
1922
+ return Bezier(
1923
+ stroke_color=stroke_color,
1924
+ fill_color=fill_color,
1925
+ stroke_width=stroke_width,
1926
+ dash_array=dash_array,
1927
+ dash_phase=dash_phase,
1928
+ p0=p0,
1929
+ p1=p1,
1930
+ p2=p2,
1931
+ p3=p3
1932
+ )
1933
+ else:
1934
+ # Fallback to base PathSegment for unknown types
1935
+ return PathSegment(
1936
+ stroke_color=stroke_color,
1937
+ fill_color=fill_color,
1938
+ stroke_width=stroke_width,
1939
+ dash_array=dash_array,
1940
+ dash_phase=dash_phase
1941
+ )
1942
+
1943
+ def _parse_path(self, obj_data: dict) -> 'Path':
1944
+ """Parse JSON data into Path instance with path segments."""
1945
+ from .models import Path
1946
+
1947
+ position_data = obj_data.get('position', {})
1948
+ position = self._parse_position(position_data) if position_data else None
1949
+
1950
+ # Parse path segments
1951
+ path_segments = []
1952
+ segments_data = obj_data.get('pathSegments', [])
1953
+ if isinstance(segments_data, list):
1954
+ for segment_data in segments_data:
1955
+ if isinstance(segment_data, dict):
1956
+ path_segments.append(self._parse_path_segment(segment_data))
1957
+
1958
+ even_odd_fill = obj_data.get('evenOddFill')
1959
+
1960
+ return Path(
1961
+ position=position,
1962
+ path_segments=path_segments if path_segments else None,
1963
+ even_odd_fill=even_odd_fill
1964
+ )
1965
+
1966
+ def _parse_font_recommendation(self, data: dict) -> FontRecommendation:
1967
+ """Parse JSON data into FontRecommendation instance."""
1968
+ font_type_str = data.get('fontType', 'SYSTEM')
1969
+ font_type = FontType(font_type_str)
1970
+
1971
+ return FontRecommendation(
1972
+ font_name=data.get('fontName', ''),
1973
+ font_type=font_type,
1974
+ similarity_score=data.get('similarityScore', 0.0)
1975
+ )
1976
+
1977
+ def _parse_page_snapshot(self, data: dict) -> PageSnapshot:
1978
+ """Parse JSON data into PageSnapshot instance with proper type handling."""
1979
+ page_ref = self._parse_page_ref(data.get('pageRef', {}))
1980
+
1981
+ # Parse elements using appropriate parser based on type
1982
+ elements = []
1983
+ for elem_data in data.get('elements', []):
1984
+ elem_type_str = elem_data.get('type')
1985
+ if not elem_type_str:
1986
+ continue
1987
+
1988
+ try:
1989
+ # Normalize type string (API returns "CHECKBOX" but enum is "CHECK_BOX")
1990
+ if elem_type_str == "CHECKBOX":
1991
+ elem_type_str = "CHECK_BOX"
1992
+ # Deep copy to avoid modifying original
1993
+ import copy
1994
+ elem_data = copy.deepcopy(elem_data)
1995
+ elem_data['type'] = elem_type_str # Update type in data
1996
+
1997
+ elem_type = ObjectType(elem_type_str)
1998
+
1999
+ # Use appropriate parser based on element type
2000
+ if elem_type in (ObjectType.PARAGRAPH, ObjectType.TEXT_LINE):
2001
+ # Parse as TextObjectRef to capture text, font, color, children
2002
+ elements.append(self._parse_text_object_ref(elem_data))
2003
+ elif elem_type in (ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
2004
+ ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
2005
+ ObjectType.BUTTON, ObjectType.DROPDOWN):
2006
+ # Parse as FormFieldRef to capture name and value
2007
+ elements.append(self._parse_form_field_ref(elem_data))
2008
+ else:
2009
+ # Parse as basic ObjectRef
2010
+ elements.append(self._parse_object_ref(elem_data))
2011
+ except (ValueError, KeyError):
2012
+ # Skip elements with invalid types
2013
+ continue
2014
+
2015
+ return PageSnapshot(
2016
+ page_ref=page_ref,
2017
+ elements=elements
2018
+ )
2019
+
2020
+ def _parse_document_snapshot(self, data: dict) -> DocumentSnapshot:
2021
+ """Parse JSON data into DocumentSnapshot instance."""
2022
+ page_count = data.get('pageCount', 0)
2023
+ fonts = [self._parse_font_recommendation(font_data) for font_data in data.get('fonts', [])]
2024
+ pages = [self._parse_page_snapshot(page_data) for page_data in data.get('pages', [])]
2025
+
2026
+ return DocumentSnapshot(
2027
+ page_count=page_count,
2028
+ fonts=fonts,
2029
+ pages=pages
2030
+ )
2031
+
1301
2032
  # Builder Pattern Support
1302
2033
 
1303
2034
  def _paragraph_builder(self) -> 'ParagraphBuilder':
@@ -1316,9 +2047,17 @@ class PDFDancer:
1316
2047
 
1317
2048
  def __exit__(self, exc_type, exc_val, exc_tb):
1318
2049
  """Context manager exit - cleanup if needed."""
2050
+ # Close the HTTP client to free resources
2051
+ if hasattr(self, '_client'):
2052
+ self._client.close()
1319
2053
  # TODO Could add session cleanup here if API supports it. Cleanup on the server
1320
2054
  pass
1321
2055
 
2056
+ def close(self):
2057
+ """Close the HTTP client and free resources."""
2058
+ if hasattr(self, '_client'):
2059
+ self._client.close()
2060
+
1322
2061
  def _to_path_objects(self, refs: List[ObjectRef]) -> List[PathObject]:
1323
2062
  return [PathObject(self, ref.internal_id, ref.type, ref.position) for ref in refs]
1324
2063