pdfdancer-client-python 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfdancer-client-python might be problematic. Click here for more details.
- pdfdancer/__init__.py +29 -2
- pdfdancer/exceptions.py +3 -3
- pdfdancer/fingerprint.py +121 -0
- pdfdancer/models.py +243 -27
- pdfdancer/paragraph_builder.py +2 -1
- pdfdancer/path_builder.py +557 -0
- pdfdancer/pdfdancer_v1.py +900 -113
- pdfdancer/types.py +3 -3
- pdfdancer_client_python-0.2.18.dist-info/METADATA +585 -0
- pdfdancer_client_python-0.2.18.dist-info/RECORD +15 -0
- pdfdancer_client_python-0.2.18.dist-info/licenses/LICENSE +202 -0
- pdfdancer_client_python-0.2.18.dist-info/licenses/NOTICE +8 -0
- pdfdancer_client_python-0.2.16.dist-info/METADATA +0 -190
- pdfdancer_client_python-0.2.16.dist-info/RECORD +0 -11
- {pdfdancer_client_python-0.2.16.dist-info → pdfdancer_client_python-0.2.18.dist-info}/WHEEL +0 -0
- {pdfdancer_client_python-0.2.16.dist-info → pdfdancer_client_python-0.2.18.dist-info}/top_level.txt +0 -0
pdfdancer/pdfdancer_v1.py
CHANGED
|
@@ -5,16 +5,117 @@ A Python client that closely mirrors the Java Client class structure and functio
|
|
|
5
5
|
Provides session-based PDF manipulation operations with strict validation.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import gzip
|
|
8
9
|
import json
|
|
9
10
|
import os
|
|
11
|
+
import time
|
|
12
|
+
from datetime import datetime, timezone
|
|
10
13
|
from pathlib import Path
|
|
11
14
|
from typing import List, Optional, Union, BinaryIO, Mapping, Any
|
|
12
15
|
|
|
13
|
-
import
|
|
16
|
+
import httpx
|
|
14
17
|
from dotenv import load_dotenv
|
|
15
18
|
|
|
19
|
+
from .fingerprint import Fingerprint
|
|
20
|
+
|
|
16
21
|
load_dotenv()
|
|
17
22
|
|
|
23
|
+
# Global variable to disable SSL certificate verification
|
|
24
|
+
# Set to True to skip SSL verification (useful for testing with self-signed certificates)
|
|
25
|
+
# WARNING: Only use in development/testing environments
|
|
26
|
+
DISABLE_SSL_VERIFY = os.environ.get("PDFDANCER_CLIENT_DISABLE_SSL_VERIFY", False)
|
|
27
|
+
|
|
28
|
+
DEBUG = os.environ.get("PDFDANCER_CLIENT_DEBUG", False)
|
|
29
|
+
DEFAULT_TOLERANCE = 0.01
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _generate_timestamp() -> str:
|
|
33
|
+
"""
|
|
34
|
+
Generate a timestamp string in the format expected by the API.
|
|
35
|
+
Format: YYYY-MM-DDTHH:MM:SS.ffffffZ (with microseconds)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Timestamp string with UTC timezone
|
|
39
|
+
"""
|
|
40
|
+
return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _parse_timestamp(timestamp_str: str) -> datetime:
|
|
44
|
+
"""
|
|
45
|
+
Parse timestamp string, handling both microseconds and nanoseconds precision.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
timestamp_str: Timestamp string in format YYYY-MM-DDTHH:MM:SS.fffffffZ
|
|
49
|
+
(with 6 or 9 fractional digits)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
datetime object with UTC timezone
|
|
53
|
+
"""
|
|
54
|
+
# Remove the 'Z' suffix
|
|
55
|
+
ts = timestamp_str.rstrip('Z')
|
|
56
|
+
|
|
57
|
+
# Handle nanoseconds (9 digits) by truncating to microseconds (6 digits)
|
|
58
|
+
# Python's datetime only supports microseconds precision
|
|
59
|
+
if '.' in ts:
|
|
60
|
+
date_part, frac_part = ts.rsplit('.', 1)
|
|
61
|
+
if len(frac_part) > 6:
|
|
62
|
+
# Truncate to 6 digits (microseconds)
|
|
63
|
+
frac_part = frac_part[:6]
|
|
64
|
+
ts = f"{date_part}.{frac_part}"
|
|
65
|
+
|
|
66
|
+
return datetime.fromisoformat(ts).replace(tzinfo=timezone.utc)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _log_generated_at_header(response: httpx.Response, method: str, path: str) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Check for X-Generated-At and X-Received-At headers and log timing information if DEBUG=True.
|
|
72
|
+
|
|
73
|
+
Expected timestamp formats:
|
|
74
|
+
- 2025-10-24T08:49:39.161945Z (microseconds - 6 digits)
|
|
75
|
+
- 2025-10-24T08:58:45.468131265Z (nanoseconds - 9 digits)
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
response: The HTTP response object
|
|
79
|
+
method: HTTP method used
|
|
80
|
+
path: API path
|
|
81
|
+
"""
|
|
82
|
+
if not DEBUG:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
generated_at = response.headers.get('X-Generated-At')
|
|
86
|
+
received_at = response.headers.get('X-Received-At')
|
|
87
|
+
|
|
88
|
+
if generated_at or received_at:
|
|
89
|
+
try:
|
|
90
|
+
log_parts = []
|
|
91
|
+
current_time = datetime.now(timezone.utc)
|
|
92
|
+
|
|
93
|
+
# Parse and log X-Received-At
|
|
94
|
+
received_time = None
|
|
95
|
+
if received_at:
|
|
96
|
+
received_time = _parse_timestamp(received_at)
|
|
97
|
+
time_since_received = (current_time - received_time).total_seconds()
|
|
98
|
+
log_parts.append(f"X-Received-At: {received_at}, time since received: {time_since_received:.3f}s")
|
|
99
|
+
|
|
100
|
+
# Parse and log X-Generated-At
|
|
101
|
+
generated_time = None
|
|
102
|
+
if generated_at:
|
|
103
|
+
generated_time = _parse_timestamp(generated_at)
|
|
104
|
+
time_since_generated = (current_time - generated_time).total_seconds()
|
|
105
|
+
log_parts.append(f"X-Generated-At: {generated_at}, time since generated: {time_since_generated:.3f}s")
|
|
106
|
+
|
|
107
|
+
# Calculate processing time (X-Generated-At - X-Received-At)
|
|
108
|
+
if received_time and generated_time:
|
|
109
|
+
processing_time = (generated_time - received_time).total_seconds()
|
|
110
|
+
log_parts.append(f"processing time: {processing_time:.3f}s")
|
|
111
|
+
|
|
112
|
+
if log_parts:
|
|
113
|
+
print(f"{time.time()}|{method} {path} - {', '.join(log_parts)}")
|
|
114
|
+
|
|
115
|
+
except (ValueError, AttributeError) as e:
|
|
116
|
+
print(f"{time.time()}|{method} {path} - Header parse error: {e}")
|
|
117
|
+
|
|
118
|
+
|
|
18
119
|
from . import ParagraphBuilder
|
|
19
120
|
from .exceptions import (
|
|
20
121
|
PdfDancerException,
|
|
@@ -28,7 +129,8 @@ from .models import (
|
|
|
28
129
|
ObjectRef, Position, ObjectType, Font, Image, Paragraph, FormFieldRef, TextObjectRef, PageRef,
|
|
29
130
|
FindRequest, DeleteRequest, MoveRequest, PageMoveRequest, AddRequest, ModifyRequest, ModifyTextRequest,
|
|
30
131
|
ChangeFormFieldRequest, CommandResult,
|
|
31
|
-
ShapeType, PositionMode, PageSize, Orientation
|
|
132
|
+
ShapeType, PositionMode, PageSize, Orientation,
|
|
133
|
+
PageSnapshot, DocumentSnapshot, FontRecommendation, FontType
|
|
32
134
|
)
|
|
33
135
|
from .paragraph_builder import ParagraphPageBuilder
|
|
34
136
|
from .types import PathObject, ParagraphObject, TextLineObject, ImageObject, FormObject, FormFieldObject
|
|
@@ -52,9 +154,10 @@ class PageClient:
|
|
|
52
154
|
else:
|
|
53
155
|
self.orientation = orientation
|
|
54
156
|
|
|
55
|
-
def select_paths_at(self, x: float, y: float) -> List[PathObject]:
|
|
157
|
+
def select_paths_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[PathObject]:
|
|
158
|
+
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
56
159
|
# noinspection PyProtectedMember
|
|
57
|
-
return self.root._to_path_objects(self.root._find_paths(
|
|
160
|
+
return self.root._to_path_objects(self.root._find_paths(position, tolerance))
|
|
58
161
|
|
|
59
162
|
def select_paragraphs(self) -> List[ParagraphObject]:
|
|
60
163
|
# noinspection PyProtectedMember
|
|
@@ -78,10 +181,10 @@ class PageClient:
|
|
|
78
181
|
# noinspection PyProtectedMember
|
|
79
182
|
return self.root._to_textline_objects(self.root._find_text_lines(position))
|
|
80
183
|
|
|
81
|
-
def select_paragraphs_at(self, x: float, y: float) -> List[ParagraphObject]:
|
|
184
|
+
def select_paragraphs_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ParagraphObject]:
|
|
82
185
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
83
186
|
# noinspection PyProtectedMember
|
|
84
|
-
return self.root._to_paragraph_objects(self.root._find_paragraphs(position))
|
|
187
|
+
return self.root._to_paragraph_objects(self.root._find_paragraphs(position, tolerance))
|
|
85
188
|
|
|
86
189
|
def select_text_lines(self) -> List[TextLineObject]:
|
|
87
190
|
position = Position.at_page(self.page_index)
|
|
@@ -94,29 +197,29 @@ class PageClient:
|
|
|
94
197
|
# noinspection PyProtectedMember
|
|
95
198
|
return self.root._to_textline_objects(self.root._find_text_lines(position))
|
|
96
199
|
|
|
97
|
-
def select_text_lines_at(self, x, y) -> List[TextLineObject]:
|
|
200
|
+
def select_text_lines_at(self, x, y, tolerance: float = DEFAULT_TOLERANCE) -> List[TextLineObject]:
|
|
98
201
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
99
202
|
# noinspection PyProtectedMember
|
|
100
|
-
return self.root._to_textline_objects(self.root._find_text_lines(position))
|
|
203
|
+
return self.root._to_textline_objects(self.root._find_text_lines(position, tolerance))
|
|
101
204
|
|
|
102
205
|
def select_images(self) -> List[ImageObject]:
|
|
103
206
|
# noinspection PyProtectedMember
|
|
104
207
|
return self.root._to_image_objects(self.root._find_images(Position.at_page(self.page_index)))
|
|
105
208
|
|
|
106
|
-
def select_images_at(self, x: float, y: float) -> List[ImageObject]:
|
|
209
|
+
def select_images_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ImageObject]:
|
|
107
210
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
108
211
|
# noinspection PyProtectedMember
|
|
109
|
-
return self.root._to_image_objects(self.root._find_images(position))
|
|
212
|
+
return self.root._to_image_objects(self.root._find_images(position, tolerance))
|
|
110
213
|
|
|
111
214
|
def select_forms(self) -> List[FormObject]:
|
|
112
215
|
position = Position.at_page(self.page_index)
|
|
113
216
|
# noinspection PyProtectedMember
|
|
114
217
|
return self.root._to_form_objects(self.root._find_form_x_objects(position))
|
|
115
218
|
|
|
116
|
-
def select_forms_at(self, x: float, y: float) -> List[FormObject]:
|
|
219
|
+
def select_forms_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormObject]:
|
|
117
220
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
118
221
|
# noinspection PyProtectedMember
|
|
119
|
-
return self.root._to_form_objects(self.root._find_form_x_objects(position))
|
|
222
|
+
return self.root._to_form_objects(self.root._find_form_x_objects(position, tolerance))
|
|
120
223
|
|
|
121
224
|
def select_form_fields(self) -> List[FormFieldObject]:
|
|
122
225
|
position = Position.at_page(self.page_index)
|
|
@@ -129,10 +232,10 @@ class PageClient:
|
|
|
129
232
|
# noinspection PyProtectedMember
|
|
130
233
|
return self.root._to_form_field_objects(self.root._find_form_fields(pos))
|
|
131
234
|
|
|
132
|
-
def select_form_fields_at(self, x: float, y: float) -> List[FormFieldObject]:
|
|
235
|
+
def select_form_fields_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormFieldObject]:
|
|
133
236
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
134
237
|
# noinspection PyProtectedMember
|
|
135
|
-
return self.root._to_form_field_objects(self.root._find_form_fields(position))
|
|
238
|
+
return self.root._to_form_field_objects(self.root._find_form_fields(position, tolerance))
|
|
136
239
|
|
|
137
240
|
@classmethod
|
|
138
241
|
def from_ref(cls, root: 'PDFDancer', page_ref: PageRef) -> 'PageClient':
|
|
@@ -170,6 +273,18 @@ class PageClient:
|
|
|
170
273
|
def new_paragraph(self):
|
|
171
274
|
return ParagraphPageBuilder(self.root, self.page_index)
|
|
172
275
|
|
|
276
|
+
def new_path(self):
|
|
277
|
+
from .path_builder import PathBuilder
|
|
278
|
+
return PathBuilder(self.root, self.page_index)
|
|
279
|
+
|
|
280
|
+
def new_line(self):
|
|
281
|
+
from .path_builder import LineBuilder
|
|
282
|
+
return LineBuilder(self.root, self.page_index)
|
|
283
|
+
|
|
284
|
+
def new_bezier(self):
|
|
285
|
+
from .path_builder import BezierBuilder
|
|
286
|
+
return BezierBuilder(self.root, self.page_index)
|
|
287
|
+
|
|
173
288
|
def select_paths(self):
|
|
174
289
|
# noinspection PyProtectedMember
|
|
175
290
|
return self.root._to_path_objects(self.root._find_paths(Position.at_page(self.page_index)))
|
|
@@ -221,9 +336,15 @@ class PDFDancer:
|
|
|
221
336
|
"""
|
|
222
337
|
Create a client session, falling back to environment variables when needed.
|
|
223
338
|
|
|
339
|
+
Authentication:
|
|
340
|
+
- If token is provided, uses it
|
|
341
|
+
- Otherwise, checks PDFDANCER_TOKEN environment variable
|
|
342
|
+
- If no token is found, automatically obtains an anonymous token
|
|
343
|
+
|
|
224
344
|
Args:
|
|
225
345
|
pdf_data: PDF payload supplied directly or via filesystem handles.
|
|
226
|
-
token: Override for the API token; falls back to `PDFDANCER_TOKEN`
|
|
346
|
+
token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
|
|
347
|
+
then to anonymous token if not set.
|
|
227
348
|
base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
|
|
228
349
|
or defaults to `https://api.pdfdancer.com`.
|
|
229
350
|
timeout: HTTP read timeout in seconds.
|
|
@@ -234,6 +355,10 @@ class PDFDancer:
|
|
|
234
355
|
resolved_token = cls._resolve_token(token)
|
|
235
356
|
resolved_base_url = cls._resolve_base_url(base_url)
|
|
236
357
|
|
|
358
|
+
# If no token found, obtain anonymous token
|
|
359
|
+
if resolved_token is None:
|
|
360
|
+
resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
|
|
361
|
+
|
|
237
362
|
return PDFDancer(resolved_token, pdf_data, resolved_base_url, timeout)
|
|
238
363
|
|
|
239
364
|
@classmethod
|
|
@@ -244,18 +369,66 @@ class PDFDancer:
|
|
|
244
369
|
resolved_base_url = "https://api.pdfdancer.com"
|
|
245
370
|
return resolved_base_url
|
|
246
371
|
|
|
372
|
+
@classmethod
|
|
373
|
+
def _obtain_anonymous_token(cls, base_url: str, timeout: float = 30.0) -> str:
|
|
374
|
+
"""
|
|
375
|
+
Obtain an anonymous token from the /keys/anon endpoint.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
base_url: Base URL of the PDFDancer API server
|
|
379
|
+
timeout: HTTP read timeout in seconds
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Anonymous token string
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
HttpClientException: If token request fails
|
|
386
|
+
"""
|
|
387
|
+
try:
|
|
388
|
+
# Create temporary client without authentication
|
|
389
|
+
temp_client = httpx.Client(
|
|
390
|
+
http2=True,
|
|
391
|
+
verify=not DISABLE_SSL_VERIFY
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
headers = {
|
|
395
|
+
'X-Fingerprint': Fingerprint.generate()
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
response = temp_client.post(
|
|
399
|
+
cls._cleanup_url_path(base_url, "/keys/anon"),
|
|
400
|
+
headers=headers,
|
|
401
|
+
timeout=timeout if timeout > 0 else None
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
response.raise_for_status()
|
|
405
|
+
token_data = response.json()
|
|
406
|
+
|
|
407
|
+
# Extract token from response (matches Java AnonTokenResponse structure)
|
|
408
|
+
if isinstance(token_data, dict) and 'token' in token_data:
|
|
409
|
+
return token_data['token']
|
|
410
|
+
else:
|
|
411
|
+
raise HttpClientException("Invalid anonymous token response format")
|
|
412
|
+
|
|
413
|
+
except httpx.HTTPStatusError as e:
|
|
414
|
+
raise HttpClientException(f"Failed to obtain anonymous token: HTTP {e.response.status_code}",
|
|
415
|
+
response=e.response, cause=e) from None
|
|
416
|
+
except httpx.RequestError as e:
|
|
417
|
+
raise HttpClientException(f"Failed to obtain anonymous token: {str(e)}",
|
|
418
|
+
response=None, cause=e) from None
|
|
419
|
+
finally:
|
|
420
|
+
temp_client.close()
|
|
421
|
+
|
|
247
422
|
@classmethod
|
|
248
423
|
def _resolve_token(cls, token: Optional[str]) -> Optional[str]:
|
|
424
|
+
"""
|
|
425
|
+
Resolve token from argument or environment variable.
|
|
426
|
+
Returns None if no token is found (allowing fallback to anonymous token).
|
|
427
|
+
"""
|
|
249
428
|
resolved_token = token.strip() if token and token.strip() else None
|
|
250
429
|
if resolved_token is None:
|
|
251
430
|
env_token = os.getenv("PDFDANCER_TOKEN")
|
|
252
431
|
resolved_token = env_token.strip() if env_token and env_token.strip() else None
|
|
253
|
-
|
|
254
|
-
if resolved_token is None:
|
|
255
|
-
raise ValidationException(
|
|
256
|
-
"Missing PDFDancer API token. Pass a token via the `token` argument "
|
|
257
|
-
"or set the PDFDANCER_TOKEN environment variable."
|
|
258
|
-
)
|
|
259
432
|
return resolved_token
|
|
260
433
|
|
|
261
434
|
@classmethod
|
|
@@ -269,8 +442,14 @@ class PDFDancer:
|
|
|
269
442
|
"""
|
|
270
443
|
Create a new blank PDF document with optional configuration.
|
|
271
444
|
|
|
445
|
+
Authentication:
|
|
446
|
+
- If token is provided, uses it
|
|
447
|
+
- Otherwise, checks PDFDANCER_TOKEN environment variable
|
|
448
|
+
- If no token is found, automatically obtains an anonymous token
|
|
449
|
+
|
|
272
450
|
Args:
|
|
273
|
-
token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable
|
|
451
|
+
token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
|
|
452
|
+
then to anonymous token if not set.
|
|
274
453
|
base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
|
|
275
454
|
or defaults to `https://api.pdfdancer.com`.
|
|
276
455
|
timeout: HTTP read timeout in seconds.
|
|
@@ -285,6 +464,10 @@ class PDFDancer:
|
|
|
285
464
|
resolved_token = cls._resolve_token(token)
|
|
286
465
|
resolved_base_url = cls._resolve_base_url(base_url)
|
|
287
466
|
|
|
467
|
+
# If no token found, obtain anonymous token
|
|
468
|
+
if resolved_token is None:
|
|
469
|
+
resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
|
|
470
|
+
|
|
288
471
|
# Create a new instance that will call _create_blank_pdf_session
|
|
289
472
|
instance = object.__new__(cls)
|
|
290
473
|
|
|
@@ -296,11 +479,12 @@ class PDFDancer:
|
|
|
296
479
|
instance._base_url = resolved_base_url.rstrip('/')
|
|
297
480
|
instance._read_timeout = timeout
|
|
298
481
|
|
|
299
|
-
# Create HTTP
|
|
300
|
-
instance.
|
|
301
|
-
|
|
302
|
-
'Authorization': f'Bearer {instance._token}'
|
|
303
|
-
|
|
482
|
+
# Create HTTP client for connection reuse with HTTP/2 support
|
|
483
|
+
instance._client = httpx.Client(
|
|
484
|
+
http2=True,
|
|
485
|
+
headers={'Authorization': f'Bearer {instance._token}'},
|
|
486
|
+
verify=not DISABLE_SSL_VERIFY
|
|
487
|
+
)
|
|
304
488
|
|
|
305
489
|
# Create blank PDF session
|
|
306
490
|
instance._session_id = instance._create_blank_pdf_session(
|
|
@@ -312,6 +496,10 @@ class PDFDancer:
|
|
|
312
496
|
# Set pdf_bytes to None since we don't have the PDF bytes yet
|
|
313
497
|
instance._pdf_bytes = None
|
|
314
498
|
|
|
499
|
+
# Initialize snapshot caches (lazy-loaded)
|
|
500
|
+
instance._document_snapshot = None
|
|
501
|
+
instance._page_snapshots = {}
|
|
502
|
+
|
|
315
503
|
return instance
|
|
316
504
|
|
|
317
505
|
def __init__(self, token: str, pdf_data: Union[bytes, Path, str, BinaryIO],
|
|
@@ -343,15 +531,20 @@ class PDFDancer:
|
|
|
343
531
|
# Process PDF data with validation
|
|
344
532
|
self._pdf_bytes = self._process_pdf_data(pdf_data)
|
|
345
533
|
|
|
346
|
-
# Create HTTP
|
|
347
|
-
self.
|
|
348
|
-
|
|
349
|
-
'Authorization': f'Bearer {self._token}'
|
|
350
|
-
|
|
534
|
+
# Create HTTP client for connection reuse with HTTP/2 support
|
|
535
|
+
self._client = httpx.Client(
|
|
536
|
+
http2=True,
|
|
537
|
+
headers={'Authorization': f'Bearer {self._token}'},
|
|
538
|
+
verify=not DISABLE_SSL_VERIFY
|
|
539
|
+
)
|
|
351
540
|
|
|
352
541
|
# Create session - equivalent to Java constructor behavior
|
|
353
542
|
self._session_id = self._create_session()
|
|
354
543
|
|
|
544
|
+
# Initialize snapshot caches (lazy-loaded)
|
|
545
|
+
self._document_snapshot: Optional[DocumentSnapshot] = None
|
|
546
|
+
self._page_snapshots: dict[int, PageSnapshot] = {}
|
|
547
|
+
|
|
355
548
|
@staticmethod
|
|
356
549
|
def _process_pdf_data(pdf_data: Union[bytes, Path, str, BinaryIO]) -> bytes:
|
|
357
550
|
"""
|
|
@@ -393,7 +586,7 @@ class PDFDancer:
|
|
|
393
586
|
except (IOError, OSError) as e:
|
|
394
587
|
raise PdfDancerException(f"Failed to read PDF data: {e}", cause=e)
|
|
395
588
|
|
|
396
|
-
def _extract_error_message(self, response: Optional[
|
|
589
|
+
def _extract_error_message(self, response: Optional[httpx.Response]) -> str:
|
|
397
590
|
"""
|
|
398
591
|
Extract meaningful error messages from API response.
|
|
399
592
|
Parses JSON error responses with _embedded.errors structure.
|
|
@@ -429,7 +622,7 @@ class PDFDancer:
|
|
|
429
622
|
# If JSON parsing fails, return response content or status
|
|
430
623
|
return response.text or f"HTTP {response.status_code}"
|
|
431
624
|
|
|
432
|
-
def _handle_authentication_error(self, response: Optional[
|
|
625
|
+
def _handle_authentication_error(self, response: Optional[httpx.Response]) -> None:
|
|
433
626
|
"""
|
|
434
627
|
Translate authentication failures into a clear, actionable validation error.
|
|
435
628
|
"""
|
|
@@ -466,16 +659,54 @@ class PDFDancer:
|
|
|
466
659
|
Creates a new PDF processing session by uploading the PDF data.
|
|
467
660
|
"""
|
|
468
661
|
try:
|
|
469
|
-
|
|
470
|
-
|
|
662
|
+
# Build multipart body manually to avoid base64 encoding and enable compression
|
|
663
|
+
# httpx by default may add Content-Transfer-Encoding: base64 which the server rejects
|
|
664
|
+
import uuid
|
|
665
|
+
|
|
666
|
+
boundary = uuid.uuid4().hex
|
|
667
|
+
|
|
668
|
+
# Build multipart body with binary (not base64) encoding
|
|
669
|
+
body_parts = []
|
|
670
|
+
body_parts.append(f'--{boundary}\r\n'.encode('utf-8'))
|
|
671
|
+
body_parts.append(b'Content-Disposition: form-data; name="pdf"; filename="document.pdf"\r\n')
|
|
672
|
+
body_parts.append(b'Content-Type: application/pdf\r\n')
|
|
673
|
+
body_parts.append(b'\r\n') # End of headers, no Content-Transfer-Encoding
|
|
674
|
+
body_parts.append(self._pdf_bytes)
|
|
675
|
+
body_parts.append(b'\r\n')
|
|
676
|
+
body_parts.append(f'--{boundary}--\r\n'.encode('utf-8'))
|
|
677
|
+
|
|
678
|
+
uncompressed_body = b''.join(body_parts)
|
|
679
|
+
|
|
680
|
+
# Compress entire request body using gzip
|
|
681
|
+
compressed_body = gzip.compress(uncompressed_body)
|
|
682
|
+
|
|
683
|
+
original_size = len(uncompressed_body)
|
|
684
|
+
compressed_size = len(compressed_body)
|
|
685
|
+
compression_ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
|
|
686
|
+
|
|
687
|
+
if DEBUG:
|
|
688
|
+
print(f"{time.time()}|POST /session/create - original size: {original_size} bytes, "
|
|
689
|
+
f"compressed size: {compressed_size} bytes, "
|
|
690
|
+
f"compression: {compression_ratio:.1f}%")
|
|
691
|
+
|
|
692
|
+
headers = {
|
|
693
|
+
'X-Generated-At': _generate_timestamp(),
|
|
694
|
+
'Content-Type': f'multipart/form-data; boundary={boundary}',
|
|
695
|
+
'Content-Encoding': 'gzip'
|
|
471
696
|
}
|
|
472
697
|
|
|
473
|
-
response = self.
|
|
698
|
+
response = self._client.post(
|
|
474
699
|
self._cleanup_url_path(self._base_url, "/session/create"),
|
|
475
|
-
|
|
700
|
+
content=compressed_body,
|
|
701
|
+
headers=headers,
|
|
476
702
|
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
477
703
|
)
|
|
478
704
|
|
|
705
|
+
response_size = len(response.content)
|
|
706
|
+
if DEBUG:
|
|
707
|
+
print(f"{time.time()}|POST /session/create - response size: {response_size} bytes")
|
|
708
|
+
|
|
709
|
+
_log_generated_at_header(response, "POST", "/session/create")
|
|
479
710
|
self._handle_authentication_error(response)
|
|
480
711
|
response.raise_for_status()
|
|
481
712
|
session_id = response.text.strip()
|
|
@@ -485,11 +716,14 @@ class PDFDancer:
|
|
|
485
716
|
|
|
486
717
|
return session_id
|
|
487
718
|
|
|
488
|
-
except
|
|
489
|
-
self._handle_authentication_error(
|
|
490
|
-
error_message = self._extract_error_message(
|
|
719
|
+
except httpx.HTTPStatusError as e:
|
|
720
|
+
self._handle_authentication_error(e.response)
|
|
721
|
+
error_message = self._extract_error_message(e.response)
|
|
491
722
|
raise HttpClientException(f"Failed to create session: {error_message}",
|
|
492
|
-
response=
|
|
723
|
+
response=e.response, cause=e) from None
|
|
724
|
+
except httpx.RequestError as e:
|
|
725
|
+
raise HttpClientException(f"Failed to create session: {str(e)}",
|
|
726
|
+
response=None, cause=e) from None
|
|
493
727
|
|
|
494
728
|
def _create_blank_pdf_session(self,
|
|
495
729
|
page_size: Optional[Union[PageSize, str, Mapping[str, Any]]] = None,
|
|
@@ -538,14 +772,27 @@ class PDFDancer:
|
|
|
538
772
|
raise ValidationException(f"Initial page count must be at least 1, got {initial_page_count}")
|
|
539
773
|
request_data['initialPageCount'] = initial_page_count
|
|
540
774
|
|
|
541
|
-
|
|
542
|
-
|
|
775
|
+
request_body = json.dumps(request_data)
|
|
776
|
+
request_size = len(request_body.encode('utf-8'))
|
|
777
|
+
if DEBUG:
|
|
778
|
+
print(f"{time.time()}|POST /session/new - request size: {request_size} bytes")
|
|
779
|
+
|
|
780
|
+
headers = {
|
|
781
|
+
'Content-Type': 'application/json',
|
|
782
|
+
'X-Generated-At': _generate_timestamp()
|
|
783
|
+
}
|
|
784
|
+
response = self._client.post(
|
|
543
785
|
self._cleanup_url_path(self._base_url, "/session/new"),
|
|
544
786
|
json=request_data,
|
|
545
787
|
headers=headers,
|
|
546
788
|
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
547
789
|
)
|
|
548
790
|
|
|
791
|
+
response_size = len(response.content)
|
|
792
|
+
if DEBUG:
|
|
793
|
+
print(f"{time.time()}|POST /session/new - response size: {response_size} bytes")
|
|
794
|
+
|
|
795
|
+
_log_generated_at_header(response, "POST", "/session/new")
|
|
549
796
|
self._handle_authentication_error(response)
|
|
550
797
|
response.raise_for_status()
|
|
551
798
|
session_id = response.text.strip()
|
|
@@ -555,24 +802,36 @@ class PDFDancer:
|
|
|
555
802
|
|
|
556
803
|
return session_id
|
|
557
804
|
|
|
558
|
-
except
|
|
559
|
-
self._handle_authentication_error(
|
|
560
|
-
error_message = self._extract_error_message(
|
|
805
|
+
except httpx.HTTPStatusError as e:
|
|
806
|
+
self._handle_authentication_error(e.response)
|
|
807
|
+
error_message = self._extract_error_message(e.response)
|
|
561
808
|
raise HttpClientException(f"Failed to create blank PDF session: {error_message}",
|
|
562
|
-
response=
|
|
809
|
+
response=e.response, cause=e) from None
|
|
810
|
+
except httpx.RequestError as e:
|
|
811
|
+
raise HttpClientException(f"Failed to create blank PDF session: {str(e)}",
|
|
812
|
+
response=None, cause=e) from None
|
|
563
813
|
|
|
564
814
|
def _make_request(self, method: str, path: str, data: Optional[dict] = None,
|
|
565
|
-
params: Optional[dict] = None) ->
|
|
815
|
+
params: Optional[dict] = None) -> httpx.Response:
|
|
566
816
|
"""
|
|
567
817
|
Make HTTP request with session headers and error handling.
|
|
568
818
|
"""
|
|
569
819
|
headers = {
|
|
570
820
|
'X-Session-Id': self._session_id,
|
|
571
|
-
'Content-Type': 'application/json'
|
|
821
|
+
'Content-Type': 'application/json',
|
|
822
|
+
'X-Generated-At': _generate_timestamp(),
|
|
823
|
+
'X-Fingerprint': Fingerprint.generate()
|
|
572
824
|
}
|
|
573
825
|
|
|
574
826
|
try:
|
|
575
|
-
|
|
827
|
+
request_size = 0
|
|
828
|
+
if data is not None:
|
|
829
|
+
request_body = json.dumps(data)
|
|
830
|
+
request_size = len(request_body.encode('utf-8'))
|
|
831
|
+
if DEBUG:
|
|
832
|
+
print(f"{time.time()}|{method} {path} - request size: {request_size} bytes")
|
|
833
|
+
|
|
834
|
+
response = self._client.request(
|
|
576
835
|
method=method,
|
|
577
836
|
url=self._cleanup_url_path(self._base_url, path),
|
|
578
837
|
json=data,
|
|
@@ -581,6 +840,12 @@ class PDFDancer:
|
|
|
581
840
|
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
582
841
|
)
|
|
583
842
|
|
|
843
|
+
response_size = len(response.content)
|
|
844
|
+
if DEBUG:
|
|
845
|
+
print(f"{time.time()}|{method} {path} - response size: {response_size} bytes")
|
|
846
|
+
|
|
847
|
+
_log_generated_at_header(response, method, path)
|
|
848
|
+
|
|
584
849
|
# Handle FontNotFoundException
|
|
585
850
|
if response.status_code == 404:
|
|
586
851
|
try:
|
|
@@ -594,31 +859,46 @@ class PDFDancer:
|
|
|
594
859
|
response.raise_for_status()
|
|
595
860
|
return response
|
|
596
861
|
|
|
597
|
-
except
|
|
598
|
-
self._handle_authentication_error(
|
|
599
|
-
error_message = self._extract_error_message(
|
|
600
|
-
raise HttpClientException(f"API request failed: {error_message}", response=
|
|
862
|
+
except httpx.HTTPStatusError as e:
|
|
863
|
+
self._handle_authentication_error(e.response)
|
|
864
|
+
error_message = self._extract_error_message(e.response)
|
|
865
|
+
raise HttpClientException(f"API request failed: {error_message}", response=e.response,
|
|
866
|
+
cause=e) from None
|
|
867
|
+
except httpx.RequestError as e:
|
|
868
|
+
raise HttpClientException(f"API request failed: {str(e)}", response=None,
|
|
601
869
|
cause=e) from None
|
|
602
870
|
|
|
603
|
-
def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None
|
|
871
|
+
def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None,
|
|
872
|
+
tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
|
|
604
873
|
"""
|
|
605
874
|
Searches for PDF objects matching the specified criteria.
|
|
606
|
-
|
|
607
|
-
allowing filtering by object type and position constraints.
|
|
875
|
+
Uses snapshot cache for all queries except paths at specific coordinates.
|
|
608
876
|
|
|
609
877
|
Args:
|
|
610
878
|
object_type: The type of objects to find (None for all types)
|
|
611
879
|
position: Positional constraints for the search (None for all positions)
|
|
880
|
+
tolerance: Tolerance in points for spatial matching (default: DEFAULT_TOLERANCE)
|
|
612
881
|
|
|
613
882
|
Returns:
|
|
614
883
|
List of object references matching the search criteria
|
|
615
884
|
"""
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
885
|
+
# Special case: PATH queries with bounding_rect need API (full vector data)
|
|
886
|
+
if object_type == ObjectType.PATH and position and position.bounding_rect:
|
|
887
|
+
request_data = FindRequest(object_type, position).to_dict()
|
|
888
|
+
response = self._make_request('POST', '/pdf/find', data=request_data)
|
|
889
|
+
objects_data = response.json()
|
|
890
|
+
return [self._parse_object_ref(obj_data) for obj_data in objects_data]
|
|
891
|
+
|
|
892
|
+
# Use snapshot for all other queries
|
|
893
|
+
if position and position.page_index is not None:
|
|
894
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
895
|
+
return self._filter_snapshot_elements(snapshot.elements, object_type, position, tolerance)
|
|
896
|
+
else:
|
|
897
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
898
|
+
all_elements = []
|
|
899
|
+
for page_snap in snapshot.pages:
|
|
900
|
+
all_elements.extend(page_snap.elements)
|
|
901
|
+
return self._filter_snapshot_elements(all_elements, object_type, position, tolerance)
|
|
622
902
|
|
|
623
903
|
def select_paragraphs(self) -> List[TextObjectRef]:
|
|
624
904
|
"""
|
|
@@ -626,21 +906,39 @@ class PDFDancer:
|
|
|
626
906
|
"""
|
|
627
907
|
return self._find_paragraphs(None)
|
|
628
908
|
|
|
629
|
-
def _find_paragraphs(self, position: Optional[Position] = None) -> List[
|
|
909
|
+
def _find_paragraphs(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
910
|
+
TextObjectRef]:
|
|
630
911
|
"""
|
|
631
912
|
Searches for paragraph objects returning TextObjectRef with hierarchical structure.
|
|
913
|
+
Uses snapshot cache for all queries.
|
|
632
914
|
"""
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
915
|
+
# Use snapshot for all queries (including spatial)
|
|
916
|
+
if position and position.page_index is not None:
|
|
917
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
918
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.PARAGRAPH, position, tolerance)
|
|
919
|
+
else:
|
|
920
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
921
|
+
all_elements = []
|
|
922
|
+
for page_snap in snapshot.pages:
|
|
923
|
+
all_elements.extend(page_snap.elements)
|
|
924
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.PARAGRAPH, position, tolerance)
|
|
638
925
|
|
|
639
|
-
def _find_images(self, position: Optional[Position] = None) -> List[
|
|
926
|
+
def _find_images(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
927
|
+
ObjectRef]:
|
|
640
928
|
"""
|
|
641
929
|
Searches for image objects at the specified position.
|
|
930
|
+
Uses snapshot cache for all queries.
|
|
642
931
|
"""
|
|
643
|
-
|
|
932
|
+
# Use snapshot for all queries (including spatial)
|
|
933
|
+
if position and position.page_index is not None:
|
|
934
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
935
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.IMAGE, position, tolerance)
|
|
936
|
+
else:
|
|
937
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
938
|
+
all_elements = []
|
|
939
|
+
for page_snap in snapshot.pages:
|
|
940
|
+
all_elements.extend(page_snap.elements)
|
|
941
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.IMAGE, position, tolerance)
|
|
644
942
|
|
|
645
943
|
def select_images(self) -> List[ImageObject]:
|
|
646
944
|
"""
|
|
@@ -654,11 +952,22 @@ class PDFDancer:
|
|
|
654
952
|
"""
|
|
655
953
|
return self._to_form_objects(self._find(ObjectType.FORM_X_OBJECT, None))
|
|
656
954
|
|
|
657
|
-
def _find_form_x_objects(self, position: Optional[Position] = None) -> List[
|
|
955
|
+
def _find_form_x_objects(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
956
|
+
ObjectRef]:
|
|
658
957
|
"""
|
|
659
|
-
Searches for form
|
|
958
|
+
Searches for form X objects at the specified position.
|
|
959
|
+
Uses snapshot cache for all queries.
|
|
660
960
|
"""
|
|
661
|
-
|
|
961
|
+
# Use snapshot for all queries (including spatial)
|
|
962
|
+
if position and position.page_index is not None:
|
|
963
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
964
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_X_OBJECT, position, tolerance)
|
|
965
|
+
else:
|
|
966
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
967
|
+
all_elements = []
|
|
968
|
+
for page_snap in snapshot.pages:
|
|
969
|
+
all_elements.extend(page_snap.elements)
|
|
970
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.FORM_X_OBJECT, position, tolerance)
|
|
662
971
|
|
|
663
972
|
def select_form_fields(self) -> List[FormFieldObject]:
|
|
664
973
|
"""
|
|
@@ -672,17 +981,23 @@ class PDFDancer:
|
|
|
672
981
|
"""
|
|
673
982
|
return self._to_form_field_objects(self._find_form_fields(Position.by_name(field_name)))
|
|
674
983
|
|
|
675
|
-
def _find_form_fields(self, position: Optional[Position] = None) -> List[
|
|
984
|
+
def _find_form_fields(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
985
|
+
FormFieldRef]:
|
|
676
986
|
"""
|
|
677
987
|
Searches for form fields at the specified position.
|
|
678
988
|
Returns FormFieldRef objects with name and value properties.
|
|
989
|
+
Uses snapshot cache for all queries (including name and spatial filtering).
|
|
679
990
|
"""
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
991
|
+
# Use snapshot for all queries (including name and spatial)
|
|
992
|
+
if position and position.page_index is not None:
|
|
993
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
994
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_FIELD, position, tolerance)
|
|
995
|
+
else:
|
|
996
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
997
|
+
all_elements = []
|
|
998
|
+
for page_snap in snapshot.pages:
|
|
999
|
+
all_elements.extend(page_snap.elements)
|
|
1000
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.FORM_FIELD, position, tolerance)
|
|
686
1001
|
|
|
687
1002
|
def _change_form_field(self, form_field_ref: FormFieldRef, new_value: str) -> bool:
|
|
688
1003
|
"""
|
|
@@ -691,9 +1006,12 @@ class PDFDancer:
|
|
|
691
1006
|
if form_field_ref is None:
|
|
692
1007
|
raise ValidationException("Form field reference cannot be null")
|
|
693
1008
|
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
1009
|
+
try:
|
|
1010
|
+
request_data = ChangeFormFieldRequest(form_field_ref, new_value).to_dict()
|
|
1011
|
+
response = self._make_request('PUT', '/pdf/modify/formField', data=request_data)
|
|
1012
|
+
return response.json()
|
|
1013
|
+
finally:
|
|
1014
|
+
self._invalidate_snapshots()
|
|
697
1015
|
|
|
698
1016
|
def select_paths(self) -> List[ObjectRef]:
|
|
699
1017
|
"""
|
|
@@ -701,21 +1019,45 @@ class PDFDancer:
|
|
|
701
1019
|
"""
|
|
702
1020
|
return self._find(ObjectType.PATH, None)
|
|
703
1021
|
|
|
704
|
-
def _find_paths(self, position: Optional[Position] = None) -> List[ObjectRef]:
|
|
1022
|
+
def _find_paths(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
|
|
705
1023
|
"""
|
|
706
1024
|
Searches for vector path objects at the specified position.
|
|
707
|
-
|
|
708
|
-
|
|
1025
|
+
Note: Spatial queries (with bounding_rect) fall back to API since snapshots
|
|
1026
|
+
don't include full vector path data needed for precise intersection tests.
|
|
1027
|
+
"""
|
|
1028
|
+
# Special case: paths at specific coordinates need full vector data
|
|
1029
|
+
# which is not available in snapshots, so pass through to API
|
|
1030
|
+
if position and position.bounding_rect:
|
|
1031
|
+
return self._find(ObjectType.PATH, position, tolerance)
|
|
1032
|
+
|
|
1033
|
+
# For simple page-level "all paths" queries, use snapshot
|
|
1034
|
+
if position and position.page_index is not None:
|
|
1035
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
1036
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.PATH, position, tolerance)
|
|
1037
|
+
else:
|
|
1038
|
+
# Document-level query - use document snapshot
|
|
1039
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
1040
|
+
all_elements = []
|
|
1041
|
+
for page_snap in snapshot.pages:
|
|
1042
|
+
all_elements.extend(page_snap.elements)
|
|
1043
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.PATH, position, tolerance)
|
|
709
1044
|
|
|
710
|
-
def _find_text_lines(self, position: Optional[Position] = None) -> List[
|
|
1045
|
+
def _find_text_lines(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
1046
|
+
TextObjectRef]:
|
|
711
1047
|
"""
|
|
712
1048
|
Searches for text line objects returning TextObjectRef with hierarchical structure.
|
|
1049
|
+
Uses snapshot cache for all queries.
|
|
713
1050
|
"""
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
1051
|
+
# Use snapshot for all queries (including spatial)
|
|
1052
|
+
if position and position.page_index is not None:
|
|
1053
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
1054
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.TEXT_LINE, position, tolerance)
|
|
1055
|
+
else:
|
|
1056
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
1057
|
+
all_elements = []
|
|
1058
|
+
for page_snap in snapshot.pages:
|
|
1059
|
+
all_elements.extend(page_snap.elements)
|
|
1060
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.TEXT_LINE, position, tolerance)
|
|
719
1061
|
|
|
720
1062
|
def select_text_lines(self) -> List[TextLineObject]:
|
|
721
1063
|
"""
|
|
@@ -725,7 +1067,7 @@ class PDFDancer:
|
|
|
725
1067
|
|
|
726
1068
|
def page(self, page_index: int) -> PageClient:
|
|
727
1069
|
"""
|
|
728
|
-
Get a specific page by index,
|
|
1070
|
+
Get a specific page by index, using snapshot cache when available.
|
|
729
1071
|
|
|
730
1072
|
Args:
|
|
731
1073
|
page_index: The 0-based page index
|
|
@@ -733,11 +1075,16 @@ class PDFDancer:
|
|
|
733
1075
|
Returns:
|
|
734
1076
|
PageClient with page properties populated
|
|
735
1077
|
"""
|
|
1078
|
+
# Try to get page ref from snapshot first (avoids API call)
|
|
1079
|
+
page_snapshot = self._get_or_fetch_page_snapshot(page_index)
|
|
1080
|
+
if page_snapshot and page_snapshot.page_ref:
|
|
1081
|
+
return PageClient.from_ref(self, page_snapshot.page_ref)
|
|
1082
|
+
|
|
1083
|
+
# Fallback to API if snapshot doesn't have page ref
|
|
736
1084
|
page_ref = self._get_page(page_index)
|
|
737
1085
|
if page_ref:
|
|
738
1086
|
return PageClient.from_ref(self, page_ref)
|
|
739
1087
|
else:
|
|
740
|
-
# Fallback to basic PageClient if page not found
|
|
741
1088
|
return PageClient(page_index, self)
|
|
742
1089
|
|
|
743
1090
|
# Page Operations
|
|
@@ -747,11 +1094,11 @@ class PDFDancer:
|
|
|
747
1094
|
|
|
748
1095
|
def _get_pages(self) -> List[PageRef]:
|
|
749
1096
|
"""
|
|
750
|
-
Retrieves references to all pages in the PDF document.
|
|
1097
|
+
Retrieves references to all pages in the PDF document using snapshot cache.
|
|
751
1098
|
"""
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
return [
|
|
1099
|
+
# Use document snapshot which includes all pages (avoids API call)
|
|
1100
|
+
doc_snapshot = self._get_or_fetch_document_snapshot()
|
|
1101
|
+
return [page_snap.page_ref for page_snap in doc_snapshot.pages]
|
|
755
1102
|
|
|
756
1103
|
def _get_page(self, page_index: int) -> Optional[PageRef]:
|
|
757
1104
|
"""
|
|
@@ -791,7 +1138,13 @@ class PDFDancer:
|
|
|
791
1138
|
request_data = page_ref.to_dict()
|
|
792
1139
|
|
|
793
1140
|
response = self._make_request('DELETE', '/pdf/page/delete', data=request_data)
|
|
794
|
-
|
|
1141
|
+
result = response.json()
|
|
1142
|
+
|
|
1143
|
+
# Invalidate snapshot caches after mutation
|
|
1144
|
+
if result:
|
|
1145
|
+
self._invalidate_snapshots()
|
|
1146
|
+
|
|
1147
|
+
return result
|
|
795
1148
|
|
|
796
1149
|
def move_page(self, from_page_index: int, to_page_index: int) -> bool:
|
|
797
1150
|
"""Move a page to a different index within the document."""
|
|
@@ -810,6 +1163,11 @@ class PDFDancer:
|
|
|
810
1163
|
request_data = PageMoveRequest(from_page_index, to_page_index).to_dict()
|
|
811
1164
|
response = self._make_request('PUT', '/pdf/page/move', data=request_data)
|
|
812
1165
|
result = response.json()
|
|
1166
|
+
|
|
1167
|
+
# Invalidate snapshot caches after mutation
|
|
1168
|
+
if result:
|
|
1169
|
+
self._invalidate_snapshots()
|
|
1170
|
+
|
|
813
1171
|
return bool(result)
|
|
814
1172
|
|
|
815
1173
|
# Manipulation Operations
|
|
@@ -829,7 +1187,13 @@ class PDFDancer:
|
|
|
829
1187
|
|
|
830
1188
|
request_data = DeleteRequest(object_ref).to_dict()
|
|
831
1189
|
response = self._make_request('DELETE', '/pdf/delete', data=request_data)
|
|
832
|
-
|
|
1190
|
+
result = response.json()
|
|
1191
|
+
|
|
1192
|
+
# Invalidate snapshot caches after mutation
|
|
1193
|
+
if result:
|
|
1194
|
+
self._invalidate_snapshots()
|
|
1195
|
+
|
|
1196
|
+
return result
|
|
833
1197
|
|
|
834
1198
|
def _move(self, object_ref: ObjectRef, position: Position) -> bool:
|
|
835
1199
|
"""
|
|
@@ -849,7 +1213,13 @@ class PDFDancer:
|
|
|
849
1213
|
|
|
850
1214
|
request_data = MoveRequest(object_ref, position).to_dict()
|
|
851
1215
|
response = self._make_request('PUT', '/pdf/move', data=request_data)
|
|
852
|
-
|
|
1216
|
+
result = response.json()
|
|
1217
|
+
|
|
1218
|
+
# Invalidate snapshot caches after mutation
|
|
1219
|
+
if result:
|
|
1220
|
+
self._invalidate_snapshots()
|
|
1221
|
+
|
|
1222
|
+
return result
|
|
853
1223
|
|
|
854
1224
|
# Add Operations
|
|
855
1225
|
|
|
@@ -896,24 +1266,58 @@ class PDFDancer:
|
|
|
896
1266
|
|
|
897
1267
|
return self._add_object(paragraph)
|
|
898
1268
|
|
|
1269
|
+
def _add_path(self, path: 'Path') -> bool:
|
|
1270
|
+
"""
|
|
1271
|
+
Internal method to add a path to the document after validation.
|
|
1272
|
+
"""
|
|
1273
|
+
from .models import Path as PathModel
|
|
1274
|
+
|
|
1275
|
+
if path is None:
|
|
1276
|
+
raise ValidationException("Path cannot be null")
|
|
1277
|
+
if path.get_position() is None:
|
|
1278
|
+
raise ValidationException("Path position is null")
|
|
1279
|
+
if path.get_position().page_index is None:
|
|
1280
|
+
raise ValidationException("Path position page index is null")
|
|
1281
|
+
if path.get_position().page_index < 0:
|
|
1282
|
+
raise ValidationException("Path position page index is less than 0")
|
|
1283
|
+
if not path.get_path_segments() or len(path.get_path_segments()) == 0:
|
|
1284
|
+
raise ValidationException("Path must have at least one segment")
|
|
1285
|
+
|
|
1286
|
+
return self._add_object(path)
|
|
1287
|
+
|
|
899
1288
|
def _add_object(self, pdf_object) -> bool:
|
|
900
1289
|
"""
|
|
901
1290
|
Internal method to add any PDF object.
|
|
902
1291
|
"""
|
|
903
1292
|
request_data = AddRequest(pdf_object).to_dict()
|
|
904
1293
|
response = self._make_request('POST', '/pdf/add', data=request_data)
|
|
905
|
-
|
|
1294
|
+
result = response.json()
|
|
1295
|
+
|
|
1296
|
+
# Invalidate snapshot caches after mutation
|
|
1297
|
+
if result:
|
|
1298
|
+
self._invalidate_snapshots()
|
|
1299
|
+
|
|
1300
|
+
return result
|
|
906
1301
|
|
|
907
1302
|
def new_paragraph(self) -> ParagraphBuilder:
|
|
908
1303
|
return ParagraphBuilder(self)
|
|
909
1304
|
|
|
910
1305
|
def new_page(self):
|
|
911
1306
|
response = self._make_request('POST', '/pdf/page/add', data=None)
|
|
912
|
-
|
|
1307
|
+
result = self._parse_page_ref(response.json())
|
|
1308
|
+
|
|
1309
|
+
# Invalidate snapshot caches after adding page
|
|
1310
|
+
self._invalidate_snapshots()
|
|
1311
|
+
|
|
1312
|
+
return result
|
|
913
1313
|
|
|
914
1314
|
def new_image(self) -> ImageBuilder:
|
|
915
1315
|
return ImageBuilder(self)
|
|
916
1316
|
|
|
1317
|
+
def new_path(self) -> 'PathBuilder':
|
|
1318
|
+
from .path_builder import PathBuilder
|
|
1319
|
+
return PathBuilder(self)
|
|
1320
|
+
|
|
917
1321
|
# Modify Operations
|
|
918
1322
|
def _modify_paragraph(self, object_ref: ObjectRef, new_paragraph: Union[Paragraph, str]) -> CommandResult:
|
|
919
1323
|
"""
|
|
@@ -935,12 +1339,16 @@ class PDFDancer:
|
|
|
935
1339
|
# Text modification - returns CommandResult
|
|
936
1340
|
request_data = ModifyTextRequest(object_ref, new_paragraph).to_dict()
|
|
937
1341
|
response = self._make_request('PUT', '/pdf/text/paragraph', data=request_data)
|
|
938
|
-
|
|
1342
|
+
result = CommandResult.from_dict(response.json())
|
|
939
1343
|
else:
|
|
940
1344
|
# Object modification
|
|
941
1345
|
request_data = ModifyRequest(object_ref, new_paragraph).to_dict()
|
|
942
1346
|
response = self._make_request('PUT', '/pdf/modify', data=request_data)
|
|
943
|
-
|
|
1347
|
+
result = CommandResult.from_dict(response.json())
|
|
1348
|
+
|
|
1349
|
+
# Invalidate snapshot caches after mutation
|
|
1350
|
+
self._invalidate_snapshots()
|
|
1351
|
+
return result
|
|
944
1352
|
|
|
945
1353
|
def _modify_text_line(self, object_ref: ObjectRef, new_text: str) -> CommandResult:
|
|
946
1354
|
"""
|
|
@@ -960,7 +1368,11 @@ class PDFDancer:
|
|
|
960
1368
|
|
|
961
1369
|
request_data = ModifyTextRequest(object_ref, new_text).to_dict()
|
|
962
1370
|
response = self._make_request('PUT', '/pdf/text/line', data=request_data)
|
|
963
|
-
|
|
1371
|
+
result = CommandResult.from_dict(response.json())
|
|
1372
|
+
|
|
1373
|
+
# Invalidate snapshot caches after mutation
|
|
1374
|
+
self._invalidate_snapshots()
|
|
1375
|
+
return result
|
|
964
1376
|
|
|
965
1377
|
# Font Operations
|
|
966
1378
|
|
|
@@ -1040,26 +1452,224 @@ class PDFDancer:
|
|
|
1040
1452
|
'ttfFile': (filename, font_data, 'font/ttf')
|
|
1041
1453
|
}
|
|
1042
1454
|
|
|
1043
|
-
|
|
1044
|
-
|
|
1455
|
+
request_size = len(font_data)
|
|
1456
|
+
if DEBUG:
|
|
1457
|
+
print(f"{time.time()}|POST /font/register - request size: {request_size} bytes")
|
|
1458
|
+
|
|
1459
|
+
headers = {
|
|
1460
|
+
'X-Session-Id': self._session_id,
|
|
1461
|
+
'X-Generated-At': _generate_timestamp()
|
|
1462
|
+
}
|
|
1463
|
+
response = self._client.post(
|
|
1045
1464
|
self._cleanup_url_path(self._base_url, "/font/register"),
|
|
1046
1465
|
files=files,
|
|
1047
1466
|
headers=headers,
|
|
1048
1467
|
timeout=30
|
|
1049
1468
|
)
|
|
1050
1469
|
|
|
1470
|
+
response_size = len(response.content)
|
|
1471
|
+
if DEBUG:
|
|
1472
|
+
print(f"{time.time()}|POST /font/register - response size: {response_size} bytes")
|
|
1473
|
+
|
|
1474
|
+
_log_generated_at_header(response, "POST", "/font/register")
|
|
1051
1475
|
response.raise_for_status()
|
|
1052
1476
|
return response.text.strip()
|
|
1053
1477
|
|
|
1054
1478
|
except (IOError, OSError) as e:
|
|
1055
1479
|
raise PdfDancerException(f"Failed to read font file: {e}", cause=e)
|
|
1056
|
-
except
|
|
1057
|
-
error_message = self._extract_error_message(
|
|
1480
|
+
except httpx.HTTPStatusError as e:
|
|
1481
|
+
error_message = self._extract_error_message(e.response)
|
|
1058
1482
|
raise HttpClientException(f"Font registration failed: {error_message}",
|
|
1059
|
-
response=
|
|
1483
|
+
response=e.response, cause=e) from None
|
|
1484
|
+
except httpx.RequestError as e:
|
|
1485
|
+
raise HttpClientException(f"Font registration failed: {str(e)}",
|
|
1486
|
+
response=None, cause=e) from None
|
|
1060
1487
|
|
|
1061
1488
|
# Document Operations
|
|
1062
1489
|
|
|
1490
|
+
# Snapshot Operations
|
|
1491
|
+
|
|
1492
|
+
def get_document_snapshot(self, types: Optional[str] = None) -> DocumentSnapshot:
|
|
1493
|
+
"""
|
|
1494
|
+
Retrieve a snapshot of the entire document with all pages and elements.
|
|
1495
|
+
|
|
1496
|
+
Args:
|
|
1497
|
+
types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
|
|
1498
|
+
|
|
1499
|
+
Returns:
|
|
1500
|
+
DocumentSnapshot containing page count, fonts, and all page snapshots
|
|
1501
|
+
"""
|
|
1502
|
+
params = {}
|
|
1503
|
+
if types:
|
|
1504
|
+
params['types'] = types
|
|
1505
|
+
|
|
1506
|
+
response = self._make_request('GET', '/pdf/document/snapshot', params=params)
|
|
1507
|
+
data = response.json()
|
|
1508
|
+
|
|
1509
|
+
return self._parse_document_snapshot(data)
|
|
1510
|
+
|
|
1511
|
+
def get_page_snapshot(self, page_index: int, types: Optional[str] = None) -> PageSnapshot:
|
|
1512
|
+
"""
|
|
1513
|
+
Retrieve a snapshot of a specific page with all its elements.
|
|
1514
|
+
|
|
1515
|
+
Args:
|
|
1516
|
+
page_index: The index of the page to snapshot (0-based)
|
|
1517
|
+
types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
|
|
1518
|
+
|
|
1519
|
+
Returns:
|
|
1520
|
+
PageSnapshot containing page reference and all elements on that page
|
|
1521
|
+
"""
|
|
1522
|
+
if page_index < 0:
|
|
1523
|
+
raise ValidationException(f"Page index must be >= 0, got {page_index}")
|
|
1524
|
+
|
|
1525
|
+
params = {}
|
|
1526
|
+
if types:
|
|
1527
|
+
params['types'] = types
|
|
1528
|
+
|
|
1529
|
+
response = self._make_request('GET', f'/pdf/page/{page_index}/snapshot', params=params)
|
|
1530
|
+
data = response.json()
|
|
1531
|
+
|
|
1532
|
+
return self._parse_page_snapshot(data)
|
|
1533
|
+
|
|
1534
|
+
def _get_or_fetch_document_snapshot(self) -> DocumentSnapshot:
|
|
1535
|
+
"""
|
|
1536
|
+
Get document snapshot from cache or fetch if not cached.
|
|
1537
|
+
This is used internally by select_* methods for optimization.
|
|
1538
|
+
Also caches individual page snapshots from the document snapshot.
|
|
1539
|
+
"""
|
|
1540
|
+
if self._document_snapshot is None:
|
|
1541
|
+
self._document_snapshot = self.get_document_snapshot()
|
|
1542
|
+
# Cache individual page snapshots from document snapshot
|
|
1543
|
+
for i, page_snapshot in enumerate(self._document_snapshot.pages):
|
|
1544
|
+
if i not in self._page_snapshots:
|
|
1545
|
+
self._page_snapshots[i] = page_snapshot
|
|
1546
|
+
return self._document_snapshot
|
|
1547
|
+
|
|
1548
|
+
def _get_or_fetch_page_snapshot(self, page_index: int) -> PageSnapshot:
|
|
1549
|
+
"""
|
|
1550
|
+
Get page snapshot from cache or fetch if not cached.
|
|
1551
|
+
This is used internally by select_* methods for optimization.
|
|
1552
|
+
If document snapshot exists, uses page from it instead of making separate API call.
|
|
1553
|
+
"""
|
|
1554
|
+
# Check if already cached
|
|
1555
|
+
if page_index in self._page_snapshots:
|
|
1556
|
+
return self._page_snapshots[page_index]
|
|
1557
|
+
|
|
1558
|
+
# If document snapshot exists, get page from it (no API call needed)
|
|
1559
|
+
if self._document_snapshot is not None:
|
|
1560
|
+
if 0 <= page_index < len(self._document_snapshot.pages):
|
|
1561
|
+
page_snapshot = self._document_snapshot.pages[page_index]
|
|
1562
|
+
self._page_snapshots[page_index] = page_snapshot
|
|
1563
|
+
return page_snapshot
|
|
1564
|
+
|
|
1565
|
+
# Otherwise fetch page snapshot individually
|
|
1566
|
+
self._page_snapshots[page_index] = self.get_page_snapshot(page_index)
|
|
1567
|
+
return self._page_snapshots[page_index]
|
|
1568
|
+
|
|
1569
|
+
def _invalidate_snapshots(self) -> None:
|
|
1570
|
+
"""
|
|
1571
|
+
Clear all snapshot caches.
|
|
1572
|
+
Called after mutations (delete, move, modify) to ensure fresh data on next select.
|
|
1573
|
+
"""
|
|
1574
|
+
self._document_snapshot = None
|
|
1575
|
+
self._page_snapshots.clear()
|
|
1576
|
+
|
|
1577
|
+
def _filter_snapshot_elements(self, elements: List, object_type: ObjectType,
|
|
1578
|
+
position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List:
|
|
1579
|
+
"""
|
|
1580
|
+
Filter snapshot elements client-side based on object type and position criteria.
|
|
1581
|
+
|
|
1582
|
+
Args:
|
|
1583
|
+
elements: List of elements from snapshot (ObjectRef, TextObjectRef, etc.)
|
|
1584
|
+
object_type: Type to filter for
|
|
1585
|
+
position: Optional position filter with text matching, bounding rect, etc.
|
|
1586
|
+
tolerance: Tolerance in points for spatial matching (default: 10.0)
|
|
1587
|
+
|
|
1588
|
+
Returns:
|
|
1589
|
+
Filtered list of elements matching the criteria
|
|
1590
|
+
"""
|
|
1591
|
+
import re
|
|
1592
|
+
|
|
1593
|
+
# Filter by object type (handle form field subtypes)
|
|
1594
|
+
if object_type == ObjectType.FORM_FIELD:
|
|
1595
|
+
# Form fields include TEXT_FIELD, CHECK_BOX, RADIO_BUTTON, BUTTON, DROPDOWN
|
|
1596
|
+
form_field_types = {ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
|
|
1597
|
+
ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
|
|
1598
|
+
ObjectType.BUTTON, ObjectType.DROPDOWN}
|
|
1599
|
+
filtered = [e for e in elements if e.type in form_field_types]
|
|
1600
|
+
else:
|
|
1601
|
+
filtered = [e for e in elements if e.type == object_type]
|
|
1602
|
+
|
|
1603
|
+
if position is None:
|
|
1604
|
+
return filtered
|
|
1605
|
+
|
|
1606
|
+
# Apply position filters
|
|
1607
|
+
result = filtered
|
|
1608
|
+
|
|
1609
|
+
# Text starts with filter (case-insensitive to match API behavior)
|
|
1610
|
+
if position.text_starts_with:
|
|
1611
|
+
search_text = position.text_starts_with.lower()
|
|
1612
|
+
result = [
|
|
1613
|
+
e for e in result
|
|
1614
|
+
if isinstance(e, TextObjectRef) and e.text and e.text.lower().startswith(search_text)
|
|
1615
|
+
]
|
|
1616
|
+
|
|
1617
|
+
# Regex pattern filter
|
|
1618
|
+
if position.text_pattern:
|
|
1619
|
+
pattern = re.compile(position.text_pattern)
|
|
1620
|
+
result = [
|
|
1621
|
+
e for e in result
|
|
1622
|
+
if isinstance(e, TextObjectRef) and e.text and pattern.search(e.text)
|
|
1623
|
+
]
|
|
1624
|
+
|
|
1625
|
+
# Bounding rect filter (spatial queries like at(x, y))
|
|
1626
|
+
if position.bounding_rect:
|
|
1627
|
+
rect = position.bounding_rect
|
|
1628
|
+
result = [
|
|
1629
|
+
e for e in result
|
|
1630
|
+
if e.position and e.position.bounding_rect and
|
|
1631
|
+
self._rects_intersect(e.position.bounding_rect, rect, tolerance)
|
|
1632
|
+
]
|
|
1633
|
+
|
|
1634
|
+
# Name filter (for form fields)
|
|
1635
|
+
if position.name:
|
|
1636
|
+
from .models import FormFieldRef
|
|
1637
|
+
result = [
|
|
1638
|
+
e for e in result
|
|
1639
|
+
if isinstance(e, FormFieldRef) and e.name == position.name
|
|
1640
|
+
]
|
|
1641
|
+
|
|
1642
|
+
return result
|
|
1643
|
+
|
|
1644
|
+
@staticmethod
|
|
1645
|
+
def _rects_intersect(rect1, rect2, tolerance: float = DEFAULT_TOLERANCE) -> bool:
|
|
1646
|
+
"""
|
|
1647
|
+
Check if two bounding rectangles intersect or are very close.
|
|
1648
|
+
Handles point queries (width/height = 0) with tolerance.
|
|
1649
|
+
|
|
1650
|
+
Args:
|
|
1651
|
+
rect1: First bounding rectangle
|
|
1652
|
+
rect2: Second bounding rectangle
|
|
1653
|
+
tolerance: Tolerance in points for position matching (default: 10.0)
|
|
1654
|
+
"""
|
|
1655
|
+
# Get effective bounds with tolerance
|
|
1656
|
+
r1_left = rect1.x - tolerance
|
|
1657
|
+
r1_right = rect1.x + rect1.width + tolerance
|
|
1658
|
+
r1_top = rect1.y - tolerance
|
|
1659
|
+
r1_bottom = rect1.y + rect1.height + tolerance
|
|
1660
|
+
|
|
1661
|
+
r2_left = rect2.x - tolerance
|
|
1662
|
+
r2_right = rect2.x + rect2.width + tolerance
|
|
1663
|
+
r2_top = rect2.y - tolerance
|
|
1664
|
+
r2_bottom = rect2.y + rect2.height + tolerance
|
|
1665
|
+
|
|
1666
|
+
# Check if rectangles overlap
|
|
1667
|
+
if r1_right < r2_left or r2_right < r1_left:
|
|
1668
|
+
return False
|
|
1669
|
+
if r1_bottom < r2_top or r2_bottom < r1_top:
|
|
1670
|
+
return False
|
|
1671
|
+
return True
|
|
1672
|
+
|
|
1063
1673
|
def get_bytes(self) -> bytes:
|
|
1064
1674
|
"""
|
|
1065
1675
|
Downloads the current state of the PDF document with all modifications applied.
|
|
@@ -1250,6 +1860,175 @@ class PDFDancer:
|
|
|
1250
1860
|
orientation=orientation
|
|
1251
1861
|
)
|
|
1252
1862
|
|
|
1863
|
+
def _parse_path_segment(self, segment_data: dict) -> 'PathSegment':
|
|
1864
|
+
"""Parse JSON data into PathSegment instance (Line or Bezier)."""
|
|
1865
|
+
from .models import Line, Bezier, PathSegment, Point, Color
|
|
1866
|
+
|
|
1867
|
+
segment_type = segment_data.get('segmentType', segment_data.get('type', '')).upper()
|
|
1868
|
+
|
|
1869
|
+
# Parse common properties
|
|
1870
|
+
stroke_color = None
|
|
1871
|
+
stroke_color_data = segment_data.get('strokeColor')
|
|
1872
|
+
if isinstance(stroke_color_data, dict):
|
|
1873
|
+
r = stroke_color_data.get('red', 0)
|
|
1874
|
+
g = stroke_color_data.get('green', 0)
|
|
1875
|
+
b = stroke_color_data.get('blue', 0)
|
|
1876
|
+
a = stroke_color_data.get('alpha', 255)
|
|
1877
|
+
if all(isinstance(v, int) for v in [r, g, b]):
|
|
1878
|
+
stroke_color = Color(r, g, b, a)
|
|
1879
|
+
|
|
1880
|
+
fill_color = None
|
|
1881
|
+
fill_color_data = segment_data.get('fillColor')
|
|
1882
|
+
if isinstance(fill_color_data, dict):
|
|
1883
|
+
r = fill_color_data.get('red', 0)
|
|
1884
|
+
g = fill_color_data.get('green', 0)
|
|
1885
|
+
b = fill_color_data.get('blue', 0)
|
|
1886
|
+
a = fill_color_data.get('alpha', 255)
|
|
1887
|
+
if all(isinstance(v, int) for v in [r, g, b]):
|
|
1888
|
+
fill_color = Color(r, g, b, a)
|
|
1889
|
+
|
|
1890
|
+
stroke_width = segment_data.get('strokeWidth')
|
|
1891
|
+
dash_array = segment_data.get('dashArray')
|
|
1892
|
+
dash_phase = segment_data.get('dashPhase')
|
|
1893
|
+
|
|
1894
|
+
# Parse specific segment type
|
|
1895
|
+
if segment_type == 'LINE':
|
|
1896
|
+
p0_data = segment_data.get('p0', {})
|
|
1897
|
+
p1_data = segment_data.get('p1', {})
|
|
1898
|
+
|
|
1899
|
+
p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
|
|
1900
|
+
p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
|
|
1901
|
+
|
|
1902
|
+
return Line(
|
|
1903
|
+
stroke_color=stroke_color,
|
|
1904
|
+
fill_color=fill_color,
|
|
1905
|
+
stroke_width=stroke_width,
|
|
1906
|
+
dash_array=dash_array,
|
|
1907
|
+
dash_phase=dash_phase,
|
|
1908
|
+
p0=p0,
|
|
1909
|
+
p1=p1
|
|
1910
|
+
)
|
|
1911
|
+
elif segment_type == 'BEZIER':
|
|
1912
|
+
p0_data = segment_data.get('p0', {})
|
|
1913
|
+
p1_data = segment_data.get('p1', {})
|
|
1914
|
+
p2_data = segment_data.get('p2', {})
|
|
1915
|
+
p3_data = segment_data.get('p3', {})
|
|
1916
|
+
|
|
1917
|
+
p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
|
|
1918
|
+
p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
|
|
1919
|
+
p2 = Point(p2_data.get('x', 0.0), p2_data.get('y', 0.0)) if p2_data else None
|
|
1920
|
+
p3 = Point(p3_data.get('x', 0.0), p3_data.get('y', 0.0)) if p3_data else None
|
|
1921
|
+
|
|
1922
|
+
return Bezier(
|
|
1923
|
+
stroke_color=stroke_color,
|
|
1924
|
+
fill_color=fill_color,
|
|
1925
|
+
stroke_width=stroke_width,
|
|
1926
|
+
dash_array=dash_array,
|
|
1927
|
+
dash_phase=dash_phase,
|
|
1928
|
+
p0=p0,
|
|
1929
|
+
p1=p1,
|
|
1930
|
+
p2=p2,
|
|
1931
|
+
p3=p3
|
|
1932
|
+
)
|
|
1933
|
+
else:
|
|
1934
|
+
# Fallback to base PathSegment for unknown types
|
|
1935
|
+
return PathSegment(
|
|
1936
|
+
stroke_color=stroke_color,
|
|
1937
|
+
fill_color=fill_color,
|
|
1938
|
+
stroke_width=stroke_width,
|
|
1939
|
+
dash_array=dash_array,
|
|
1940
|
+
dash_phase=dash_phase
|
|
1941
|
+
)
|
|
1942
|
+
|
|
1943
|
+
def _parse_path(self, obj_data: dict) -> 'Path':
|
|
1944
|
+
"""Parse JSON data into Path instance with path segments."""
|
|
1945
|
+
from .models import Path
|
|
1946
|
+
|
|
1947
|
+
position_data = obj_data.get('position', {})
|
|
1948
|
+
position = self._parse_position(position_data) if position_data else None
|
|
1949
|
+
|
|
1950
|
+
# Parse path segments
|
|
1951
|
+
path_segments = []
|
|
1952
|
+
segments_data = obj_data.get('pathSegments', [])
|
|
1953
|
+
if isinstance(segments_data, list):
|
|
1954
|
+
for segment_data in segments_data:
|
|
1955
|
+
if isinstance(segment_data, dict):
|
|
1956
|
+
path_segments.append(self._parse_path_segment(segment_data))
|
|
1957
|
+
|
|
1958
|
+
even_odd_fill = obj_data.get('evenOddFill')
|
|
1959
|
+
|
|
1960
|
+
return Path(
|
|
1961
|
+
position=position,
|
|
1962
|
+
path_segments=path_segments if path_segments else None,
|
|
1963
|
+
even_odd_fill=even_odd_fill
|
|
1964
|
+
)
|
|
1965
|
+
|
|
1966
|
+
def _parse_font_recommendation(self, data: dict) -> FontRecommendation:
|
|
1967
|
+
"""Parse JSON data into FontRecommendation instance."""
|
|
1968
|
+
font_type_str = data.get('fontType', 'SYSTEM')
|
|
1969
|
+
font_type = FontType(font_type_str)
|
|
1970
|
+
|
|
1971
|
+
return FontRecommendation(
|
|
1972
|
+
font_name=data.get('fontName', ''),
|
|
1973
|
+
font_type=font_type,
|
|
1974
|
+
similarity_score=data.get('similarityScore', 0.0)
|
|
1975
|
+
)
|
|
1976
|
+
|
|
1977
|
+
def _parse_page_snapshot(self, data: dict) -> PageSnapshot:
|
|
1978
|
+
"""Parse JSON data into PageSnapshot instance with proper type handling."""
|
|
1979
|
+
page_ref = self._parse_page_ref(data.get('pageRef', {}))
|
|
1980
|
+
|
|
1981
|
+
# Parse elements using appropriate parser based on type
|
|
1982
|
+
elements = []
|
|
1983
|
+
for elem_data in data.get('elements', []):
|
|
1984
|
+
elem_type_str = elem_data.get('type')
|
|
1985
|
+
if not elem_type_str:
|
|
1986
|
+
continue
|
|
1987
|
+
|
|
1988
|
+
try:
|
|
1989
|
+
# Normalize type string (API returns "CHECKBOX" but enum is "CHECK_BOX")
|
|
1990
|
+
if elem_type_str == "CHECKBOX":
|
|
1991
|
+
elem_type_str = "CHECK_BOX"
|
|
1992
|
+
# Deep copy to avoid modifying original
|
|
1993
|
+
import copy
|
|
1994
|
+
elem_data = copy.deepcopy(elem_data)
|
|
1995
|
+
elem_data['type'] = elem_type_str # Update type in data
|
|
1996
|
+
|
|
1997
|
+
elem_type = ObjectType(elem_type_str)
|
|
1998
|
+
|
|
1999
|
+
# Use appropriate parser based on element type
|
|
2000
|
+
if elem_type in (ObjectType.PARAGRAPH, ObjectType.TEXT_LINE):
|
|
2001
|
+
# Parse as TextObjectRef to capture text, font, color, children
|
|
2002
|
+
elements.append(self._parse_text_object_ref(elem_data))
|
|
2003
|
+
elif elem_type in (ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
|
|
2004
|
+
ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
|
|
2005
|
+
ObjectType.BUTTON, ObjectType.DROPDOWN):
|
|
2006
|
+
# Parse as FormFieldRef to capture name and value
|
|
2007
|
+
elements.append(self._parse_form_field_ref(elem_data))
|
|
2008
|
+
else:
|
|
2009
|
+
# Parse as basic ObjectRef
|
|
2010
|
+
elements.append(self._parse_object_ref(elem_data))
|
|
2011
|
+
except (ValueError, KeyError):
|
|
2012
|
+
# Skip elements with invalid types
|
|
2013
|
+
continue
|
|
2014
|
+
|
|
2015
|
+
return PageSnapshot(
|
|
2016
|
+
page_ref=page_ref,
|
|
2017
|
+
elements=elements
|
|
2018
|
+
)
|
|
2019
|
+
|
|
2020
|
+
def _parse_document_snapshot(self, data: dict) -> DocumentSnapshot:
|
|
2021
|
+
"""Parse JSON data into DocumentSnapshot instance."""
|
|
2022
|
+
page_count = data.get('pageCount', 0)
|
|
2023
|
+
fonts = [self._parse_font_recommendation(font_data) for font_data in data.get('fonts', [])]
|
|
2024
|
+
pages = [self._parse_page_snapshot(page_data) for page_data in data.get('pages', [])]
|
|
2025
|
+
|
|
2026
|
+
return DocumentSnapshot(
|
|
2027
|
+
page_count=page_count,
|
|
2028
|
+
fonts=fonts,
|
|
2029
|
+
pages=pages
|
|
2030
|
+
)
|
|
2031
|
+
|
|
1253
2032
|
# Builder Pattern Support
|
|
1254
2033
|
|
|
1255
2034
|
def _paragraph_builder(self) -> 'ParagraphBuilder':
|
|
@@ -1268,9 +2047,17 @@ class PDFDancer:
|
|
|
1268
2047
|
|
|
1269
2048
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
1270
2049
|
"""Context manager exit - cleanup if needed."""
|
|
2050
|
+
# Close the HTTP client to free resources
|
|
2051
|
+
if hasattr(self, '_client'):
|
|
2052
|
+
self._client.close()
|
|
1271
2053
|
# TODO Could add session cleanup here if API supports it. Cleanup on the server
|
|
1272
2054
|
pass
|
|
1273
2055
|
|
|
2056
|
+
def close(self):
|
|
2057
|
+
"""Close the HTTP client and free resources."""
|
|
2058
|
+
if hasattr(self, '_client'):
|
|
2059
|
+
self._client.close()
|
|
2060
|
+
|
|
1274
2061
|
def _to_path_objects(self, refs: List[ObjectRef]) -> List[PathObject]:
|
|
1275
2062
|
return [PathObject(self, ref.internal_id, ref.type, ref.position) for ref in refs]
|
|
1276
2063
|
|