pdfdancer-client-python 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfdancer/__init__.py +9 -1
- pdfdancer/exceptions.py +3 -3
- pdfdancer/fingerprint.py +121 -0
- pdfdancer/models.py +243 -27
- pdfdancer/path_builder.py +557 -0
- pdfdancer/pdfdancer_v1.py +868 -129
- pdfdancer/types.py +7 -0
- {pdfdancer_client_python-0.2.17.dist-info → pdfdancer_client_python-0.2.19.dist-info}/METADATA +18 -18
- pdfdancer_client_python-0.2.19.dist-info/RECORD +15 -0
- pdfdancer_client_python-0.2.17.dist-info/RECORD +0 -13
- {pdfdancer_client_python-0.2.17.dist-info → pdfdancer_client_python-0.2.19.dist-info}/WHEEL +0 -0
- {pdfdancer_client_python-0.2.17.dist-info → pdfdancer_client_python-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {pdfdancer_client_python-0.2.17.dist-info → pdfdancer_client_python-0.2.19.dist-info}/licenses/NOTICE +0 -0
- {pdfdancer_client_python-0.2.17.dist-info → pdfdancer_client_python-0.2.19.dist-info}/top_level.txt +0 -0
pdfdancer/pdfdancer_v1.py
CHANGED
|
@@ -5,23 +5,116 @@ A Python client that closely mirrors the Java Client class structure and functio
|
|
|
5
5
|
Provides session-based PDF manipulation operations with strict validation.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import gzip
|
|
8
9
|
import json
|
|
9
10
|
import os
|
|
10
11
|
import time
|
|
12
|
+
from datetime import datetime, timezone
|
|
11
13
|
from pathlib import Path
|
|
12
14
|
from typing import List, Optional, Union, BinaryIO, Mapping, Any
|
|
13
15
|
|
|
14
|
-
import
|
|
16
|
+
import httpx
|
|
15
17
|
from dotenv import load_dotenv
|
|
16
18
|
|
|
19
|
+
from .fingerprint import Fingerprint
|
|
20
|
+
|
|
17
21
|
load_dotenv()
|
|
18
22
|
|
|
19
23
|
# Global variable to disable SSL certificate verification
|
|
20
24
|
# Set to True to skip SSL verification (useful for testing with self-signed certificates)
|
|
21
25
|
# WARNING: Only use in development/testing environments
|
|
22
|
-
DISABLE_SSL_VERIFY = False
|
|
26
|
+
DISABLE_SSL_VERIFY = os.environ.get("PDFDANCER_CLIENT_DISABLE_SSL_VERIFY", False)
|
|
27
|
+
|
|
28
|
+
DEBUG = os.environ.get("PDFDANCER_CLIENT_DEBUG", False)
|
|
29
|
+
DEFAULT_TOLERANCE = 0.01
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _generate_timestamp() -> str:
|
|
33
|
+
"""
|
|
34
|
+
Generate a timestamp string in the format expected by the API.
|
|
35
|
+
Format: YYYY-MM-DDTHH:MM:SS.ffffffZ (with microseconds)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Timestamp string with UTC timezone
|
|
39
|
+
"""
|
|
40
|
+
return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _parse_timestamp(timestamp_str: str) -> datetime:
|
|
44
|
+
"""
|
|
45
|
+
Parse timestamp string, handling both microseconds and nanoseconds precision.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
timestamp_str: Timestamp string in format YYYY-MM-DDTHH:MM:SS.fffffffZ
|
|
49
|
+
(with 6 or 9 fractional digits)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
datetime object with UTC timezone
|
|
53
|
+
"""
|
|
54
|
+
# Remove the 'Z' suffix
|
|
55
|
+
ts = timestamp_str.rstrip('Z')
|
|
56
|
+
|
|
57
|
+
# Handle nanoseconds (9 digits) by truncating to microseconds (6 digits)
|
|
58
|
+
# Python's datetime only supports microseconds precision
|
|
59
|
+
if '.' in ts:
|
|
60
|
+
date_part, frac_part = ts.rsplit('.', 1)
|
|
61
|
+
if len(frac_part) > 6:
|
|
62
|
+
# Truncate to 6 digits (microseconds)
|
|
63
|
+
frac_part = frac_part[:6]
|
|
64
|
+
ts = f"{date_part}.{frac_part}"
|
|
65
|
+
|
|
66
|
+
return datetime.fromisoformat(ts).replace(tzinfo=timezone.utc)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _log_generated_at_header(response: httpx.Response, method: str, path: str) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Check for X-Generated-At and X-Received-At headers and log timing information if DEBUG=True.
|
|
72
|
+
|
|
73
|
+
Expected timestamp formats:
|
|
74
|
+
- 2025-10-24T08:49:39.161945Z (microseconds - 6 digits)
|
|
75
|
+
- 2025-10-24T08:58:45.468131265Z (nanoseconds - 9 digits)
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
response: The HTTP response object
|
|
79
|
+
method: HTTP method used
|
|
80
|
+
path: API path
|
|
81
|
+
"""
|
|
82
|
+
if not DEBUG:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
generated_at = response.headers.get('X-Generated-At')
|
|
86
|
+
received_at = response.headers.get('X-Received-At')
|
|
87
|
+
|
|
88
|
+
if generated_at or received_at:
|
|
89
|
+
try:
|
|
90
|
+
log_parts = []
|
|
91
|
+
current_time = datetime.now(timezone.utc)
|
|
92
|
+
|
|
93
|
+
# Parse and log X-Received-At
|
|
94
|
+
received_time = None
|
|
95
|
+
if received_at:
|
|
96
|
+
received_time = _parse_timestamp(received_at)
|
|
97
|
+
time_since_received = (current_time - received_time).total_seconds()
|
|
98
|
+
log_parts.append(f"X-Received-At: {received_at}, time since received: {time_since_received:.3f}s")
|
|
99
|
+
|
|
100
|
+
# Parse and log X-Generated-At
|
|
101
|
+
generated_time = None
|
|
102
|
+
if generated_at:
|
|
103
|
+
generated_time = _parse_timestamp(generated_at)
|
|
104
|
+
time_since_generated = (current_time - generated_time).total_seconds()
|
|
105
|
+
log_parts.append(f"X-Generated-At: {generated_at}, time since generated: {time_since_generated:.3f}s")
|
|
106
|
+
|
|
107
|
+
# Calculate processing time (X-Generated-At - X-Received-At)
|
|
108
|
+
if received_time and generated_time:
|
|
109
|
+
processing_time = (generated_time - received_time).total_seconds()
|
|
110
|
+
log_parts.append(f"processing time: {processing_time:.3f}s")
|
|
111
|
+
|
|
112
|
+
if log_parts:
|
|
113
|
+
print(f"{time.time()}|{method} {path} - {', '.join(log_parts)}")
|
|
114
|
+
|
|
115
|
+
except (ValueError, AttributeError) as e:
|
|
116
|
+
print(f"{time.time()}|{method} {path} - Header parse error: {e}")
|
|
23
117
|
|
|
24
|
-
DEBUG = False
|
|
25
118
|
|
|
26
119
|
from . import ParagraphBuilder
|
|
27
120
|
from .exceptions import (
|
|
@@ -36,7 +129,8 @@ from .models import (
|
|
|
36
129
|
ObjectRef, Position, ObjectType, Font, Image, Paragraph, FormFieldRef, TextObjectRef, PageRef,
|
|
37
130
|
FindRequest, DeleteRequest, MoveRequest, PageMoveRequest, AddRequest, ModifyRequest, ModifyTextRequest,
|
|
38
131
|
ChangeFormFieldRequest, CommandResult,
|
|
39
|
-
ShapeType, PositionMode, PageSize, Orientation
|
|
132
|
+
ShapeType, PositionMode, PageSize, Orientation,
|
|
133
|
+
PageSnapshot, DocumentSnapshot, FontRecommendation, FontType
|
|
40
134
|
)
|
|
41
135
|
from .paragraph_builder import ParagraphPageBuilder
|
|
42
136
|
from .types import PathObject, ParagraphObject, TextLineObject, ImageObject, FormObject, FormFieldObject
|
|
@@ -60,9 +154,10 @@ class PageClient:
|
|
|
60
154
|
else:
|
|
61
155
|
self.orientation = orientation
|
|
62
156
|
|
|
63
|
-
def select_paths_at(self, x: float, y: float) -> List[PathObject]:
|
|
157
|
+
def select_paths_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[PathObject]:
|
|
158
|
+
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
64
159
|
# noinspection PyProtectedMember
|
|
65
|
-
return self.root._to_path_objects(self.root._find_paths(
|
|
160
|
+
return self.root._to_path_objects(self.root._find_paths(position, tolerance))
|
|
66
161
|
|
|
67
162
|
def select_paragraphs(self) -> List[ParagraphObject]:
|
|
68
163
|
# noinspection PyProtectedMember
|
|
@@ -86,10 +181,10 @@ class PageClient:
|
|
|
86
181
|
# noinspection PyProtectedMember
|
|
87
182
|
return self.root._to_textline_objects(self.root._find_text_lines(position))
|
|
88
183
|
|
|
89
|
-
def select_paragraphs_at(self, x: float, y: float) -> List[ParagraphObject]:
|
|
184
|
+
def select_paragraphs_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ParagraphObject]:
|
|
90
185
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
91
186
|
# noinspection PyProtectedMember
|
|
92
|
-
return self.root._to_paragraph_objects(self.root._find_paragraphs(position))
|
|
187
|
+
return self.root._to_paragraph_objects(self.root._find_paragraphs(position, tolerance))
|
|
93
188
|
|
|
94
189
|
def select_text_lines(self) -> List[TextLineObject]:
|
|
95
190
|
position = Position.at_page(self.page_index)
|
|
@@ -102,29 +197,29 @@ class PageClient:
|
|
|
102
197
|
# noinspection PyProtectedMember
|
|
103
198
|
return self.root._to_textline_objects(self.root._find_text_lines(position))
|
|
104
199
|
|
|
105
|
-
def select_text_lines_at(self, x, y) -> List[TextLineObject]:
|
|
200
|
+
def select_text_lines_at(self, x, y, tolerance: float = DEFAULT_TOLERANCE) -> List[TextLineObject]:
|
|
106
201
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
107
202
|
# noinspection PyProtectedMember
|
|
108
|
-
return self.root._to_textline_objects(self.root._find_text_lines(position))
|
|
203
|
+
return self.root._to_textline_objects(self.root._find_text_lines(position, tolerance))
|
|
109
204
|
|
|
110
205
|
def select_images(self) -> List[ImageObject]:
|
|
111
206
|
# noinspection PyProtectedMember
|
|
112
207
|
return self.root._to_image_objects(self.root._find_images(Position.at_page(self.page_index)))
|
|
113
208
|
|
|
114
|
-
def select_images_at(self, x: float, y: float) -> List[ImageObject]:
|
|
209
|
+
def select_images_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[ImageObject]:
|
|
115
210
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
116
211
|
# noinspection PyProtectedMember
|
|
117
|
-
return self.root._to_image_objects(self.root._find_images(position))
|
|
212
|
+
return self.root._to_image_objects(self.root._find_images(position, tolerance))
|
|
118
213
|
|
|
119
214
|
def select_forms(self) -> List[FormObject]:
|
|
120
215
|
position = Position.at_page(self.page_index)
|
|
121
216
|
# noinspection PyProtectedMember
|
|
122
217
|
return self.root._to_form_objects(self.root._find_form_x_objects(position))
|
|
123
218
|
|
|
124
|
-
def select_forms_at(self, x: float, y: float) -> List[FormObject]:
|
|
219
|
+
def select_forms_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormObject]:
|
|
125
220
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
126
221
|
# noinspection PyProtectedMember
|
|
127
|
-
return self.root._to_form_objects(self.root._find_form_x_objects(position))
|
|
222
|
+
return self.root._to_form_objects(self.root._find_form_x_objects(position, tolerance))
|
|
128
223
|
|
|
129
224
|
def select_form_fields(self) -> List[FormFieldObject]:
|
|
130
225
|
position = Position.at_page(self.page_index)
|
|
@@ -137,10 +232,10 @@ class PageClient:
|
|
|
137
232
|
# noinspection PyProtectedMember
|
|
138
233
|
return self.root._to_form_field_objects(self.root._find_form_fields(pos))
|
|
139
234
|
|
|
140
|
-
def select_form_fields_at(self, x: float, y: float) -> List[FormFieldObject]:
|
|
235
|
+
def select_form_fields_at(self, x: float, y: float, tolerance: float = DEFAULT_TOLERANCE) -> List[FormFieldObject]:
|
|
141
236
|
position = Position.at_page_coordinates(self.page_index, x, y)
|
|
142
237
|
# noinspection PyProtectedMember
|
|
143
|
-
return self.root._to_form_field_objects(self.root._find_form_fields(position))
|
|
238
|
+
return self.root._to_form_field_objects(self.root._find_form_fields(position, tolerance))
|
|
144
239
|
|
|
145
240
|
@classmethod
|
|
146
241
|
def from_ref(cls, root: 'PDFDancer', page_ref: PageRef) -> 'PageClient':
|
|
@@ -178,6 +273,18 @@ class PageClient:
|
|
|
178
273
|
def new_paragraph(self):
|
|
179
274
|
return ParagraphPageBuilder(self.root, self.page_index)
|
|
180
275
|
|
|
276
|
+
def new_path(self):
|
|
277
|
+
from .path_builder import PathBuilder
|
|
278
|
+
return PathBuilder(self.root, self.page_index)
|
|
279
|
+
|
|
280
|
+
def new_line(self):
|
|
281
|
+
from .path_builder import LineBuilder
|
|
282
|
+
return LineBuilder(self.root, self.page_index)
|
|
283
|
+
|
|
284
|
+
def new_bezier(self):
|
|
285
|
+
from .path_builder import BezierBuilder
|
|
286
|
+
return BezierBuilder(self.root, self.page_index)
|
|
287
|
+
|
|
181
288
|
def select_paths(self):
|
|
182
289
|
# noinspection PyProtectedMember
|
|
183
290
|
return self.root._to_path_objects(self.root._find_paths(Position.at_page(self.page_index)))
|
|
@@ -229,9 +336,15 @@ class PDFDancer:
|
|
|
229
336
|
"""
|
|
230
337
|
Create a client session, falling back to environment variables when needed.
|
|
231
338
|
|
|
339
|
+
Authentication:
|
|
340
|
+
- If token is provided, uses it
|
|
341
|
+
- Otherwise, checks PDFDANCER_TOKEN environment variable
|
|
342
|
+
- If no token is found, automatically obtains an anonymous token
|
|
343
|
+
|
|
232
344
|
Args:
|
|
233
345
|
pdf_data: PDF payload supplied directly or via filesystem handles.
|
|
234
|
-
token: Override for the API token; falls back to `PDFDANCER_TOKEN`
|
|
346
|
+
token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
|
|
347
|
+
then to anonymous token if not set.
|
|
235
348
|
base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
|
|
236
349
|
or defaults to `https://api.pdfdancer.com`.
|
|
237
350
|
timeout: HTTP read timeout in seconds.
|
|
@@ -242,6 +355,10 @@ class PDFDancer:
|
|
|
242
355
|
resolved_token = cls._resolve_token(token)
|
|
243
356
|
resolved_base_url = cls._resolve_base_url(base_url)
|
|
244
357
|
|
|
358
|
+
# If no token found, obtain anonymous token
|
|
359
|
+
if resolved_token is None:
|
|
360
|
+
resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
|
|
361
|
+
|
|
245
362
|
return PDFDancer(resolved_token, pdf_data, resolved_base_url, timeout)
|
|
246
363
|
|
|
247
364
|
@classmethod
|
|
@@ -252,18 +369,66 @@ class PDFDancer:
|
|
|
252
369
|
resolved_base_url = "https://api.pdfdancer.com"
|
|
253
370
|
return resolved_base_url
|
|
254
371
|
|
|
372
|
+
@classmethod
|
|
373
|
+
def _obtain_anonymous_token(cls, base_url: str, timeout: float = 30.0) -> str:
|
|
374
|
+
"""
|
|
375
|
+
Obtain an anonymous token from the /keys/anon endpoint.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
base_url: Base URL of the PDFDancer API server
|
|
379
|
+
timeout: HTTP read timeout in seconds
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Anonymous token string
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
HttpClientException: If token request fails
|
|
386
|
+
"""
|
|
387
|
+
try:
|
|
388
|
+
# Create temporary client without authentication
|
|
389
|
+
temp_client = httpx.Client(
|
|
390
|
+
http2=True,
|
|
391
|
+
verify=not DISABLE_SSL_VERIFY
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
headers = {
|
|
395
|
+
'X-Fingerprint': Fingerprint.generate()
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
response = temp_client.post(
|
|
399
|
+
cls._cleanup_url_path(base_url, "/keys/anon"),
|
|
400
|
+
headers=headers,
|
|
401
|
+
timeout=timeout if timeout > 0 else None
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
response.raise_for_status()
|
|
405
|
+
token_data = response.json()
|
|
406
|
+
|
|
407
|
+
# Extract token from response (matches Java AnonTokenResponse structure)
|
|
408
|
+
if isinstance(token_data, dict) and 'token' in token_data:
|
|
409
|
+
return token_data['token']
|
|
410
|
+
else:
|
|
411
|
+
raise HttpClientException("Invalid anonymous token response format")
|
|
412
|
+
|
|
413
|
+
except httpx.HTTPStatusError as e:
|
|
414
|
+
raise HttpClientException(f"Failed to obtain anonymous token: HTTP {e.response.status_code}",
|
|
415
|
+
response=e.response, cause=e) from None
|
|
416
|
+
except httpx.RequestError as e:
|
|
417
|
+
raise HttpClientException(f"Failed to obtain anonymous token: {str(e)}",
|
|
418
|
+
response=None, cause=e) from None
|
|
419
|
+
finally:
|
|
420
|
+
temp_client.close()
|
|
421
|
+
|
|
255
422
|
@classmethod
|
|
256
423
|
def _resolve_token(cls, token: Optional[str]) -> Optional[str]:
|
|
424
|
+
"""
|
|
425
|
+
Resolve token from argument or environment variable.
|
|
426
|
+
Returns None if no token is found (allowing fallback to anonymous token).
|
|
427
|
+
"""
|
|
257
428
|
resolved_token = token.strip() if token and token.strip() else None
|
|
258
429
|
if resolved_token is None:
|
|
259
430
|
env_token = os.getenv("PDFDANCER_TOKEN")
|
|
260
431
|
resolved_token = env_token.strip() if env_token and env_token.strip() else None
|
|
261
|
-
|
|
262
|
-
if resolved_token is None:
|
|
263
|
-
raise ValidationException(
|
|
264
|
-
"Missing PDFDancer API token. Pass a token via the `token` argument "
|
|
265
|
-
"or set the PDFDANCER_TOKEN environment variable."
|
|
266
|
-
)
|
|
267
432
|
return resolved_token
|
|
268
433
|
|
|
269
434
|
@classmethod
|
|
@@ -277,8 +442,14 @@ class PDFDancer:
|
|
|
277
442
|
"""
|
|
278
443
|
Create a new blank PDF document with optional configuration.
|
|
279
444
|
|
|
445
|
+
Authentication:
|
|
446
|
+
- If token is provided, uses it
|
|
447
|
+
- Otherwise, checks PDFDANCER_TOKEN environment variable
|
|
448
|
+
- If no token is found, automatically obtains an anonymous token
|
|
449
|
+
|
|
280
450
|
Args:
|
|
281
|
-
token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable
|
|
451
|
+
token: Override for the API token; falls back to `PDFDANCER_TOKEN` environment variable,
|
|
452
|
+
then to anonymous token if not set.
|
|
282
453
|
base_url: Override for the API base URL; falls back to `PDFDANCER_BASE_URL`
|
|
283
454
|
or defaults to `https://api.pdfdancer.com`.
|
|
284
455
|
timeout: HTTP read timeout in seconds.
|
|
@@ -293,6 +464,10 @@ class PDFDancer:
|
|
|
293
464
|
resolved_token = cls._resolve_token(token)
|
|
294
465
|
resolved_base_url = cls._resolve_base_url(base_url)
|
|
295
466
|
|
|
467
|
+
# If no token found, obtain anonymous token
|
|
468
|
+
if resolved_token is None:
|
|
469
|
+
resolved_token = cls._obtain_anonymous_token(resolved_base_url, timeout)
|
|
470
|
+
|
|
296
471
|
# Create a new instance that will call _create_blank_pdf_session
|
|
297
472
|
instance = object.__new__(cls)
|
|
298
473
|
|
|
@@ -304,11 +479,12 @@ class PDFDancer:
|
|
|
304
479
|
instance._base_url = resolved_base_url.rstrip('/')
|
|
305
480
|
instance._read_timeout = timeout
|
|
306
481
|
|
|
307
|
-
# Create HTTP
|
|
308
|
-
instance.
|
|
309
|
-
|
|
310
|
-
'Authorization': f'Bearer {instance._token}'
|
|
311
|
-
|
|
482
|
+
# Create HTTP client for connection reuse with HTTP/2 support
|
|
483
|
+
instance._client = httpx.Client(
|
|
484
|
+
http2=True,
|
|
485
|
+
headers={'Authorization': f'Bearer {instance._token}'},
|
|
486
|
+
verify=not DISABLE_SSL_VERIFY
|
|
487
|
+
)
|
|
312
488
|
|
|
313
489
|
# Create blank PDF session
|
|
314
490
|
instance._session_id = instance._create_blank_pdf_session(
|
|
@@ -320,6 +496,10 @@ class PDFDancer:
|
|
|
320
496
|
# Set pdf_bytes to None since we don't have the PDF bytes yet
|
|
321
497
|
instance._pdf_bytes = None
|
|
322
498
|
|
|
499
|
+
# Initialize snapshot caches (lazy-loaded)
|
|
500
|
+
instance._document_snapshot = None
|
|
501
|
+
instance._page_snapshots = {}
|
|
502
|
+
|
|
323
503
|
return instance
|
|
324
504
|
|
|
325
505
|
def __init__(self, token: str, pdf_data: Union[bytes, Path, str, BinaryIO],
|
|
@@ -351,15 +531,20 @@ class PDFDancer:
|
|
|
351
531
|
# Process PDF data with validation
|
|
352
532
|
self._pdf_bytes = self._process_pdf_data(pdf_data)
|
|
353
533
|
|
|
354
|
-
# Create HTTP
|
|
355
|
-
self.
|
|
356
|
-
|
|
357
|
-
'Authorization': f'Bearer {self._token}'
|
|
358
|
-
|
|
534
|
+
# Create HTTP client for connection reuse with HTTP/2 support
|
|
535
|
+
self._client = httpx.Client(
|
|
536
|
+
http2=True,
|
|
537
|
+
headers={'Authorization': f'Bearer {self._token}'},
|
|
538
|
+
verify=not DISABLE_SSL_VERIFY
|
|
539
|
+
)
|
|
359
540
|
|
|
360
541
|
# Create session - equivalent to Java constructor behavior
|
|
361
542
|
self._session_id = self._create_session()
|
|
362
543
|
|
|
544
|
+
# Initialize snapshot caches (lazy-loaded)
|
|
545
|
+
self._document_snapshot: Optional[DocumentSnapshot] = None
|
|
546
|
+
self._page_snapshots: dict[int, PageSnapshot] = {}
|
|
547
|
+
|
|
363
548
|
@staticmethod
|
|
364
549
|
def _process_pdf_data(pdf_data: Union[bytes, Path, str, BinaryIO]) -> bytes:
|
|
365
550
|
"""
|
|
@@ -401,7 +586,7 @@ class PDFDancer:
|
|
|
401
586
|
except (IOError, OSError) as e:
|
|
402
587
|
raise PdfDancerException(f"Failed to read PDF data: {e}", cause=e)
|
|
403
588
|
|
|
404
|
-
def _extract_error_message(self, response: Optional[
|
|
589
|
+
def _extract_error_message(self, response: Optional[httpx.Response]) -> str:
|
|
405
590
|
"""
|
|
406
591
|
Extract meaningful error messages from API response.
|
|
407
592
|
Parses JSON error responses with _embedded.errors structure.
|
|
@@ -437,7 +622,7 @@ class PDFDancer:
|
|
|
437
622
|
# If JSON parsing fails, return response content or status
|
|
438
623
|
return response.text or f"HTTP {response.status_code}"
|
|
439
624
|
|
|
440
|
-
def _handle_authentication_error(self, response: Optional[
|
|
625
|
+
def _handle_authentication_error(self, response: Optional[httpx.Response]) -> None:
|
|
441
626
|
"""
|
|
442
627
|
Translate authentication failures into a clear, actionable validation error.
|
|
443
628
|
"""
|
|
@@ -474,25 +659,54 @@ class PDFDancer:
|
|
|
474
659
|
Creates a new PDF processing session by uploading the PDF data.
|
|
475
660
|
"""
|
|
476
661
|
try:
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
662
|
+
# Build multipart body manually to avoid base64 encoding and enable compression
|
|
663
|
+
# httpx by default may add Content-Transfer-Encoding: base64 which the server rejects
|
|
664
|
+
import uuid
|
|
665
|
+
|
|
666
|
+
boundary = uuid.uuid4().hex
|
|
667
|
+
|
|
668
|
+
# Build multipart body with binary (not base64) encoding
|
|
669
|
+
body_parts = []
|
|
670
|
+
body_parts.append(f'--{boundary}\r\n'.encode('utf-8'))
|
|
671
|
+
body_parts.append(b'Content-Disposition: form-data; name="pdf"; filename="document.pdf"\r\n')
|
|
672
|
+
body_parts.append(b'Content-Type: application/pdf\r\n')
|
|
673
|
+
body_parts.append(b'\r\n') # End of headers, no Content-Transfer-Encoding
|
|
674
|
+
body_parts.append(self._pdf_bytes)
|
|
675
|
+
body_parts.append(b'\r\n')
|
|
676
|
+
body_parts.append(f'--{boundary}--\r\n'.encode('utf-8'))
|
|
677
|
+
|
|
678
|
+
uncompressed_body = b''.join(body_parts)
|
|
679
|
+
|
|
680
|
+
# Compress entire request body using gzip
|
|
681
|
+
compressed_body = gzip.compress(uncompressed_body)
|
|
682
|
+
|
|
683
|
+
original_size = len(uncompressed_body)
|
|
684
|
+
compressed_size = len(compressed_body)
|
|
685
|
+
compression_ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
|
|
480
686
|
|
|
481
|
-
request_size = len(self._pdf_bytes)
|
|
482
687
|
if DEBUG:
|
|
483
|
-
print(f"{time.time()}|POST /session/create -
|
|
688
|
+
print(f"{time.time()}|POST /session/create - original size: {original_size} bytes, "
|
|
689
|
+
f"compressed size: {compressed_size} bytes, "
|
|
690
|
+
f"compression: {compression_ratio:.1f}%")
|
|
691
|
+
|
|
692
|
+
headers = {
|
|
693
|
+
'X-Generated-At': _generate_timestamp(),
|
|
694
|
+
'Content-Type': f'multipart/form-data; boundary={boundary}',
|
|
695
|
+
'Content-Encoding': 'gzip'
|
|
696
|
+
}
|
|
484
697
|
|
|
485
|
-
response = self.
|
|
698
|
+
response = self._client.post(
|
|
486
699
|
self._cleanup_url_path(self._base_url, "/session/create"),
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
700
|
+
content=compressed_body,
|
|
701
|
+
headers=headers,
|
|
702
|
+
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
490
703
|
)
|
|
491
704
|
|
|
492
705
|
response_size = len(response.content)
|
|
493
706
|
if DEBUG:
|
|
494
707
|
print(f"{time.time()}|POST /session/create - response size: {response_size} bytes")
|
|
495
708
|
|
|
709
|
+
_log_generated_at_header(response, "POST", "/session/create")
|
|
496
710
|
self._handle_authentication_error(response)
|
|
497
711
|
response.raise_for_status()
|
|
498
712
|
session_id = response.text.strip()
|
|
@@ -502,11 +716,14 @@ class PDFDancer:
|
|
|
502
716
|
|
|
503
717
|
return session_id
|
|
504
718
|
|
|
505
|
-
except
|
|
506
|
-
self._handle_authentication_error(
|
|
507
|
-
error_message = self._extract_error_message(
|
|
719
|
+
except httpx.HTTPStatusError as e:
|
|
720
|
+
self._handle_authentication_error(e.response)
|
|
721
|
+
error_message = self._extract_error_message(e.response)
|
|
508
722
|
raise HttpClientException(f"Failed to create session: {error_message}",
|
|
509
|
-
response=
|
|
723
|
+
response=e.response, cause=e) from None
|
|
724
|
+
except httpx.RequestError as e:
|
|
725
|
+
raise HttpClientException(f"Failed to create session: {str(e)}",
|
|
726
|
+
response=None, cause=e) from None
|
|
510
727
|
|
|
511
728
|
def _create_blank_pdf_session(self,
|
|
512
729
|
page_size: Optional[Union[PageSize, str, Mapping[str, Any]]] = None,
|
|
@@ -560,19 +777,22 @@ class PDFDancer:
|
|
|
560
777
|
if DEBUG:
|
|
561
778
|
print(f"{time.time()}|POST /session/new - request size: {request_size} bytes")
|
|
562
779
|
|
|
563
|
-
headers = {
|
|
564
|
-
|
|
780
|
+
headers = {
|
|
781
|
+
'Content-Type': 'application/json',
|
|
782
|
+
'X-Generated-At': _generate_timestamp()
|
|
783
|
+
}
|
|
784
|
+
response = self._client.post(
|
|
565
785
|
self._cleanup_url_path(self._base_url, "/session/new"),
|
|
566
786
|
json=request_data,
|
|
567
787
|
headers=headers,
|
|
568
|
-
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
569
|
-
verify=not DISABLE_SSL_VERIFY
|
|
788
|
+
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
570
789
|
)
|
|
571
790
|
|
|
572
791
|
response_size = len(response.content)
|
|
573
792
|
if DEBUG:
|
|
574
793
|
print(f"{time.time()}|POST /session/new - response size: {response_size} bytes")
|
|
575
794
|
|
|
795
|
+
_log_generated_at_header(response, "POST", "/session/new")
|
|
576
796
|
self._handle_authentication_error(response)
|
|
577
797
|
response.raise_for_status()
|
|
578
798
|
session_id = response.text.strip()
|
|
@@ -582,20 +802,25 @@ class PDFDancer:
|
|
|
582
802
|
|
|
583
803
|
return session_id
|
|
584
804
|
|
|
585
|
-
except
|
|
586
|
-
self._handle_authentication_error(
|
|
587
|
-
error_message = self._extract_error_message(
|
|
805
|
+
except httpx.HTTPStatusError as e:
|
|
806
|
+
self._handle_authentication_error(e.response)
|
|
807
|
+
error_message = self._extract_error_message(e.response)
|
|
588
808
|
raise HttpClientException(f"Failed to create blank PDF session: {error_message}",
|
|
589
|
-
response=
|
|
809
|
+
response=e.response, cause=e) from None
|
|
810
|
+
except httpx.RequestError as e:
|
|
811
|
+
raise HttpClientException(f"Failed to create blank PDF session: {str(e)}",
|
|
812
|
+
response=None, cause=e) from None
|
|
590
813
|
|
|
591
814
|
def _make_request(self, method: str, path: str, data: Optional[dict] = None,
|
|
592
|
-
params: Optional[dict] = None) ->
|
|
815
|
+
params: Optional[dict] = None) -> httpx.Response:
|
|
593
816
|
"""
|
|
594
817
|
Make HTTP request with session headers and error handling.
|
|
595
818
|
"""
|
|
596
819
|
headers = {
|
|
597
820
|
'X-Session-Id': self._session_id,
|
|
598
|
-
'Content-Type': 'application/json'
|
|
821
|
+
'Content-Type': 'application/json',
|
|
822
|
+
'X-Generated-At': _generate_timestamp(),
|
|
823
|
+
'X-Fingerprint': Fingerprint.generate()
|
|
599
824
|
}
|
|
600
825
|
|
|
601
826
|
try:
|
|
@@ -606,20 +831,21 @@ class PDFDancer:
|
|
|
606
831
|
if DEBUG:
|
|
607
832
|
print(f"{time.time()}|{method} {path} - request size: {request_size} bytes")
|
|
608
833
|
|
|
609
|
-
response = self.
|
|
834
|
+
response = self._client.request(
|
|
610
835
|
method=method,
|
|
611
836
|
url=self._cleanup_url_path(self._base_url, path),
|
|
612
837
|
json=data,
|
|
613
838
|
params=params,
|
|
614
839
|
headers=headers,
|
|
615
|
-
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
616
|
-
verify=not DISABLE_SSL_VERIFY
|
|
840
|
+
timeout=self._read_timeout if self._read_timeout > 0 else None
|
|
617
841
|
)
|
|
618
842
|
|
|
619
843
|
response_size = len(response.content)
|
|
620
844
|
if DEBUG:
|
|
621
845
|
print(f"{time.time()}|{method} {path} - response size: {response_size} bytes")
|
|
622
846
|
|
|
847
|
+
_log_generated_at_header(response, method, path)
|
|
848
|
+
|
|
623
849
|
# Handle FontNotFoundException
|
|
624
850
|
if response.status_code == 404:
|
|
625
851
|
try:
|
|
@@ -633,53 +859,86 @@ class PDFDancer:
|
|
|
633
859
|
response.raise_for_status()
|
|
634
860
|
return response
|
|
635
861
|
|
|
636
|
-
except
|
|
637
|
-
self._handle_authentication_error(
|
|
638
|
-
error_message = self._extract_error_message(
|
|
639
|
-
raise HttpClientException(f"API request failed: {error_message}", response=
|
|
862
|
+
except httpx.HTTPStatusError as e:
|
|
863
|
+
self._handle_authentication_error(e.response)
|
|
864
|
+
error_message = self._extract_error_message(e.response)
|
|
865
|
+
raise HttpClientException(f"API request failed: {error_message}", response=e.response,
|
|
866
|
+
cause=e) from None
|
|
867
|
+
except httpx.RequestError as e:
|
|
868
|
+
raise HttpClientException(f"API request failed: {str(e)}", response=None,
|
|
640
869
|
cause=e) from None
|
|
641
870
|
|
|
642
|
-
def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None
|
|
871
|
+
def _find(self, object_type: Optional[ObjectType] = None, position: Optional[Position] = None,
|
|
872
|
+
tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
|
|
643
873
|
"""
|
|
644
874
|
Searches for PDF objects matching the specified criteria.
|
|
645
|
-
|
|
646
|
-
allowing filtering by object type and position constraints.
|
|
875
|
+
Uses snapshot cache for all queries except paths at specific coordinates.
|
|
647
876
|
|
|
648
877
|
Args:
|
|
649
878
|
object_type: The type of objects to find (None for all types)
|
|
650
879
|
position: Positional constraints for the search (None for all positions)
|
|
880
|
+
tolerance: Tolerance in points for spatial matching (default: DEFAULT_TOLERANCE)
|
|
651
881
|
|
|
652
882
|
Returns:
|
|
653
883
|
List of object references matching the search criteria
|
|
654
884
|
"""
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
885
|
+
# Special case: PATH queries with bounding_rect need API (full vector data)
|
|
886
|
+
if object_type == ObjectType.PATH and position and position.bounding_rect:
|
|
887
|
+
request_data = FindRequest(object_type, position).to_dict()
|
|
888
|
+
response = self._make_request('POST', '/pdf/find', data=request_data)
|
|
889
|
+
objects_data = response.json()
|
|
890
|
+
return [self._parse_object_ref(obj_data) for obj_data in objects_data]
|
|
891
|
+
|
|
892
|
+
# Use snapshot for all other queries
|
|
893
|
+
if position and position.page_index is not None:
|
|
894
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
895
|
+
return self._filter_snapshot_elements(snapshot.elements, object_type, position, tolerance)
|
|
896
|
+
else:
|
|
897
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
898
|
+
all_elements = []
|
|
899
|
+
for page_snap in snapshot.pages:
|
|
900
|
+
all_elements.extend(page_snap.elements)
|
|
901
|
+
return self._filter_snapshot_elements(all_elements, object_type, position, tolerance)
|
|
661
902
|
|
|
662
|
-
def select_paragraphs(self) -> List[
|
|
903
|
+
def select_paragraphs(self) -> List[ParagraphObject]:
|
|
663
904
|
"""
|
|
664
|
-
Searches for paragraph objects returning
|
|
905
|
+
Searches for paragraph objects returning ParagraphObject instances.
|
|
665
906
|
"""
|
|
666
|
-
return self._find_paragraphs(None)
|
|
907
|
+
return self._to_paragraph_objects(self._find_paragraphs(None))
|
|
667
908
|
|
|
668
|
-
def _find_paragraphs(self, position: Optional[Position] = None) -> List[
|
|
909
|
+
def _find_paragraphs(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
910
|
+
TextObjectRef]:
|
|
669
911
|
"""
|
|
670
912
|
Searches for paragraph objects returning TextObjectRef with hierarchical structure.
|
|
913
|
+
Uses snapshot cache for all queries.
|
|
671
914
|
"""
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
915
|
+
# Use snapshot for all queries (including spatial)
|
|
916
|
+
if position and position.page_index is not None:
|
|
917
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
918
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.PARAGRAPH, position, tolerance)
|
|
919
|
+
else:
|
|
920
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
921
|
+
all_elements = []
|
|
922
|
+
for page_snap in snapshot.pages:
|
|
923
|
+
all_elements.extend(page_snap.elements)
|
|
924
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.PARAGRAPH, position, tolerance)
|
|
677
925
|
|
|
678
|
-
def _find_images(self, position: Optional[Position] = None) -> List[
|
|
926
|
+
def _find_images(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
927
|
+
ObjectRef]:
|
|
679
928
|
"""
|
|
680
929
|
Searches for image objects at the specified position.
|
|
930
|
+
Uses snapshot cache for all queries.
|
|
681
931
|
"""
|
|
682
|
-
|
|
932
|
+
# Use snapshot for all queries (including spatial)
|
|
933
|
+
if position and position.page_index is not None:
|
|
934
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
935
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.IMAGE, position, tolerance)
|
|
936
|
+
else:
|
|
937
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
938
|
+
all_elements = []
|
|
939
|
+
for page_snap in snapshot.pages:
|
|
940
|
+
all_elements.extend(page_snap.elements)
|
|
941
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.IMAGE, position, tolerance)
|
|
683
942
|
|
|
684
943
|
def select_images(self) -> List[ImageObject]:
|
|
685
944
|
"""
|
|
@@ -693,11 +952,22 @@ class PDFDancer:
|
|
|
693
952
|
"""
|
|
694
953
|
return self._to_form_objects(self._find(ObjectType.FORM_X_OBJECT, None))
|
|
695
954
|
|
|
696
|
-
def _find_form_x_objects(self, position: Optional[Position] = None) -> List[
|
|
955
|
+
def _find_form_x_objects(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
956
|
+
ObjectRef]:
|
|
697
957
|
"""
|
|
698
|
-
Searches for form
|
|
958
|
+
Searches for form X objects at the specified position.
|
|
959
|
+
Uses snapshot cache for all queries.
|
|
699
960
|
"""
|
|
700
|
-
|
|
961
|
+
# Use snapshot for all queries (including spatial)
|
|
962
|
+
if position and position.page_index is not None:
|
|
963
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
964
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_X_OBJECT, position, tolerance)
|
|
965
|
+
else:
|
|
966
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
967
|
+
all_elements = []
|
|
968
|
+
for page_snap in snapshot.pages:
|
|
969
|
+
all_elements.extend(page_snap.elements)
|
|
970
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.FORM_X_OBJECT, position, tolerance)
|
|
701
971
|
|
|
702
972
|
def select_form_fields(self) -> List[FormFieldObject]:
|
|
703
973
|
"""
|
|
@@ -711,17 +981,23 @@ class PDFDancer:
|
|
|
711
981
|
"""
|
|
712
982
|
return self._to_form_field_objects(self._find_form_fields(Position.by_name(field_name)))
|
|
713
983
|
|
|
714
|
-
def _find_form_fields(self, position: Optional[Position] = None) -> List[
|
|
984
|
+
def _find_form_fields(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
985
|
+
FormFieldRef]:
|
|
715
986
|
"""
|
|
716
987
|
Searches for form fields at the specified position.
|
|
717
988
|
Returns FormFieldRef objects with name and value properties.
|
|
989
|
+
Uses snapshot cache for all queries (including name and spatial filtering).
|
|
718
990
|
"""
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
991
|
+
# Use snapshot for all queries (including name and spatial)
|
|
992
|
+
if position and position.page_index is not None:
|
|
993
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
994
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.FORM_FIELD, position, tolerance)
|
|
995
|
+
else:
|
|
996
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
997
|
+
all_elements = []
|
|
998
|
+
for page_snap in snapshot.pages:
|
|
999
|
+
all_elements.extend(page_snap.elements)
|
|
1000
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.FORM_FIELD, position, tolerance)
|
|
725
1001
|
|
|
726
1002
|
def _change_form_field(self, form_field_ref: FormFieldRef, new_value: str) -> bool:
|
|
727
1003
|
"""
|
|
@@ -730,9 +1006,12 @@ class PDFDancer:
|
|
|
730
1006
|
if form_field_ref is None:
|
|
731
1007
|
raise ValidationException("Form field reference cannot be null")
|
|
732
1008
|
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
1009
|
+
try:
|
|
1010
|
+
request_data = ChangeFormFieldRequest(form_field_ref, new_value).to_dict()
|
|
1011
|
+
response = self._make_request('PUT', '/pdf/modify/formField', data=request_data)
|
|
1012
|
+
return response.json()
|
|
1013
|
+
finally:
|
|
1014
|
+
self._invalidate_snapshots()
|
|
736
1015
|
|
|
737
1016
|
def select_paths(self) -> List[ObjectRef]:
|
|
738
1017
|
"""
|
|
@@ -740,21 +1019,45 @@ class PDFDancer:
|
|
|
740
1019
|
"""
|
|
741
1020
|
return self._find(ObjectType.PATH, None)
|
|
742
1021
|
|
|
743
|
-
def _find_paths(self, position: Optional[Position] = None) -> List[ObjectRef]:
|
|
1022
|
+
def _find_paths(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[ObjectRef]:
|
|
744
1023
|
"""
|
|
745
1024
|
Searches for vector path objects at the specified position.
|
|
746
|
-
|
|
747
|
-
|
|
1025
|
+
Note: Spatial queries (with bounding_rect) fall back to API since snapshots
|
|
1026
|
+
don't include full vector path data needed for precise intersection tests.
|
|
1027
|
+
"""
|
|
1028
|
+
# Special case: paths at specific coordinates need full vector data
|
|
1029
|
+
# which is not available in snapshots, so pass through to API
|
|
1030
|
+
if position and position.bounding_rect:
|
|
1031
|
+
return self._find(ObjectType.PATH, position, tolerance)
|
|
1032
|
+
|
|
1033
|
+
# For simple page-level "all paths" queries, use snapshot
|
|
1034
|
+
if position and position.page_index is not None:
|
|
1035
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
1036
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.PATH, position, tolerance)
|
|
1037
|
+
else:
|
|
1038
|
+
# Document-level query - use document snapshot
|
|
1039
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
1040
|
+
all_elements = []
|
|
1041
|
+
for page_snap in snapshot.pages:
|
|
1042
|
+
all_elements.extend(page_snap.elements)
|
|
1043
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.PATH, position, tolerance)
|
|
748
1044
|
|
|
749
|
-
def _find_text_lines(self, position: Optional[Position] = None) -> List[
|
|
1045
|
+
def _find_text_lines(self, position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List[
|
|
1046
|
+
TextObjectRef]:
|
|
750
1047
|
"""
|
|
751
1048
|
Searches for text line objects returning TextObjectRef with hierarchical structure.
|
|
1049
|
+
Uses snapshot cache for all queries.
|
|
752
1050
|
"""
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
1051
|
+
# Use snapshot for all queries (including spatial)
|
|
1052
|
+
if position and position.page_index is not None:
|
|
1053
|
+
snapshot = self._get_or_fetch_page_snapshot(position.page_index)
|
|
1054
|
+
return self._filter_snapshot_elements(snapshot.elements, ObjectType.TEXT_LINE, position, tolerance)
|
|
1055
|
+
else:
|
|
1056
|
+
snapshot = self._get_or_fetch_document_snapshot()
|
|
1057
|
+
all_elements = []
|
|
1058
|
+
for page_snap in snapshot.pages:
|
|
1059
|
+
all_elements.extend(page_snap.elements)
|
|
1060
|
+
return self._filter_snapshot_elements(all_elements, ObjectType.TEXT_LINE, position, tolerance)
|
|
758
1061
|
|
|
759
1062
|
def select_text_lines(self) -> List[TextLineObject]:
|
|
760
1063
|
"""
|
|
@@ -764,7 +1067,7 @@ class PDFDancer:
|
|
|
764
1067
|
|
|
765
1068
|
def page(self, page_index: int) -> PageClient:
|
|
766
1069
|
"""
|
|
767
|
-
Get a specific page by index,
|
|
1070
|
+
Get a specific page by index, using snapshot cache when available.
|
|
768
1071
|
|
|
769
1072
|
Args:
|
|
770
1073
|
page_index: The 0-based page index
|
|
@@ -772,11 +1075,16 @@ class PDFDancer:
|
|
|
772
1075
|
Returns:
|
|
773
1076
|
PageClient with page properties populated
|
|
774
1077
|
"""
|
|
1078
|
+
# Try to get page ref from snapshot first (avoids API call)
|
|
1079
|
+
page_snapshot = self._get_or_fetch_page_snapshot(page_index)
|
|
1080
|
+
if page_snapshot and page_snapshot.page_ref:
|
|
1081
|
+
return PageClient.from_ref(self, page_snapshot.page_ref)
|
|
1082
|
+
|
|
1083
|
+
# Fallback to API if snapshot doesn't have page ref
|
|
775
1084
|
page_ref = self._get_page(page_index)
|
|
776
1085
|
if page_ref:
|
|
777
1086
|
return PageClient.from_ref(self, page_ref)
|
|
778
1087
|
else:
|
|
779
|
-
# Fallback to basic PageClient if page not found
|
|
780
1088
|
return PageClient(page_index, self)
|
|
781
1089
|
|
|
782
1090
|
# Page Operations
|
|
@@ -786,11 +1094,11 @@ class PDFDancer:
|
|
|
786
1094
|
|
|
787
1095
|
def _get_pages(self) -> List[PageRef]:
|
|
788
1096
|
"""
|
|
789
|
-
Retrieves references to all pages in the PDF document.
|
|
1097
|
+
Retrieves references to all pages in the PDF document using snapshot cache.
|
|
790
1098
|
"""
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
return [
|
|
1099
|
+
# Use document snapshot which includes all pages (avoids API call)
|
|
1100
|
+
doc_snapshot = self._get_or_fetch_document_snapshot()
|
|
1101
|
+
return [page_snap.page_ref for page_snap in doc_snapshot.pages]
|
|
794
1102
|
|
|
795
1103
|
def _get_page(self, page_index: int) -> Optional[PageRef]:
|
|
796
1104
|
"""
|
|
@@ -830,7 +1138,13 @@ class PDFDancer:
|
|
|
830
1138
|
request_data = page_ref.to_dict()
|
|
831
1139
|
|
|
832
1140
|
response = self._make_request('DELETE', '/pdf/page/delete', data=request_data)
|
|
833
|
-
|
|
1141
|
+
result = response.json()
|
|
1142
|
+
|
|
1143
|
+
# Invalidate snapshot caches after mutation
|
|
1144
|
+
if result:
|
|
1145
|
+
self._invalidate_snapshots()
|
|
1146
|
+
|
|
1147
|
+
return result
|
|
834
1148
|
|
|
835
1149
|
def move_page(self, from_page_index: int, to_page_index: int) -> bool:
|
|
836
1150
|
"""Move a page to a different index within the document."""
|
|
@@ -849,6 +1163,11 @@ class PDFDancer:
|
|
|
849
1163
|
request_data = PageMoveRequest(from_page_index, to_page_index).to_dict()
|
|
850
1164
|
response = self._make_request('PUT', '/pdf/page/move', data=request_data)
|
|
851
1165
|
result = response.json()
|
|
1166
|
+
|
|
1167
|
+
# Invalidate snapshot caches after mutation
|
|
1168
|
+
if result:
|
|
1169
|
+
self._invalidate_snapshots()
|
|
1170
|
+
|
|
852
1171
|
return bool(result)
|
|
853
1172
|
|
|
854
1173
|
# Manipulation Operations
|
|
@@ -868,7 +1187,13 @@ class PDFDancer:
|
|
|
868
1187
|
|
|
869
1188
|
request_data = DeleteRequest(object_ref).to_dict()
|
|
870
1189
|
response = self._make_request('DELETE', '/pdf/delete', data=request_data)
|
|
871
|
-
|
|
1190
|
+
result = response.json()
|
|
1191
|
+
|
|
1192
|
+
# Invalidate snapshot caches after mutation
|
|
1193
|
+
if result:
|
|
1194
|
+
self._invalidate_snapshots()
|
|
1195
|
+
|
|
1196
|
+
return result
|
|
872
1197
|
|
|
873
1198
|
def _move(self, object_ref: ObjectRef, position: Position) -> bool:
|
|
874
1199
|
"""
|
|
@@ -888,7 +1213,13 @@ class PDFDancer:
|
|
|
888
1213
|
|
|
889
1214
|
request_data = MoveRequest(object_ref, position).to_dict()
|
|
890
1215
|
response = self._make_request('PUT', '/pdf/move', data=request_data)
|
|
891
|
-
|
|
1216
|
+
result = response.json()
|
|
1217
|
+
|
|
1218
|
+
# Invalidate snapshot caches after mutation
|
|
1219
|
+
if result:
|
|
1220
|
+
self._invalidate_snapshots()
|
|
1221
|
+
|
|
1222
|
+
return result
|
|
892
1223
|
|
|
893
1224
|
# Add Operations
|
|
894
1225
|
|
|
@@ -935,24 +1266,58 @@ class PDFDancer:
|
|
|
935
1266
|
|
|
936
1267
|
return self._add_object(paragraph)
|
|
937
1268
|
|
|
1269
|
+
def _add_path(self, path: 'Path') -> bool:
|
|
1270
|
+
"""
|
|
1271
|
+
Internal method to add a path to the document after validation.
|
|
1272
|
+
"""
|
|
1273
|
+
from .models import Path as PathModel
|
|
1274
|
+
|
|
1275
|
+
if path is None:
|
|
1276
|
+
raise ValidationException("Path cannot be null")
|
|
1277
|
+
if path.get_position() is None:
|
|
1278
|
+
raise ValidationException("Path position is null")
|
|
1279
|
+
if path.get_position().page_index is None:
|
|
1280
|
+
raise ValidationException("Path position page index is null")
|
|
1281
|
+
if path.get_position().page_index < 0:
|
|
1282
|
+
raise ValidationException("Path position page index is less than 0")
|
|
1283
|
+
if not path.get_path_segments() or len(path.get_path_segments()) == 0:
|
|
1284
|
+
raise ValidationException("Path must have at least one segment")
|
|
1285
|
+
|
|
1286
|
+
return self._add_object(path)
|
|
1287
|
+
|
|
938
1288
|
def _add_object(self, pdf_object) -> bool:
|
|
939
1289
|
"""
|
|
940
1290
|
Internal method to add any PDF object.
|
|
941
1291
|
"""
|
|
942
1292
|
request_data = AddRequest(pdf_object).to_dict()
|
|
943
1293
|
response = self._make_request('POST', '/pdf/add', data=request_data)
|
|
944
|
-
|
|
1294
|
+
result = response.json()
|
|
1295
|
+
|
|
1296
|
+
# Invalidate snapshot caches after mutation
|
|
1297
|
+
if result:
|
|
1298
|
+
self._invalidate_snapshots()
|
|
1299
|
+
|
|
1300
|
+
return result
|
|
945
1301
|
|
|
946
1302
|
def new_paragraph(self) -> ParagraphBuilder:
|
|
947
1303
|
return ParagraphBuilder(self)
|
|
948
1304
|
|
|
949
1305
|
def new_page(self):
|
|
950
1306
|
response = self._make_request('POST', '/pdf/page/add', data=None)
|
|
951
|
-
|
|
1307
|
+
result = self._parse_page_ref(response.json())
|
|
1308
|
+
|
|
1309
|
+
# Invalidate snapshot caches after adding page
|
|
1310
|
+
self._invalidate_snapshots()
|
|
1311
|
+
|
|
1312
|
+
return result
|
|
952
1313
|
|
|
953
1314
|
def new_image(self) -> ImageBuilder:
|
|
954
1315
|
return ImageBuilder(self)
|
|
955
1316
|
|
|
1317
|
+
def new_path(self) -> 'PathBuilder':
|
|
1318
|
+
from .path_builder import PathBuilder
|
|
1319
|
+
return PathBuilder(self)
|
|
1320
|
+
|
|
956
1321
|
# Modify Operations
|
|
957
1322
|
def _modify_paragraph(self, object_ref: ObjectRef, new_paragraph: Union[Paragraph, str]) -> CommandResult:
|
|
958
1323
|
"""
|
|
@@ -974,12 +1339,16 @@ class PDFDancer:
|
|
|
974
1339
|
# Text modification - returns CommandResult
|
|
975
1340
|
request_data = ModifyTextRequest(object_ref, new_paragraph).to_dict()
|
|
976
1341
|
response = self._make_request('PUT', '/pdf/text/paragraph', data=request_data)
|
|
977
|
-
|
|
1342
|
+
result = CommandResult.from_dict(response.json())
|
|
978
1343
|
else:
|
|
979
1344
|
# Object modification
|
|
980
1345
|
request_data = ModifyRequest(object_ref, new_paragraph).to_dict()
|
|
981
1346
|
response = self._make_request('PUT', '/pdf/modify', data=request_data)
|
|
982
|
-
|
|
1347
|
+
result = CommandResult.from_dict(response.json())
|
|
1348
|
+
|
|
1349
|
+
# Invalidate snapshot caches after mutation
|
|
1350
|
+
self._invalidate_snapshots()
|
|
1351
|
+
return result
|
|
983
1352
|
|
|
984
1353
|
def _modify_text_line(self, object_ref: ObjectRef, new_text: str) -> CommandResult:
|
|
985
1354
|
"""
|
|
@@ -999,7 +1368,11 @@ class PDFDancer:
|
|
|
999
1368
|
|
|
1000
1369
|
request_data = ModifyTextRequest(object_ref, new_text).to_dict()
|
|
1001
1370
|
response = self._make_request('PUT', '/pdf/text/line', data=request_data)
|
|
1002
|
-
|
|
1371
|
+
result = CommandResult.from_dict(response.json())
|
|
1372
|
+
|
|
1373
|
+
# Invalidate snapshot caches after mutation
|
|
1374
|
+
self._invalidate_snapshots()
|
|
1375
|
+
return result
|
|
1003
1376
|
|
|
1004
1377
|
# Font Operations
|
|
1005
1378
|
|
|
@@ -1083,31 +1456,220 @@ class PDFDancer:
|
|
|
1083
1456
|
if DEBUG:
|
|
1084
1457
|
print(f"{time.time()}|POST /font/register - request size: {request_size} bytes")
|
|
1085
1458
|
|
|
1086
|
-
headers = {
|
|
1087
|
-
|
|
1459
|
+
headers = {
|
|
1460
|
+
'X-Session-Id': self._session_id,
|
|
1461
|
+
'X-Generated-At': _generate_timestamp()
|
|
1462
|
+
}
|
|
1463
|
+
response = self._client.post(
|
|
1088
1464
|
self._cleanup_url_path(self._base_url, "/font/register"),
|
|
1089
1465
|
files=files,
|
|
1090
1466
|
headers=headers,
|
|
1091
|
-
timeout=30
|
|
1092
|
-
verify=not DISABLE_SSL_VERIFY
|
|
1467
|
+
timeout=30
|
|
1093
1468
|
)
|
|
1094
1469
|
|
|
1095
1470
|
response_size = len(response.content)
|
|
1096
1471
|
if DEBUG:
|
|
1097
1472
|
print(f"{time.time()}|POST /font/register - response size: {response_size} bytes")
|
|
1098
1473
|
|
|
1474
|
+
_log_generated_at_header(response, "POST", "/font/register")
|
|
1099
1475
|
response.raise_for_status()
|
|
1100
1476
|
return response.text.strip()
|
|
1101
1477
|
|
|
1102
1478
|
except (IOError, OSError) as e:
|
|
1103
1479
|
raise PdfDancerException(f"Failed to read font file: {e}", cause=e)
|
|
1104
|
-
except
|
|
1105
|
-
error_message = self._extract_error_message(
|
|
1480
|
+
except httpx.HTTPStatusError as e:
|
|
1481
|
+
error_message = self._extract_error_message(e.response)
|
|
1106
1482
|
raise HttpClientException(f"Font registration failed: {error_message}",
|
|
1107
|
-
response=
|
|
1483
|
+
response=e.response, cause=e) from None
|
|
1484
|
+
except httpx.RequestError as e:
|
|
1485
|
+
raise HttpClientException(f"Font registration failed: {str(e)}",
|
|
1486
|
+
response=None, cause=e) from None
|
|
1108
1487
|
|
|
1109
1488
|
# Document Operations
|
|
1110
1489
|
|
|
1490
|
+
# Snapshot Operations
|
|
1491
|
+
|
|
1492
|
+
def get_document_snapshot(self, types: Optional[str] = None) -> DocumentSnapshot:
|
|
1493
|
+
"""
|
|
1494
|
+
Retrieve a snapshot of the entire document with all pages and elements.
|
|
1495
|
+
|
|
1496
|
+
Args:
|
|
1497
|
+
types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
|
|
1498
|
+
|
|
1499
|
+
Returns:
|
|
1500
|
+
DocumentSnapshot containing page count, fonts, and all page snapshots
|
|
1501
|
+
"""
|
|
1502
|
+
params = {}
|
|
1503
|
+
if types:
|
|
1504
|
+
params['types'] = types
|
|
1505
|
+
|
|
1506
|
+
response = self._make_request('GET', '/pdf/document/snapshot', params=params)
|
|
1507
|
+
data = response.json()
|
|
1508
|
+
|
|
1509
|
+
return self._parse_document_snapshot(data)
|
|
1510
|
+
|
|
1511
|
+
def get_page_snapshot(self, page_index: int, types: Optional[str] = None) -> PageSnapshot:
|
|
1512
|
+
"""
|
|
1513
|
+
Retrieve a snapshot of a specific page with all its elements.
|
|
1514
|
+
|
|
1515
|
+
Args:
|
|
1516
|
+
page_index: The index of the page to snapshot (0-based)
|
|
1517
|
+
types: Optional comma-separated string of object types to filter (e.g., "PARAGRAPH,IMAGE")
|
|
1518
|
+
|
|
1519
|
+
Returns:
|
|
1520
|
+
PageSnapshot containing page reference and all elements on that page
|
|
1521
|
+
"""
|
|
1522
|
+
if page_index < 0:
|
|
1523
|
+
raise ValidationException(f"Page index must be >= 0, got {page_index}")
|
|
1524
|
+
|
|
1525
|
+
params = {}
|
|
1526
|
+
if types:
|
|
1527
|
+
params['types'] = types
|
|
1528
|
+
|
|
1529
|
+
response = self._make_request('GET', f'/pdf/page/{page_index}/snapshot', params=params)
|
|
1530
|
+
data = response.json()
|
|
1531
|
+
|
|
1532
|
+
return self._parse_page_snapshot(data)
|
|
1533
|
+
|
|
1534
|
+
def _get_or_fetch_document_snapshot(self) -> DocumentSnapshot:
|
|
1535
|
+
"""
|
|
1536
|
+
Get document snapshot from cache or fetch if not cached.
|
|
1537
|
+
This is used internally by select_* methods for optimization.
|
|
1538
|
+
Also caches individual page snapshots from the document snapshot.
|
|
1539
|
+
"""
|
|
1540
|
+
if self._document_snapshot is None:
|
|
1541
|
+
self._document_snapshot = self.get_document_snapshot()
|
|
1542
|
+
# Cache individual page snapshots from document snapshot
|
|
1543
|
+
for i, page_snapshot in enumerate(self._document_snapshot.pages):
|
|
1544
|
+
if i not in self._page_snapshots:
|
|
1545
|
+
self._page_snapshots[i] = page_snapshot
|
|
1546
|
+
return self._document_snapshot
|
|
1547
|
+
|
|
1548
|
+
def _get_or_fetch_page_snapshot(self, page_index: int) -> PageSnapshot:
|
|
1549
|
+
"""
|
|
1550
|
+
Get page snapshot from cache or fetch if not cached.
|
|
1551
|
+
This is used internally by select_* methods for optimization.
|
|
1552
|
+
If document snapshot exists, uses page from it instead of making separate API call.
|
|
1553
|
+
"""
|
|
1554
|
+
# Check if already cached
|
|
1555
|
+
if page_index in self._page_snapshots:
|
|
1556
|
+
return self._page_snapshots[page_index]
|
|
1557
|
+
|
|
1558
|
+
# If document snapshot exists, get page from it (no API call needed)
|
|
1559
|
+
if self._document_snapshot is not None:
|
|
1560
|
+
if 0 <= page_index < len(self._document_snapshot.pages):
|
|
1561
|
+
page_snapshot = self._document_snapshot.pages[page_index]
|
|
1562
|
+
self._page_snapshots[page_index] = page_snapshot
|
|
1563
|
+
return page_snapshot
|
|
1564
|
+
|
|
1565
|
+
# Otherwise fetch page snapshot individually
|
|
1566
|
+
self._page_snapshots[page_index] = self.get_page_snapshot(page_index)
|
|
1567
|
+
return self._page_snapshots[page_index]
|
|
1568
|
+
|
|
1569
|
+
def _invalidate_snapshots(self) -> None:
|
|
1570
|
+
"""
|
|
1571
|
+
Clear all snapshot caches.
|
|
1572
|
+
Called after mutations (delete, move, modify) to ensure fresh data on next select.
|
|
1573
|
+
"""
|
|
1574
|
+
self._document_snapshot = None
|
|
1575
|
+
self._page_snapshots.clear()
|
|
1576
|
+
|
|
1577
|
+
def _filter_snapshot_elements(self, elements: List, object_type: ObjectType,
|
|
1578
|
+
position: Optional[Position] = None, tolerance: float = DEFAULT_TOLERANCE) -> List:
|
|
1579
|
+
"""
|
|
1580
|
+
Filter snapshot elements client-side based on object type and position criteria.
|
|
1581
|
+
|
|
1582
|
+
Args:
|
|
1583
|
+
elements: List of elements from snapshot (ObjectRef, TextObjectRef, etc.)
|
|
1584
|
+
object_type: Type to filter for
|
|
1585
|
+
position: Optional position filter with text matching, bounding rect, etc.
|
|
1586
|
+
tolerance: Tolerance in points for spatial matching (default: 10.0)
|
|
1587
|
+
|
|
1588
|
+
Returns:
|
|
1589
|
+
Filtered list of elements matching the criteria
|
|
1590
|
+
"""
|
|
1591
|
+
import re
|
|
1592
|
+
|
|
1593
|
+
# Filter by object type (handle form field subtypes)
|
|
1594
|
+
if object_type == ObjectType.FORM_FIELD:
|
|
1595
|
+
# Form fields include TEXT_FIELD, CHECK_BOX, RADIO_BUTTON, BUTTON, DROPDOWN
|
|
1596
|
+
form_field_types = {ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
|
|
1597
|
+
ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
|
|
1598
|
+
ObjectType.BUTTON, ObjectType.DROPDOWN}
|
|
1599
|
+
filtered = [e for e in elements if e.type in form_field_types]
|
|
1600
|
+
else:
|
|
1601
|
+
filtered = [e for e in elements if e.type == object_type]
|
|
1602
|
+
|
|
1603
|
+
if position is None:
|
|
1604
|
+
return filtered
|
|
1605
|
+
|
|
1606
|
+
# Apply position filters
|
|
1607
|
+
result = filtered
|
|
1608
|
+
|
|
1609
|
+
# Text starts with filter (case-insensitive to match API behavior)
|
|
1610
|
+
if position.text_starts_with:
|
|
1611
|
+
search_text = position.text_starts_with.lower()
|
|
1612
|
+
result = [
|
|
1613
|
+
e for e in result
|
|
1614
|
+
if isinstance(e, TextObjectRef) and e.text and e.text.lower().startswith(search_text)
|
|
1615
|
+
]
|
|
1616
|
+
|
|
1617
|
+
# Regex pattern filter
|
|
1618
|
+
if position.text_pattern:
|
|
1619
|
+
pattern = re.compile(position.text_pattern)
|
|
1620
|
+
result = [
|
|
1621
|
+
e for e in result
|
|
1622
|
+
if isinstance(e, TextObjectRef) and e.text and pattern.search(e.text)
|
|
1623
|
+
]
|
|
1624
|
+
|
|
1625
|
+
# Bounding rect filter (spatial queries like at(x, y))
|
|
1626
|
+
if position.bounding_rect:
|
|
1627
|
+
rect = position.bounding_rect
|
|
1628
|
+
result = [
|
|
1629
|
+
e for e in result
|
|
1630
|
+
if e.position and e.position.bounding_rect and
|
|
1631
|
+
self._rects_intersect(e.position.bounding_rect, rect, tolerance)
|
|
1632
|
+
]
|
|
1633
|
+
|
|
1634
|
+
# Name filter (for form fields)
|
|
1635
|
+
if position.name:
|
|
1636
|
+
from .models import FormFieldRef
|
|
1637
|
+
result = [
|
|
1638
|
+
e for e in result
|
|
1639
|
+
if isinstance(e, FormFieldRef) and e.name == position.name
|
|
1640
|
+
]
|
|
1641
|
+
|
|
1642
|
+
return result
|
|
1643
|
+
|
|
1644
|
+
@staticmethod
|
|
1645
|
+
def _rects_intersect(rect1, rect2, tolerance: float = DEFAULT_TOLERANCE) -> bool:
|
|
1646
|
+
"""
|
|
1647
|
+
Check if two bounding rectangles intersect or are very close.
|
|
1648
|
+
Handles point queries (width/height = 0) with tolerance.
|
|
1649
|
+
|
|
1650
|
+
Args:
|
|
1651
|
+
rect1: First bounding rectangle
|
|
1652
|
+
rect2: Second bounding rectangle
|
|
1653
|
+
tolerance: Tolerance in points for position matching (default: 10.0)
|
|
1654
|
+
"""
|
|
1655
|
+
# Get effective bounds with tolerance
|
|
1656
|
+
r1_left = rect1.x - tolerance
|
|
1657
|
+
r1_right = rect1.x + rect1.width + tolerance
|
|
1658
|
+
r1_top = rect1.y - tolerance
|
|
1659
|
+
r1_bottom = rect1.y + rect1.height + tolerance
|
|
1660
|
+
|
|
1661
|
+
r2_left = rect2.x - tolerance
|
|
1662
|
+
r2_right = rect2.x + rect2.width + tolerance
|
|
1663
|
+
r2_top = rect2.y - tolerance
|
|
1664
|
+
r2_bottom = rect2.y + rect2.height + tolerance
|
|
1665
|
+
|
|
1666
|
+
# Check if rectangles overlap
|
|
1667
|
+
if r1_right < r2_left or r2_right < r1_left:
|
|
1668
|
+
return False
|
|
1669
|
+
if r1_bottom < r2_top or r2_bottom < r1_top:
|
|
1670
|
+
return False
|
|
1671
|
+
return True
|
|
1672
|
+
|
|
1111
1673
|
def get_bytes(self) -> bytes:
|
|
1112
1674
|
"""
|
|
1113
1675
|
Downloads the current state of the PDF document with all modifications applied.
|
|
@@ -1298,6 +1860,175 @@ class PDFDancer:
|
|
|
1298
1860
|
orientation=orientation
|
|
1299
1861
|
)
|
|
1300
1862
|
|
|
1863
|
+
def _parse_path_segment(self, segment_data: dict) -> 'PathSegment':
|
|
1864
|
+
"""Parse JSON data into PathSegment instance (Line or Bezier)."""
|
|
1865
|
+
from .models import Line, Bezier, PathSegment, Point, Color
|
|
1866
|
+
|
|
1867
|
+
segment_type = segment_data.get('segmentType', segment_data.get('type', '')).upper()
|
|
1868
|
+
|
|
1869
|
+
# Parse common properties
|
|
1870
|
+
stroke_color = None
|
|
1871
|
+
stroke_color_data = segment_data.get('strokeColor')
|
|
1872
|
+
if isinstance(stroke_color_data, dict):
|
|
1873
|
+
r = stroke_color_data.get('red', 0)
|
|
1874
|
+
g = stroke_color_data.get('green', 0)
|
|
1875
|
+
b = stroke_color_data.get('blue', 0)
|
|
1876
|
+
a = stroke_color_data.get('alpha', 255)
|
|
1877
|
+
if all(isinstance(v, int) for v in [r, g, b]):
|
|
1878
|
+
stroke_color = Color(r, g, b, a)
|
|
1879
|
+
|
|
1880
|
+
fill_color = None
|
|
1881
|
+
fill_color_data = segment_data.get('fillColor')
|
|
1882
|
+
if isinstance(fill_color_data, dict):
|
|
1883
|
+
r = fill_color_data.get('red', 0)
|
|
1884
|
+
g = fill_color_data.get('green', 0)
|
|
1885
|
+
b = fill_color_data.get('blue', 0)
|
|
1886
|
+
a = fill_color_data.get('alpha', 255)
|
|
1887
|
+
if all(isinstance(v, int) for v in [r, g, b]):
|
|
1888
|
+
fill_color = Color(r, g, b, a)
|
|
1889
|
+
|
|
1890
|
+
stroke_width = segment_data.get('strokeWidth')
|
|
1891
|
+
dash_array = segment_data.get('dashArray')
|
|
1892
|
+
dash_phase = segment_data.get('dashPhase')
|
|
1893
|
+
|
|
1894
|
+
# Parse specific segment type
|
|
1895
|
+
if segment_type == 'LINE':
|
|
1896
|
+
p0_data = segment_data.get('p0', {})
|
|
1897
|
+
p1_data = segment_data.get('p1', {})
|
|
1898
|
+
|
|
1899
|
+
p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
|
|
1900
|
+
p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
|
|
1901
|
+
|
|
1902
|
+
return Line(
|
|
1903
|
+
stroke_color=stroke_color,
|
|
1904
|
+
fill_color=fill_color,
|
|
1905
|
+
stroke_width=stroke_width,
|
|
1906
|
+
dash_array=dash_array,
|
|
1907
|
+
dash_phase=dash_phase,
|
|
1908
|
+
p0=p0,
|
|
1909
|
+
p1=p1
|
|
1910
|
+
)
|
|
1911
|
+
elif segment_type == 'BEZIER':
|
|
1912
|
+
p0_data = segment_data.get('p0', {})
|
|
1913
|
+
p1_data = segment_data.get('p1', {})
|
|
1914
|
+
p2_data = segment_data.get('p2', {})
|
|
1915
|
+
p3_data = segment_data.get('p3', {})
|
|
1916
|
+
|
|
1917
|
+
p0 = Point(p0_data.get('x', 0.0), p0_data.get('y', 0.0)) if p0_data else None
|
|
1918
|
+
p1 = Point(p1_data.get('x', 0.0), p1_data.get('y', 0.0)) if p1_data else None
|
|
1919
|
+
p2 = Point(p2_data.get('x', 0.0), p2_data.get('y', 0.0)) if p2_data else None
|
|
1920
|
+
p3 = Point(p3_data.get('x', 0.0), p3_data.get('y', 0.0)) if p3_data else None
|
|
1921
|
+
|
|
1922
|
+
return Bezier(
|
|
1923
|
+
stroke_color=stroke_color,
|
|
1924
|
+
fill_color=fill_color,
|
|
1925
|
+
stroke_width=stroke_width,
|
|
1926
|
+
dash_array=dash_array,
|
|
1927
|
+
dash_phase=dash_phase,
|
|
1928
|
+
p0=p0,
|
|
1929
|
+
p1=p1,
|
|
1930
|
+
p2=p2,
|
|
1931
|
+
p3=p3
|
|
1932
|
+
)
|
|
1933
|
+
else:
|
|
1934
|
+
# Fallback to base PathSegment for unknown types
|
|
1935
|
+
return PathSegment(
|
|
1936
|
+
stroke_color=stroke_color,
|
|
1937
|
+
fill_color=fill_color,
|
|
1938
|
+
stroke_width=stroke_width,
|
|
1939
|
+
dash_array=dash_array,
|
|
1940
|
+
dash_phase=dash_phase
|
|
1941
|
+
)
|
|
1942
|
+
|
|
1943
|
+
def _parse_path(self, obj_data: dict) -> 'Path':
|
|
1944
|
+
"""Parse JSON data into Path instance with path segments."""
|
|
1945
|
+
from .models import Path
|
|
1946
|
+
|
|
1947
|
+
position_data = obj_data.get('position', {})
|
|
1948
|
+
position = self._parse_position(position_data) if position_data else None
|
|
1949
|
+
|
|
1950
|
+
# Parse path segments
|
|
1951
|
+
path_segments = []
|
|
1952
|
+
segments_data = obj_data.get('pathSegments', [])
|
|
1953
|
+
if isinstance(segments_data, list):
|
|
1954
|
+
for segment_data in segments_data:
|
|
1955
|
+
if isinstance(segment_data, dict):
|
|
1956
|
+
path_segments.append(self._parse_path_segment(segment_data))
|
|
1957
|
+
|
|
1958
|
+
even_odd_fill = obj_data.get('evenOddFill')
|
|
1959
|
+
|
|
1960
|
+
return Path(
|
|
1961
|
+
position=position,
|
|
1962
|
+
path_segments=path_segments if path_segments else None,
|
|
1963
|
+
even_odd_fill=even_odd_fill
|
|
1964
|
+
)
|
|
1965
|
+
|
|
1966
|
+
def _parse_font_recommendation(self, data: dict) -> FontRecommendation:
|
|
1967
|
+
"""Parse JSON data into FontRecommendation instance."""
|
|
1968
|
+
font_type_str = data.get('fontType', 'SYSTEM')
|
|
1969
|
+
font_type = FontType(font_type_str)
|
|
1970
|
+
|
|
1971
|
+
return FontRecommendation(
|
|
1972
|
+
font_name=data.get('fontName', ''),
|
|
1973
|
+
font_type=font_type,
|
|
1974
|
+
similarity_score=data.get('similarityScore', 0.0)
|
|
1975
|
+
)
|
|
1976
|
+
|
|
1977
|
+
def _parse_page_snapshot(self, data: dict) -> PageSnapshot:
|
|
1978
|
+
"""Parse JSON data into PageSnapshot instance with proper type handling."""
|
|
1979
|
+
page_ref = self._parse_page_ref(data.get('pageRef', {}))
|
|
1980
|
+
|
|
1981
|
+
# Parse elements using appropriate parser based on type
|
|
1982
|
+
elements = []
|
|
1983
|
+
for elem_data in data.get('elements', []):
|
|
1984
|
+
elem_type_str = elem_data.get('type')
|
|
1985
|
+
if not elem_type_str:
|
|
1986
|
+
continue
|
|
1987
|
+
|
|
1988
|
+
try:
|
|
1989
|
+
# Normalize type string (API returns "CHECKBOX" but enum is "CHECK_BOX")
|
|
1990
|
+
if elem_type_str == "CHECKBOX":
|
|
1991
|
+
elem_type_str = "CHECK_BOX"
|
|
1992
|
+
# Deep copy to avoid modifying original
|
|
1993
|
+
import copy
|
|
1994
|
+
elem_data = copy.deepcopy(elem_data)
|
|
1995
|
+
elem_data['type'] = elem_type_str # Update type in data
|
|
1996
|
+
|
|
1997
|
+
elem_type = ObjectType(elem_type_str)
|
|
1998
|
+
|
|
1999
|
+
# Use appropriate parser based on element type
|
|
2000
|
+
if elem_type in (ObjectType.PARAGRAPH, ObjectType.TEXT_LINE):
|
|
2001
|
+
# Parse as TextObjectRef to capture text, font, color, children
|
|
2002
|
+
elements.append(self._parse_text_object_ref(elem_data))
|
|
2003
|
+
elif elem_type in (ObjectType.FORM_FIELD, ObjectType.TEXT_FIELD,
|
|
2004
|
+
ObjectType.CHECK_BOX, ObjectType.RADIO_BUTTON,
|
|
2005
|
+
ObjectType.BUTTON, ObjectType.DROPDOWN):
|
|
2006
|
+
# Parse as FormFieldRef to capture name and value
|
|
2007
|
+
elements.append(self._parse_form_field_ref(elem_data))
|
|
2008
|
+
else:
|
|
2009
|
+
# Parse as basic ObjectRef
|
|
2010
|
+
elements.append(self._parse_object_ref(elem_data))
|
|
2011
|
+
except (ValueError, KeyError):
|
|
2012
|
+
# Skip elements with invalid types
|
|
2013
|
+
continue
|
|
2014
|
+
|
|
2015
|
+
return PageSnapshot(
|
|
2016
|
+
page_ref=page_ref,
|
|
2017
|
+
elements=elements
|
|
2018
|
+
)
|
|
2019
|
+
|
|
2020
|
+
def _parse_document_snapshot(self, data: dict) -> DocumentSnapshot:
|
|
2021
|
+
"""Parse JSON data into DocumentSnapshot instance."""
|
|
2022
|
+
page_count = data.get('pageCount', 0)
|
|
2023
|
+
fonts = [self._parse_font_recommendation(font_data) for font_data in data.get('fonts', [])]
|
|
2024
|
+
pages = [self._parse_page_snapshot(page_data) for page_data in data.get('pages', [])]
|
|
2025
|
+
|
|
2026
|
+
return DocumentSnapshot(
|
|
2027
|
+
page_count=page_count,
|
|
2028
|
+
fonts=fonts,
|
|
2029
|
+
pages=pages
|
|
2030
|
+
)
|
|
2031
|
+
|
|
1301
2032
|
# Builder Pattern Support
|
|
1302
2033
|
|
|
1303
2034
|
def _paragraph_builder(self) -> 'ParagraphBuilder':
|
|
@@ -1316,9 +2047,17 @@ class PDFDancer:
|
|
|
1316
2047
|
|
|
1317
2048
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
1318
2049
|
"""Context manager exit - cleanup if needed."""
|
|
2050
|
+
# Close the HTTP client to free resources
|
|
2051
|
+
if hasattr(self, '_client'):
|
|
2052
|
+
self._client.close()
|
|
1319
2053
|
# TODO Could add session cleanup here if API supports it. Cleanup on the server
|
|
1320
2054
|
pass
|
|
1321
2055
|
|
|
2056
|
+
def close(self):
|
|
2057
|
+
"""Close the HTTP client and free resources."""
|
|
2058
|
+
if hasattr(self, '_client'):
|
|
2059
|
+
self._client.close()
|
|
2060
|
+
|
|
1322
2061
|
def _to_path_objects(self, refs: List[ObjectRef]) -> List[PathObject]:
|
|
1323
2062
|
return [PathObject(self, ref.internal_id, ref.type, ref.position) for ref in refs]
|
|
1324
2063
|
|