python-pdffiller 2.0.3__py3-none-any.whl → 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdffiller/_version.py +1 -1
- pdffiller/cli/commands/sanitize.py +100 -0
- pdffiller/pdf.py +184 -3
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/METADATA +1 -1
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/RECORD +10 -9
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/WHEEL +0 -0
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/entry_points.txt +0 -0
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/licenses/AUTHORS.rst +0 -0
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/licenses/COPYING +0 -0
- {python_pdffiller-2.0.3.dist-info → python_pdffiller-2.0.4.dist-info}/top_level.txt +0 -0
pdffiller/_version.py
CHANGED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from pdffiller.cli.args import add_global_arguments
|
|
5
|
+
from pdffiller.cli.command import pdffiller_command, PdfFillerArgumentParser
|
|
6
|
+
from pdffiller.cli.once_argument import OnceArgument
|
|
7
|
+
from pdffiller.exceptions import (
|
|
8
|
+
AbortExecution,
|
|
9
|
+
CommandLineError,
|
|
10
|
+
FileNotExistsError,
|
|
11
|
+
PdfFillerException,
|
|
12
|
+
)
|
|
13
|
+
from pdffiller.io.output import cli_out_write, PdfFillerOutput
|
|
14
|
+
from pdffiller.pdf import Pdf
|
|
15
|
+
from pdffiller.typing import Any, Dict, List
|
|
16
|
+
|
|
17
|
+
from ..exit_codes import ERROR_ENCOUNTERED
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def sanitize_text_formatter(sanitized: List[Dict[str, Any]]) -> None:
|
|
21
|
+
"""Print sanitized fields as plain text"""
|
|
22
|
+
if not sanitized:
|
|
23
|
+
cli_out_write("No fields were sanitized.")
|
|
24
|
+
return
|
|
25
|
+
for field in sanitized:
|
|
26
|
+
cli_out_write("----------")
|
|
27
|
+
for key, value in field.items():
|
|
28
|
+
cli_out_write(f"{key}: {value}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def sanitize_json_formatter(sanitized: List[Dict[str, Any]]) -> None:
|
|
32
|
+
"""Print sanitized fields as JSON"""
|
|
33
|
+
if not sanitized:
|
|
34
|
+
return
|
|
35
|
+
cli_out_write(json.dumps(sanitized, indent=4, ensure_ascii=False))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@pdffiller_command(
|
|
39
|
+
group=None,
|
|
40
|
+
formatters={"text": sanitize_text_formatter, "json": sanitize_json_formatter},
|
|
41
|
+
)
|
|
42
|
+
def sanitize(parser: PdfFillerArgumentParser, *args: Any) -> Any:
|
|
43
|
+
"""
|
|
44
|
+
Sanitize a PDF by removing MaxLen constraints from text fields.
|
|
45
|
+
This fixes PDFs where MaxLen is incorrectly set, causing text truncation.
|
|
46
|
+
"""
|
|
47
|
+
options_group = parser.add_argument_group("options")
|
|
48
|
+
|
|
49
|
+
options_group.add_argument(
|
|
50
|
+
"-o",
|
|
51
|
+
"--output",
|
|
52
|
+
metavar="OUTPUT_PATH",
|
|
53
|
+
type=str,
|
|
54
|
+
help="""Path to the output PDF file.""",
|
|
55
|
+
action=OnceArgument,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
options_group.add_argument(
|
|
59
|
+
"--deep",
|
|
60
|
+
action="store_true",
|
|
61
|
+
default=False,
|
|
62
|
+
help="Perform a deep sanitization: also clear the Comb flag from text fields.",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"file",
|
|
67
|
+
metavar="INPUT_PATH",
|
|
68
|
+
type=str,
|
|
69
|
+
nargs="?",
|
|
70
|
+
help="""Path to the input PDF file.""",
|
|
71
|
+
action=OnceArgument,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
add_global_arguments(options_group, True, parser)
|
|
75
|
+
|
|
76
|
+
opts = parser.parse_args(*args)
|
|
77
|
+
|
|
78
|
+
output = PdfFillerOutput()
|
|
79
|
+
if not opts.file:
|
|
80
|
+
raise CommandLineError("no input file given")
|
|
81
|
+
|
|
82
|
+
if not opts.output:
|
|
83
|
+
raise CommandLineError("no output file path given")
|
|
84
|
+
|
|
85
|
+
if not os.path.isfile(opts.file):
|
|
86
|
+
raise FileNotExistsError(opts.file)
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
pdf = Pdf()
|
|
90
|
+
sanitized = pdf.sanitize(opts.file, opts.output, opts.deep)
|
|
91
|
+
output.info(f"sanitized pdf saved to {opts.output}")
|
|
92
|
+
return sanitized
|
|
93
|
+
except PdfFillerException as exp:
|
|
94
|
+
output.error(str(exp))
|
|
95
|
+
except Exception as exg: # pylint: disable=broad-except # pragma: no cover
|
|
96
|
+
output.error(f"unexpected error when sanitizing {opts.file} with the following error:")
|
|
97
|
+
output.error(exg)
|
|
98
|
+
raise AbortExecution(ERROR_ENCOUNTERED) from exg
|
|
99
|
+
|
|
100
|
+
return None
|
pdffiller/pdf.py
CHANGED
|
@@ -88,7 +88,7 @@ class Pdf:
|
|
|
88
88
|
) from ex
|
|
89
89
|
|
|
90
90
|
for i, page in enumerate(doc.pages()):
|
|
91
|
-
output.verbose(f"loading page {i+1}/{doc.page_count}")
|
|
91
|
+
output.verbose(f"loading page {i + 1}/{doc.page_count}")
|
|
92
92
|
for widget in page.widgets():
|
|
93
93
|
button_states = widget.button_states()
|
|
94
94
|
choices = button_states["normal"] if button_states else None
|
|
@@ -105,7 +105,7 @@ class Pdf:
|
|
|
105
105
|
choices.insert(0, "Off")
|
|
106
106
|
new_widget.choices = [choice.replace("#20", " ") for choice in choices]
|
|
107
107
|
elif isinstance(new_widget, TextWidget):
|
|
108
|
-
new_widget.max_length = widget
|
|
108
|
+
new_widget.max_length = self._resolve_text_maxlen(doc, widget)
|
|
109
109
|
loaded_widgets[widget.field_name] = new_widget
|
|
110
110
|
else:
|
|
111
111
|
new_widget = loaded_widgets[widget.field_name]
|
|
@@ -128,6 +128,186 @@ class Pdf:
|
|
|
128
128
|
|
|
129
129
|
self.widgets = loaded_widgets
|
|
130
130
|
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _resolve_text_maxlen(doc: pymupdf.Document, widget: Any) -> Optional[int]:
|
|
133
|
+
"""
|
|
134
|
+
Resolve the MaxLen value for a text widget by walking up the PDF object hierarchy.
|
|
135
|
+
|
|
136
|
+
PyMuPDF's widget.text_maxlen may not reflect the actual MaxLen when it is
|
|
137
|
+
defined on a parent field dictionary rather than on the widget annotation itself.
|
|
138
|
+
This method checks the widget xref and its parents for /MaxLen.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
doc: The pymupdf Document.
|
|
142
|
+
widget: The pymupdf Widget.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
The resolved max length, or None if not found.
|
|
146
|
+
"""
|
|
147
|
+
max_length: Optional[int] = widget.text_maxlen
|
|
148
|
+
xref = widget.xref
|
|
149
|
+
while xref > 0:
|
|
150
|
+
key_type, value = doc.xref_get_key(xref, "MaxLen")
|
|
151
|
+
if key_type != "null" and value:
|
|
152
|
+
try:
|
|
153
|
+
int_value = int(value)
|
|
154
|
+
if max_length is None or max_length < int_value:
|
|
155
|
+
max_length = int_value
|
|
156
|
+
# return int(value)
|
|
157
|
+
except (ValueError, TypeError):
|
|
158
|
+
pass
|
|
159
|
+
# Walk up to parent
|
|
160
|
+
key_type, value = doc.xref_get_key(xref, "Parent")
|
|
161
|
+
if key_type == "xref":
|
|
162
|
+
parent_xref = int(value.split()[0])
|
|
163
|
+
if parent_xref == xref:
|
|
164
|
+
break
|
|
165
|
+
xref = parent_xref
|
|
166
|
+
else:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
return max_length
|
|
170
|
+
|
|
171
|
+
COMB_FLAG = 1 << 24 # Bit 25 in PDF spec (Comb option)
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def _has_comb_flag(doc: pymupdf.Document, widget: Any) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
Check if the Comb flag is set on the widget or any of its parents.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
doc: The pymupdf Document.
|
|
180
|
+
widget: The pymupdf Widget.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
True if the Comb flag is set on any node in the hierarchy.
|
|
184
|
+
"""
|
|
185
|
+
xref = widget.xref
|
|
186
|
+
while xref > 0:
|
|
187
|
+
key_type, ff_value = doc.xref_get_key(xref, "Ff")
|
|
188
|
+
if key_type != "null" and ff_value:
|
|
189
|
+
try:
|
|
190
|
+
if int(ff_value) & Pdf.COMB_FLAG:
|
|
191
|
+
return True
|
|
192
|
+
except (ValueError, TypeError):
|
|
193
|
+
pass
|
|
194
|
+
key_type, value = doc.xref_get_key(xref, "Parent")
|
|
195
|
+
if key_type == "xref":
|
|
196
|
+
parent_xref = int(value.split()[0])
|
|
197
|
+
if parent_xref == xref:
|
|
198
|
+
break
|
|
199
|
+
xref = parent_xref
|
|
200
|
+
else:
|
|
201
|
+
break
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def _remove_maxlen(doc: pymupdf.Document, widget: Any, deep: bool = False) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Remove /MaxLen from the widget and all its parent field dictionaries.
|
|
208
|
+
|
|
209
|
+
In deep mode, also clears the Comb flag (bit 25 of /Ff).
|
|
210
|
+
In normal mode, /MaxLen is only removed on nodes where Comb is not set.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
doc: The pymupdf Document.
|
|
214
|
+
widget: The pymupdf Widget.
|
|
215
|
+
deep: If True, also clear the Comb flag from /Ff.
|
|
216
|
+
"""
|
|
217
|
+
xref = widget.xref
|
|
218
|
+
while xref > 0:
|
|
219
|
+
# Check Comb flag on this node
|
|
220
|
+
has_comb = False
|
|
221
|
+
key_type, ff_value = doc.xref_get_key(xref, "Ff")
|
|
222
|
+
if key_type != "null" and ff_value:
|
|
223
|
+
try:
|
|
224
|
+
flags = int(ff_value)
|
|
225
|
+
has_comb = bool(flags & Pdf.COMB_FLAG)
|
|
226
|
+
except (ValueError, TypeError):
|
|
227
|
+
pass
|
|
228
|
+
|
|
229
|
+
if deep:
|
|
230
|
+
# Deep mode: remove /MaxLen and clear Comb flag
|
|
231
|
+
if doc.xref_get_key(xref, "MaxLen")[0] != "null":
|
|
232
|
+
doc.xref_set_key(xref, "MaxLen", "null")
|
|
233
|
+
if has_comb:
|
|
234
|
+
doc.xref_set_key(xref, "Ff", str(flags & ~Pdf.COMB_FLAG))
|
|
235
|
+
else:
|
|
236
|
+
# Normal mode: only remove /MaxLen if Comb is not set on this node
|
|
237
|
+
if not has_comb and doc.xref_get_key(xref, "MaxLen")[0] != "null":
|
|
238
|
+
doc.xref_set_key(xref, "MaxLen", "null")
|
|
239
|
+
|
|
240
|
+
# Walk up to parent
|
|
241
|
+
key_type, value = doc.xref_get_key(xref, "Parent")
|
|
242
|
+
if key_type == "xref":
|
|
243
|
+
parent_xref = int(value.split()[0])
|
|
244
|
+
if parent_xref == xref:
|
|
245
|
+
break
|
|
246
|
+
xref = parent_xref
|
|
247
|
+
else:
|
|
248
|
+
break
|
|
249
|
+
widget.text_maxlen = 0
|
|
250
|
+
|
|
251
|
+
def sanitize(
|
|
252
|
+
self,
|
|
253
|
+
input_file: PathLike,
|
|
254
|
+
output_file: PathLike,
|
|
255
|
+
deep: bool = False,
|
|
256
|
+
) -> List[Dict[str, Any]]:
|
|
257
|
+
"""
|
|
258
|
+
Sanitize a PDF by removing /MaxLen constraints from all text fields.
|
|
259
|
+
|
|
260
|
+
This fixes PDFs where /MaxLen is set incorrectly (e.g. 4 instead of 27),
|
|
261
|
+
which causes text truncation when filling the form.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
input_file (PathLike): The input file path.
|
|
265
|
+
output_file (PathLike): The output file path.
|
|
266
|
+
deep: If True, also clear the Comb flag from /Ff.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
A list of dicts describing each sanitized field:
|
|
270
|
+
[{"FieldName": ..., "FieldType": "text", "MaxLen": <old_value>}, ...]
|
|
271
|
+
"""
|
|
272
|
+
try:
|
|
273
|
+
document = pymupdf.open(filename=input_file)
|
|
274
|
+
except Exception as ex:
|
|
275
|
+
PdfFillerOutput().error(str(ex))
|
|
276
|
+
raise PdfFillerException(f"failed to open {input_file}") from ex
|
|
277
|
+
|
|
278
|
+
output = PdfFillerOutput()
|
|
279
|
+
output.info("sanitizing pdf text fields")
|
|
280
|
+
|
|
281
|
+
sanitized: List[Dict[str, Any]] = []
|
|
282
|
+
for page in document:
|
|
283
|
+
for field in page.widgets():
|
|
284
|
+
if (
|
|
285
|
+
field.field_type == pymupdf.PDF_WIDGET_TYPE_TEXT # pylint: disable=no-member
|
|
286
|
+
and field.text_maxlen
|
|
287
|
+
):
|
|
288
|
+
# Skip fields with Comb flag in normal mode
|
|
289
|
+
if not deep and self._has_comb_flag(document, field):
|
|
290
|
+
output.verbose(f"skipping {field.field_name} (Comb flag active)")
|
|
291
|
+
continue
|
|
292
|
+
old_maxlen = field.text_maxlen
|
|
293
|
+
output.verbose(f"removing MaxLen={old_maxlen} from {field.field_name}")
|
|
294
|
+
self._remove_maxlen(document, field, deep)
|
|
295
|
+
field.update()
|
|
296
|
+
sanitized.append(
|
|
297
|
+
{
|
|
298
|
+
"FieldName": field.field_name,
|
|
299
|
+
"FieldType": field.field_type_string,
|
|
300
|
+
"MaxLen": old_maxlen,
|
|
301
|
+
}
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
document.save(output_file)
|
|
306
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
307
|
+
output.warning("an error occurs when saving file")
|
|
308
|
+
|
|
309
|
+
return sanitized
|
|
310
|
+
|
|
131
311
|
@property
|
|
132
312
|
def schema(self) -> List[Dict[str, Any]]:
|
|
133
313
|
"""
|
|
@@ -217,11 +397,12 @@ class Pdf:
|
|
|
217
397
|
|
|
218
398
|
# Handling other fields types
|
|
219
399
|
else:
|
|
400
|
+
# Remove MaxLen constraint from the PDF object to avoid truncation
|
|
401
|
+
self._remove_maxlen(document, field)
|
|
220
402
|
output.verbose(
|
|
221
403
|
f"updating {field.field_name} with {value} from {field.field_value}"
|
|
222
404
|
)
|
|
223
405
|
field.field_value = value
|
|
224
|
-
|
|
225
406
|
# Update the widget!
|
|
226
407
|
field.update()
|
|
227
408
|
try:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
pdffiller/__init__.py,sha256=0HtgXhEV1fKTKAcOXGcq4UsCFflDIPCDQvckshZf-1k,195
|
|
2
2
|
pdffiller/__main__.py,sha256=7NPQgZVx6VSZS7OrmyJQ_O1vL4wiSqhiILi-outwUqM,107
|
|
3
|
-
pdffiller/_version.py,sha256=
|
|
3
|
+
pdffiller/_version.py,sha256=4kVX4SuWyAER9NxjJsz0eyvUdNcPcxAXfDBhA6Min2Q,172
|
|
4
4
|
pdffiller/const.py,sha256=if_j5I8ftczpjrzZjA7idv-XpvIj1-XBF4oe6VtQvF0,434
|
|
5
5
|
pdffiller/exceptions.py,sha256=CdN93bZ0mBBS5vLxg14FYZUy4xkYqoD3_SzqtSkZr4g,1624
|
|
6
|
-
pdffiller/pdf.py,sha256=
|
|
6
|
+
pdffiller/pdf.py,sha256=Y-GWZC7jomgqdPS5mIhfEx12xtPKlEsaWptDaAVq2mQ,16217
|
|
7
7
|
pdffiller/py.typed.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
pdffiller/typing.py,sha256=4GJGevlU-YOR9fmuPT8jmyg5MBhn-2TB1K88b6C8VRw,937
|
|
9
9
|
pdffiller/utils.py,sha256=pmGf3QwkhKwosk_eFCauzHM-XHp_WGVQAtZlxa7taYY,827
|
|
@@ -19,6 +19,7 @@ pdffiller/cli/smart_formatter.py,sha256=59hwF07nKbp-P9IfbqKgMFsfbvjIw5SACCZpUF4n
|
|
|
19
19
|
pdffiller/cli/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
pdffiller/cli/commands/dump_data_fields.py,sha256=o7LmFBdl9GfJOyZM1tOgeesx004QWIIXVp2_pX6FB1Q,2292
|
|
21
21
|
pdffiller/cli/commands/fill_form.py,sha256=5wjbjwYLsytY6ea-n_KDbU9UDmkN00NYj_gm25MQqJ8,4758
|
|
22
|
+
pdffiller/cli/commands/sanitize.py,sha256=wfcsgaTVVQKfnlHzJoRrYYYKczYjamk2uDbCUb8j-4U,2986
|
|
22
23
|
pdffiller/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
24
|
pdffiller/io/colors.py,sha256=QCBEWksTVNurOJQYO0zh1X_xxIOXxXmYSJhbCqnNjI8,1710
|
|
24
25
|
pdffiller/io/output.py,sha256=QMASWRWmfZGG9DdtlfpWXM3VJAMgWGQwPUzoYp_9FFY,11298
|
|
@@ -27,10 +28,10 @@ pdffiller/widgets/base.py,sha256=omGVQsQgMa-ALESnUd3_94oVIYScAMl0SPhHC_DG8Lg,361
|
|
|
27
28
|
pdffiller/widgets/checkbox.py,sha256=iijStLAsY1G4cljW3a9NxVS_8qxJewFEw-B8jU2aKXk,1711
|
|
28
29
|
pdffiller/widgets/radio.py,sha256=Db9Oc3Q8ge8qqTVPLoz3I1_SJBGyJ8KfA33ixZMr78c,1070
|
|
29
30
|
pdffiller/widgets/text.py,sha256=SiuyBvZPZ6idCmtZ_05zE26iN6Rz67OfOj1fUm98YQI,2397
|
|
30
|
-
python_pdffiller-2.0.
|
|
31
|
-
python_pdffiller-2.0.
|
|
32
|
-
python_pdffiller-2.0.
|
|
33
|
-
python_pdffiller-2.0.
|
|
34
|
-
python_pdffiller-2.0.
|
|
35
|
-
python_pdffiller-2.0.
|
|
36
|
-
python_pdffiller-2.0.
|
|
31
|
+
python_pdffiller-2.0.4.dist-info/licenses/AUTHORS.rst,sha256=1_hVzMKgmoXvGgrcZC7sIbU_6PvvkB6vwqevAqzrIkQ,205
|
|
32
|
+
python_pdffiller-2.0.4.dist-info/licenses/COPYING,sha256=ADPe-bH2wYq8nFf6EPJyovzTJyl3jSPnm09mGI8FSTo,1074
|
|
33
|
+
python_pdffiller-2.0.4.dist-info/METADATA,sha256=4on4QrrWJGfDsaakE1InU3ptSd2kQNECI8KTNeLyizM,4175
|
|
34
|
+
python_pdffiller-2.0.4.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
35
|
+
python_pdffiller-2.0.4.dist-info/entry_points.txt,sha256=RESKKpPPdWl0wDET96ntuFoUydALx9j0mxtbt-MEBjU,49
|
|
36
|
+
python_pdffiller-2.0.4.dist-info/top_level.txt,sha256=5MGWCBFYlu_Ax-I5PgQkV9Xw7O48maPe9z8Qj_yVPL4,10
|
|
37
|
+
python_pdffiller-2.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|