python-pdffiller 2.0.3__py3-none-any.whl → 2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdffiller/_version.py CHANGED
@@ -3,4 +3,4 @@ __copyright__ = "Copyright 2025 SISMIC"
3
3
  __email__ = "jraphanel@sismic.fr"
4
4
  __license__ = "MIT"
5
5
  __title__ = "pdffiller"
6
- __version__ = "2.0.3"
6
+ __version__ = "2.0.4"
@@ -0,0 +1,100 @@
1
+ import json
2
+ import os
3
+
4
+ from pdffiller.cli.args import add_global_arguments
5
+ from pdffiller.cli.command import pdffiller_command, PdfFillerArgumentParser
6
+ from pdffiller.cli.once_argument import OnceArgument
7
+ from pdffiller.exceptions import (
8
+ AbortExecution,
9
+ CommandLineError,
10
+ FileNotExistsError,
11
+ PdfFillerException,
12
+ )
13
+ from pdffiller.io.output import cli_out_write, PdfFillerOutput
14
+ from pdffiller.pdf import Pdf
15
+ from pdffiller.typing import Any, Dict, List
16
+
17
+ from ..exit_codes import ERROR_ENCOUNTERED
18
+
19
+
20
+ def sanitize_text_formatter(sanitized: List[Dict[str, Any]]) -> None:
21
+ """Print sanitized fields as plain text"""
22
+ if not sanitized:
23
+ cli_out_write("No fields were sanitized.")
24
+ return
25
+ for field in sanitized:
26
+ cli_out_write("----------")
27
+ for key, value in field.items():
28
+ cli_out_write(f"{key}: {value}")
29
+
30
+
31
+ def sanitize_json_formatter(sanitized: List[Dict[str, Any]]) -> None:
32
+ """Print sanitized fields as JSON"""
33
+ if not sanitized:
34
+ return
35
+ cli_out_write(json.dumps(sanitized, indent=4, ensure_ascii=False))
36
+
37
+
38
+ @pdffiller_command(
39
+ group=None,
40
+ formatters={"text": sanitize_text_formatter, "json": sanitize_json_formatter},
41
+ )
42
+ def sanitize(parser: PdfFillerArgumentParser, *args: Any) -> Any:
43
+ """
44
+ Sanitize a PDF by removing MaxLen constraints from text fields.
45
+ This fixes PDFs where MaxLen is incorrectly set, causing text truncation.
46
+ """
47
+ options_group = parser.add_argument_group("options")
48
+
49
+ options_group.add_argument(
50
+ "-o",
51
+ "--output",
52
+ metavar="OUTPUT_PATH",
53
+ type=str,
54
+ help="""Path to the output PDF file.""",
55
+ action=OnceArgument,
56
+ )
57
+
58
+ options_group.add_argument(
59
+ "--deep",
60
+ action="store_true",
61
+ default=False,
62
+ help="Perform a deep sanitization: also clear the Comb flag from text fields.",
63
+ )
64
+
65
+ parser.add_argument(
66
+ "file",
67
+ metavar="INPUT_PATH",
68
+ type=str,
69
+ nargs="?",
70
+ help="""Path to the input PDF file.""",
71
+ action=OnceArgument,
72
+ )
73
+
74
+ add_global_arguments(options_group, True, parser)
75
+
76
+ opts = parser.parse_args(*args)
77
+
78
+ output = PdfFillerOutput()
79
+ if not opts.file:
80
+ raise CommandLineError("no input file given")
81
+
82
+ if not opts.output:
83
+ raise CommandLineError("no output file path given")
84
+
85
+ if not os.path.isfile(opts.file):
86
+ raise FileNotExistsError(opts.file)
87
+
88
+ try:
89
+ pdf = Pdf()
90
+ sanitized = pdf.sanitize(opts.file, opts.output, opts.deep)
91
+ output.info(f"sanitized pdf saved to {opts.output}")
92
+ return sanitized
93
+ except PdfFillerException as exp:
94
+ output.error(str(exp))
95
+ except Exception as exg: # pylint: disable=broad-except # pragma: no cover
96
+ output.error(f"unexpected error when sanitizing {opts.file} with the following error:")
97
+ output.error(exg)
98
+ raise AbortExecution(ERROR_ENCOUNTERED) from exg
99
+
100
+ return None
pdffiller/pdf.py CHANGED
@@ -88,7 +88,7 @@ class Pdf:
88
88
  ) from ex
89
89
 
90
90
  for i, page in enumerate(doc.pages()):
91
- output.verbose(f"loading page {i+1}/{doc.page_count}")
91
+ output.verbose(f"loading page {i + 1}/{doc.page_count}")
92
92
  for widget in page.widgets():
93
93
  button_states = widget.button_states()
94
94
  choices = button_states["normal"] if button_states else None
@@ -105,7 +105,7 @@ class Pdf:
105
105
  choices.insert(0, "Off")
106
106
  new_widget.choices = [choice.replace("#20", " ") for choice in choices]
107
107
  elif isinstance(new_widget, TextWidget):
108
- new_widget.max_length = widget.text_maxlen
108
+ new_widget.max_length = self._resolve_text_maxlen(doc, widget)
109
109
  loaded_widgets[widget.field_name] = new_widget
110
110
  else:
111
111
  new_widget = loaded_widgets[widget.field_name]
@@ -128,6 +128,186 @@ class Pdf:
128
128
 
129
129
  self.widgets = loaded_widgets
130
130
 
131
+ @staticmethod
132
+ def _resolve_text_maxlen(doc: pymupdf.Document, widget: Any) -> Optional[int]:
133
+ """
134
+ Resolve the MaxLen value for a text widget by walking up the PDF object hierarchy.
135
+
136
+ PyMuPDF's widget.text_maxlen may not reflect the actual MaxLen when it is
137
+ defined on a parent field dictionary rather than on the widget annotation itself.
138
+ This method checks the widget xref and its parents for /MaxLen.
139
+
140
+ Args:
141
+ doc: The pymupdf Document.
142
+ widget: The pymupdf Widget.
143
+
144
+ Returns:
145
+ The resolved max length, or None if not found.
146
+ """
147
+ max_length: Optional[int] = widget.text_maxlen
148
+ xref = widget.xref
149
+ while xref > 0:
150
+ key_type, value = doc.xref_get_key(xref, "MaxLen")
151
+ if key_type != "null" and value:
152
+ try:
153
+ int_value = int(value)
154
+ if max_length is None or max_length < int_value:
155
+ max_length = int_value
156
+ # return int(value)
157
+ except (ValueError, TypeError):
158
+ pass
159
+ # Walk up to parent
160
+ key_type, value = doc.xref_get_key(xref, "Parent")
161
+ if key_type == "xref":
162
+ parent_xref = int(value.split()[0])
163
+ if parent_xref == xref:
164
+ break
165
+ xref = parent_xref
166
+ else:
167
+ break
168
+
169
+ return max_length
170
+
171
+ COMB_FLAG = 1 << 24 # Bit 25 in PDF spec (Comb option)
172
+
173
+ @staticmethod
174
+ def _has_comb_flag(doc: pymupdf.Document, widget: Any) -> bool:
175
+ """
176
+ Check if the Comb flag is set on the widget or any of its parents.
177
+
178
+ Args:
179
+ doc: The pymupdf Document.
180
+ widget: The pymupdf Widget.
181
+
182
+ Returns:
183
+ True if the Comb flag is set on any node in the hierarchy.
184
+ """
185
+ xref = widget.xref
186
+ while xref > 0:
187
+ key_type, ff_value = doc.xref_get_key(xref, "Ff")
188
+ if key_type != "null" and ff_value:
189
+ try:
190
+ if int(ff_value) & Pdf.COMB_FLAG:
191
+ return True
192
+ except (ValueError, TypeError):
193
+ pass
194
+ key_type, value = doc.xref_get_key(xref, "Parent")
195
+ if key_type == "xref":
196
+ parent_xref = int(value.split()[0])
197
+ if parent_xref == xref:
198
+ break
199
+ xref = parent_xref
200
+ else:
201
+ break
202
+ return False
203
+
204
+ @staticmethod
205
+ def _remove_maxlen(doc: pymupdf.Document, widget: Any, deep: bool = False) -> None:
206
+ """
207
+ Remove /MaxLen from the widget and all its parent field dictionaries.
208
+
209
+ In deep mode, also clears the Comb flag (bit 25 of /Ff).
210
+ In normal mode, /MaxLen is only removed on nodes where Comb is not set.
211
+
212
+ Args:
213
+ doc: The pymupdf Document.
214
+ widget: The pymupdf Widget.
215
+ deep: If True, also clear the Comb flag from /Ff.
216
+ """
217
+ xref = widget.xref
218
+ while xref > 0:
219
+ # Check Comb flag on this node
220
+ has_comb = False
221
+ key_type, ff_value = doc.xref_get_key(xref, "Ff")
222
+ if key_type != "null" and ff_value:
223
+ try:
224
+ flags = int(ff_value)
225
+ has_comb = bool(flags & Pdf.COMB_FLAG)
226
+ except (ValueError, TypeError):
227
+ pass
228
+
229
+ if deep:
230
+ # Deep mode: remove /MaxLen and clear Comb flag
231
+ if doc.xref_get_key(xref, "MaxLen")[0] != "null":
232
+ doc.xref_set_key(xref, "MaxLen", "null")
233
+ if has_comb:
234
+ doc.xref_set_key(xref, "Ff", str(flags & ~Pdf.COMB_FLAG))
235
+ else:
236
+ # Normal mode: only remove /MaxLen if Comb is not set on this node
237
+ if not has_comb and doc.xref_get_key(xref, "MaxLen")[0] != "null":
238
+ doc.xref_set_key(xref, "MaxLen", "null")
239
+
240
+ # Walk up to parent
241
+ key_type, value = doc.xref_get_key(xref, "Parent")
242
+ if key_type == "xref":
243
+ parent_xref = int(value.split()[0])
244
+ if parent_xref == xref:
245
+ break
246
+ xref = parent_xref
247
+ else:
248
+ break
249
+ widget.text_maxlen = 0
250
+
251
+ def sanitize(
252
+ self,
253
+ input_file: PathLike,
254
+ output_file: PathLike,
255
+ deep: bool = False,
256
+ ) -> List[Dict[str, Any]]:
257
+ """
258
+ Sanitize a PDF by removing /MaxLen constraints from all text fields.
259
+
260
+ This fixes PDFs where /MaxLen is set incorrectly (e.g. 4 instead of 27),
261
+ which causes text truncation when filling the form.
262
+
263
+ Args:
264
+ input_file (PathLike): The input file path.
265
+ output_file (PathLike): The output file path.
266
+ deep: If True, also clear the Comb flag from /Ff.
267
+
268
+ Returns:
269
+ A list of dicts describing each sanitized field:
270
+ [{"FieldName": ..., "FieldType": "text", "MaxLen": <old_value>}, ...]
271
+ """
272
+ try:
273
+ document = pymupdf.open(filename=input_file)
274
+ except Exception as ex:
275
+ PdfFillerOutput().error(str(ex))
276
+ raise PdfFillerException(f"failed to open {input_file}") from ex
277
+
278
+ output = PdfFillerOutput()
279
+ output.info("sanitizing pdf text fields")
280
+
281
+ sanitized: List[Dict[str, Any]] = []
282
+ for page in document:
283
+ for field in page.widgets():
284
+ if (
285
+ field.field_type == pymupdf.PDF_WIDGET_TYPE_TEXT # pylint: disable=no-member
286
+ and field.text_maxlen
287
+ ):
288
+ # Skip fields with Comb flag in normal mode
289
+ if not deep and self._has_comb_flag(document, field):
290
+ output.verbose(f"skipping {field.field_name} (Comb flag active)")
291
+ continue
292
+ old_maxlen = field.text_maxlen
293
+ output.verbose(f"removing MaxLen={old_maxlen} from {field.field_name}")
294
+ self._remove_maxlen(document, field, deep)
295
+ field.update()
296
+ sanitized.append(
297
+ {
298
+ "FieldName": field.field_name,
299
+ "FieldType": field.field_type_string,
300
+ "MaxLen": old_maxlen,
301
+ }
302
+ )
303
+
304
+ try:
305
+ document.save(output_file)
306
+ except Exception: # pylint: disable=broad-exception-caught
307
+ output.warning("an error occurs when saving file")
308
+
309
+ return sanitized
310
+
131
311
  @property
132
312
  def schema(self) -> List[Dict[str, Any]]:
133
313
  """
@@ -217,11 +397,12 @@ class Pdf:
217
397
 
218
398
  # Handling other fields types
219
399
  else:
400
+ # Remove MaxLen constraint from the PDF object to avoid truncation
401
+ self._remove_maxlen(document, field)
220
402
  output.verbose(
221
403
  f"updating {field.field_name} with {value} from {field.field_value}"
222
404
  )
223
405
  field.field_value = value
224
-
225
406
  # Update the widget!
226
407
  field.update()
227
408
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-pdffiller
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: Interact with PDF by inspecting or filling it
5
5
  Author-email: Jacques Raphanel <jraphanel@sismic.fr>
6
6
  License-Expression: MIT
@@ -1,9 +1,9 @@
1
1
  pdffiller/__init__.py,sha256=0HtgXhEV1fKTKAcOXGcq4UsCFflDIPCDQvckshZf-1k,195
2
2
  pdffiller/__main__.py,sha256=7NPQgZVx6VSZS7OrmyJQ_O1vL4wiSqhiILi-outwUqM,107
3
- pdffiller/_version.py,sha256=pFoKdvINjWe5nnOYr0ui5BcN-wdF-rn7wCYxgGvVkBQ,172
3
+ pdffiller/_version.py,sha256=4kVX4SuWyAER9NxjJsz0eyvUdNcPcxAXfDBhA6Min2Q,172
4
4
  pdffiller/const.py,sha256=if_j5I8ftczpjrzZjA7idv-XpvIj1-XBF4oe6VtQvF0,434
5
5
  pdffiller/exceptions.py,sha256=CdN93bZ0mBBS5vLxg14FYZUy4xkYqoD3_SzqtSkZr4g,1624
6
- pdffiller/pdf.py,sha256=A9-GSnQHUNR4i0men27zKeM2VzMpwds9QHt4Kxnmw8A,9254
6
+ pdffiller/pdf.py,sha256=Y-GWZC7jomgqdPS5mIhfEx12xtPKlEsaWptDaAVq2mQ,16217
7
7
  pdffiller/py.typed.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  pdffiller/typing.py,sha256=4GJGevlU-YOR9fmuPT8jmyg5MBhn-2TB1K88b6C8VRw,937
9
9
  pdffiller/utils.py,sha256=pmGf3QwkhKwosk_eFCauzHM-XHp_WGVQAtZlxa7taYY,827
@@ -19,6 +19,7 @@ pdffiller/cli/smart_formatter.py,sha256=59hwF07nKbp-P9IfbqKgMFsfbvjIw5SACCZpUF4n
19
19
  pdffiller/cli/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  pdffiller/cli/commands/dump_data_fields.py,sha256=o7LmFBdl9GfJOyZM1tOgeesx004QWIIXVp2_pX6FB1Q,2292
21
21
  pdffiller/cli/commands/fill_form.py,sha256=5wjbjwYLsytY6ea-n_KDbU9UDmkN00NYj_gm25MQqJ8,4758
22
+ pdffiller/cli/commands/sanitize.py,sha256=wfcsgaTVVQKfnlHzJoRrYYYKczYjamk2uDbCUb8j-4U,2986
22
23
  pdffiller/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
24
  pdffiller/io/colors.py,sha256=QCBEWksTVNurOJQYO0zh1X_xxIOXxXmYSJhbCqnNjI8,1710
24
25
  pdffiller/io/output.py,sha256=QMASWRWmfZGG9DdtlfpWXM3VJAMgWGQwPUzoYp_9FFY,11298
@@ -27,10 +28,10 @@ pdffiller/widgets/base.py,sha256=omGVQsQgMa-ALESnUd3_94oVIYScAMl0SPhHC_DG8Lg,361
27
28
  pdffiller/widgets/checkbox.py,sha256=iijStLAsY1G4cljW3a9NxVS_8qxJewFEw-B8jU2aKXk,1711
28
29
  pdffiller/widgets/radio.py,sha256=Db9Oc3Q8ge8qqTVPLoz3I1_SJBGyJ8KfA33ixZMr78c,1070
29
30
  pdffiller/widgets/text.py,sha256=SiuyBvZPZ6idCmtZ_05zE26iN6Rz67OfOj1fUm98YQI,2397
30
- python_pdffiller-2.0.3.dist-info/licenses/AUTHORS.rst,sha256=1_hVzMKgmoXvGgrcZC7sIbU_6PvvkB6vwqevAqzrIkQ,205
31
- python_pdffiller-2.0.3.dist-info/licenses/COPYING,sha256=ADPe-bH2wYq8nFf6EPJyovzTJyl3jSPnm09mGI8FSTo,1074
32
- python_pdffiller-2.0.3.dist-info/METADATA,sha256=DDZg0ASQb7K4D-5W2POxLQJ-3eaCtheEIIt1kxTN4Tw,4175
33
- python_pdffiller-2.0.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
34
- python_pdffiller-2.0.3.dist-info/entry_points.txt,sha256=RESKKpPPdWl0wDET96ntuFoUydALx9j0mxtbt-MEBjU,49
35
- python_pdffiller-2.0.3.dist-info/top_level.txt,sha256=5MGWCBFYlu_Ax-I5PgQkV9Xw7O48maPe9z8Qj_yVPL4,10
36
- python_pdffiller-2.0.3.dist-info/RECORD,,
31
+ python_pdffiller-2.0.4.dist-info/licenses/AUTHORS.rst,sha256=1_hVzMKgmoXvGgrcZC7sIbU_6PvvkB6vwqevAqzrIkQ,205
32
+ python_pdffiller-2.0.4.dist-info/licenses/COPYING,sha256=ADPe-bH2wYq8nFf6EPJyovzTJyl3jSPnm09mGI8FSTo,1074
33
+ python_pdffiller-2.0.4.dist-info/METADATA,sha256=4on4QrrWJGfDsaakE1InU3ptSd2kQNECI8KTNeLyizM,4175
34
+ python_pdffiller-2.0.4.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
35
+ python_pdffiller-2.0.4.dist-info/entry_points.txt,sha256=RESKKpPPdWl0wDET96ntuFoUydALx9j0mxtbt-MEBjU,49
36
+ python_pdffiller-2.0.4.dist-info/top_level.txt,sha256=5MGWCBFYlu_Ax-I5PgQkV9Xw7O48maPe9z8Qj_yVPL4,10
37
+ python_pdffiller-2.0.4.dist-info/RECORD,,