python-pdffiller 2.0.2__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/PKG-INFO +1 -1
  2. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/cli-commands.rst +2 -0
  3. python_pdffiller-2.0.4/docs/source/commands/sanitize.rst +89 -0
  4. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/_version.py +1 -1
  5. python_pdffiller-2.0.4/pdffiller/cli/commands/sanitize.py +100 -0
  6. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/pdf.py +184 -3
  7. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pyproject.toml +1 -1
  8. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/python_pdffiller.egg-info/PKG-INFO +1 -1
  9. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/python_pdffiller.egg-info/SOURCES.txt +4 -0
  10. python_pdffiller-2.0.4/tests/cli/test_sanitize.py +124 -0
  11. python_pdffiller-2.0.4/tests/unit/test_sanitize.py +98 -0
  12. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/AUTHORS.rst +0 -0
  13. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/CHANGELOG.md +0 -0
  14. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/COPYING +0 -0
  15. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/MANIFEST.in +0 -0
  16. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/README.rst +0 -0
  17. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/Makefile +0 -0
  18. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/make.bat +0 -0
  19. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/__init__.py +0 -0
  20. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/_static/rtd_literal_block.css +0 -0
  21. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/_static/rtd_theme_overrides.css +0 -0
  22. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/_static/terminal_output.css +0 -0
  23. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/changelog.md +0 -0
  24. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/cli-usage.rst +0 -0
  25. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/commands/dump_data_fields.rst +0 -0
  26. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/commands/fill_form.rst +0 -0
  27. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/conf.py +0 -0
  28. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/contributing.rst +0 -0
  29. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/docs/source/index.rst +0 -0
  30. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/__init__.py +0 -0
  31. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/__main__.py +0 -0
  32. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/__init__.py +0 -0
  33. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/args.py +0 -0
  34. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/boolean_action.py +0 -0
  35. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/cli.py +0 -0
  36. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/command.py +0 -0
  37. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/commands/__init__.py +0 -0
  38. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/commands/dump_data_fields.py +0 -0
  39. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/commands/fill_form.py +0 -0
  40. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/exit_codes.py +0 -0
  41. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/formatters.py +0 -0
  42. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/once_argument.py +0 -0
  43. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/cli/smart_formatter.py +0 -0
  44. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/const.py +0 -0
  45. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/exceptions.py +0 -0
  46. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/io/__init__.py +0 -0
  47. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/io/colors.py +0 -0
  48. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/io/output.py +0 -0
  49. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/py.typed.py +0 -0
  50. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/typing.py +0 -0
  51. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/utils.py +0 -0
  52. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/widgets/__init__.py +0 -0
  53. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/widgets/base.py +0 -0
  54. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/widgets/checkbox.py +0 -0
  55. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/widgets/radio.py +0 -0
  56. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/pdffiller/widgets/text.py +0 -0
  57. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/python_pdffiller.egg-info/dependency_links.txt +0 -0
  58. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/python_pdffiller.egg-info/entry_points.txt +0 -0
  59. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/python_pdffiller.egg-info/requires.txt +0 -0
  60. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/python_pdffiller.egg-info/top_level.txt +0 -0
  61. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/requirements-dev.txt +0 -0
  62. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/requirements-doc.txt +0 -0
  63. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/requirements-lint.txt +0 -0
  64. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/requirements-test.txt +0 -0
  65. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/requirements.txt +0 -0
  66. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/setup.cfg +0 -0
  67. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/setup.py +0 -0
  68. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/__init__.py +0 -0
  69. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/cli/__init__.py +0 -0
  70. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/cli/test_dump_data_field.py +0 -0
  71. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/cli/test_fill_form.py +0 -0
  72. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/conftest.py +0 -0
  73. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/data/empty.pdf +0 -0
  74. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/data/input.pdf +0 -0
  75. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/unit/__init__.py +0 -0
  76. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/unit/test_form_field.py +0 -0
  77. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/unit/test_formatters.py +0 -0
  78. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tests/unit/test_setup.py +0 -0
  79. {python_pdffiller-2.0.2 → python_pdffiller-2.0.4}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-pdffiller
3
- Version: 2.0.2
3
+ Version: 2.0.4
4
4
  Summary: Interact with PDF by inspecting or filling it
5
5
  Author-email: Jacques Raphanel <jraphanel@sismic.fr>
6
6
  License-Expression: MIT
@@ -12,6 +12,8 @@ This section describe the **pdffiller** built-in commands, like ``pdffiller dump
12
12
 
13
13
  commands/dump_data_fields
14
14
  commands/fill_form
15
+ commands/sanitize
15
16
 
16
17
  - :doc:`pdffiller dump_data_fields <commands/dump_data_fields>`: Dump form fields present in a pdf given its file path
17
18
  - :doc:`pdffiller fill_form <commands/fill_form>`: Fill an input PDF's form fields with the data from
19
+ - :doc:`pdffiller sanitize <commands/sanitize>`: Sanitize a PDF by removing MaxLen constraints from text fields
@@ -0,0 +1,89 @@
1
+ .. _commands_sanitize:
2
+
3
+ pdffiller sanitize
4
+ ==================
5
+
6
+ .. code-block:: text
7
+
8
+ $ pdffiller sanitize -h
9
+ usage: pdffiller sanitize [-o OUTPUT_PATH] [--deep] [-L PATH] [-V [LEVEL]] [-f NAME] [-h] [INPUT_PATH]
10
+
11
+ Sanitize a PDF by removing MaxLen constraints from text fields.
12
+ This fixes PDFs where MaxLen is incorrectly set, causing text truncation.
13
+
14
+ positional arguments:
15
+ INPUT_PATH Path to the input PDF file.
16
+
17
+ options:
18
+ -o OUTPUT_PATH, --output OUTPUT_PATH
19
+ Path to the output PDF file.
20
+ --deep Perform a deep sanitization: also clear the Comb flag
21
+ from text fields.
22
+ -f NAME, --format NAME
23
+ Output format: text or json. Defaults to text.
24
+ -L PATH, --log-file PATH
25
+ Send output to PATH instead of stderr.
26
+ -V [LEVEL], --verbosity [LEVEL]
27
+ Level of detail of the output. Valid options from less verbose to more
28
+ verbose: -Vquiet, -Verror, -Vwarning, -Vnotice, -Vstatus, -V or -Vverbose, -VV or
29
+ -Vdebug, -VVV or -vtrace
30
+ -h, --help show this help message and exit
31
+
32
+ Sanitizes a single input PDF by removing ``/MaxLen`` constraints from text fields.
33
+ This fixes PDFs where ``/MaxLen`` is incorrectly set on widget annotations or
34
+ inherited from parent field dictionaries, causing text truncation when filling forms.
35
+
36
+ By default (light mode), fields with the Comb flag (``Bande de X caracteres`` in Adobe) are
37
+ **skipped**, as their ``/MaxLen`` is intentional. Only text fields without the Comb flag are sanitized.
38
+
39
+ With the ``--deep`` option, **all** text fields are sanitized: ``/MaxLen`` is removed and
40
+ the Comb flag (bit 25 of ``/Ff``) is also cleared.
41
+
42
+ .. code-block:: text
43
+
44
+ pdffiller sanitize -o output.pdf input.pdf
45
+ pdffiller sanitize --deep -o output.pdf input.pdf
46
+ pdffiller sanitize -f json -o output.pdf input.pdf
47
+
48
+ The ``pdffiller sanitize`` command will:
49
+
50
+ * Load the input PDF file
51
+ * Walk through all text fields and their parent field dictionaries
52
+ * Skip fields with the Comb flag (unless ``--deep`` is used)
53
+ * Remove ``/MaxLen`` entries from each node in the hierarchy
54
+ * With ``--deep``: also clear the Comb flag (``Bande de X caracteres``) from ``/Ff``
55
+ * Save the sanitized PDF to the output path
56
+ * Report all sanitized fields (name, type, and previous MaxLen value)
57
+
58
+ Output formats
59
+ --------------
60
+
61
+ **Text** (default):
62
+
63
+ .. code-block:: text
64
+
65
+ ----------
66
+ FieldName: nom_souscripteur
67
+ FieldType: Text
68
+ MaxLen: 4
69
+ ----------
70
+ FieldName: prenom_souscripteur
71
+ FieldType: Text
72
+ MaxLen: 4
73
+
74
+ **JSON** (``-f json``):
75
+
76
+ .. code-block:: json
77
+
78
+ [
79
+ {
80
+ "FieldName": "nom_souscripteur",
81
+ "FieldType": "Text",
82
+ "MaxLen": 4
83
+ },
84
+ {
85
+ "FieldName": "prenom_souscripteur",
86
+ "FieldType": "Text",
87
+ "MaxLen": 4
88
+ }
89
+ ]
@@ -3,4 +3,4 @@ __copyright__ = "Copyright 2025 SISMIC"
3
3
  __email__ = "jraphanel@sismic.fr"
4
4
  __license__ = "MIT"
5
5
  __title__ = "pdffiller"
6
- __version__ = "2.0.2"
6
+ __version__ = "2.0.4"
@@ -0,0 +1,100 @@
1
+ import json
2
+ import os
3
+
4
+ from pdffiller.cli.args import add_global_arguments
5
+ from pdffiller.cli.command import pdffiller_command, PdfFillerArgumentParser
6
+ from pdffiller.cli.once_argument import OnceArgument
7
+ from pdffiller.exceptions import (
8
+ AbortExecution,
9
+ CommandLineError,
10
+ FileNotExistsError,
11
+ PdfFillerException,
12
+ )
13
+ from pdffiller.io.output import cli_out_write, PdfFillerOutput
14
+ from pdffiller.pdf import Pdf
15
+ from pdffiller.typing import Any, Dict, List
16
+
17
+ from ..exit_codes import ERROR_ENCOUNTERED
18
+
19
+
20
+ def sanitize_text_formatter(sanitized: List[Dict[str, Any]]) -> None:
21
+ """Print sanitized fields as plain text"""
22
+ if not sanitized:
23
+ cli_out_write("No fields were sanitized.")
24
+ return
25
+ for field in sanitized:
26
+ cli_out_write("----------")
27
+ for key, value in field.items():
28
+ cli_out_write(f"{key}: {value}")
29
+
30
+
31
+ def sanitize_json_formatter(sanitized: List[Dict[str, Any]]) -> None:
32
+ """Print sanitized fields as JSON"""
33
+ if not sanitized:
34
+ return
35
+ cli_out_write(json.dumps(sanitized, indent=4, ensure_ascii=False))
36
+
37
+
38
+ @pdffiller_command(
39
+ group=None,
40
+ formatters={"text": sanitize_text_formatter, "json": sanitize_json_formatter},
41
+ )
42
+ def sanitize(parser: PdfFillerArgumentParser, *args: Any) -> Any:
43
+ """
44
+ Sanitize a PDF by removing MaxLen constraints from text fields.
45
+ This fixes PDFs where MaxLen is incorrectly set, causing text truncation.
46
+ """
47
+ options_group = parser.add_argument_group("options")
48
+
49
+ options_group.add_argument(
50
+ "-o",
51
+ "--output",
52
+ metavar="OUTPUT_PATH",
53
+ type=str,
54
+ help="""Path to the output PDF file.""",
55
+ action=OnceArgument,
56
+ )
57
+
58
+ options_group.add_argument(
59
+ "--deep",
60
+ action="store_true",
61
+ default=False,
62
+ help="Perform a deep sanitization: also clear the Comb flag from text fields.",
63
+ )
64
+
65
+ parser.add_argument(
66
+ "file",
67
+ metavar="INPUT_PATH",
68
+ type=str,
69
+ nargs="?",
70
+ help="""Path to the input PDF file.""",
71
+ action=OnceArgument,
72
+ )
73
+
74
+ add_global_arguments(options_group, True, parser)
75
+
76
+ opts = parser.parse_args(*args)
77
+
78
+ output = PdfFillerOutput()
79
+ if not opts.file:
80
+ raise CommandLineError("no input file given")
81
+
82
+ if not opts.output:
83
+ raise CommandLineError("no output file path given")
84
+
85
+ if not os.path.isfile(opts.file):
86
+ raise FileNotExistsError(opts.file)
87
+
88
+ try:
89
+ pdf = Pdf()
90
+ sanitized = pdf.sanitize(opts.file, opts.output, opts.deep)
91
+ output.info(f"sanitized pdf saved to {opts.output}")
92
+ return sanitized
93
+ except PdfFillerException as exp:
94
+ output.error(str(exp))
95
+ except Exception as exg: # pylint: disable=broad-except # pragma: no cover
96
+ output.error(f"unexpected error when sanitizing {opts.file} with the following error:")
97
+ output.error(exg)
98
+ raise AbortExecution(ERROR_ENCOUNTERED) from exg
99
+
100
+ return None
@@ -88,7 +88,7 @@ class Pdf:
88
88
  ) from ex
89
89
 
90
90
  for i, page in enumerate(doc.pages()):
91
- output.verbose(f"loading page {i+1}/{doc.page_count}")
91
+ output.verbose(f"loading page {i + 1}/{doc.page_count}")
92
92
  for widget in page.widgets():
93
93
  button_states = widget.button_states()
94
94
  choices = button_states["normal"] if button_states else None
@@ -105,7 +105,7 @@ class Pdf:
105
105
  choices.insert(0, "Off")
106
106
  new_widget.choices = [choice.replace("#20", " ") for choice in choices]
107
107
  elif isinstance(new_widget, TextWidget):
108
- new_widget.max_length = widget.text_maxlen
108
+ new_widget.max_length = self._resolve_text_maxlen(doc, widget)
109
109
  loaded_widgets[widget.field_name] = new_widget
110
110
  else:
111
111
  new_widget = loaded_widgets[widget.field_name]
@@ -128,6 +128,186 @@ class Pdf:
128
128
 
129
129
  self.widgets = loaded_widgets
130
130
 
131
+ @staticmethod
132
+ def _resolve_text_maxlen(doc: pymupdf.Document, widget: Any) -> Optional[int]:
133
+ """
134
+ Resolve the MaxLen value for a text widget by walking up the PDF object hierarchy.
135
+
136
+ PyMuPDF's widget.text_maxlen may not reflect the actual MaxLen when it is
137
+ defined on a parent field dictionary rather than on the widget annotation itself.
138
+ This method checks the widget xref and its parents for /MaxLen.
139
+
140
+ Args:
141
+ doc: The pymupdf Document.
142
+ widget: The pymupdf Widget.
143
+
144
+ Returns:
145
+ The resolved max length, or None if not found.
146
+ """
147
+ max_length: Optional[int] = widget.text_maxlen
148
+ xref = widget.xref
149
+ while xref > 0:
150
+ key_type, value = doc.xref_get_key(xref, "MaxLen")
151
+ if key_type != "null" and value:
152
+ try:
153
+ int_value = int(value)
154
+ if max_length is None or max_length < int_value:
155
+ max_length = int_value
156
+ # return int(value)
157
+ except (ValueError, TypeError):
158
+ pass
159
+ # Walk up to parent
160
+ key_type, value = doc.xref_get_key(xref, "Parent")
161
+ if key_type == "xref":
162
+ parent_xref = int(value.split()[0])
163
+ if parent_xref == xref:
164
+ break
165
+ xref = parent_xref
166
+ else:
167
+ break
168
+
169
+ return max_length
170
+
171
+ COMB_FLAG = 1 << 24 # Bit 25 in PDF spec (Comb option)
172
+
173
+ @staticmethod
174
+ def _has_comb_flag(doc: pymupdf.Document, widget: Any) -> bool:
175
+ """
176
+ Check if the Comb flag is set on the widget or any of its parents.
177
+
178
+ Args:
179
+ doc: The pymupdf Document.
180
+ widget: The pymupdf Widget.
181
+
182
+ Returns:
183
+ True if the Comb flag is set on any node in the hierarchy.
184
+ """
185
+ xref = widget.xref
186
+ while xref > 0:
187
+ key_type, ff_value = doc.xref_get_key(xref, "Ff")
188
+ if key_type != "null" and ff_value:
189
+ try:
190
+ if int(ff_value) & Pdf.COMB_FLAG:
191
+ return True
192
+ except (ValueError, TypeError):
193
+ pass
194
+ key_type, value = doc.xref_get_key(xref, "Parent")
195
+ if key_type == "xref":
196
+ parent_xref = int(value.split()[0])
197
+ if parent_xref == xref:
198
+ break
199
+ xref = parent_xref
200
+ else:
201
+ break
202
+ return False
203
+
204
+ @staticmethod
205
+ def _remove_maxlen(doc: pymupdf.Document, widget: Any, deep: bool = False) -> None:
206
+ """
207
+ Remove /MaxLen from the widget and all its parent field dictionaries.
208
+
209
+ In deep mode, also clears the Comb flag (bit 25 of /Ff).
210
+ In normal mode, /MaxLen is only removed on nodes where Comb is not set.
211
+
212
+ Args:
213
+ doc: The pymupdf Document.
214
+ widget: The pymupdf Widget.
215
+ deep: If True, also clear the Comb flag from /Ff.
216
+ """
217
+ xref = widget.xref
218
+ while xref > 0:
219
+ # Check Comb flag on this node
220
+ has_comb = False
221
+ key_type, ff_value = doc.xref_get_key(xref, "Ff")
222
+ if key_type != "null" and ff_value:
223
+ try:
224
+ flags = int(ff_value)
225
+ has_comb = bool(flags & Pdf.COMB_FLAG)
226
+ except (ValueError, TypeError):
227
+ pass
228
+
229
+ if deep:
230
+ # Deep mode: remove /MaxLen and clear Comb flag
231
+ if doc.xref_get_key(xref, "MaxLen")[0] != "null":
232
+ doc.xref_set_key(xref, "MaxLen", "null")
233
+ if has_comb:
234
+ doc.xref_set_key(xref, "Ff", str(flags & ~Pdf.COMB_FLAG))
235
+ else:
236
+ # Normal mode: only remove /MaxLen if Comb is not set on this node
237
+ if not has_comb and doc.xref_get_key(xref, "MaxLen")[0] != "null":
238
+ doc.xref_set_key(xref, "MaxLen", "null")
239
+
240
+ # Walk up to parent
241
+ key_type, value = doc.xref_get_key(xref, "Parent")
242
+ if key_type == "xref":
243
+ parent_xref = int(value.split()[0])
244
+ if parent_xref == xref:
245
+ break
246
+ xref = parent_xref
247
+ else:
248
+ break
249
+ widget.text_maxlen = 0
250
+
251
+ def sanitize(
252
+ self,
253
+ input_file: PathLike,
254
+ output_file: PathLike,
255
+ deep: bool = False,
256
+ ) -> List[Dict[str, Any]]:
257
+ """
258
+ Sanitize a PDF by removing /MaxLen constraints from all text fields.
259
+
260
+ This fixes PDFs where /MaxLen is set incorrectly (e.g. 4 instead of 27),
261
+ which causes text truncation when filling the form.
262
+
263
+ Args:
264
+ input_file (PathLike): The input file path.
265
+ output_file (PathLike): The output file path.
266
+ deep: If True, also clear the Comb flag from /Ff.
267
+
268
+ Returns:
269
+ A list of dicts describing each sanitized field:
270
+ [{"FieldName": ..., "FieldType": "text", "MaxLen": <old_value>}, ...]
271
+ """
272
+ try:
273
+ document = pymupdf.open(filename=input_file)
274
+ except Exception as ex:
275
+ PdfFillerOutput().error(str(ex))
276
+ raise PdfFillerException(f"failed to open {input_file}") from ex
277
+
278
+ output = PdfFillerOutput()
279
+ output.info("sanitizing pdf text fields")
280
+
281
+ sanitized: List[Dict[str, Any]] = []
282
+ for page in document:
283
+ for field in page.widgets():
284
+ if (
285
+ field.field_type == pymupdf.PDF_WIDGET_TYPE_TEXT # pylint: disable=no-member
286
+ and field.text_maxlen
287
+ ):
288
+ # Skip fields with Comb flag in normal mode
289
+ if not deep and self._has_comb_flag(document, field):
290
+ output.verbose(f"skipping {field.field_name} (Comb flag active)")
291
+ continue
292
+ old_maxlen = field.text_maxlen
293
+ output.verbose(f"removing MaxLen={old_maxlen} from {field.field_name}")
294
+ self._remove_maxlen(document, field, deep)
295
+ field.update()
296
+ sanitized.append(
297
+ {
298
+ "FieldName": field.field_name,
299
+ "FieldType": field.field_type_string,
300
+ "MaxLen": old_maxlen,
301
+ }
302
+ )
303
+
304
+ try:
305
+ document.save(output_file)
306
+ except Exception: # pylint: disable=broad-exception-caught
307
+ output.warning("an error occurs when saving file")
308
+
309
+ return sanitized
310
+
131
311
  @property
132
312
  def schema(self) -> List[Dict[str, Any]]:
133
313
  """
@@ -217,11 +397,12 @@ class Pdf:
217
397
 
218
398
  # Handling other fields types
219
399
  else:
400
+ # Remove MaxLen constraint from the PDF object to avoid truncation
401
+ self._remove_maxlen(document, field)
220
402
  output.verbose(
221
403
  f"updating {field.field_name} with {value} from {field.field_value}"
222
404
  )
223
405
  field.field_value = value
224
-
225
406
  # Update the widget!
226
407
  field.update()
227
408
  try:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "python-pdffiller"
7
- version = "2.0.2"
7
+ version = "2.0.4"
8
8
  description="Interact with PDF by inspecting or filling it"
9
9
  requires-python = ">=3.9"
10
10
  license ="MIT"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-pdffiller
3
- Version: 2.0.2
3
+ Version: 2.0.4
4
4
  Summary: Interact with PDF by inspecting or filling it
5
5
  Author-email: Jacques Raphanel <jraphanel@sismic.fr>
6
6
  License-Expression: MIT
@@ -25,6 +25,7 @@ docs/source/_static/rtd_theme_overrides.css
25
25
  docs/source/_static/terminal_output.css
26
26
  docs/source/commands/dump_data_fields.rst
27
27
  docs/source/commands/fill_form.rst
28
+ docs/source/commands/sanitize.rst
28
29
  pdffiller/__init__.py
29
30
  pdffiller/__main__.py
30
31
  pdffiller/_version.py
@@ -46,6 +47,7 @@ pdffiller/cli/smart_formatter.py
46
47
  pdffiller/cli/commands/__init__.py
47
48
  pdffiller/cli/commands/dump_data_fields.py
48
49
  pdffiller/cli/commands/fill_form.py
50
+ pdffiller/cli/commands/sanitize.py
49
51
  pdffiller/io/__init__.py
50
52
  pdffiller/io/colors.py
51
53
  pdffiller/io/output.py
@@ -65,9 +67,11 @@ tests/conftest.py
65
67
  tests/cli/__init__.py
66
68
  tests/cli/test_dump_data_field.py
67
69
  tests/cli/test_fill_form.py
70
+ tests/cli/test_sanitize.py
68
71
  tests/data/empty.pdf
69
72
  tests/data/input.pdf
70
73
  tests/unit/__init__.py
71
74
  tests/unit/test_form_field.py
72
75
  tests/unit/test_formatters.py
76
+ tests/unit/test_sanitize.py
73
77
  tests/unit/test_setup.py
@@ -0,0 +1,124 @@
1
+ import json
2
+ from unittest import mock
3
+
4
+ import pytest
5
+
6
+ from pdffiller.cli import cli
7
+ from pdffiller.cli.exit_codes import (
8
+ ERROR_COMMAND_NAME,
9
+ ERROR_UNEXPECTED,
10
+ SUCCESS,
11
+ )
12
+
13
+
14
+ def test_incomplete_no_action():
15
+ """test empty command-line"""
16
+
17
+ with mock.patch("sys.argv", []):
18
+ assert cli.main() == ERROR_COMMAND_NAME
19
+
20
+
21
+ @pytest.mark.parametrize("argv", [])
22
+ def test_incomplete(argv):
23
+ """test command without required arguments"""
24
+
25
+ # Test through direct command-line
26
+ with mock.patch("sys.argv", ["pdffiller", "sanitize"] + argv):
27
+ assert cli.main() == ERROR_UNEXPECTED
28
+
29
+ # Test with direct call to main function
30
+ assert cli.main(["sanitize"] + argv) == ERROR_UNEXPECTED
31
+
32
+
33
+ def test_complete(test_data_dir, output_pdf_path, capsys):
34
+ """test sanitize command with text output"""
35
+
36
+ argv = [
37
+ "-o",
38
+ str(output_pdf_path),
39
+ str(test_data_dir / "input.pdf"),
40
+ ]
41
+
42
+ # Test through direct command-line
43
+ with mock.patch(
44
+ "sys.argv",
45
+ ["pdffiller", "sanitize"] + argv,
46
+ ):
47
+ assert cli.main() == SUCCESS
48
+ assert output_pdf_path.exists()
49
+ output_pdf_path.unlink()
50
+
51
+ # Test with direct call to main function
52
+ assert cli.main(["sanitize"] + argv) == SUCCESS
53
+ assert output_pdf_path.exists()
54
+
55
+
56
+ def test_complete_json_output(test_data_dir, output_pdf_path, capsys):
57
+ """test sanitize command with JSON output"""
58
+
59
+ argv = [
60
+ "-fjson",
61
+ "-o",
62
+ str(output_pdf_path),
63
+ str(test_data_dir / "input.pdf"),
64
+ ]
65
+
66
+ with mock.patch(
67
+ "sys.argv",
68
+ ["pdffiller", "sanitize"] + argv,
69
+ ):
70
+ assert cli.main() == SUCCESS
71
+ out, err = capsys.readouterr()
72
+ # Output should be valid JSON (or empty if no fields to sanitize)
73
+ if out.strip():
74
+ sanitized = json.loads(out)
75
+ assert isinstance(sanitized, list)
76
+ for field in sanitized:
77
+ assert "FieldName" in field
78
+ assert "FieldType" in field
79
+ assert "MaxLen" in field
80
+
81
+
82
+ def test_complete_with_deep(test_data_dir, output_pdf_path):
83
+ """test sanitize command with --deep option"""
84
+
85
+ argv = [
86
+ "--deep",
87
+ "-o",
88
+ str(output_pdf_path),
89
+ str(test_data_dir / "input.pdf"),
90
+ ]
91
+
92
+ # Test through direct command-line
93
+ with mock.patch(
94
+ "sys.argv",
95
+ ["pdffiller", "sanitize"] + argv,
96
+ ):
97
+ assert cli.main() == SUCCESS
98
+ assert output_pdf_path.exists()
99
+ output_pdf_path.unlink()
100
+
101
+ # Test with direct call to main function
102
+ assert cli.main(["sanitize"] + argv) == SUCCESS
103
+ assert output_pdf_path.exists()
104
+
105
+
106
+ def test_complete_with_invalid_file(test_data_dir, output_pdf_path):
107
+ """test sanitize command with invalid PDF"""
108
+
109
+ argv = [
110
+ "-o",
111
+ str(output_pdf_path),
112
+ str(test_data_dir / "empty.pdf"),
113
+ ]
114
+
115
+ # PdfFillerException is caught and logged, command returns SUCCESS
116
+ # Test through direct command-line
117
+ with mock.patch(
118
+ "sys.argv",
119
+ ["pdffiller", "sanitize"] + argv,
120
+ ):
121
+ assert cli.main() == SUCCESS
122
+
123
+ # Test with direct call to main function
124
+ assert cli.main(["sanitize"] + argv) == SUCCESS
@@ -0,0 +1,98 @@
1
+ import pytest
2
+
3
+ from pdffiller.pdf import Pdf
4
+
5
+
6
+ def test_sanitize_removes_maxlen(test_data_dir, output_pdf_path):
7
+ """Test that sanitize removes MaxLen from text fields without Comb flag"""
8
+ pdf = Pdf()
9
+ sanitized = pdf.sanitize(
10
+ str(test_data_dir / "input.pdf"),
11
+ str(output_pdf_path),
12
+ )
13
+
14
+ # Verify output file was created
15
+ assert output_pdf_path.exists()
16
+
17
+ # Verify sanitized fields are text fields with a MaxLen value
18
+ for field in sanitized:
19
+ assert "FieldName" in field
20
+ assert "FieldType" in field
21
+ assert "MaxLen" in field
22
+ assert field["FieldType"] == "Text"
23
+ assert isinstance(field["MaxLen"], int)
24
+ assert field["MaxLen"] > 0
25
+
26
+
27
+ def test_sanitize_output_has_no_maxlen(test_data_dir, output_pdf_path):
28
+ """Test that the sanitized PDF has no MaxLen on non-Comb text fields"""
29
+ pdf = Pdf()
30
+ pdf.sanitize(
31
+ str(test_data_dir / "input.pdf"),
32
+ str(output_pdf_path),
33
+ )
34
+
35
+ # Re-read the sanitized PDF and check that text fields have no MaxLen
36
+ sanitized_pdf = Pdf(str(output_pdf_path))
37
+ for widget in sanitized_pdf.widgets.values():
38
+ if hasattr(widget, "max_length") and not _has_comb_in_pdf(
39
+ str(output_pdf_path), widget.name
40
+ ):
41
+ assert widget.max_length is None or widget.max_length == 0
42
+
43
+
44
+ def test_sanitize_deep_mode(test_data_dir, output_pdf_path):
45
+ """Test that deep mode also processes fields with Comb flag"""
46
+ pdf = Pdf()
47
+ sanitized_normal = pdf.sanitize(
48
+ str(test_data_dir / "input.pdf"),
49
+ str(output_pdf_path),
50
+ )
51
+
52
+ # Deep mode should sanitize at least as many fields as normal mode
53
+ output_pdf_path_deep = output_pdf_path.parent / "deep_output.pdf"
54
+ pdf2 = Pdf()
55
+ sanitized_deep = pdf2.sanitize(
56
+ str(test_data_dir / "input.pdf"),
57
+ str(output_pdf_path_deep),
58
+ deep=True,
59
+ )
60
+
61
+ assert len(sanitized_deep) >= len(sanitized_normal)
62
+
63
+ if output_pdf_path_deep.exists():
64
+ output_pdf_path_deep.unlink()
65
+
66
+
67
+ def test_sanitize_returns_list(test_data_dir, output_pdf_path):
68
+ """Test that sanitize returns a list"""
69
+ pdf = Pdf()
70
+ result = pdf.sanitize(
71
+ str(test_data_dir / "input.pdf"),
72
+ str(output_pdf_path),
73
+ )
74
+ assert isinstance(result, list)
75
+
76
+
77
+ def test_sanitize_invalid_pdf(test_data_dir, output_pdf_path):
78
+ """Test that sanitize raises on invalid PDF"""
79
+ from pdffiller.exceptions import PdfFillerException
80
+
81
+ pdf = Pdf()
82
+ with pytest.raises(PdfFillerException):
83
+ pdf.sanitize(
84
+ str(test_data_dir / "empty.pdf"),
85
+ str(output_pdf_path),
86
+ )
87
+
88
+
89
+ def _has_comb_in_pdf(pdf_path, field_name):
90
+ """Helper to check if a field has Comb flag set"""
91
+ import pymupdf
92
+
93
+ doc = pymupdf.open(filename=pdf_path)
94
+ for page in doc:
95
+ for widget in page.widgets():
96
+ if widget.field_name == field_name:
97
+ return bool(widget.field_flags & Pdf.COMB_FLAG)
98
+ return False