PyPDFForm 2.5.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PyPDFForm might be problematic. Click here for more details.

PyPDFForm/utils.py CHANGED
@@ -1,13 +1,16 @@
1
1
  # -*- coding: utf-8 -*-
2
- """Provides core utility functions for PDF form processing.
3
-
4
- This module contains general-purpose utilities used throughout PyPDFForm:
5
- - Stream/file handling conversions
6
- - Color space transformations
7
- - Widget preview generation
8
- - PDF merging and splitting
9
- - Pattern matching for PDF structures
10
- - Unique ID generation
2
+ """
3
+ This module provides a collection of utility functions used throughout the PyPDFForm library.
4
+
5
+ It includes functions for:
6
+ - Converting byte streams to BinaryIO objects.
7
+ - Removing all widgets (form fields) from a PDF.
8
+ - Extracting the content stream of each page in a PDF.
9
+ - Merging two PDFs into one.
10
+ - Finding and traversing patterns within PDF widgets.
11
+ - Extracting widget properties based on defined patterns.
12
+ - Generating unique suffixes for internal use.
13
+ - Enabling Adobe-specific settings in the PDF to ensure proper rendering of form fields.
11
14
  """
12
15
 
13
16
  from collections.abc import Callable
@@ -18,29 +21,28 @@ from string import ascii_letters, digits, punctuation
18
21
  from typing import Any, BinaryIO, List, Union
19
22
 
20
23
  from pypdf import PdfReader, PdfWriter
21
- from pypdf.generic import ArrayObject, DictionaryObject
22
- from reportlab.lib.colors import CMYKColor, Color
24
+ from pypdf.generic import ArrayObject, DictionaryObject, NameObject
23
25
 
24
- from .constants import (BUTTON_STYLES, DEFAULT_CHECKBOX_STYLE, DEFAULT_FONT,
25
- DEFAULT_FONT_COLOR, DEFAULT_FONT_SIZE,
26
- DEFAULT_RADIO_STYLE, PREVIEW_FONT_COLOR,
27
- UNIQUE_SUFFIX_LENGTH, WIDGET_TYPES)
28
- from .middleware.checkbox import Checkbox
29
- from .middleware.radio import Radio
30
- from .middleware.text import Text
26
+ from .constants import UNIQUE_SUFFIX_LENGTH, XFA, AcroForm, Annots, Root
31
27
 
32
28
 
33
29
  @lru_cache
34
30
  def stream_to_io(stream: bytes) -> BinaryIO:
35
- """Converts a byte stream to a seekable binary IO object.
31
+ """
32
+ Converts a bytes stream to a BinaryIO object, which can be used by PyPDFForm.
33
+
34
+ This function takes a bytes stream as input and returns a BinaryIO object
35
+ that represents the same data. This is useful because PyPDFForm often
36
+ works with BinaryIO objects, so this function allows you to easily convert
37
+ a bytes stream to the correct format. The result is cached using lru_cache
38
+ for performance.
36
39
 
37
40
  Args:
38
- stream: Input byte stream to convert
41
+ stream (bytes): The bytes stream to convert.
39
42
 
40
43
  Returns:
41
- BinaryIO: Seekable file-like object containing the stream data
44
+ BinaryIO: A BinaryIO object representing the stream.
42
45
  """
43
-
44
46
  result = BytesIO()
45
47
  result.write(stream)
46
48
  result.seek(0)
@@ -48,101 +50,52 @@ def stream_to_io(stream: bytes) -> BinaryIO:
48
50
  return result
49
51
 
50
52
 
51
- def handle_color(color: Union[list, ArrayObject]) -> Union[Color, CMYKColor, None]:
52
- """Converts PDF color specifications to reportlab color objects.
53
-
54
- Supports:
55
- - Grayscale (1 component)
56
- - RGB (3 components)
57
- - CMYK (4 components)
58
-
59
- Args:
60
- color: Color array from PDF specification
61
-
62
- Returns:
63
- Union[Color, CMYKColor, None]: Color object or None if invalid format
64
- """
65
-
66
- result = None
67
-
68
- if len(color) == 1:
69
- result = CMYKColor(black=1 - color[0])
70
- elif len(color) == 3:
71
- result = Color(red=color[0], green=color[1], blue=color[2])
72
- elif len(color) == 4:
73
- result = CMYKColor(
74
- cyan=color[0], magenta=color[1], yellow=color[2], black=color[3]
75
- )
76
-
77
- return result
78
-
53
+ @lru_cache
54
+ def enable_adobe_mode(pdf: bytes) -> bytes:
55
+ """Enables Adobe-specific settings in the PDF to ensure proper rendering of form fields.
79
56
 
80
- def checkbox_radio_to_draw(
81
- widget: Union[Checkbox, Radio], font_size: Union[float, int]
82
- ) -> Text:
83
- """Converts checkbox/radio widgets to text symbols for drawing.
57
+ This function modifies the PDF's AcroForm dictionary to include the `NeedAppearances` flag,
58
+ which forces Adobe Reader to generate appearance streams for form fields. It also handles
59
+ XFA (XML Forms Architecture) forms by removing the XFA entry from the AcroForm dictionary
60
+ if it exists, ensuring compatibility and proper rendering. This ensures that the form fields
61
+ are rendered correctly in Adobe Reader, especially when the form is filled programmatically.
84
62
 
85
63
  Args:
86
- widget: Checkbox or Radio widget to convert
87
- font_size: Size for the drawn symbol
64
+ pdf (bytes): The PDF content as bytes.
88
65
 
89
66
  Returns:
90
- Text: Text widget configured to draw the appropriate symbol
67
+ bytes: The modified PDF content with Adobe mode enabled.
91
68
  """
69
+ reader = PdfReader(stream_to_io(pdf))
70
+ writer = PdfWriter()
92
71
 
93
- new_widget = Text(
94
- name=widget.name,
95
- value="",
96
- )
97
- new_widget.font = DEFAULT_FONT
98
- new_widget.font_size = font_size
99
- new_widget.font_color = DEFAULT_FONT_COLOR
100
- new_widget.value = BUTTON_STYLES.get(widget.button_style) or (
101
- DEFAULT_CHECKBOX_STYLE if type(widget) is Checkbox else DEFAULT_RADIO_STYLE
102
- )
103
-
104
- return new_widget
72
+ if AcroForm in reader.trailer[Root] and XFA in reader.trailer[Root][AcroForm]:
73
+ del reader.trailer[Root][AcroForm][XFA]
105
74
 
75
+ writer.append(reader)
76
+ writer.set_need_appearances_writer()
106
77
 
107
- def preview_widget_to_draw(
108
- widget_name: str, widget: WIDGET_TYPES, with_preview_text: bool
109
- ) -> Text:
110
- """Creates preview version of a widget showing field name/location.
78
+ with BytesIO() as f:
79
+ writer.write(f)
80
+ f.seek(0)
81
+ return f.read()
111
82
 
112
- Args:
113
- widget_name: Name of the widget to generate preview for
114
- widget: Widget to generate preview for
115
- with_preview_text: Whether to include field name in preview
116
83
 
117
- Returns:
118
- Text: Text widget configured for preview display
84
+ @lru_cache
85
+ def remove_all_widgets(pdf: bytes) -> bytes:
119
86
  """
87
+ Removes all widgets (form fields) from a PDF, effectively flattening the form.
120
88
 
121
- new_widget = Text(
122
- name=widget.name,
123
- value="{" + f" {widget_name} " + "}" if with_preview_text else None,
124
- )
125
- new_widget.font = DEFAULT_FONT
126
- new_widget.font_size = DEFAULT_FONT_SIZE
127
- new_widget.font_color = PREVIEW_FONT_COLOR
128
- new_widget.preview = with_preview_text
129
- new_widget.border_color = handle_color([0, 0, 0])
130
- new_widget.border_width = 1
131
- new_widget.render_widget = True
132
-
133
- return new_widget
134
-
135
-
136
- def remove_all_widgets(pdf: bytes) -> bytes:
137
- """Removes all interactive form fields from a PDF document.
89
+ This function takes a PDF as a bytes stream, removes all of its interactive
90
+ form fields (widgets), and returns the modified PDF as a bytes stream. This
91
+ is useful for creating a non-interactive version of a PDF form.
138
92
 
139
93
  Args:
140
- pdf: Input PDF as bytes
94
+ pdf (bytes): The PDF as a bytes stream.
141
95
 
142
96
  Returns:
143
- bytes: Flattened PDF with form fields removed
97
+ bytes: The PDF with all widgets removed, as a bytes stream.
144
98
  """
145
-
146
99
  pdf_file = PdfReader(stream_to_io(pdf))
147
100
  result_stream = BytesIO()
148
101
  writer = PdfWriter()
@@ -157,15 +110,18 @@ def remove_all_widgets(pdf: bytes) -> bytes:
157
110
 
158
111
 
159
112
  def get_page_streams(pdf: bytes) -> List[bytes]:
160
- """Splits a PDF into individual page streams.
113
+ """
114
+ Extracts the content stream of each page in a PDF as a list of byte streams.
115
+
116
+ This function takes a PDF as a bytes stream and returns a list of bytes streams,
117
+ where each element in the list represents the content stream of a page in the PDF.
161
118
 
162
119
  Args:
163
- pdf: Input PDF as bytes
120
+ pdf (bytes): The PDF as a bytes stream.
164
121
 
165
122
  Returns:
166
- List[bytes]: List where each element contains a single PDF page
123
+ List[bytes]: A list of bytes streams, one for each page.
167
124
  """
168
-
169
125
  pdf_file = PdfReader(stream_to_io(pdf))
170
126
  result = []
171
127
 
@@ -181,16 +137,20 @@ def get_page_streams(pdf: bytes) -> List[bytes]:
181
137
 
182
138
 
183
139
  def merge_two_pdfs(pdf: bytes, other: bytes) -> bytes:
184
- """Combines two PDF documents into a single multipage PDF.
140
+ """
141
+ Merges two PDF files into a single PDF file.
142
+
143
+ This function takes two PDF files as byte streams, merges them, and returns the result as a single PDF byte stream.
144
+ It handles the merging of pages from both PDFs and also attempts to preserve form field widgets from both input PDFs
145
+ in the final merged PDF. The form fields are cloned and added to the output pages.
185
146
 
186
147
  Args:
187
- pdf: First PDF as bytes
188
- other: Second PDF as bytes
148
+ pdf (bytes): The first PDF file as a byte stream.
149
+ other (bytes): The second PDF file as a byte stream.
189
150
 
190
151
  Returns:
191
- bytes: Combined PDF containing all pages from both inputs
152
+ bytes: The merged PDF file as a byte stream.
192
153
  """
193
-
194
154
  output = PdfWriter()
195
155
  pdf_file = PdfReader(stream_to_io(pdf))
196
156
  other_file = PdfReader(stream_to_io(other))
@@ -203,20 +163,52 @@ def merge_two_pdfs(pdf: bytes, other: bytes) -> bytes:
203
163
 
204
164
  output.write(result)
205
165
  result.seek(0)
166
+
167
+ merged_no_widgets = PdfReader(stream_to_io(remove_all_widgets(result.read())))
168
+ output = PdfWriter()
169
+ output.append(merged_no_widgets)
170
+
171
+ # TODO: refactor duplicate logic with copy_watermark_widgets
172
+ widgets_to_copy = {}
173
+ for i, page in enumerate(pdf_file.pages):
174
+ widgets_to_copy[i] = []
175
+ for annot in page.get(Annots, []):
176
+ widgets_to_copy[i].append(annot.clone(output))
177
+
178
+ for i, page in enumerate(other_file.pages):
179
+ widgets_to_copy[i + len(pdf_file.pages)] = []
180
+ for annot in page.get(Annots, []):
181
+ widgets_to_copy[i + len(pdf_file.pages)].append(annot.clone(output))
182
+
183
+ for i, page in enumerate(output.pages):
184
+ page[NameObject(Annots)] = (
185
+ (page[NameObject(Annots)] + ArrayObject(widgets_to_copy[i]))
186
+ if Annots in page
187
+ else ArrayObject(widgets_to_copy[i])
188
+ )
189
+
190
+ result = BytesIO()
191
+ output.write(result)
192
+ result.seek(0)
206
193
  return result.read()
207
194
 
208
195
 
209
196
  def find_pattern_match(pattern: dict, widget: Union[dict, DictionaryObject]) -> bool:
210
- """Tests whether a widget matches the specified PDF attribute pattern.
197
+ """
198
+ Recursively finds a pattern match within a PDF widget (annotation dictionary).
199
+
200
+ This function searches for a specific pattern within a PDF widget's properties.
201
+ It recursively traverses the widget's dictionary, comparing keys and values
202
+ to the provided pattern.
211
203
 
212
204
  Args:
213
- pattern: Dictionary of PDF attributes and expected values
214
- widget: PDF widget to test against the pattern
205
+ pattern (dict): The pattern to search for, represented as a dictionary.
206
+ widget (Union[dict, DictionaryObject]): The widget to search within, which
207
+ can be a dictionary or a DictionaryObject.
215
208
 
216
209
  Returns:
217
- bool: True if widget matches all pattern criteria
210
+ bool: True if a match is found, False otherwise.
218
211
  """
219
-
220
212
  for key, value in widget.items():
221
213
  result = False
222
214
  if key in pattern:
@@ -238,16 +230,21 @@ def find_pattern_match(pattern: dict, widget: Union[dict, DictionaryObject]) ->
238
230
  def traverse_pattern(
239
231
  pattern: dict, widget: Union[dict, DictionaryObject]
240
232
  ) -> Union[str, list, None]:
241
- """Recursively searches a widget for a matching pattern and returns its value.
233
+ """
234
+ Recursively traverses a pattern within a PDF widget (annotation dictionary) and returns the value.
235
+
236
+ This function searches for a specific pattern within a PDF widget's properties.
237
+ It recursively traverses the widget's dictionary, comparing keys and values
238
+ to the provided pattern and returns the value if the pattern is True.
242
239
 
243
240
  Args:
244
- pattern: Dictionary of PDF attributes specifying the search path
245
- widget: PDF widget to search through
241
+ pattern (dict): The pattern to traverse, represented as a dictionary.
242
+ widget (Union[dict, DictionaryObject]): The widget to traverse within, which
243
+ can be a dictionary or a DictionaryObject.
246
244
 
247
245
  Returns:
248
- Union[str, list, None]: Found value or None if not matched
246
+ Union[str, list, None]: The value found, or None if not found.
249
247
  """
250
-
251
248
  for key, value in widget.items():
252
249
  result = None
253
250
  if key in pattern:
@@ -270,18 +267,25 @@ def extract_widget_property(
270
267
  default_value: Any,
271
268
  func_before_return: Union[Callable, None],
272
269
  ) -> Any:
273
- """Extracts a widget property using pattern matching with fallback.
270
+ """
271
+ Extracts a specific property from a PDF widget based on a list of patterns.
272
+
273
+ This function iterates through a list of patterns, attempting to find a match
274
+ within the provided widget. If a match is found, the corresponding value is
275
+ extracted and returned. If no match is found, a default value is returned.
274
276
 
275
277
  Args:
276
- widget: PDF widget dictionary to examine
277
- patterns: List of patterns to try in order
278
- default_value: Value to return if no patterns match
279
- func_before_return: Optional function to transform the extracted value
278
+ widget (Union[dict, DictionaryObject]): The widget to extract the property from.
279
+ patterns (list): A list of patterns to search for. Each pattern should be a
280
+ dictionary representing the structure of the property to extract.
281
+ default_value (Any): The default value to return if no pattern is found.
282
+ func_before_return (Union[Callable, None]): An optional function to call before
283
+ returning the extracted value. This can be used to perform additional
284
+ processing or formatting on the value.
280
285
 
281
286
  Returns:
282
- Any: Extracted property value or default_value
287
+ Any: The extracted property value, or the default value if no pattern is found.
283
288
  """
284
-
285
289
  result = default_value
286
290
 
287
291
  for pattern in patterns:
@@ -294,12 +298,16 @@ def extract_widget_property(
294
298
 
295
299
 
296
300
  def generate_unique_suffix() -> str:
297
- """Generates a random string for disambiguating field names during merging.
301
+ """
302
+ Generates a unique suffix string for internal use, such as to avoid naming conflicts.
303
+
304
+ This function creates a random string of characters with a predefined length
305
+ (UNIQUE_SUFFIX_LENGTH) using a combination of ASCII letters, digits, and
306
+ punctuation characters (excluding hyphens).
298
307
 
299
308
  Returns:
300
- str: Random string containing letters, digits and symbols
309
+ str: A unique suffix string.
301
310
  """
302
-
303
311
  return "".join(
304
312
  [
305
313
  choice(ascii_letters + digits + punctuation.replace("-", ""))