PyPDFForm 2.5.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PyPDFForm might be problematic. Click here for more details.

PyPDFForm/utils.py CHANGED
@@ -1,13 +1,16 @@
1
1
  # -*- coding: utf-8 -*-
2
- """Provides core utility functions for PDF form processing.
3
-
4
- This module contains general-purpose utilities used throughout PyPDFForm:
5
- - Stream/file handling conversions
6
- - Color space transformations
7
- - Widget preview generation
8
- - PDF merging and splitting
9
- - Pattern matching for PDF structures
10
- - Unique ID generation
2
+ """
3
+ This module provides a collection of utility functions used throughout the PyPDFForm library.
4
+
5
+ It includes functions for:
6
+ - Converting byte streams to BinaryIO objects.
7
+ - Removing all widgets (form fields) from a PDF.
8
+ - Extracting the content stream of each page in a PDF.
9
+ - Merging two PDFs into one.
10
+ - Finding and traversing patterns within PDF widgets.
11
+ - Extracting widget properties based on defined patterns.
12
+ - Generating unique suffixes for internal use.
13
+ - Enabling Adobe-specific settings in the PDF to ensure proper rendering of form fields.
11
14
  """
12
15
 
13
16
  from collections.abc import Callable
@@ -18,29 +21,30 @@ from string import ascii_letters, digits, punctuation
18
21
  from typing import Any, BinaryIO, List, Union
19
22
 
20
23
  from pypdf import PdfReader, PdfWriter
21
- from pypdf.generic import ArrayObject, DictionaryObject
22
- from reportlab.lib.colors import CMYKColor, Color
24
+ from pypdf.generic import (ArrayObject, BooleanObject, DictionaryObject,
25
+ NameObject)
23
26
 
24
- from .constants import (BUTTON_STYLES, DEFAULT_CHECKBOX_STYLE, DEFAULT_FONT,
25
- DEFAULT_FONT_COLOR, DEFAULT_FONT_SIZE,
26
- DEFAULT_RADIO_STYLE, PREVIEW_FONT_COLOR,
27
- UNIQUE_SUFFIX_LENGTH, WIDGET_TYPES)
28
- from .middleware.checkbox import Checkbox
29
- from .middleware.radio import Radio
30
- from .middleware.text import Text
27
+ from .constants import (UNIQUE_SUFFIX_LENGTH, AcroForm, Annots,
28
+ NeedAppearances, Root)
31
29
 
32
30
 
33
31
  @lru_cache
34
32
  def stream_to_io(stream: bytes) -> BinaryIO:
35
- """Converts a byte stream to a seekable binary IO object.
33
+ """
34
+ Converts a bytes stream to a BinaryIO object, which can be used by PyPDFForm.
35
+
36
+ This function takes a bytes stream as input and returns a BinaryIO object
37
+ that represents the same data. This is useful because PyPDFForm often
38
+ works with BinaryIO objects, so this function allows you to easily convert
39
+ a bytes stream to the correct format. The result is cached using lru_cache
40
+ for performance.
36
41
 
37
42
  Args:
38
- stream: Input byte stream to convert
43
+ stream (bytes): The bytes stream to convert.
39
44
 
40
45
  Returns:
41
- BinaryIO: Seekable file-like object containing the stream data
46
+ BinaryIO: A BinaryIO object representing the stream.
42
47
  """
43
-
44
48
  result = BytesIO()
45
49
  result.write(stream)
46
50
  result.seek(0)
@@ -48,101 +52,56 @@ def stream_to_io(stream: bytes) -> BinaryIO:
48
52
  return result
49
53
 
50
54
 
51
- def handle_color(color: Union[list, ArrayObject]) -> Union[Color, CMYKColor, None]:
52
- """Converts PDF color specifications to reportlab color objects.
53
-
54
- Supports:
55
- - Grayscale (1 component)
56
- - RGB (3 components)
57
- - CMYK (4 components)
58
-
59
- Args:
60
- color: Color array from PDF specification
61
-
62
- Returns:
63
- Union[Color, CMYKColor, None]: Color object or None if invalid format
64
- """
65
-
66
- result = None
67
-
68
- if len(color) == 1:
69
- result = CMYKColor(black=1 - color[0])
70
- elif len(color) == 3:
71
- result = Color(red=color[0], green=color[1], blue=color[2])
72
- elif len(color) == 4:
73
- result = CMYKColor(
74
- cyan=color[0], magenta=color[1], yellow=color[2], black=color[3]
75
- )
76
-
77
- return result
78
-
55
+ @lru_cache
56
+ def enable_adobe_mode(pdf: bytes) -> bytes:
57
+ """Enables Adobe-specific settings in the PDF to ensure proper rendering of form fields.
79
58
 
80
- def checkbox_radio_to_draw(
81
- widget: Union[Checkbox, Radio], font_size: Union[float, int]
82
- ) -> Text:
83
- """Converts checkbox/radio widgets to text symbols for drawing.
59
+ This function modifies the PDF's AcroForm dictionary to include the `NeedAppearances` flag,
60
+ which forces Adobe Reader to generate appearance streams for form fields. This ensures that
61
+ the form fields are rendered correctly in Adobe Reader, especially when the form is filled
62
+ programmatically.
84
63
 
85
64
  Args:
86
- widget: Checkbox or Radio widget to convert
87
- font_size: Size for the drawn symbol
65
+ pdf (bytes): The PDF content as bytes.
88
66
 
89
67
  Returns:
90
- Text: Text widget configured to draw the appropriate symbol
68
+ bytes: The modified PDF content with Adobe mode enabled.
91
69
  """
70
+ reader = PdfReader(stream_to_io(pdf))
71
+ writer = PdfWriter()
92
72
 
93
- new_widget = Text(
94
- name=widget.name,
95
- value="",
73
+ # https://stackoverflow.com/questions/47288578/pdf-form-filled-with-pypdf2-does-not-show-in-print
74
+ if AcroForm in reader.trailer[Root]:
75
+ if NeedAppearances in reader.trailer[Root][AcroForm]:
76
+ return pdf
77
+ else:
78
+ reader.trailer[Root].update({NameObject(AcroForm): DictionaryObject()})
79
+ reader.trailer[Root][AcroForm].update(
80
+ {NameObject(NeedAppearances): BooleanObject(True)}
96
81
  )
97
- new_widget.font = DEFAULT_FONT
98
- new_widget.font_size = font_size
99
- new_widget.font_color = DEFAULT_FONT_COLOR
100
- new_widget.value = BUTTON_STYLES.get(widget.button_style) or (
101
- DEFAULT_CHECKBOX_STYLE if type(widget) is Checkbox else DEFAULT_RADIO_STYLE
102
- )
103
-
104
- return new_widget
105
-
82
+ writer.append(reader)
106
83
 
107
- def preview_widget_to_draw(
108
- widget_name: str, widget: WIDGET_TYPES, with_preview_text: bool
109
- ) -> Text:
110
- """Creates preview version of a widget showing field name/location.
84
+ with BytesIO() as f:
85
+ writer.write(f)
86
+ f.seek(0)
87
+ return f.read()
111
88
 
112
- Args:
113
- widget_name: Name of the widget to generate preview for
114
- widget: Widget to generate preview for
115
- with_preview_text: Whether to include field name in preview
116
89
 
117
- Returns:
118
- Text: Text widget configured for preview display
90
+ @lru_cache
91
+ def remove_all_widgets(pdf: bytes) -> bytes:
119
92
  """
93
+ Removes all widgets (form fields) from a PDF, effectively flattening the form.
120
94
 
121
- new_widget = Text(
122
- name=widget.name,
123
- value="{" + f" {widget_name} " + "}" if with_preview_text else None,
124
- )
125
- new_widget.font = DEFAULT_FONT
126
- new_widget.font_size = DEFAULT_FONT_SIZE
127
- new_widget.font_color = PREVIEW_FONT_COLOR
128
- new_widget.preview = with_preview_text
129
- new_widget.border_color = handle_color([0, 0, 0])
130
- new_widget.border_width = 1
131
- new_widget.render_widget = True
132
-
133
- return new_widget
134
-
135
-
136
- def remove_all_widgets(pdf: bytes) -> bytes:
137
- """Removes all interactive form fields from a PDF document.
95
+ This function takes a PDF as a bytes stream, removes all of its interactive
96
+ form fields (widgets), and returns the modified PDF as a bytes stream. This
97
+ is useful for creating a non-interactive version of a PDF form.
138
98
 
139
99
  Args:
140
- pdf: Input PDF as bytes
100
+ pdf (bytes): The PDF as a bytes stream.
141
101
 
142
102
  Returns:
143
- bytes: Flattened PDF with form fields removed
103
+ bytes: The PDF with all widgets removed, as a bytes stream.
144
104
  """
145
-
146
105
  pdf_file = PdfReader(stream_to_io(pdf))
147
106
  result_stream = BytesIO()
148
107
  writer = PdfWriter()
@@ -157,15 +116,18 @@ def remove_all_widgets(pdf: bytes) -> bytes:
157
116
 
158
117
 
159
118
  def get_page_streams(pdf: bytes) -> List[bytes]:
160
- """Splits a PDF into individual page streams.
119
+ """
120
+ Extracts the content stream of each page in a PDF as a list of byte streams.
121
+
122
+ This function takes a PDF as a bytes stream and returns a list of bytes streams,
123
+ where each element in the list represents the content stream of a page in the PDF.
161
124
 
162
125
  Args:
163
- pdf: Input PDF as bytes
126
+ pdf (bytes): The PDF as a bytes stream.
164
127
 
165
128
  Returns:
166
- List[bytes]: List where each element contains a single PDF page
129
+ List[bytes]: A list of bytes streams, one for each page.
167
130
  """
168
-
169
131
  pdf_file = PdfReader(stream_to_io(pdf))
170
132
  result = []
171
133
 
@@ -181,16 +143,20 @@ def get_page_streams(pdf: bytes) -> List[bytes]:
181
143
 
182
144
 
183
145
  def merge_two_pdfs(pdf: bytes, other: bytes) -> bytes:
184
- """Combines two PDF documents into a single multipage PDF.
146
+ """
147
+ Merges two PDF files into a single PDF file.
148
+
149
+ This function takes two PDF files as byte streams, merges them, and returns the result as a single PDF byte stream.
150
+ It handles the merging of pages from both PDFs and also attempts to preserve form field widgets from both input PDFs
151
+ in the final merged PDF. The form fields are cloned and added to the output pages.
185
152
 
186
153
  Args:
187
- pdf: First PDF as bytes
188
- other: Second PDF as bytes
154
+ pdf (bytes): The first PDF file as a byte stream.
155
+ other (bytes): The second PDF file as a byte stream.
189
156
 
190
157
  Returns:
191
- bytes: Combined PDF containing all pages from both inputs
158
+ bytes: The merged PDF file as a byte stream.
192
159
  """
193
-
194
160
  output = PdfWriter()
195
161
  pdf_file = PdfReader(stream_to_io(pdf))
196
162
  other_file = PdfReader(stream_to_io(other))
@@ -203,20 +169,52 @@ def merge_two_pdfs(pdf: bytes, other: bytes) -> bytes:
203
169
 
204
170
  output.write(result)
205
171
  result.seek(0)
172
+
173
+ merged_no_widgets = PdfReader(stream_to_io(remove_all_widgets(result.read())))
174
+ output = PdfWriter()
175
+ output.append(merged_no_widgets)
176
+
177
+ # TODO: refactor duplicate logic with copy_watermark_widgets
178
+ widgets_to_copy = {}
179
+ for i, page in enumerate(pdf_file.pages):
180
+ widgets_to_copy[i] = []
181
+ for annot in page.get(Annots, []):
182
+ widgets_to_copy[i].append(annot.clone(output))
183
+
184
+ for i, page in enumerate(other_file.pages):
185
+ widgets_to_copy[i + len(pdf_file.pages)] = []
186
+ for annot in page.get(Annots, []):
187
+ widgets_to_copy[i + len(pdf_file.pages)].append(annot.clone(output))
188
+
189
+ for i, page in enumerate(output.pages):
190
+ page[NameObject(Annots)] = (
191
+ (page[NameObject(Annots)] + ArrayObject(widgets_to_copy[i]))
192
+ if Annots in page
193
+ else ArrayObject(widgets_to_copy[i])
194
+ )
195
+
196
+ result = BytesIO()
197
+ output.write(result)
198
+ result.seek(0)
206
199
  return result.read()
207
200
 
208
201
 
209
202
  def find_pattern_match(pattern: dict, widget: Union[dict, DictionaryObject]) -> bool:
210
- """Tests whether a widget matches the specified PDF attribute pattern.
203
+ """
204
+ Recursively finds a pattern match within a PDF widget (annotation dictionary).
205
+
206
+ This function searches for a specific pattern within a PDF widget's properties.
207
+ It recursively traverses the widget's dictionary, comparing keys and values
208
+ to the provided pattern.
211
209
 
212
210
  Args:
213
- pattern: Dictionary of PDF attributes and expected values
214
- widget: PDF widget to test against the pattern
211
+ pattern (dict): The pattern to search for, represented as a dictionary.
212
+ widget (Union[dict, DictionaryObject]): The widget to search within, which
213
+ can be a dictionary or a DictionaryObject.
215
214
 
216
215
  Returns:
217
- bool: True if widget matches all pattern criteria
216
+ bool: True if a match is found, False otherwise.
218
217
  """
219
-
220
218
  for key, value in widget.items():
221
219
  result = False
222
220
  if key in pattern:
@@ -238,16 +236,21 @@ def find_pattern_match(pattern: dict, widget: Union[dict, DictionaryObject]) ->
238
236
  def traverse_pattern(
239
237
  pattern: dict, widget: Union[dict, DictionaryObject]
240
238
  ) -> Union[str, list, None]:
241
- """Recursively searches a widget for a matching pattern and returns its value.
239
+ """
240
+ Recursively traverses a pattern within a PDF widget (annotation dictionary) and returns the value.
241
+
242
+ This function searches for a specific pattern within a PDF widget's properties.
243
+ It recursively traverses the widget's dictionary, comparing keys and values
244
+ to the provided pattern and returns the value if the pattern is True.
242
245
 
243
246
  Args:
244
- pattern: Dictionary of PDF attributes specifying the search path
245
- widget: PDF widget to search through
247
+ pattern (dict): The pattern to traverse, represented as a dictionary.
248
+ widget (Union[dict, DictionaryObject]): The widget to traverse within, which
249
+ can be a dictionary or a DictionaryObject.
246
250
 
247
251
  Returns:
248
- Union[str, list, None]: Found value or None if not matched
252
+ Union[str, list, None]: The value found, or None if not found.
249
253
  """
250
-
251
254
  for key, value in widget.items():
252
255
  result = None
253
256
  if key in pattern:
@@ -270,18 +273,25 @@ def extract_widget_property(
270
273
  default_value: Any,
271
274
  func_before_return: Union[Callable, None],
272
275
  ) -> Any:
273
- """Extracts a widget property using pattern matching with fallback.
276
+ """
277
+ Extracts a specific property from a PDF widget based on a list of patterns.
278
+
279
+ This function iterates through a list of patterns, attempting to find a match
280
+ within the provided widget. If a match is found, the corresponding value is
281
+ extracted and returned. If no match is found, a default value is returned.
274
282
 
275
283
  Args:
276
- widget: PDF widget dictionary to examine
277
- patterns: List of patterns to try in order
278
- default_value: Value to return if no patterns match
279
- func_before_return: Optional function to transform the extracted value
284
+ widget (Union[dict, DictionaryObject]): The widget to extract the property from.
285
+ patterns (list): A list of patterns to search for. Each pattern should be a
286
+ dictionary representing the structure of the property to extract.
287
+ default_value (Any): The default value to return if no pattern is found.
288
+ func_before_return (Union[Callable, None]): An optional function to call before
289
+ returning the extracted value. This can be used to perform additional
290
+ processing or formatting on the value.
280
291
 
281
292
  Returns:
282
- Any: Extracted property value or default_value
293
+ Any: The extracted property value, or the default value if no pattern is found.
283
294
  """
284
-
285
295
  result = default_value
286
296
 
287
297
  for pattern in patterns:
@@ -294,12 +304,16 @@ def extract_widget_property(
294
304
 
295
305
 
296
306
  def generate_unique_suffix() -> str:
297
- """Generates a random string for disambiguating field names during merging.
307
+ """
308
+ Generates a unique suffix string for internal use, such as to avoid naming conflicts.
309
+
310
+ This function creates a random string of characters with a predefined length
311
+ (UNIQUE_SUFFIX_LENGTH) using a combination of ASCII letters, digits, and
312
+ punctuation characters (excluding hyphens).
298
313
 
299
314
  Returns:
300
- str: Random string containing letters, digits and symbols
315
+ str: A unique suffix string.
301
316
  """
302
-
303
317
  return "".join(
304
318
  [
305
319
  choice(ascii_letters + digits + punctuation.replace("-", ""))