PyPDFForm 3.5.3__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. PyPDFForm/__init__.py +5 -3
  2. PyPDFForm/adapter.py +33 -1
  3. PyPDFForm/ap.py +99 -0
  4. PyPDFForm/assets/__init__.py +0 -0
  5. PyPDFForm/assets/blank.py +100 -0
  6. PyPDFForm/constants.py +20 -2
  7. PyPDFForm/coordinate.py +7 -11
  8. PyPDFForm/deprecation.py +30 -0
  9. PyPDFForm/filler.py +17 -36
  10. PyPDFForm/font.py +16 -16
  11. PyPDFForm/hooks.py +153 -30
  12. PyPDFForm/image.py +0 -3
  13. PyPDFForm/middleware/__init__.py +35 -0
  14. PyPDFForm/middleware/base.py +24 -5
  15. PyPDFForm/middleware/checkbox.py +18 -1
  16. PyPDFForm/middleware/signature.py +0 -1
  17. PyPDFForm/patterns.py +44 -13
  18. PyPDFForm/raw/__init__.py +37 -0
  19. PyPDFForm/raw/circle.py +65 -0
  20. PyPDFForm/raw/ellipse.py +69 -0
  21. PyPDFForm/raw/image.py +79 -0
  22. PyPDFForm/raw/line.py +65 -0
  23. PyPDFForm/raw/rect.py +70 -0
  24. PyPDFForm/raw/text.py +73 -0
  25. PyPDFForm/template.py +114 -12
  26. PyPDFForm/types.py +49 -0
  27. PyPDFForm/utils.py +31 -41
  28. PyPDFForm/watermark.py +153 -44
  29. PyPDFForm/widgets/__init__.py +1 -0
  30. PyPDFForm/widgets/base.py +79 -59
  31. PyPDFForm/widgets/checkbox.py +30 -30
  32. PyPDFForm/widgets/dropdown.py +42 -40
  33. PyPDFForm/widgets/image.py +17 -16
  34. PyPDFForm/widgets/radio.py +27 -28
  35. PyPDFForm/widgets/signature.py +96 -60
  36. PyPDFForm/widgets/text.py +40 -40
  37. PyPDFForm/wrapper.py +256 -240
  38. {pypdfform-3.5.3.dist-info → pypdfform-4.2.0.dist-info}/METADATA +33 -26
  39. pypdfform-4.2.0.dist-info/RECORD +47 -0
  40. {pypdfform-3.5.3.dist-info → pypdfform-4.2.0.dist-info}/licenses/LICENSE +1 -1
  41. pypdfform-3.5.3.dist-info/RECORD +0 -35
  42. /PyPDFForm/{widgets → assets}/bedrock.py +0 -0
  43. {pypdfform-3.5.3.dist-info → pypdfform-4.2.0.dist-info}/WHEEL +0 -0
  44. {pypdfform-3.5.3.dist-info → pypdfform-4.2.0.dist-info}/top_level.txt +0 -0
PyPDFForm/raw/image.py ADDED
@@ -0,0 +1,79 @@
1
+ # -*- coding: utf-8 -*-
2
+ # pylint: disable=R0801
3
+ """
4
+ Contains the RawImage class, which represents an image that can be drawn
5
+ directly onto a PDF page at a specified position and size.
6
+ """
7
+
8
+ from typing import BinaryIO, Union
9
+
10
+ from ..adapter import fp_or_f_obj_or_stream_to_stream
11
+ from ..image import rotate_image
12
+
13
+
14
+ class RawImage:
15
+ """
16
+ Represents an image object intended for direct drawing onto a specific page
17
+ of a PDF document at specified coordinates, size, and rotation.
18
+
19
+ This class handles converting various input types for the image (file path, bytes,
20
+ or stream) into a standardized stream format, applying rotation if necessary.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ image: Union[bytes, str, BinaryIO],
26
+ page_number: int,
27
+ x: float,
28
+ y: float,
29
+ width: float,
30
+ height: float,
31
+ rotation: float = 0,
32
+ ) -> None:
33
+ """
34
+ Initializes a raw image object for drawing.
35
+
36
+ Args:
37
+ image: The image source, which can be a path (str), raw bytes (bytes),
38
+ or a file stream (BinaryIO).
39
+ page_number: The 1-based index of the page where the image should be drawn.
40
+ x: The x-coordinate (horizontal position) of the bottom-left corner of the image.
41
+ y: The y-coordinate (vertical position) of the bottom-left corner of the image.
42
+ width: The desired width of the image when drawn on the PDF.
43
+ height: The desired height of the image when drawn on the PDF.
44
+ rotation: The rotation angle in degrees (defaults to 0, no rotation).
45
+ """
46
+ super().__init__()
47
+
48
+ self.image = image
49
+ self.page_number = page_number
50
+ self.x = x
51
+ self.y = y
52
+ self.width = width
53
+ self.height = height
54
+ self.rotation = rotation
55
+
56
+ @property
57
+ def to_draw(self) -> dict:
58
+ """
59
+ Converts the raw image object into a dictionary format ready for drawing.
60
+
61
+ The image is converted to a stream and rotated if necessary before being included in the dictionary.
62
+
63
+ Returns:
64
+ A dictionary containing drawing parameters: page number, object type ("image"),
65
+ the image stream (BinaryIO), coordinates (x, y), and dimensions (width, height).
66
+ """
67
+ image = fp_or_f_obj_or_stream_to_stream(self.image)
68
+ if self.rotation:
69
+ image = rotate_image(image, self.rotation)
70
+
71
+ return {
72
+ "page_number": self.page_number,
73
+ "type": "image",
74
+ "stream": image,
75
+ "x": self.x,
76
+ "y": self.y,
77
+ "width": self.width,
78
+ "height": self.height,
79
+ }
PyPDFForm/raw/line.py ADDED
@@ -0,0 +1,65 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Contains the RawLine class, which represents a line that can be drawn
4
+ directly onto a PDF page at specified coordinates.
5
+ """
6
+
7
+ from ..constants import DEFAULT_FONT_COLOR
8
+
9
+
10
+ class RawLine:
11
+ """
12
+ Represents a line object intended for direct drawing onto a specific page
13
+ of a PDF document defined by starting and ending coordinates.
14
+
15
+ This class encapsulates the necessary information (start point, end point,
16
+ page number, and color) to render a straight line on a PDF page.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ page_number: int,
22
+ src_x: float,
23
+ src_y: float,
24
+ dest_x: float,
25
+ dest_y: float,
26
+ color: tuple = DEFAULT_FONT_COLOR,
27
+ ) -> None:
28
+ """
29
+ Initializes a raw line object for drawing.
30
+
31
+ Args:
32
+ page_number: The 1-based index of the page where the line should be drawn.
33
+ src_x: The x-coordinate (horizontal position) of the starting point.
34
+ src_y: The y-coordinate (vertical position) of the starting point.
35
+ dest_x: The x-coordinate (horizontal position) of the ending point.
36
+ dest_y: The y-coordinate (vertical position) of the ending point.
37
+ color: The color of the line as an RGB tuple (0-1 for each channel).
38
+ """
39
+ super().__init__()
40
+
41
+ self.page_number = page_number
42
+ self.src_x = src_x
43
+ self.src_y = src_y
44
+ self.dest_x = dest_x
45
+ self.dest_y = dest_y
46
+ self.color = color
47
+
48
+ @property
49
+ def to_draw(self) -> dict:
50
+ """
51
+ Converts the raw line object into a dictionary format ready for drawing.
52
+
53
+ Returns:
54
+ A dictionary containing drawing parameters: page number, object type ("line"),
55
+ start and end coordinates, and color.
56
+ """
57
+ return {
58
+ "page_number": self.page_number,
59
+ "type": "line",
60
+ "src_x": self.src_x,
61
+ "src_y": self.src_y,
62
+ "dest_x": self.dest_x,
63
+ "dest_y": self.dest_y,
64
+ "color": self.color,
65
+ }
PyPDFForm/raw/rect.py ADDED
@@ -0,0 +1,70 @@
1
+ # -*- coding: utf-8 -*-
2
+ # pylint: disable=R0801
3
+ """
4
+ Contains the RawRectangle class, which represents a rectangle that can be drawn
5
+ directly onto a PDF page at specified coordinates and dimensions.
6
+ """
7
+
8
+ from ..constants import DEFAULT_FONT_COLOR
9
+
10
+
11
+ class RawRectangle:
12
+ """
13
+ Represents a rectangle object intended for direct drawing onto a specific page
14
+ of a PDF document at specified coordinates and size.
15
+
16
+ This class encapsulates the necessary information (position, size, color,
17
+ and fill color) to render a rectangle on a PDF page.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ page_number: int,
23
+ x: float,
24
+ y: float,
25
+ width: float,
26
+ height: float,
27
+ color: tuple = DEFAULT_FONT_COLOR,
28
+ fill_color: tuple = None,
29
+ ) -> None:
30
+ """
31
+ Initializes a raw rectangle object for drawing.
32
+
33
+ Args:
34
+ page_number: The 1-based index of the page where the rectangle should be drawn.
35
+ x: The x-coordinate (horizontal position) of the bottom-left corner of the rectangle.
36
+ y: The y-coordinate (vertical position) of the bottom-left corner of the rectangle.
37
+ width: The width of the rectangle.
38
+ height: The height of the rectangle.
39
+ color: The color of the rectangle's outline as an RGB tuple (0-1 for each channel).
40
+ fill_color: The fill color of the rectangle as an RGB tuple (0-1 for each channel).
41
+ """
42
+ super().__init__()
43
+
44
+ self.page_number = page_number
45
+ self.x = x
46
+ self.y = y
47
+ self.width = width
48
+ self.height = height
49
+ self.color = color
50
+ self.fill_color = fill_color
51
+
52
+ @property
53
+ def to_draw(self) -> dict:
54
+ """
55
+ Converts the raw rectangle object into a dictionary format ready for drawing.
56
+
57
+ Returns:
58
+ A dictionary containing drawing parameters: page number, object type ("rect"),
59
+ coordinates, dimensions, outline color, and fill color.
60
+ """
61
+ return {
62
+ "page_number": self.page_number,
63
+ "type": "rect",
64
+ "x": self.x,
65
+ "y": self.y,
66
+ "width": self.width,
67
+ "height": self.height,
68
+ "color": self.color,
69
+ "fill_color": self.fill_color,
70
+ }
PyPDFForm/raw/text.py ADDED
@@ -0,0 +1,73 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Contains the RawText class, which represents a text annotation
4
+ that can be drawn directly onto a PDF page without relying on existing form fields.
5
+ """
6
+
7
+ from ..constants import DEFAULT_FONT, DEFAULT_FONT_COLOR, DEFAULT_FONT_SIZE
8
+ from ..middleware.text import Text
9
+
10
+
11
+ class RawText:
12
+ """
13
+ Represents a text object intended for direct drawing onto a specific page
14
+ of a PDF document at specified coordinates.
15
+
16
+ This class encapsulates all necessary information (text content, position,
17
+ font, size, and color) to render text on a PDF page outside of traditional
18
+ form fields.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ text: str,
24
+ page_number: int,
25
+ x: float,
26
+ y: float,
27
+ font: str = DEFAULT_FONT,
28
+ font_size: float = DEFAULT_FONT_SIZE,
29
+ font_color: tuple = DEFAULT_FONT_COLOR,
30
+ ) -> None:
31
+ """
32
+ Initializes a raw text object for drawing.
33
+
34
+ Args:
35
+ text: The string content of the text to be drawn.
36
+ page_number: The 1-based index of the page where the text should be drawn.
37
+ x: The x-coordinate (horizontal position) of the text.
38
+ y: The y-coordinate (vertical position) of the text.
39
+ font: The name of the font to use for the text (defaults to DEFAULT_FONT).
40
+ font_size: The size of the font (defaults to DEFAULT_FONT_SIZE).
41
+ font_color: The color of the text as an RGB tuple (0-255 for each channel).
42
+ """
43
+ super().__init__()
44
+
45
+ self.text = text
46
+ self.page_number = page_number
47
+ self.x = x
48
+ self.y = y
49
+ self.font = font
50
+ self.font_size = font_size
51
+ self.font_color = font_color
52
+
53
+ @property
54
+ def to_draw(self) -> dict:
55
+ """
56
+ Converts the raw text object to a dict ready for drawing.
57
+
58
+ Returns:
59
+ A dictionary containing the page number, object type, an initialized Text widget,
60
+ and the coordinates for drawing.
61
+ """
62
+ widget = Text("new", self.text)
63
+ widget.font = self.font
64
+ widget.font_size = self.font_size
65
+ widget.font_color = self.font_color
66
+
67
+ return {
68
+ "page_number": self.page_number,
69
+ "type": "text",
70
+ "widget": widget,
71
+ "x": self.x,
72
+ "y": self.y,
73
+ }
PyPDFForm/template.py CHANGED
@@ -7,29 +7,25 @@ in PDF form templates. It leverages the pypdf library for PDF manipulation
7
7
  and defines specific patterns for identifying and constructing different
8
8
  types of widgets.
9
9
  """
10
- # TODO: In `build_widgets`, the `get_widgets_by_page` function is called, which then iterates through pages and annotations. For very large PDFs, this initial parsing and iteration can be a bottleneck. Consider optimizing the widget extraction process if possible, perhaps by using a more direct method to access annotations if `pypdf` allows.
11
- # TODO: The `construct_widget` function iterates through `WIDGET_TYPE_PATTERNS` for each widget. If there are many patterns or many widgets, this repeated iteration could be optimized by pre-compiling patterns or using a more efficient lookup mechanism.
12
- # TODO: In `get_widget_key`, the recursive call for `Parent` can lead to deep recursion for deeply nested widgets, potentially impacting performance or hitting recursion limits for extremely complex forms. Consider an iterative approach if deep nesting is common.
13
- # TODO: In `update_widget_keys`, the nested loops iterating through `old_keys`, `out.pages`, and `page.get(Annots, [])` can be very inefficient for large numbers of keys, pages, or annotations. Consider creating a lookup structure for annotations by key to avoid repeated linear scans.
14
- # TODO: In `update_widget_keys`, `PdfReader(stream_to_io(template))` and `out.append(pdf)` involve re-parsing and appending the PDF. For large PDFs, passing `PdfReader` and `PdfWriter` objects directly could reduce overhead.
15
10
 
16
11
  from functools import lru_cache
17
12
  from io import BytesIO
18
13
  from typing import Dict, List, Tuple, Union, cast
19
14
 
20
15
  from pypdf import PdfReader, PdfWriter
21
- from pypdf.generic import DictionaryObject
16
+ from pypdf.generic import DictionaryObject, NameObject, TextStringObject
22
17
 
23
- from .constants import WIDGET_TYPES, Annots, MaxLen, Parent, T
18
+ from .constants import (JS, WIDGET_TYPES, Annots, JavaScript, MaxLen,
19
+ OpenAction, Parent, S, T, Title)
24
20
  from .middleware.checkbox import Checkbox
25
21
  from .middleware.dropdown import Dropdown
26
22
  from .middleware.radio import Radio
27
23
  from .middleware.text import Text
28
24
  from .patterns import (DROPDOWN_CHOICE_PATTERNS, WIDGET_DESCRIPTION_PATTERNS,
29
25
  WIDGET_KEY_PATTERNS, WIDGET_TYPE_PATTERNS,
30
- get_checkbox_value, get_dropdown_value, get_radio_value,
31
- get_text_field_multiline, get_text_value,
32
- update_annotation_name)
26
+ get_checkbox_value, get_dropdown_value, get_field_rect,
27
+ get_radio_value, get_text_field_multiline,
28
+ get_text_value, update_annotation_name)
33
29
  from .utils import extract_widget_property, find_pattern_match, stream_to_io
34
30
 
35
31
 
@@ -62,10 +58,13 @@ def build_widgets(
62
58
  key = get_widget_key(widget, use_full_widget_name)
63
59
  _widget = construct_widget(widget, key)
64
60
  if _widget is not None:
65
- _widget.desc = extract_widget_property(
61
+ _widget.__dict__["tooltip"] = extract_widget_property(
66
62
  widget, WIDGET_DESCRIPTION_PATTERNS, None, str
67
63
  )
68
64
 
65
+ field_rect = get_field_rect(widget)
66
+ _widget.x, _widget.y, _widget.width, _widget.height = field_rect
67
+
69
68
  if isinstance(_widget, Text):
70
69
  # mostly for schema for now
71
70
  # doesn't trigger hook
@@ -84,11 +83,20 @@ def build_widgets(
84
83
 
85
84
  if isinstance(_widget, Radio):
86
85
  if key not in results:
86
+ _widget.x = []
87
+ _widget.y = []
88
+ _widget.width = []
89
+ _widget.height = []
87
90
  results[key] = _widget
88
91
 
89
92
  # for schema
90
93
  results[key].number_of_options += 1
91
94
 
95
+ results[key].x.append(field_rect[0])
96
+ results[key].y.append(field_rect[1])
97
+ results[key].width.append(field_rect[2])
98
+ results[key].height.append(field_rect[3])
99
+
92
100
  if get_radio_value(widget):
93
101
  results[key].value = results[key].number_of_options - 1
94
102
  continue
@@ -225,13 +233,107 @@ def get_dropdown_choices(widget: dict) -> Union[Tuple[str, ...], None]:
225
233
  Union[Tuple[str, ...], None]: A tuple of strings representing the choices in the dropdown, or None if the choices are not specified.
226
234
  """
227
235
  return tuple(
228
- (each if isinstance(each, str) else str(each[1]))
236
+ (
237
+ each.get_object()
238
+ if isinstance(each.get_object(), str)
239
+ else str(each.get_object()[1])
240
+ )
229
241
  for each in extract_widget_property(
230
242
  widget, DROPDOWN_CHOICE_PATTERNS, None, None
231
243
  )
232
244
  )
233
245
 
234
246
 
247
+ def get_on_open_javascript(pdf: bytes) -> Union[str, None]:
248
+ """
249
+ Retrieves the JavaScript that runs when the PDF is opened.
250
+
251
+ Args:
252
+ pdf (bytes): The PDF file content as a bytes stream.
253
+
254
+ Returns:
255
+ Union[str, None]: The JavaScript that runs when the PDF is opened, or None if it's not present.
256
+ """
257
+ reader = PdfReader(stream_to_io(pdf))
258
+ try:
259
+ return reader.root_object[OpenAction][JS]
260
+ except KeyError:
261
+ return None
262
+
263
+
264
+ def set_on_open_javascript(pdf: bytes, script: str) -> bytes:
265
+ """
266
+ Sets the JavaScript that runs when the PDF is opened.
267
+
268
+ Args:
269
+ pdf (bytes): The PDF file content as a bytes stream.
270
+ script (str): The JavaScript to run when the PDF is opened.
271
+
272
+ Returns:
273
+ bytes: The modified PDF content as a bytes stream.
274
+ """
275
+ if not script:
276
+ return pdf
277
+
278
+ reader = PdfReader(stream_to_io(pdf))
279
+ writer = PdfWriter()
280
+ writer.append(reader)
281
+
282
+ open_action = DictionaryObject()
283
+ open_action[NameObject(S)] = NameObject(JavaScript)
284
+ open_action[NameObject(JS)] = TextStringObject(script)
285
+
286
+ writer._root_object.update({NameObject(OpenAction): open_action}) # type: ignore # noqa: SLF001 # # pylint: disable=W0212
287
+
288
+ with BytesIO() as f:
289
+ writer.write(f)
290
+ f.seek(0)
291
+ return f.read()
292
+
293
+
294
+ def get_pdf_title(pdf: bytes) -> Union[str, None]:
295
+ """
296
+ Retrieves the title of a PDF from its metadata.
297
+
298
+ Args:
299
+ pdf (bytes): The PDF file content as a bytes stream.
300
+
301
+ Returns:
302
+ Union[str, None]: The title of the PDF, or None if it's not present.
303
+ """
304
+ reader = PdfReader(stream_to_io(pdf))
305
+ return (reader.metadata or {}).get(Title)
306
+
307
+
308
+ def set_pdf_title(pdf: bytes, title: str) -> bytes:
309
+ """
310
+ Sets the title of a PDF in its metadata.
311
+
312
+ Args:
313
+ pdf (bytes): The PDF file content as a bytes stream.
314
+ title (str): The new title for the PDF.
315
+
316
+ Returns:
317
+ bytes: The modified PDF content as a bytes stream.
318
+ """
319
+ if not title:
320
+ return pdf
321
+
322
+ reader = PdfReader(stream_to_io(pdf))
323
+ writer = PdfWriter()
324
+ writer.append(reader)
325
+
326
+ metadata = reader.metadata or {}
327
+ metadata[NameObject(Title)] = TextStringObject(title)
328
+
329
+ writer.add_metadata(metadata)
330
+
331
+ with BytesIO() as f:
332
+ writer.write(f)
333
+ f.seek(0)
334
+ return f.read()
335
+
336
+
235
337
  def update_widget_keys(
236
338
  template: bytes,
237
339
  widgets: Dict[str, WIDGET_TYPES],
PyPDFForm/types.py ADDED
@@ -0,0 +1,49 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ A module for custom type definitions used throughout the PyPDFForm library.
4
+
5
+ This includes specialized container types like PdfWrapperList, which extends
6
+ the standard list to provide custom behavior for slicing operations, particularly
7
+ for merging PdfWrapper objects.
8
+ """
9
+
10
+ from typing import Any, Union
11
+
12
+
13
+ class PdfWrapperList(list):
14
+ """
15
+ A specialized list subclass designed to hold PdfWrapper objects.
16
+
17
+ When sliced, this list automatically merges the contained PdfWrapper
18
+ objects using the PdfWrapper.__add__ method, returning a single
19
+ merged PdfWrapper object. If the slice is empty, it returns an empty list.
20
+ For non-slice indexing, it behaves like a standard list.
21
+ """
22
+
23
+ def __getitem__(self, key: Any) -> Union[list, Any]:
24
+ """
25
+ Retrieves an item or a slice of items from the list.
26
+
27
+ If the key is a slice, it merges the PdfWrapper objects in the slice
28
+ and returns a single merged PdfWrapper.
29
+ If the key is an index, it returns the PdfWrapper at that index.
30
+
31
+ Args:
32
+ key (Union[int, slice]): The index or slice to retrieve.
33
+
34
+ Returns:
35
+ Union[PdfWrapper, list, Any]: A single merged PdfWrapper if sliced,
36
+ or the item at the index if indexed.
37
+ """
38
+
39
+ if isinstance(key, slice):
40
+ result = None
41
+ wrappers = super().__getitem__(key)
42
+ for each in wrappers:
43
+ if not result:
44
+ result = each
45
+ else:
46
+ result += each
47
+
48
+ return result
49
+ return super().__getitem__(key)
PyPDFForm/utils.py CHANGED
@@ -10,16 +10,8 @@ It includes functions for:
10
10
  - Finding and traversing patterns within PDF widgets.
11
11
  - Extracting widget properties based on defined patterns.
12
12
  - Generating unique suffixes for internal use.
13
- - Enabling Adobe-specific settings in the PDF to ensure proper rendering of form fields.
13
+ - Setting the `NeedAppearances` flag in the PDF to ensure proper rendering of form fields.
14
14
  """
15
- # TODO: In `enable_adobe_mode`, `PdfReader(stream_to_io(pdf))` and `writer.append(reader)` involve re-parsing and appending the PDF. For large PDFs, passing `PdfReader` and `PdfWriter` objects directly could reduce overhead.
16
- # TODO: In `remove_all_widgets`, `PdfReader(stream_to_io(pdf))` and iterating through pages to add them to a new writer can be inefficient for large PDFs. Consider if `pypdf` offers a more direct way to remove annotations without reconstructing the entire PDF.
17
- # TODO: In `get_page_streams`, `PdfReader(stream_to_io(pdf))` and then creating a new `PdfWriter` for each page can be very inefficient. It would be more performant to iterate through the pages of a single `PdfReader` and extract their content streams directly if possible, or to use a single `PdfWriter` to extract multiple pages.
18
- # TODO: In `merge_two_pdfs`, the function reads and writes PDFs multiple times (`PdfReader`, `PdfWriter`, `remove_all_widgets`, then another `PdfReader` and `PdfWriter`). This is highly inefficient. The PDF objects should be passed around and modified in-place as much as possible, with a single final write operation.
19
- # TODO: The `merge_two_pdfs` function has a `TODO: refactor duplicate logic with copy_watermark_widgets` comment. This indicates a potential for code duplication and inefficiency. Refactoring this to a shared helper function would improve maintainability and potentially performance.
20
- # TODO: In `find_pattern_match` and `traverse_pattern`, the recursive nature and repeated dictionary lookups (`widget.items()`, `value.get_object()`) can be slow for deeply nested or complex widget structures. Consider optimizing these traversals, perhaps by pre-flattening the widget dictionary or using a more direct access method if `pypdf` allows.
21
- # TODO: In `extract_widget_property`, the loop iterates through `patterns` and calls `traverse_pattern` for each. If `patterns` is long or `traverse_pattern` is expensive, this could be a bottleneck. Consider optimizing the pattern matching or lookup.
22
- # TODO: `generate_unique_suffix` uses `choice` in a loop. While generally fast, for extremely high call volumes, pre-generating a pool of characters or using a faster random string generation method might offer minor improvements.
23
15
 
24
16
  from collections.abc import Callable
25
17
  from functools import lru_cache
@@ -31,7 +23,7 @@ from typing import Any, BinaryIO, List, Union
31
23
  from pypdf import PdfReader, PdfWriter
32
24
  from pypdf.generic import ArrayObject, DictionaryObject, NameObject
33
25
 
34
- from .constants import SLASH, UNIQUE_SUFFIX_LENGTH, XFA, AcroForm, Annots, Root
26
+ from .constants import SLASH, UNIQUE_SUFFIX_LENGTH, Annots
35
27
 
36
28
 
37
29
  @lru_cache
@@ -58,37 +50,6 @@ def stream_to_io(stream: bytes) -> BinaryIO:
58
50
  return result
59
51
 
60
52
 
61
- @lru_cache
62
- def enable_adobe_mode(pdf: bytes) -> bytes:
63
- """Enables Adobe-specific settings in the PDF to ensure proper rendering of form fields.
64
-
65
- This function modifies the PDF's AcroForm dictionary to include the `NeedAppearances` flag,
66
- which forces Adobe Reader to generate appearance streams for form fields. It also handles
67
- XFA (XML Forms Architecture) forms by removing the XFA entry from the AcroForm dictionary
68
- if it exists, ensuring compatibility and proper rendering. This ensures that the form fields
69
- are rendered correctly in Adobe Reader, especially when the form is filled programmatically.
70
-
71
- Args:
72
- pdf (bytes): The PDF content as bytes.
73
-
74
- Returns:
75
- bytes: The modified PDF content with Adobe mode enabled.
76
- """
77
- reader = PdfReader(stream_to_io(pdf))
78
- writer = PdfWriter()
79
-
80
- if AcroForm in reader.trailer[Root] and XFA in reader.trailer[Root][AcroForm]:
81
- del reader.trailer[Root][AcroForm][XFA]
82
-
83
- writer.append(reader)
84
- writer.set_need_appearances_writer()
85
-
86
- with BytesIO() as f:
87
- writer.write(f)
88
- f.seek(0)
89
- return f.read()
90
-
91
-
92
53
  @lru_cache
93
54
  def remove_all_widgets(pdf: bytes) -> bytes:
94
55
  """
@@ -144,6 +105,35 @@ def get_page_streams(pdf: bytes) -> List[bytes]:
144
105
  return result
145
106
 
146
107
 
108
+ def merge_pdfs(pdf_list: list[bytes]) -> bytes:
109
+ """
110
+ Merges a list of PDF byte streams into a single PDF byte stream.
111
+
112
+ This function uses a pairwise merging strategy (similar to a merge sort's merge phase)
113
+ to combine multiple PDF files efficiently. Instead of iteratively merging the result
114
+ with the next PDF (O(n^2) complexity where n is the number of pages), this approach
115
+ merges all available PDFs in pairs in a single pass. This process repeats until
116
+ only a single merged PDF remains, offering better performance for large lists of
117
+ PDFs.
118
+
119
+ Args:
120
+ pdf_list (list[bytes]): A list of PDF files as byte streams to be merged.
121
+
122
+ Returns:
123
+ bytes: The merged PDF file as a single byte stream.
124
+ """
125
+ while len(pdf_list) > 2:
126
+ groups = [pdf_list[i : i + 2] for i in range(0, len(pdf_list), 2)]
127
+ pdf_list = []
128
+ for each in groups:
129
+ if len(each) == 2:
130
+ pdf_list.append(merge_two_pdfs(each[0], each[1]))
131
+ else:
132
+ pdf_list += each
133
+
134
+ return merge_two_pdfs(pdf_list[0], pdf_list[1])
135
+
136
+
147
137
  def merge_two_pdfs(pdf: bytes, other: bytes) -> bytes:
148
138
  """
149
139
  Merges two PDF files into a single PDF file.