PyPDFForm 3.5.1__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyPDFForm/__init__.py +5 -3
- PyPDFForm/adapter.py +33 -1
- PyPDFForm/ap.py +99 -0
- PyPDFForm/assets/__init__.py +0 -0
- PyPDFForm/assets/blank.py +100 -0
- PyPDFForm/constants.py +20 -2
- PyPDFForm/coordinate.py +7 -11
- PyPDFForm/deprecation.py +30 -0
- PyPDFForm/filler.py +17 -36
- PyPDFForm/font.py +16 -16
- PyPDFForm/hooks.py +169 -31
- PyPDFForm/image.py +0 -3
- PyPDFForm/middleware/__init__.py +35 -0
- PyPDFForm/middleware/base.py +24 -5
- PyPDFForm/middleware/checkbox.py +18 -1
- PyPDFForm/middleware/signature.py +0 -1
- PyPDFForm/patterns.py +71 -13
- PyPDFForm/raw/__init__.py +37 -0
- PyPDFForm/raw/circle.py +65 -0
- PyPDFForm/raw/ellipse.py +69 -0
- PyPDFForm/raw/image.py +79 -0
- PyPDFForm/raw/line.py +65 -0
- PyPDFForm/raw/rect.py +70 -0
- PyPDFForm/raw/text.py +73 -0
- PyPDFForm/template.py +114 -10
- PyPDFForm/types.py +49 -0
- PyPDFForm/utils.py +31 -41
- PyPDFForm/watermark.py +153 -44
- PyPDFForm/widgets/__init__.py +1 -0
- PyPDFForm/widgets/base.py +79 -59
- PyPDFForm/widgets/checkbox.py +30 -30
- PyPDFForm/widgets/dropdown.py +42 -40
- PyPDFForm/widgets/image.py +17 -16
- PyPDFForm/widgets/radio.py +27 -28
- PyPDFForm/widgets/signature.py +96 -60
- PyPDFForm/widgets/text.py +40 -40
- PyPDFForm/wrapper.py +256 -240
- {pypdfform-3.5.1.dist-info → pypdfform-4.2.0.dist-info}/METADATA +33 -26
- pypdfform-4.2.0.dist-info/RECORD +47 -0
- {pypdfform-3.5.1.dist-info → pypdfform-4.2.0.dist-info}/licenses/LICENSE +1 -1
- pypdfform-3.5.1.dist-info/RECORD +0 -35
- /PyPDFForm/{widgets → assets}/bedrock.py +0 -0
- {pypdfform-3.5.1.dist-info → pypdfform-4.2.0.dist-info}/WHEEL +0 -0
- {pypdfform-3.5.1.dist-info → pypdfform-4.2.0.dist-info}/top_level.txt +0 -0
PyPDFForm/raw/ellipse.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Contains the RawEllipse class, which represents an ellipse that can be drawn
|
|
4
|
+
directly onto a PDF page defined by its bounding box.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ..constants import DEFAULT_FONT_COLOR
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RawEllipse:
|
|
11
|
+
"""
|
|
12
|
+
Represents an ellipse object intended for direct drawing onto a specific page
|
|
13
|
+
of a PDF document defined by its bounding box coordinates.
|
|
14
|
+
|
|
15
|
+
This class encapsulates the necessary information (bounding box corners,
|
|
16
|
+
page number, color, and fill color) to render an ellipse on a PDF page.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
page_number: int,
|
|
22
|
+
x1: float,
|
|
23
|
+
y1: float,
|
|
24
|
+
x2: float,
|
|
25
|
+
y2: float,
|
|
26
|
+
color: tuple = DEFAULT_FONT_COLOR,
|
|
27
|
+
fill_color: tuple = None,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Initializes a raw ellipse object for drawing.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
page_number: The 1-based index of the page where the ellipse should be drawn.
|
|
34
|
+
x1: The x-coordinate of the first corner of the bounding box.
|
|
35
|
+
y1: The y-coordinate of the first corner of the bounding box.
|
|
36
|
+
x2: The x-coordinate of the second corner of the bounding box.
|
|
37
|
+
y2: The y-coordinate of the second corner of the bounding box.
|
|
38
|
+
color: The color of the ellipse's outline as an RGB tuple (0-1 for each channel).
|
|
39
|
+
fill_color: The fill color of the ellipse as an RGB tuple (0-1 for each channel).
|
|
40
|
+
"""
|
|
41
|
+
super().__init__()
|
|
42
|
+
|
|
43
|
+
self.page_number = page_number
|
|
44
|
+
self.x1 = x1
|
|
45
|
+
self.y1 = y1
|
|
46
|
+
self.x2 = x2
|
|
47
|
+
self.y2 = y2
|
|
48
|
+
self.color = color
|
|
49
|
+
self.fill_color = fill_color
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def to_draw(self) -> dict:
|
|
53
|
+
"""
|
|
54
|
+
Converts the raw ellipse object into a dictionary format ready for drawing.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A dictionary containing drawing parameters: page number, object type ("ellipse"),
|
|
58
|
+
bounding box coordinates, outline color, and fill color.
|
|
59
|
+
"""
|
|
60
|
+
return {
|
|
61
|
+
"page_number": self.page_number,
|
|
62
|
+
"type": "ellipse",
|
|
63
|
+
"x1": self.x1,
|
|
64
|
+
"y1": self.y1,
|
|
65
|
+
"x2": self.x2,
|
|
66
|
+
"y2": self.y2,
|
|
67
|
+
"color": self.color,
|
|
68
|
+
"fill_color": self.fill_color,
|
|
69
|
+
}
|
PyPDFForm/raw/image.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# pylint: disable=R0801
|
|
3
|
+
"""
|
|
4
|
+
Contains the RawImage class, which represents an image that can be drawn
|
|
5
|
+
directly onto a PDF page at a specified position and size.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import BinaryIO, Union
|
|
9
|
+
|
|
10
|
+
from ..adapter import fp_or_f_obj_or_stream_to_stream
|
|
11
|
+
from ..image import rotate_image
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RawImage:
|
|
15
|
+
"""
|
|
16
|
+
Represents an image object intended for direct drawing onto a specific page
|
|
17
|
+
of a PDF document at specified coordinates, size, and rotation.
|
|
18
|
+
|
|
19
|
+
This class handles converting various input types for the image (file path, bytes,
|
|
20
|
+
or stream) into a standardized stream format, applying rotation if necessary.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
image: Union[bytes, str, BinaryIO],
|
|
26
|
+
page_number: int,
|
|
27
|
+
x: float,
|
|
28
|
+
y: float,
|
|
29
|
+
width: float,
|
|
30
|
+
height: float,
|
|
31
|
+
rotation: float = 0,
|
|
32
|
+
) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Initializes a raw image object for drawing.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
image: The image source, which can be a path (str), raw bytes (bytes),
|
|
38
|
+
or a file stream (BinaryIO).
|
|
39
|
+
page_number: The 1-based index of the page where the image should be drawn.
|
|
40
|
+
x: The x-coordinate (horizontal position) of the bottom-left corner of the image.
|
|
41
|
+
y: The y-coordinate (vertical position) of the bottom-left corner of the image.
|
|
42
|
+
width: The desired width of the image when drawn on the PDF.
|
|
43
|
+
height: The desired height of the image when drawn on the PDF.
|
|
44
|
+
rotation: The rotation angle in degrees (defaults to 0, no rotation).
|
|
45
|
+
"""
|
|
46
|
+
super().__init__()
|
|
47
|
+
|
|
48
|
+
self.image = image
|
|
49
|
+
self.page_number = page_number
|
|
50
|
+
self.x = x
|
|
51
|
+
self.y = y
|
|
52
|
+
self.width = width
|
|
53
|
+
self.height = height
|
|
54
|
+
self.rotation = rotation
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def to_draw(self) -> dict:
|
|
58
|
+
"""
|
|
59
|
+
Converts the raw image object into a dictionary format ready for drawing.
|
|
60
|
+
|
|
61
|
+
The image is converted to a stream and rotated if necessary before being included in the dictionary.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A dictionary containing drawing parameters: page number, object type ("image"),
|
|
65
|
+
the image stream (BinaryIO), coordinates (x, y), and dimensions (width, height).
|
|
66
|
+
"""
|
|
67
|
+
image = fp_or_f_obj_or_stream_to_stream(self.image)
|
|
68
|
+
if self.rotation:
|
|
69
|
+
image = rotate_image(image, self.rotation)
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
"page_number": self.page_number,
|
|
73
|
+
"type": "image",
|
|
74
|
+
"stream": image,
|
|
75
|
+
"x": self.x,
|
|
76
|
+
"y": self.y,
|
|
77
|
+
"width": self.width,
|
|
78
|
+
"height": self.height,
|
|
79
|
+
}
|
PyPDFForm/raw/line.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Contains the RawLine class, which represents a line that can be drawn
|
|
4
|
+
directly onto a PDF page at specified coordinates.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ..constants import DEFAULT_FONT_COLOR
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RawLine:
|
|
11
|
+
"""
|
|
12
|
+
Represents a line object intended for direct drawing onto a specific page
|
|
13
|
+
of a PDF document defined by starting and ending coordinates.
|
|
14
|
+
|
|
15
|
+
This class encapsulates the necessary information (start point, end point,
|
|
16
|
+
page number, and color) to render a straight line on a PDF page.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
page_number: int,
|
|
22
|
+
src_x: float,
|
|
23
|
+
src_y: float,
|
|
24
|
+
dest_x: float,
|
|
25
|
+
dest_y: float,
|
|
26
|
+
color: tuple = DEFAULT_FONT_COLOR,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Initializes a raw line object for drawing.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
page_number: The 1-based index of the page where the line should be drawn.
|
|
33
|
+
src_x: The x-coordinate (horizontal position) of the starting point.
|
|
34
|
+
src_y: The y-coordinate (vertical position) of the starting point.
|
|
35
|
+
dest_x: The x-coordinate (horizontal position) of the ending point.
|
|
36
|
+
dest_y: The y-coordinate (vertical position) of the ending point.
|
|
37
|
+
color: The color of the line as an RGB tuple (0-1 for each channel).
|
|
38
|
+
"""
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
self.page_number = page_number
|
|
42
|
+
self.src_x = src_x
|
|
43
|
+
self.src_y = src_y
|
|
44
|
+
self.dest_x = dest_x
|
|
45
|
+
self.dest_y = dest_y
|
|
46
|
+
self.color = color
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def to_draw(self) -> dict:
|
|
50
|
+
"""
|
|
51
|
+
Converts the raw line object into a dictionary format ready for drawing.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
A dictionary containing drawing parameters: page number, object type ("line"),
|
|
55
|
+
start and end coordinates, and color.
|
|
56
|
+
"""
|
|
57
|
+
return {
|
|
58
|
+
"page_number": self.page_number,
|
|
59
|
+
"type": "line",
|
|
60
|
+
"src_x": self.src_x,
|
|
61
|
+
"src_y": self.src_y,
|
|
62
|
+
"dest_x": self.dest_x,
|
|
63
|
+
"dest_y": self.dest_y,
|
|
64
|
+
"color": self.color,
|
|
65
|
+
}
|
PyPDFForm/raw/rect.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# pylint: disable=R0801
|
|
3
|
+
"""
|
|
4
|
+
Contains the RawRectangle class, which represents a rectangle that can be drawn
|
|
5
|
+
directly onto a PDF page at specified coordinates and dimensions.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ..constants import DEFAULT_FONT_COLOR
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RawRectangle:
|
|
12
|
+
"""
|
|
13
|
+
Represents a rectangle object intended for direct drawing onto a specific page
|
|
14
|
+
of a PDF document at specified coordinates and size.
|
|
15
|
+
|
|
16
|
+
This class encapsulates the necessary information (position, size, color,
|
|
17
|
+
and fill color) to render a rectangle on a PDF page.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
page_number: int,
|
|
23
|
+
x: float,
|
|
24
|
+
y: float,
|
|
25
|
+
width: float,
|
|
26
|
+
height: float,
|
|
27
|
+
color: tuple = DEFAULT_FONT_COLOR,
|
|
28
|
+
fill_color: tuple = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Initializes a raw rectangle object for drawing.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
page_number: The 1-based index of the page where the rectangle should be drawn.
|
|
35
|
+
x: The x-coordinate (horizontal position) of the bottom-left corner of the rectangle.
|
|
36
|
+
y: The y-coordinate (vertical position) of the bottom-left corner of the rectangle.
|
|
37
|
+
width: The width of the rectangle.
|
|
38
|
+
height: The height of the rectangle.
|
|
39
|
+
color: The color of the rectangle's outline as an RGB tuple (0-1 for each channel).
|
|
40
|
+
fill_color: The fill color of the rectangle as an RGB tuple (0-1 for each channel).
|
|
41
|
+
"""
|
|
42
|
+
super().__init__()
|
|
43
|
+
|
|
44
|
+
self.page_number = page_number
|
|
45
|
+
self.x = x
|
|
46
|
+
self.y = y
|
|
47
|
+
self.width = width
|
|
48
|
+
self.height = height
|
|
49
|
+
self.color = color
|
|
50
|
+
self.fill_color = fill_color
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def to_draw(self) -> dict:
|
|
54
|
+
"""
|
|
55
|
+
Converts the raw rectangle object into a dictionary format ready for drawing.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
A dictionary containing drawing parameters: page number, object type ("rect"),
|
|
59
|
+
coordinates, dimensions, outline color, and fill color.
|
|
60
|
+
"""
|
|
61
|
+
return {
|
|
62
|
+
"page_number": self.page_number,
|
|
63
|
+
"type": "rect",
|
|
64
|
+
"x": self.x,
|
|
65
|
+
"y": self.y,
|
|
66
|
+
"width": self.width,
|
|
67
|
+
"height": self.height,
|
|
68
|
+
"color": self.color,
|
|
69
|
+
"fill_color": self.fill_color,
|
|
70
|
+
}
|
PyPDFForm/raw/text.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Contains the RawText class, which represents a text annotation
|
|
4
|
+
that can be drawn directly onto a PDF page without relying on existing form fields.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ..constants import DEFAULT_FONT, DEFAULT_FONT_COLOR, DEFAULT_FONT_SIZE
|
|
8
|
+
from ..middleware.text import Text
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RawText:
|
|
12
|
+
"""
|
|
13
|
+
Represents a text object intended for direct drawing onto a specific page
|
|
14
|
+
of a PDF document at specified coordinates.
|
|
15
|
+
|
|
16
|
+
This class encapsulates all necessary information (text content, position,
|
|
17
|
+
font, size, and color) to render text on a PDF page outside of traditional
|
|
18
|
+
form fields.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
text: str,
|
|
24
|
+
page_number: int,
|
|
25
|
+
x: float,
|
|
26
|
+
y: float,
|
|
27
|
+
font: str = DEFAULT_FONT,
|
|
28
|
+
font_size: float = DEFAULT_FONT_SIZE,
|
|
29
|
+
font_color: tuple = DEFAULT_FONT_COLOR,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Initializes a raw text object for drawing.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
text: The string content of the text to be drawn.
|
|
36
|
+
page_number: The 1-based index of the page where the text should be drawn.
|
|
37
|
+
x: The x-coordinate (horizontal position) of the text.
|
|
38
|
+
y: The y-coordinate (vertical position) of the text.
|
|
39
|
+
font: The name of the font to use for the text (defaults to DEFAULT_FONT).
|
|
40
|
+
font_size: The size of the font (defaults to DEFAULT_FONT_SIZE).
|
|
41
|
+
font_color: The color of the text as an RGB tuple (0-255 for each channel).
|
|
42
|
+
"""
|
|
43
|
+
super().__init__()
|
|
44
|
+
|
|
45
|
+
self.text = text
|
|
46
|
+
self.page_number = page_number
|
|
47
|
+
self.x = x
|
|
48
|
+
self.y = y
|
|
49
|
+
self.font = font
|
|
50
|
+
self.font_size = font_size
|
|
51
|
+
self.font_color = font_color
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def to_draw(self) -> dict:
|
|
55
|
+
"""
|
|
56
|
+
Converts the raw text object to a dict ready for drawing.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A dictionary containing the page number, object type, an initialized Text widget,
|
|
60
|
+
and the coordinates for drawing.
|
|
61
|
+
"""
|
|
62
|
+
widget = Text("new", self.text)
|
|
63
|
+
widget.font = self.font
|
|
64
|
+
widget.font_size = self.font_size
|
|
65
|
+
widget.font_color = self.font_color
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"page_number": self.page_number,
|
|
69
|
+
"type": "text",
|
|
70
|
+
"widget": widget,
|
|
71
|
+
"x": self.x,
|
|
72
|
+
"y": self.y,
|
|
73
|
+
}
|
PyPDFForm/template.py
CHANGED
|
@@ -7,27 +7,24 @@ in PDF form templates. It leverages the pypdf library for PDF manipulation
|
|
|
7
7
|
and defines specific patterns for identifying and constructing different
|
|
8
8
|
types of widgets.
|
|
9
9
|
"""
|
|
10
|
-
# TODO: In `build_widgets`, the `get_widgets_by_page` function is called, which then iterates through pages and annotations. For very large PDFs, this initial parsing and iteration can be a bottleneck. Consider optimizing the widget extraction process if possible, perhaps by using a more direct method to access annotations if `pypdf` allows.
|
|
11
|
-
# TODO: The `construct_widget` function iterates through `WIDGET_TYPE_PATTERNS` for each widget. If there are many patterns or many widgets, this repeated iteration could be optimized by pre-compiling patterns or using a more efficient lookup mechanism.
|
|
12
|
-
# TODO: In `get_widget_key`, the recursive call for `Parent` can lead to deep recursion for deeply nested widgets, potentially impacting performance or hitting recursion limits for extremely complex forms. Consider an iterative approach if deep nesting is common.
|
|
13
|
-
# TODO: In `update_widget_keys`, the nested loops iterating through `old_keys`, `out.pages`, and `page.get(Annots, [])` can be very inefficient for large numbers of keys, pages, or annotations. Consider creating a lookup structure for annotations by key to avoid repeated linear scans.
|
|
14
|
-
# TODO: In `update_widget_keys`, `PdfReader(stream_to_io(template))` and `out.append(pdf)` involve re-parsing and appending the PDF. For large PDFs, passing `PdfReader` and `PdfWriter` objects directly could reduce overhead.
|
|
15
10
|
|
|
16
11
|
from functools import lru_cache
|
|
17
12
|
from io import BytesIO
|
|
18
13
|
from typing import Dict, List, Tuple, Union, cast
|
|
19
14
|
|
|
20
15
|
from pypdf import PdfReader, PdfWriter
|
|
21
|
-
from pypdf.generic import DictionaryObject
|
|
16
|
+
from pypdf.generic import DictionaryObject, NameObject, TextStringObject
|
|
22
17
|
|
|
23
|
-
from .constants import WIDGET_TYPES, Annots,
|
|
18
|
+
from .constants import (JS, WIDGET_TYPES, Annots, JavaScript, MaxLen,
|
|
19
|
+
OpenAction, Parent, S, T, Title)
|
|
24
20
|
from .middleware.checkbox import Checkbox
|
|
25
21
|
from .middleware.dropdown import Dropdown
|
|
26
22
|
from .middleware.radio import Radio
|
|
27
23
|
from .middleware.text import Text
|
|
28
24
|
from .patterns import (DROPDOWN_CHOICE_PATTERNS, WIDGET_DESCRIPTION_PATTERNS,
|
|
29
25
|
WIDGET_KEY_PATTERNS, WIDGET_TYPE_PATTERNS,
|
|
30
|
-
get_checkbox_value, get_dropdown_value,
|
|
26
|
+
get_checkbox_value, get_dropdown_value, get_field_rect,
|
|
27
|
+
get_radio_value, get_text_field_multiline,
|
|
31
28
|
get_text_value, update_annotation_name)
|
|
32
29
|
from .utils import extract_widget_property, find_pattern_match, stream_to_io
|
|
33
30
|
|
|
@@ -61,14 +58,18 @@ def build_widgets(
|
|
|
61
58
|
key = get_widget_key(widget, use_full_widget_name)
|
|
62
59
|
_widget = construct_widget(widget, key)
|
|
63
60
|
if _widget is not None:
|
|
64
|
-
_widget.
|
|
61
|
+
_widget.__dict__["tooltip"] = extract_widget_property(
|
|
65
62
|
widget, WIDGET_DESCRIPTION_PATTERNS, None, str
|
|
66
63
|
)
|
|
67
64
|
|
|
65
|
+
field_rect = get_field_rect(widget)
|
|
66
|
+
_widget.x, _widget.y, _widget.width, _widget.height = field_rect
|
|
67
|
+
|
|
68
68
|
if isinstance(_widget, Text):
|
|
69
69
|
# mostly for schema for now
|
|
70
70
|
# doesn't trigger hook
|
|
71
71
|
_widget.__dict__["max_length"] = get_text_field_max_length(widget)
|
|
72
|
+
_widget.__dict__["multiline"] = get_text_field_multiline(widget)
|
|
72
73
|
get_text_value(widget, _widget)
|
|
73
74
|
|
|
74
75
|
if type(_widget) is Checkbox:
|
|
@@ -82,11 +83,20 @@ def build_widgets(
|
|
|
82
83
|
|
|
83
84
|
if isinstance(_widget, Radio):
|
|
84
85
|
if key not in results:
|
|
86
|
+
_widget.x = []
|
|
87
|
+
_widget.y = []
|
|
88
|
+
_widget.width = []
|
|
89
|
+
_widget.height = []
|
|
85
90
|
results[key] = _widget
|
|
86
91
|
|
|
87
92
|
# for schema
|
|
88
93
|
results[key].number_of_options += 1
|
|
89
94
|
|
|
95
|
+
results[key].x.append(field_rect[0])
|
|
96
|
+
results[key].y.append(field_rect[1])
|
|
97
|
+
results[key].width.append(field_rect[2])
|
|
98
|
+
results[key].height.append(field_rect[3])
|
|
99
|
+
|
|
90
100
|
if get_radio_value(widget):
|
|
91
101
|
results[key].value = results[key].number_of_options - 1
|
|
92
102
|
continue
|
|
@@ -223,13 +233,107 @@ def get_dropdown_choices(widget: dict) -> Union[Tuple[str, ...], None]:
|
|
|
223
233
|
Union[Tuple[str, ...], None]: A tuple of strings representing the choices in the dropdown, or None if the choices are not specified.
|
|
224
234
|
"""
|
|
225
235
|
return tuple(
|
|
226
|
-
(
|
|
236
|
+
(
|
|
237
|
+
each.get_object()
|
|
238
|
+
if isinstance(each.get_object(), str)
|
|
239
|
+
else str(each.get_object()[1])
|
|
240
|
+
)
|
|
227
241
|
for each in extract_widget_property(
|
|
228
242
|
widget, DROPDOWN_CHOICE_PATTERNS, None, None
|
|
229
243
|
)
|
|
230
244
|
)
|
|
231
245
|
|
|
232
246
|
|
|
247
|
+
def get_on_open_javascript(pdf: bytes) -> Union[str, None]:
|
|
248
|
+
"""
|
|
249
|
+
Retrieves the JavaScript that runs when the PDF is opened.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
pdf (bytes): The PDF file content as a bytes stream.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Union[str, None]: The JavaScript that runs when the PDF is opened, or None if it's not present.
|
|
256
|
+
"""
|
|
257
|
+
reader = PdfReader(stream_to_io(pdf))
|
|
258
|
+
try:
|
|
259
|
+
return reader.root_object[OpenAction][JS]
|
|
260
|
+
except KeyError:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def set_on_open_javascript(pdf: bytes, script: str) -> bytes:
|
|
265
|
+
"""
|
|
266
|
+
Sets the JavaScript that runs when the PDF is opened.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
pdf (bytes): The PDF file content as a bytes stream.
|
|
270
|
+
script (str): The JavaScript to run when the PDF is opened.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
bytes: The modified PDF content as a bytes stream.
|
|
274
|
+
"""
|
|
275
|
+
if not script:
|
|
276
|
+
return pdf
|
|
277
|
+
|
|
278
|
+
reader = PdfReader(stream_to_io(pdf))
|
|
279
|
+
writer = PdfWriter()
|
|
280
|
+
writer.append(reader)
|
|
281
|
+
|
|
282
|
+
open_action = DictionaryObject()
|
|
283
|
+
open_action[NameObject(S)] = NameObject(JavaScript)
|
|
284
|
+
open_action[NameObject(JS)] = TextStringObject(script)
|
|
285
|
+
|
|
286
|
+
writer._root_object.update({NameObject(OpenAction): open_action}) # type: ignore # noqa: SLF001 # # pylint: disable=W0212
|
|
287
|
+
|
|
288
|
+
with BytesIO() as f:
|
|
289
|
+
writer.write(f)
|
|
290
|
+
f.seek(0)
|
|
291
|
+
return f.read()
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def get_pdf_title(pdf: bytes) -> Union[str, None]:
|
|
295
|
+
"""
|
|
296
|
+
Retrieves the title of a PDF from its metadata.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
pdf (bytes): The PDF file content as a bytes stream.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Union[str, None]: The title of the PDF, or None if it's not present.
|
|
303
|
+
"""
|
|
304
|
+
reader = PdfReader(stream_to_io(pdf))
|
|
305
|
+
return (reader.metadata or {}).get(Title)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def set_pdf_title(pdf: bytes, title: str) -> bytes:
|
|
309
|
+
"""
|
|
310
|
+
Sets the title of a PDF in its metadata.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
pdf (bytes): The PDF file content as a bytes stream.
|
|
314
|
+
title (str): The new title for the PDF.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
bytes: The modified PDF content as a bytes stream.
|
|
318
|
+
"""
|
|
319
|
+
if not title:
|
|
320
|
+
return pdf
|
|
321
|
+
|
|
322
|
+
reader = PdfReader(stream_to_io(pdf))
|
|
323
|
+
writer = PdfWriter()
|
|
324
|
+
writer.append(reader)
|
|
325
|
+
|
|
326
|
+
metadata = reader.metadata or {}
|
|
327
|
+
metadata[NameObject(Title)] = TextStringObject(title)
|
|
328
|
+
|
|
329
|
+
writer.add_metadata(metadata)
|
|
330
|
+
|
|
331
|
+
with BytesIO() as f:
|
|
332
|
+
writer.write(f)
|
|
333
|
+
f.seek(0)
|
|
334
|
+
return f.read()
|
|
335
|
+
|
|
336
|
+
|
|
233
337
|
def update_widget_keys(
|
|
234
338
|
template: bytes,
|
|
235
339
|
widgets: Dict[str, WIDGET_TYPES],
|
PyPDFForm/types.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
A module for custom type definitions used throughout the PyPDFForm library.
|
|
4
|
+
|
|
5
|
+
This includes specialized container types like PdfWrapperList, which extends
|
|
6
|
+
the standard list to provide custom behavior for slicing operations, particularly
|
|
7
|
+
for merging PdfWrapper objects.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Any, Union
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PdfWrapperList(list):
|
|
14
|
+
"""
|
|
15
|
+
A specialized list subclass designed to hold PdfWrapper objects.
|
|
16
|
+
|
|
17
|
+
When sliced, this list automatically merges the contained PdfWrapper
|
|
18
|
+
objects using the PdfWrapper.__add__ method, returning a single
|
|
19
|
+
merged PdfWrapper object. If the slice is empty, it returns an empty list.
|
|
20
|
+
For non-slice indexing, it behaves like a standard list.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __getitem__(self, key: Any) -> Union[list, Any]:
|
|
24
|
+
"""
|
|
25
|
+
Retrieves an item or a slice of items from the list.
|
|
26
|
+
|
|
27
|
+
If the key is a slice, it merges the PdfWrapper objects in the slice
|
|
28
|
+
and returns a single merged PdfWrapper.
|
|
29
|
+
If the key is an index, it returns the PdfWrapper at that index.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
key (Union[int, slice]): The index or slice to retrieve.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Union[PdfWrapper, list, Any]: A single merged PdfWrapper if sliced,
|
|
36
|
+
or the item at the index if indexed.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
if isinstance(key, slice):
|
|
40
|
+
result = None
|
|
41
|
+
wrappers = super().__getitem__(key)
|
|
42
|
+
for each in wrappers:
|
|
43
|
+
if not result:
|
|
44
|
+
result = each
|
|
45
|
+
else:
|
|
46
|
+
result += each
|
|
47
|
+
|
|
48
|
+
return result
|
|
49
|
+
return super().__getitem__(key)
|
PyPDFForm/utils.py
CHANGED
|
@@ -10,16 +10,8 @@ It includes functions for:
|
|
|
10
10
|
- Finding and traversing patterns within PDF widgets.
|
|
11
11
|
- Extracting widget properties based on defined patterns.
|
|
12
12
|
- Generating unique suffixes for internal use.
|
|
13
|
-
-
|
|
13
|
+
- Setting the `NeedAppearances` flag in the PDF to ensure proper rendering of form fields.
|
|
14
14
|
"""
|
|
15
|
-
# TODO: In `enable_adobe_mode`, `PdfReader(stream_to_io(pdf))` and `writer.append(reader)` involve re-parsing and appending the PDF. For large PDFs, passing `PdfReader` and `PdfWriter` objects directly could reduce overhead.
|
|
16
|
-
# TODO: In `remove_all_widgets`, `PdfReader(stream_to_io(pdf))` and iterating through pages to add them to a new writer can be inefficient for large PDFs. Consider if `pypdf` offers a more direct way to remove annotations without reconstructing the entire PDF.
|
|
17
|
-
# TODO: In `get_page_streams`, `PdfReader(stream_to_io(pdf))` and then creating a new `PdfWriter` for each page can be very inefficient. It would be more performant to iterate through the pages of a single `PdfReader` and extract their content streams directly if possible, or to use a single `PdfWriter` to extract multiple pages.
|
|
18
|
-
# TODO: In `merge_two_pdfs`, the function reads and writes PDFs multiple times (`PdfReader`, `PdfWriter`, `remove_all_widgets`, then another `PdfReader` and `PdfWriter`). This is highly inefficient. The PDF objects should be passed around and modified in-place as much as possible, with a single final write operation.
|
|
19
|
-
# TODO: The `merge_two_pdfs` function has a `TODO: refactor duplicate logic with copy_watermark_widgets` comment. This indicates a potential for code duplication and inefficiency. Refactoring this to a shared helper function would improve maintainability and potentially performance.
|
|
20
|
-
# TODO: In `find_pattern_match` and `traverse_pattern`, the recursive nature and repeated dictionary lookups (`widget.items()`, `value.get_object()`) can be slow for deeply nested or complex widget structures. Consider optimizing these traversals, perhaps by pre-flattening the widget dictionary or using a more direct access method if `pypdf` allows.
|
|
21
|
-
# TODO: In `extract_widget_property`, the loop iterates through `patterns` and calls `traverse_pattern` for each. If `patterns` is long or `traverse_pattern` is expensive, this could be a bottleneck. Consider optimizing the pattern matching or lookup.
|
|
22
|
-
# TODO: `generate_unique_suffix` uses `choice` in a loop. While generally fast, for extremely high call volumes, pre-generating a pool of characters or using a faster random string generation method might offer minor improvements.
|
|
23
15
|
|
|
24
16
|
from collections.abc import Callable
|
|
25
17
|
from functools import lru_cache
|
|
@@ -31,7 +23,7 @@ from typing import Any, BinaryIO, List, Union
|
|
|
31
23
|
from pypdf import PdfReader, PdfWriter
|
|
32
24
|
from pypdf.generic import ArrayObject, DictionaryObject, NameObject
|
|
33
25
|
|
|
34
|
-
from .constants import SLASH, UNIQUE_SUFFIX_LENGTH,
|
|
26
|
+
from .constants import SLASH, UNIQUE_SUFFIX_LENGTH, Annots
|
|
35
27
|
|
|
36
28
|
|
|
37
29
|
@lru_cache
|
|
@@ -58,37 +50,6 @@ def stream_to_io(stream: bytes) -> BinaryIO:
|
|
|
58
50
|
return result
|
|
59
51
|
|
|
60
52
|
|
|
61
|
-
@lru_cache
|
|
62
|
-
def enable_adobe_mode(pdf: bytes) -> bytes:
|
|
63
|
-
"""Enables Adobe-specific settings in the PDF to ensure proper rendering of form fields.
|
|
64
|
-
|
|
65
|
-
This function modifies the PDF's AcroForm dictionary to include the `NeedAppearances` flag,
|
|
66
|
-
which forces Adobe Reader to generate appearance streams for form fields. It also handles
|
|
67
|
-
XFA (XML Forms Architecture) forms by removing the XFA entry from the AcroForm dictionary
|
|
68
|
-
if it exists, ensuring compatibility and proper rendering. This ensures that the form fields
|
|
69
|
-
are rendered correctly in Adobe Reader, especially when the form is filled programmatically.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
pdf (bytes): The PDF content as bytes.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
bytes: The modified PDF content with Adobe mode enabled.
|
|
76
|
-
"""
|
|
77
|
-
reader = PdfReader(stream_to_io(pdf))
|
|
78
|
-
writer = PdfWriter()
|
|
79
|
-
|
|
80
|
-
if AcroForm in reader.trailer[Root] and XFA in reader.trailer[Root][AcroForm]:
|
|
81
|
-
del reader.trailer[Root][AcroForm][XFA]
|
|
82
|
-
|
|
83
|
-
writer.append(reader)
|
|
84
|
-
writer.set_need_appearances_writer()
|
|
85
|
-
|
|
86
|
-
with BytesIO() as f:
|
|
87
|
-
writer.write(f)
|
|
88
|
-
f.seek(0)
|
|
89
|
-
return f.read()
|
|
90
|
-
|
|
91
|
-
|
|
92
53
|
@lru_cache
|
|
93
54
|
def remove_all_widgets(pdf: bytes) -> bytes:
|
|
94
55
|
"""
|
|
@@ -144,6 +105,35 @@ def get_page_streams(pdf: bytes) -> List[bytes]:
|
|
|
144
105
|
return result
|
|
145
106
|
|
|
146
107
|
|
|
108
|
+
def merge_pdfs(pdf_list: list[bytes]) -> bytes:
|
|
109
|
+
"""
|
|
110
|
+
Merges a list of PDF byte streams into a single PDF byte stream.
|
|
111
|
+
|
|
112
|
+
This function uses a pairwise merging strategy (similar to a merge sort's merge phase)
|
|
113
|
+
to combine multiple PDF files efficiently. Instead of iteratively merging the result
|
|
114
|
+
with the next PDF (O(n^2) complexity where n is the number of pages), this approach
|
|
115
|
+
merges all available PDFs in pairs in a single pass. This process repeats until
|
|
116
|
+
only a single merged PDF remains, offering better performance for large lists of
|
|
117
|
+
PDFs.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
pdf_list (list[bytes]): A list of PDF files as byte streams to be merged.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
bytes: The merged PDF file as a single byte stream.
|
|
124
|
+
"""
|
|
125
|
+
while len(pdf_list) > 2:
|
|
126
|
+
groups = [pdf_list[i : i + 2] for i in range(0, len(pdf_list), 2)]
|
|
127
|
+
pdf_list = []
|
|
128
|
+
for each in groups:
|
|
129
|
+
if len(each) == 2:
|
|
130
|
+
pdf_list.append(merge_two_pdfs(each[0], each[1]))
|
|
131
|
+
else:
|
|
132
|
+
pdf_list += each
|
|
133
|
+
|
|
134
|
+
return merge_two_pdfs(pdf_list[0], pdf_list[1])
|
|
135
|
+
|
|
136
|
+
|
|
147
137
|
def merge_two_pdfs(pdf: bytes, other: bytes) -> bytes:
|
|
148
138
|
"""
|
|
149
139
|
Merges two PDF files into a single PDF file.
|