pdf-form-tools 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_form_tools-2.0.0/LICENSE +21 -0
- pdf_form_tools-2.0.0/PKG-INFO +79 -0
- pdf_form_tools-2.0.0/README.md +44 -0
- pdf_form_tools-2.0.0/pyproject.toml +55 -0
- pdf_form_tools-2.0.0/setup.cfg +4 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools/__init__.py +27 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools/pdf_form_overlay.py +357 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools.egg-info/PKG-INFO +79 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools.egg-info/SOURCES.txt +11 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools.egg-info/dependency_links.txt +1 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools.egg-info/requires.txt +13 -0
- pdf_form_tools-2.0.0/src/pdf_form_tools.egg-info/top_level.txt +1 -0
- pdf_form_tools-2.0.0/tests/test_pdf_form_overlay.py +24 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 רומן אוסטרובסקי
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf-form-tools
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Template-aware tools for filling scanned PDF forms with visual verification
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/ceratops-code/pdf-form-tools
|
|
7
|
+
Project-URL: Repository, https://github.com/ceratops-code/pdf-form-tools
|
|
8
|
+
Project-URL: Issues, https://github.com/ceratops-code/pdf-form-tools/issues
|
|
9
|
+
Keywords: pdf,forms,opencv,pymupdf,document-processing
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Multimedia :: Graphics
|
|
18
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: opencv-python-headless
|
|
24
|
+
Requires-Dist: pymupdf
|
|
25
|
+
Requires-Dist: pillow
|
|
26
|
+
Requires-Dist: pypdf
|
|
27
|
+
Requires-Dist: reportlab
|
|
28
|
+
Requires-Dist: python-bidi
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
33
|
+
Requires-Dist: twine>=5.1; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# pdf-form-tools
|
|
37
|
+
|
|
38
|
+
`pdf-form-tools` is an import-only Python package for filling layout-sensitive scanned PDF forms with deterministic placement helpers and visual verification primitives.
|
|
39
|
+
|
|
40
|
+
It is intentionally small:
|
|
41
|
+
|
|
42
|
+
- render PDF pages to raster images
|
|
43
|
+
- detect writable regions, checkbox boxes, signature lines, and ID slots
|
|
44
|
+
- draw text, checks, and signatures onto an overlay
|
|
45
|
+
- merge the overlay back into the original PDF
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python -m pip install pdf-form-tools
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Example
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
|
|
58
|
+
from pdf_form_tools import Rect, merge_overlay_pdf, render_pdf_page
|
|
59
|
+
|
|
60
|
+
source_pdf = Path("form.pdf")
|
|
61
|
+
preview_png = Path("preview-page1.png")
|
|
62
|
+
render_pdf_page(source_pdf, 0, 2, preview_png)
|
|
63
|
+
|
|
64
|
+
# draw your overlay separately, then merge it back
|
|
65
|
+
merge_overlay_pdf(source_pdf, Path("overlay-page1.png"), Path("form-filled.pdf"))
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Development
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m pip install -e ".[dev]"
|
|
72
|
+
python -m ruff check .
|
|
73
|
+
python -m pytest
|
|
74
|
+
python -m build
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Scope
|
|
78
|
+
|
|
79
|
+
This package contains reusable low-level helpers only. Form-specific filling flows belong in project-local scripts or thin runners, not in the shared library.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# pdf-form-tools
|
|
2
|
+
|
|
3
|
+
`pdf-form-tools` is an import-only Python package for filling layout-sensitive scanned PDF forms with deterministic placement helpers and visual verification primitives.
|
|
4
|
+
|
|
5
|
+
It is intentionally small:
|
|
6
|
+
|
|
7
|
+
- render PDF pages to raster images
|
|
8
|
+
- detect writable regions, checkbox boxes, signature lines, and ID slots
|
|
9
|
+
- draw text, checks, and signatures onto an overlay
|
|
10
|
+
- merge the overlay back into the original PDF
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
python -m pip install pdf-form-tools
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Example
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from pdf_form_tools import Rect, merge_overlay_pdf, render_pdf_page
|
|
24
|
+
|
|
25
|
+
source_pdf = Path("form.pdf")
|
|
26
|
+
preview_png = Path("preview-page1.png")
|
|
27
|
+
render_pdf_page(source_pdf, 0, 2, preview_png)
|
|
28
|
+
|
|
29
|
+
# draw your overlay separately, then merge it back
|
|
30
|
+
merge_overlay_pdf(source_pdf, Path("overlay-page1.png"), Path("form-filled.pdf"))
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Development
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
python -m pip install -e ".[dev]"
|
|
37
|
+
python -m ruff check .
|
|
38
|
+
python -m pytest
|
|
39
|
+
python -m build
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Scope
|
|
43
|
+
|
|
44
|
+
This package contains reusable low-level helpers only. Form-specific filling flows belong in project-local scripts or thin runners, not in the shared library.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pdf-form-tools"
|
|
7
|
+
version = "2.0.0"
|
|
8
|
+
description = "Template-aware tools for filling scanned PDF forms with visual verification"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
keywords = ["pdf", "forms", "opencv", "pymupdf", "document-processing"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Multimedia :: Graphics",
|
|
23
|
+
"Topic :: Office/Business :: Office Suites",
|
|
24
|
+
]
|
|
25
|
+
urls = { Homepage = "https://github.com/ceratops-code/pdf-form-tools", Repository = "https://github.com/ceratops-code/pdf-form-tools", Issues = "https://github.com/ceratops-code/pdf-form-tools/issues" }
|
|
26
|
+
dependencies = [
|
|
27
|
+
"numpy",
|
|
28
|
+
"opencv-python-headless",
|
|
29
|
+
"pymupdf",
|
|
30
|
+
"pillow",
|
|
31
|
+
"pypdf",
|
|
32
|
+
"reportlab",
|
|
33
|
+
"python-bidi",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
dev = [
|
|
38
|
+
"build>=1.2",
|
|
39
|
+
"pytest>=8.0",
|
|
40
|
+
"ruff>=0.6",
|
|
41
|
+
"twine>=5.1",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.package-dir]
|
|
45
|
+
"" = "src"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
49
|
+
include = ["pdf_form_tools*"]
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
testpaths = ["tests"]
|
|
53
|
+
|
|
54
|
+
[tool.ruff]
|
|
55
|
+
target-version = "py311"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .pdf_form_overlay import (
|
|
2
|
+
Rect,
|
|
3
|
+
centered_address_box,
|
|
4
|
+
detect_lines,
|
|
5
|
+
detect_square_boxes,
|
|
6
|
+
draw_check,
|
|
7
|
+
draw_id_number,
|
|
8
|
+
draw_text,
|
|
9
|
+
merge_overlay_pdf,
|
|
10
|
+
paste_signature,
|
|
11
|
+
render_pdf_page,
|
|
12
|
+
writable_box,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Rect",
|
|
17
|
+
"centered_address_box",
|
|
18
|
+
"detect_lines",
|
|
19
|
+
"detect_square_boxes",
|
|
20
|
+
"draw_check",
|
|
21
|
+
"draw_id_number",
|
|
22
|
+
"draw_text",
|
|
23
|
+
"merge_overlay_pdf",
|
|
24
|
+
"paste_signature",
|
|
25
|
+
"render_pdf_page",
|
|
26
|
+
"writable_box",
|
|
27
|
+
]
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import cv2
|
|
9
|
+
import fitz
|
|
10
|
+
import numpy as np
|
|
11
|
+
from bidi.algorithm import get_display
|
|
12
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
13
|
+
from pypdf import PdfReader, PdfWriter
|
|
14
|
+
from reportlab.lib.utils import ImageReader
|
|
15
|
+
from reportlab.pdfgen import canvas
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
TEXT_COLOR = (20, 20, 20, 255)
|
|
19
|
+
FONT_CANDIDATES = {
|
|
20
|
+
False: [
|
|
21
|
+
Path(r"C:\Windows\Fonts\arial.ttf"),
|
|
22
|
+
Path("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"),
|
|
23
|
+
Path("/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf"),
|
|
24
|
+
Path("/Library/Fonts/Arial.ttf"),
|
|
25
|
+
Path("/System/Library/Fonts/Supplemental/Arial.ttf"),
|
|
26
|
+
],
|
|
27
|
+
True: [
|
|
28
|
+
Path(r"C:\Windows\Fonts\arialbd.ttf"),
|
|
29
|
+
Path("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"),
|
|
30
|
+
Path("/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf"),
|
|
31
|
+
Path("/Library/Fonts/Arial Bold.ttf"),
|
|
32
|
+
Path("/System/Library/Fonts/Supplemental/Arial Bold.ttf"),
|
|
33
|
+
],
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class Rect:
|
|
39
|
+
x: int
|
|
40
|
+
y: int
|
|
41
|
+
w: int
|
|
42
|
+
h: int
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def x2(self) -> int:
|
|
46
|
+
return self.x + self.w
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def y2(self) -> int:
|
|
50
|
+
return self.y + self.h
|
|
51
|
+
|
|
52
|
+
def inset(self, dx: int, dy: int | None = None) -> "Rect":
|
|
53
|
+
if dy is None:
|
|
54
|
+
dy = dx
|
|
55
|
+
return Rect(self.x + dx, self.y + dy, self.w - dx * 2, self.h - dy * 2)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def contains_hebrew(text: str) -> bool:
|
|
59
|
+
return any("\u0590" <= ch <= "\u05FF" for ch in text)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def visual_text(text: str) -> str:
|
|
63
|
+
return get_display(text) if contains_hebrew(text) else text
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@lru_cache(maxsize=2)
|
|
67
|
+
def resolve_font_path(bold: bool = False) -> Path:
|
|
68
|
+
for candidate in FONT_CANDIDATES[bold]:
|
|
69
|
+
if candidate.exists():
|
|
70
|
+
return candidate
|
|
71
|
+
raise FileNotFoundError(f"Could not find a usable {'bold' if bold else 'regular'} TrueType font.")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def load_font(size: int, bold: bool = False) -> ImageFont.ImageFont:
|
|
75
|
+
font_path = resolve_font_path(bold=bold)
|
|
76
|
+
return ImageFont.truetype(str(font_path), size)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def close_small_gaps(mask: np.ndarray, max_gap: int = 4) -> np.ndarray:
|
|
80
|
+
result = mask.copy()
|
|
81
|
+
start = None
|
|
82
|
+
for idx, value in enumerate(mask):
|
|
83
|
+
if not value and start is None:
|
|
84
|
+
start = idx
|
|
85
|
+
elif value and start is not None:
|
|
86
|
+
if idx - start <= max_gap:
|
|
87
|
+
result[start:idx] = True
|
|
88
|
+
start = None
|
|
89
|
+
if start is not None and len(mask) - start <= max_gap:
|
|
90
|
+
result[start:] = True
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def longest_true_segment(mask: np.ndarray, min_len: int) -> tuple[int, int] | None:
|
|
95
|
+
best = None
|
|
96
|
+
start = None
|
|
97
|
+
for idx, value in enumerate(mask):
|
|
98
|
+
if value and start is None:
|
|
99
|
+
start = idx
|
|
100
|
+
elif not value and start is not None:
|
|
101
|
+
if idx - start >= min_len and (best is None or idx - start > best[1] - best[0]):
|
|
102
|
+
best = (start, idx)
|
|
103
|
+
start = None
|
|
104
|
+
if start is not None and len(mask) - start >= min_len:
|
|
105
|
+
candidate = (start, len(mask))
|
|
106
|
+
if best is None or candidate[1] - candidate[0] > best[1] - best[0]:
|
|
107
|
+
best = candidate
|
|
108
|
+
return best
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def writable_box(page_gray: np.ndarray, rect: Rect, row_threshold: float = 0.015, col_threshold: float = 0.03) -> Rect:
|
|
112
|
+
inner = rect.inset(8)
|
|
113
|
+
crop = page_gray[inner.y:inner.y2, inner.x:inner.x2]
|
|
114
|
+
ink = crop < 185
|
|
115
|
+
|
|
116
|
+
row_density = ink.mean(axis=1)
|
|
117
|
+
row_mask = row_density < row_threshold
|
|
118
|
+
row_mask[:4] = False
|
|
119
|
+
row_mask[-4:] = False
|
|
120
|
+
row_mask = close_small_gaps(row_mask, max_gap=5)
|
|
121
|
+
row_segment = longest_true_segment(row_mask, min_len=max(18, crop.shape[0] // 6))
|
|
122
|
+
if row_segment is None:
|
|
123
|
+
row_segment = (crop.shape[0] // 3, crop.shape[0] - 12)
|
|
124
|
+
|
|
125
|
+
band = crop[row_segment[0]:row_segment[1], :]
|
|
126
|
+
band_ink = band < 185
|
|
127
|
+
col_density = band_ink.mean(axis=0)
|
|
128
|
+
col_mask = col_density < col_threshold
|
|
129
|
+
col_mask[:6] = False
|
|
130
|
+
col_mask[-6:] = False
|
|
131
|
+
col_mask = close_small_gaps(col_mask, max_gap=8)
|
|
132
|
+
col_segment = longest_true_segment(col_mask, min_len=max(40, crop.shape[1] // 6))
|
|
133
|
+
if col_segment is None:
|
|
134
|
+
col_segment = (10, crop.shape[1] - 10)
|
|
135
|
+
|
|
136
|
+
box = Rect(
|
|
137
|
+
inner.x + col_segment[0],
|
|
138
|
+
inner.y + row_segment[0],
|
|
139
|
+
col_segment[1] - col_segment[0],
|
|
140
|
+
row_segment[1] - row_segment[0],
|
|
141
|
+
)
|
|
142
|
+
return box.inset(4)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def fit_font(
|
|
146
|
+
draw: ImageDraw.ImageDraw,
|
|
147
|
+
text: str,
|
|
148
|
+
rect: Rect,
|
|
149
|
+
max_size: int,
|
|
150
|
+
min_size: int,
|
|
151
|
+
bold: bool,
|
|
152
|
+
) -> tuple[ImageFont.FreeTypeFont, tuple[int, int, int, int]]:
|
|
153
|
+
prepared = visual_text(text)
|
|
154
|
+
for size in range(max_size, min_size - 1, -2):
|
|
155
|
+
font = load_font(size, bold=bold)
|
|
156
|
+
bbox = draw.textbbox((0, 0), prepared, font=font)
|
|
157
|
+
width = bbox[2] - bbox[0]
|
|
158
|
+
height = bbox[3] - bbox[1]
|
|
159
|
+
if width <= rect.w and height <= rect.h:
|
|
160
|
+
return font, bbox
|
|
161
|
+
font = load_font(min_size, bold=bold)
|
|
162
|
+
bbox = draw.textbbox((0, 0), prepared, font=font)
|
|
163
|
+
return font, bbox
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def draw_text(
|
|
167
|
+
draw: ImageDraw.ImageDraw,
|
|
168
|
+
text: str,
|
|
169
|
+
rect: Rect,
|
|
170
|
+
*,
|
|
171
|
+
align: str,
|
|
172
|
+
max_size: int,
|
|
173
|
+
min_size: int,
|
|
174
|
+
bold: bool = False,
|
|
175
|
+
fill: tuple[int, int, int, int] = TEXT_COLOR,
|
|
176
|
+
) -> None:
|
|
177
|
+
prepared = visual_text(text)
|
|
178
|
+
font, bbox = fit_font(draw, text, rect, max_size=max_size, min_size=min_size, bold=bold)
|
|
179
|
+
width = bbox[2] - bbox[0]
|
|
180
|
+
height = bbox[3] - bbox[1]
|
|
181
|
+
|
|
182
|
+
if align == "right":
|
|
183
|
+
x = rect.x2 - width - bbox[0]
|
|
184
|
+
elif align == "left":
|
|
185
|
+
x = rect.x - bbox[0]
|
|
186
|
+
else:
|
|
187
|
+
x = rect.x + (rect.w - width) / 2 - bbox[0]
|
|
188
|
+
|
|
189
|
+
y = rect.y + (rect.h - height) / 2 - bbox[1]
|
|
190
|
+
draw.text((x, y), prepared, font=font, fill=fill)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def centered_address_box(rect: Rect, *, top_pad: int, side_pad: int, height: int, right_pad: int | None = None) -> Rect:
|
|
194
|
+
if right_pad is None:
|
|
195
|
+
right_pad = side_pad
|
|
196
|
+
return Rect(rect.x + side_pad, rect.y + top_pad, rect.w - side_pad - right_pad, height)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def detect_square_boxes(page_gray: np.ndarray, region: Rect) -> list[Rect]:
|
|
200
|
+
crop = page_gray[region.y:region.y2, region.x:region.x2]
|
|
201
|
+
_, thresh = cv2.threshold(crop, 210, 255, cv2.THRESH_BINARY_INV)
|
|
202
|
+
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
203
|
+
boxes: list[Rect] = []
|
|
204
|
+
for contour in contours:
|
|
205
|
+
x, y, w, h = cv2.boundingRect(contour)
|
|
206
|
+
if 40 <= w <= 60 and 40 <= h <= 60 and 0.8 <= (w / h) <= 1.25:
|
|
207
|
+
candidate = Rect(region.x + x, region.y + y, w, h)
|
|
208
|
+
if any(abs(candidate.x - existing.x) < 5 and abs(candidate.y - existing.y) < 5 for existing in boxes):
|
|
209
|
+
continue
|
|
210
|
+
boxes.append(candidate)
|
|
211
|
+
return sorted(boxes, key=lambda item: (item.y, item.x))
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def detect_lines(page_gray: np.ndarray, region: Rect) -> list[Rect]:
|
|
215
|
+
crop = page_gray[region.y:region.y2, region.x:region.x2]
|
|
216
|
+
_, thresh = cv2.threshold(crop, 200, 255, cv2.THRESH_BINARY_INV)
|
|
217
|
+
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
218
|
+
lines: list[Rect] = []
|
|
219
|
+
for contour in contours:
|
|
220
|
+
x, y, w, h = cv2.boundingRect(contour)
|
|
221
|
+
if 500 <= w <= 900 and h <= 12:
|
|
222
|
+
candidate = Rect(region.x + x, region.y + y, w, h)
|
|
223
|
+
if any(abs(candidate.x - existing.x) < 10 and abs(candidate.y - existing.y) < 10 for existing in lines):
|
|
224
|
+
continue
|
|
225
|
+
lines.append(candidate)
|
|
226
|
+
return sorted(lines, key=lambda item: item.x)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def detect_id_slots(page_gray: np.ndarray, rect: Rect) -> list[Rect]:
|
|
230
|
+
crop = page_gray[rect.y:rect.y2, rect.x:rect.x2]
|
|
231
|
+
guide_start = int(crop.shape[0] * 0.7)
|
|
232
|
+
lower = crop[guide_start:, :]
|
|
233
|
+
ink = lower < 180
|
|
234
|
+
|
|
235
|
+
row_sum = ink.sum(axis=1)
|
|
236
|
+
first_guide_row = next((idx + guide_start for idx, value in enumerate(row_sum) if value >= 6), int(crop.shape[0] * 0.82))
|
|
237
|
+
|
|
238
|
+
col_sum = ink.sum(axis=0)
|
|
239
|
+
peak_columns = [idx for idx, value in enumerate(col_sum) if value >= 8]
|
|
240
|
+
ranges: list[tuple[int, int]] = []
|
|
241
|
+
start = None
|
|
242
|
+
prev = None
|
|
243
|
+
for idx in peak_columns:
|
|
244
|
+
if start is None:
|
|
245
|
+
start = idx
|
|
246
|
+
prev = idx
|
|
247
|
+
continue
|
|
248
|
+
if idx == prev + 1:
|
|
249
|
+
prev = idx
|
|
250
|
+
continue
|
|
251
|
+
ranges.append((start, prev))
|
|
252
|
+
start = idx
|
|
253
|
+
prev = idx
|
|
254
|
+
if start is not None and prev is not None:
|
|
255
|
+
ranges.append((start, prev))
|
|
256
|
+
|
|
257
|
+
boundaries = [0]
|
|
258
|
+
for left, right in ranges:
|
|
259
|
+
center = int(round((left + right) / 2))
|
|
260
|
+
if 4 < center < crop.shape[1] - 5:
|
|
261
|
+
boundaries.append(center)
|
|
262
|
+
boundaries.append(crop.shape[1] - 1)
|
|
263
|
+
boundaries = sorted(set(boundaries))
|
|
264
|
+
|
|
265
|
+
if len(boundaries) != 10:
|
|
266
|
+
raise RuntimeError(f"Expected 10 ID slot boundaries, found {len(boundaries)} for {rect}.")
|
|
267
|
+
|
|
268
|
+
digit_top = rect.y + first_guide_row - int(rect.h * 0.34)
|
|
269
|
+
digit_height = int(rect.h * 0.48)
|
|
270
|
+
slots: list[Rect] = []
|
|
271
|
+
for left, right in zip(boundaries, boundaries[1:]):
|
|
272
|
+
slots.append(Rect(rect.x + left + 3, digit_top, right - left - 6, digit_height))
|
|
273
|
+
return slots
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def draw_id_number(draw: ImageDraw.ImageDraw, page_gray: np.ndarray, rect: Rect, number: str) -> None:
|
|
277
|
+
slots = detect_id_slots(page_gray, rect)
|
|
278
|
+
if len(number) != len(slots):
|
|
279
|
+
raise RuntimeError(f"ID length {len(number)} does not match detected slot count {len(slots)}.")
|
|
280
|
+
for digit, slot in zip(number, slots):
|
|
281
|
+
draw_text(draw, digit, slot, align="center", max_size=74, min_size=54)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def draw_check(
|
|
285
|
+
draw: ImageDraw.ImageDraw,
|
|
286
|
+
rect: Rect,
|
|
287
|
+
*,
|
|
288
|
+
raise_px: int = 10,
|
|
289
|
+
fill: tuple[int, int, int, int] = TEXT_COLOR,
|
|
290
|
+
) -> None:
|
|
291
|
+
x0, y0 = rect.x, rect.y
|
|
292
|
+
width = max(10, rect.w // 4)
|
|
293
|
+
p1 = (x0 + rect.w * 0.18, y0 + rect.h * 0.54 - raise_px)
|
|
294
|
+
p2 = (x0 + rect.w * 0.43, y0 + rect.h * 0.80 - raise_px)
|
|
295
|
+
p3 = (x0 + rect.w * 0.83, y0 + rect.h * 0.20 - raise_px)
|
|
296
|
+
draw.line([p1, p2], fill=fill, width=width)
|
|
297
|
+
draw.line([p2, p3], fill=fill, width=width)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def paste_signature(
|
|
301
|
+
overlay: Image.Image,
|
|
302
|
+
signature: Image.Image,
|
|
303
|
+
line_rect: Rect,
|
|
304
|
+
*,
|
|
305
|
+
min_cm_width: float = 2.0,
|
|
306
|
+
target_height: int | None = None,
|
|
307
|
+
y_offset: int = 45,
|
|
308
|
+
) -> None:
|
|
309
|
+
alpha_bbox = signature.getchannel("A").getbbox()
|
|
310
|
+
if alpha_bbox:
|
|
311
|
+
signature = signature.crop(alpha_bbox)
|
|
312
|
+
|
|
313
|
+
min_signature_width = int(round((overlay.width / 21.0) * min_cm_width))
|
|
314
|
+
target_width = max(min_signature_width, int(line_rect.w * 0.55))
|
|
315
|
+
if target_height is None:
|
|
316
|
+
target_height = 260
|
|
317
|
+
resized = signature.resize((target_width, target_height), Image.Resampling.LANCZOS)
|
|
318
|
+
x = int(line_rect.x + (line_rect.w - target_width) / 2)
|
|
319
|
+
y = int(line_rect.y - target_height + y_offset)
|
|
320
|
+
overlay.alpha_composite(resized, (x, y))
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def render_pdf_page(pdf_path: Path, page_index: int, scale: int, out_path: Path) -> Image.Image:
|
|
324
|
+
document = fitz.open(pdf_path)
|
|
325
|
+
try:
|
|
326
|
+
page = document[page_index]
|
|
327
|
+
pixmap = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
|
|
328
|
+
image = Image.frombytes("RGB", (pixmap.width, pixmap.height), pixmap.samples)
|
|
329
|
+
image.save(out_path)
|
|
330
|
+
return image
|
|
331
|
+
finally:
|
|
332
|
+
document.close()
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def merge_overlay_pdf(src_pdf: Path, overlay_png: Path, out_pdf: Path) -> None:
|
|
336
|
+
reader = PdfReader(str(src_pdf))
|
|
337
|
+
writer = PdfWriter()
|
|
338
|
+
|
|
339
|
+
page = reader.pages[0]
|
|
340
|
+
width = float(page.mediabox.width)
|
|
341
|
+
height = float(page.mediabox.height)
|
|
342
|
+
|
|
343
|
+
overlay_buffer = BytesIO()
|
|
344
|
+
c = canvas.Canvas(overlay_buffer, pagesize=(width, height))
|
|
345
|
+
c.drawImage(ImageReader(str(overlay_png)), 0, 0, width=width, height=height, mask="auto")
|
|
346
|
+
c.save()
|
|
347
|
+
overlay_buffer.seek(0)
|
|
348
|
+
overlay_reader = PdfReader(overlay_buffer)
|
|
349
|
+
|
|
350
|
+
merged_page = page
|
|
351
|
+
merged_page.merge_page(overlay_reader.pages[0])
|
|
352
|
+
writer.add_page(merged_page)
|
|
353
|
+
for extra_page in reader.pages[1:]:
|
|
354
|
+
writer.add_page(extra_page)
|
|
355
|
+
|
|
356
|
+
with out_pdf.open("wb") as handle:
|
|
357
|
+
writer.write(handle)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf-form-tools
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Template-aware tools for filling scanned PDF forms with visual verification
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/ceratops-code/pdf-form-tools
|
|
7
|
+
Project-URL: Repository, https://github.com/ceratops-code/pdf-form-tools
|
|
8
|
+
Project-URL: Issues, https://github.com/ceratops-code/pdf-form-tools/issues
|
|
9
|
+
Keywords: pdf,forms,opencv,pymupdf,document-processing
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Multimedia :: Graphics
|
|
18
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: opencv-python-headless
|
|
24
|
+
Requires-Dist: pymupdf
|
|
25
|
+
Requires-Dist: pillow
|
|
26
|
+
Requires-Dist: pypdf
|
|
27
|
+
Requires-Dist: reportlab
|
|
28
|
+
Requires-Dist: python-bidi
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
33
|
+
Requires-Dist: twine>=5.1; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# pdf-form-tools
|
|
37
|
+
|
|
38
|
+
`pdf-form-tools` is an import-only Python package for filling layout-sensitive scanned PDF forms with deterministic placement helpers and visual verification primitives.
|
|
39
|
+
|
|
40
|
+
It is intentionally small:
|
|
41
|
+
|
|
42
|
+
- render PDF pages to raster images
|
|
43
|
+
- detect writable regions, checkbox boxes, signature lines, and ID slots
|
|
44
|
+
- draw text, checks, and signatures onto an overlay
|
|
45
|
+
- merge the overlay back into the original PDF
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python -m pip install pdf-form-tools
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Example
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
|
|
58
|
+
from pdf_form_tools import Rect, merge_overlay_pdf, render_pdf_page
|
|
59
|
+
|
|
60
|
+
source_pdf = Path("form.pdf")
|
|
61
|
+
preview_png = Path("preview-page1.png")
|
|
62
|
+
render_pdf_page(source_pdf, 0, 2, preview_png)
|
|
63
|
+
|
|
64
|
+
# draw your overlay separately, then merge it back
|
|
65
|
+
merge_overlay_pdf(source_pdf, Path("overlay-page1.png"), Path("form-filled.pdf"))
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Development
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m pip install -e ".[dev]"
|
|
72
|
+
python -m ruff check .
|
|
73
|
+
python -m pytest
|
|
74
|
+
python -m build
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Scope
|
|
78
|
+
|
|
79
|
+
This package contains reusable low-level helpers only. Form-specific filling flows belong in project-local scripts or thin runners, not in the shared library.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/pdf_form_tools/__init__.py
|
|
5
|
+
src/pdf_form_tools/pdf_form_overlay.py
|
|
6
|
+
src/pdf_form_tools.egg-info/PKG-INFO
|
|
7
|
+
src/pdf_form_tools.egg-info/SOURCES.txt
|
|
8
|
+
src/pdf_form_tools.egg-info/dependency_links.txt
|
|
9
|
+
src/pdf_form_tools.egg-info/requires.txt
|
|
10
|
+
src/pdf_form_tools.egg-info/top_level.txt
|
|
11
|
+
tests/test_pdf_form_overlay.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdf_form_tools
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pdf_form_tools.pdf_form_overlay as overlay
|
|
4
|
+
from pdf_form_tools import Rect, centered_address_box
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_rect_inset() -> None:
|
|
8
|
+
assert Rect(10, 20, 30, 40).inset(5) == Rect(15, 25, 20, 30)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_centered_address_box() -> None:
|
|
12
|
+
rect = Rect(100, 200, 400, 120)
|
|
13
|
+
assert centered_address_box(rect, top_pad=10, side_pad=20, height=50) == Rect(120, 210, 360, 50)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_load_font_uses_existing_system_font() -> None:
|
|
17
|
+
font = overlay.load_font(18, bold=False)
|
|
18
|
+
assert font is not None
|
|
19
|
+
assert Path(overlay.resolve_font_path()).exists()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_contains_hebrew_detects_hebrew_characters() -> None:
|
|
23
|
+
assert overlay.contains_hebrew("אמילי")
|
|
24
|
+
assert not overlay.contains_hebrew("Emily")
|