flyfield 2025.8.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyfield-2025.8.29/LICENSE +3 -0
- flyfield-2025.8.29/MANIFEST.in +3 -0
- flyfield-2025.8.29/PKG-INFO +164 -0
- flyfield-2025.8.29/README.md +142 -0
- flyfield-2025.8.29/flyfield/__init__.py +23 -0
- flyfield-2025.8.29/flyfield/cli.py +6 -0
- flyfield-2025.8.29/flyfield/config.py +34 -0
- flyfield-2025.8.29/flyfield/extract.py +176 -0
- flyfield-2025.8.29/flyfield/io_utils.py +150 -0
- flyfield-2025.8.29/flyfield/layout.py +232 -0
- flyfield-2025.8.29/flyfield/main.py +205 -0
- flyfield-2025.8.29/flyfield/markup_and_fields.py +257 -0
- flyfield-2025.8.29/flyfield/utils.py +161 -0
- flyfield-2025.8.29/flyfield.egg-info/PKG-INFO +164 -0
- flyfield-2025.8.29/flyfield.egg-info/SOURCES.txt +26 -0
- flyfield-2025.8.29/flyfield.egg-info/dependency_links.txt +1 -0
- flyfield-2025.8.29/flyfield.egg-info/entry_points.txt +2 -0
- flyfield-2025.8.29/flyfield.egg-info/requires.txt +2 -0
- flyfield-2025.8.29/flyfield.egg-info/top_level.txt +5 -0
- flyfield-2025.8.29/pyproject.toml +36 -0
- flyfield-2025.8.29/setup.cfg +4 -0
- flyfield-2025.8.29/tests/__init__.py +2 -0
- flyfield-2025.8.29/tests/test_extract.py +21 -0
- flyfield-2025.8.29/tests/test_io_utils.py +32 -0
- flyfield-2025.8.29/tests/test_layout.py +41 -0
- flyfield-2025.8.29/tests/test_main.py +27 -0
- flyfield-2025.8.29/tests/test_markup_and_fields.py +15 -0
- flyfield-2025.8.29/tests/test_utils.py +27 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flyfield
|
|
3
|
+
Version: 2025.8.29
|
|
4
|
+
Summary: Tools for extracting, processing, and generating interactive fields for PDFs containing white box fields.
|
|
5
|
+
Author-email: flywire <flywire0@gmail.com>
|
|
6
|
+
License: GPL-3.0-or-later
|
|
7
|
+
Keywords: pdf,form,fields,extraction,pymupdf,pypdfforms
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: PyMuPDF>=1.18.0
|
|
20
|
+
Requires-Dist: PyPDFForm>=0.3.3
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# flyfield
|
|
24
|
+
|
|
25
|
+
A tool to automatically detect white boxes in PDFs and convert them into interactive, automated form fields—aimed at users and developers looking to streamline PDF form workflows.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Overview
|
|
30
|
+
|
|
31
|
+
**flyfield** is a Python library and command-line tool designed to automate the conversion of white box placeholders within PDFs into fully interactive form fields. This enables users and developers to analyze form layouts, generate fillable fields, fill forms programmatically using CSV data, and capture filled data for further use.
|
|
32
|
+
|
|
33
|
+
Powered by [PyMuPDF](https://pymupdf.readthedocs.io) and [PyPDFForm](https://pypdfform.readthedocs.io), flyfield provides a modular, extensible solution for reliable PDF form automation.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Key Features
|
|
38
|
+
|
|
39
|
+
- **White box detection:** Precisely extract white box regions as potential form fields from vector PDFs.
|
|
40
|
+
- **Layout analysis:** Group extracted fields by page, line, and block with flexible gap detection.
|
|
41
|
+
- **Form field generation:** Automatically produce Python scripts to create interactive PDF form fields aligned with detected boxes.
|
|
42
|
+
- **Markup visualization:** Generate annotated PDFs marking detected fields for verification.
|
|
43
|
+
- **Form filling and capture:** Fill PDF forms programmatically from CSV data and extract filled data into CSV format.
|
|
44
|
+
- **CLI integration:** User-friendly command-line interface to orchestrate workflows from extraction to data capture.
|
|
45
|
+
- **Open Source and Extensible:** Easily customize and extend for specific PDF processing needs.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
For isolated installation, use [pipx](https://pipxproject.github.io/pipx/):
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
pipx install flyfield
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Verify the installed version:
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
flyfield --version
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Alternatively, install with `pip`:
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
pip install flyfield
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Usage
|
|
78
|
+
|
|
79
|
+
Execute commands against PDF files as needed:
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
flyfield --input-pdf myfile.pdf --markup
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Common options:
|
|
88
|
+
|
|
89
|
+
- `--markup` → Generate a marked-up PDF showing detected white boxes.
|
|
90
|
+
- `--fields` → Generate and run scripts to add interactive form fields.
|
|
91
|
+
- `--fill` → Fill form fields using CSV data.
|
|
92
|
+
- `--capture` → Extract filled form data back to CSV.
|
|
93
|
+
- `--input-csv` → Provide CSV input for field data, bypassing extraction.
|
|
94
|
+
- `--debug` → Enable verbose debug output.
|
|
95
|
+
|
|
96
|
+
Example workflow:
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
flyfield --input-pdf form_template.pdf --markup --fields
|
|
101
|
+
flyfield --input-pdf form_template-fields.pdf --input-csv form_template.csv --fill
|
|
102
|
+
flyfield --input-pdf form_template-filled.pdf --capture
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## For Developers
|
|
109
|
+
|
|
110
|
+
Clone the repository and install development dependencies:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
git clone https://github.com/flywire/flyfield.git
|
|
115
|
+
cd flyfield
|
|
116
|
+
pip install -e .[dev]
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Run the test suite with coverage enabled:
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
tox
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
The project is modular with separate components, including:
|
|
129
|
+
|
|
130
|
+
- `extract` for box detection
|
|
131
|
+
- `layout` for field grouping and filtering
|
|
132
|
+
- `markup_and_fields` for field generation and markup
|
|
133
|
+
- `io_utils` for data input/output
|
|
134
|
+
- `utils` for utility functions
|
|
135
|
+
|
|
136
|
+
Get CLI help with:
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
python -m flyfield.cli --help
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
Licensed under the **GNU General Public License v3.0 or later** (GPL-3.0-or-later). See [LICENSE](LICENSE) for details.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Contributing
|
|
153
|
+
|
|
154
|
+
Contributions are welcome! Please open issues to report bugs or request features, and submit pull requests with tests and documentation improvements.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Acknowledgements
|
|
159
|
+
|
|
160
|
+
- Built using [PyMuPDF](https://pymupdf.readthedocs.io) for PDF handling.
|
|
161
|
+
- Uses [PyPDFForm](https://pypdfform.readthedocs.io) for interactive form creation.
|
|
162
|
+
- Inspired by the need for robust automation of PDF workflows involving white boxed form fields.
|
|
163
|
+
|
|
164
|
+
---
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# flyfield
|
|
2
|
+
|
|
3
|
+
A tool to automatically detect white boxes in PDFs and convert them into interactive, automated form fields—aimed at users and developers looking to streamline PDF form workflows.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
**flyfield** is a Python library and command-line tool designed to automate the conversion of white box placeholders within PDFs into fully interactive form fields. This enables users and developers to analyze form layouts, generate fillable fields, fill forms programmatically using CSV data, and capture filled data for further use.
|
|
10
|
+
|
|
11
|
+
Powered by [PyMuPDF](https://pymupdf.readthedocs.io) and [PyPDFForm](https://pypdfform.readthedocs.io), flyfield provides a modular, extensible solution for reliable PDF form automation.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Key Features
|
|
16
|
+
|
|
17
|
+
- **White box detection:** Precisely extract white box regions as potential form fields from vector PDFs.
|
|
18
|
+
- **Layout analysis:** Group extracted fields by page, line, and block with flexible gap detection.
|
|
19
|
+
- **Form field generation:** Automatically produce Python scripts to create interactive PDF form fields aligned with detected boxes.
|
|
20
|
+
- **Markup visualization:** Generate annotated PDFs marking detected fields for verification.
|
|
21
|
+
- **Form filling and capture:** Fill PDF forms programmatically from CSV data and extract filled data into CSV format.
|
|
22
|
+
- **CLI integration:** User-friendly command-line interface to orchestrate workflows from extraction to data capture.
|
|
23
|
+
- **Open Source and Extensible:** Easily customize and extend for specific PDF processing needs.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
For isolated installation, use [pipx](https://pipxproject.github.io/pipx/):
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
pipx install flyfield
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Verify the installed version:
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
flyfield --version
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Alternatively, install with `pip`:
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
pip install flyfield
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
Execute commands against PDF files as needed:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
flyfield --input-pdf myfile.pdf --markup
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Common options:
|
|
66
|
+
|
|
67
|
+
- `--markup` → Generate a marked-up PDF showing detected white boxes.
|
|
68
|
+
- `--fields` → Generate and run scripts to add interactive form fields.
|
|
69
|
+
- `--fill` → Fill form fields using CSV data.
|
|
70
|
+
- `--capture` → Extract filled form data back to CSV.
|
|
71
|
+
- `--input-csv` → Provide CSV input for field data, bypassing extraction.
|
|
72
|
+
- `--debug` → Enable verbose debug output.
|
|
73
|
+
|
|
74
|
+
Example workflow:
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
flyfield --input-pdf form_template.pdf --markup --fields
|
|
79
|
+
flyfield --input-pdf form_template-fields.pdf --input-csv form_template.csv --fill
|
|
80
|
+
flyfield --input-pdf form_template-filled.pdf --capture
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## For Developers
|
|
87
|
+
|
|
88
|
+
Clone the repository and install development dependencies:
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
git clone https://github.com/flywire/flyfield.git
|
|
93
|
+
cd flyfield
|
|
94
|
+
pip install -e .[dev]
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Run the test suite with coverage enabled:
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
tox
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The project is modular with separate components, including:
|
|
107
|
+
|
|
108
|
+
- `extract` for box detection
|
|
109
|
+
- `layout` for field grouping and filtering
|
|
110
|
+
- `markup_and_fields` for field generation and markup
|
|
111
|
+
- `io_utils` for data input/output
|
|
112
|
+
- `utils` for utility functions
|
|
113
|
+
|
|
114
|
+
Get CLI help with:
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
python -m flyfield.cli --help
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
Licensed under the **GNU General Public License v3.0 or later** (GPL-3.0-or-later). See [LICENSE](LICENSE) for details.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Contributing
|
|
131
|
+
|
|
132
|
+
Contributions are welcome! Please open issues to report bugs or request features, and submit pull requests with tests and documentation improvements.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Acknowledgements
|
|
137
|
+
|
|
138
|
+
- Built using [PyMuPDF](https://pymupdf.readthedocs.io) for PDF handling.
|
|
139
|
+
- Uses [PyPDFForm](https://pypdfform.readthedocs.io) for interactive form creation.
|
|
140
|
+
- Inspired by the need for robust automation of PDF workflows involving white boxed form fields.
|
|
141
|
+
|
|
142
|
+
---
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pdf_form_generator package
|
|
3
|
+
|
|
4
|
+
This package provides modular components for:
|
|
5
|
+
- PDF box extraction and layout processing
|
|
6
|
+
- CSV input/output
|
|
7
|
+
- PDF markup and form field generation/filling
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .config import *
|
|
11
|
+
from .utils import *
|
|
12
|
+
from .extract import *
|
|
13
|
+
from .io_utils import *
|
|
14
|
+
from .markup_and_fields import *
|
|
15
|
+
|
|
16
|
+
# Optionally define __all__ to specify what is exported on import *
|
|
17
|
+
__all__ = [
|
|
18
|
+
"config",
|
|
19
|
+
"utils",
|
|
20
|
+
"extract",
|
|
21
|
+
"io_utils",
|
|
22
|
+
"markup_and_fields",
|
|
23
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration constants used throughout the project.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Default file paths
|
|
6
|
+
DEFAULT_INPUT_PDF = "input.pdf"
|
|
7
|
+
DEFAULT_CAPTURE_SUFFIX = "-capture.csv"
|
|
8
|
+
DEFAULT_FIELD_GENERATOR_SUFFIX = "-field-generator.py"
|
|
9
|
+
DEFAULT_FILLER_SUFFIX = "-filler.py"
|
|
10
|
+
DEFAULT_MARKUP_SUFFIX = "-markup"
|
|
11
|
+
DEFAULT_FIELDS_SUFFIX = "-fields"
|
|
12
|
+
DEFAULT_FILL_SUFFIX = "-filled"
|
|
13
|
+
|
|
14
|
+
# Color constants (normalized RGB)
|
|
15
|
+
COLOR_WHITE = (1, 1, 1)
|
|
16
|
+
COLOR_BLACK = (0, 0, 0)
|
|
17
|
+
|
|
18
|
+
# Target color for extraction (normalized RGB)
|
|
19
|
+
TARGET_COLOUR = (1, 1, 1) # RGB normalized white for fill colour detection
|
|
20
|
+
|
|
21
|
+
# Thresholds for layout grouping and gaps (arbitrary units, related to PDF coords)
|
|
22
|
+
GAP_THRESHOLD = 3.0
|
|
23
|
+
GAP = 1.9
|
|
24
|
+
GAP_GROUP = 7.6
|
|
25
|
+
|
|
26
|
+
# Constants for height filtering (units are PDF coordinate points)
|
|
27
|
+
MIN_BOX_HEIGHT = 15.0
|
|
28
|
+
MAX_BOX_HEIGHT = 30.0 # Filters out signature boxes
|
|
29
|
+
|
|
30
|
+
# Field types considered numeric for special processing
|
|
31
|
+
NUMERIC_FIELD_TYPES = ["Currency", "CurrencyDecimal", "DollarCents", "Dollars"]
|
|
32
|
+
|
|
33
|
+
# Pages to test on, if set (list of page numbers)
|
|
34
|
+
PDF_PAGES = None
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
import fitz # PyMuPDF
|
|
4
|
+
from .config import (
|
|
5
|
+
TARGET_COLOUR,
|
|
6
|
+
MIN_BOX_HEIGHT,
|
|
7
|
+
MAX_BOX_HEIGHT,
|
|
8
|
+
)
|
|
9
|
+
from .utils import (
|
|
10
|
+
colour_match,
|
|
11
|
+
int_to_rgb,
|
|
12
|
+
allowed_text,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_boxes(pdf_path):
|
|
19
|
+
"""
|
|
20
|
+
Extract filled rectangles (boxes) from each page of the PDF that match the TARGET_COLOUR.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
pdf_path (str): Path to input PDF file.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
list of dict: Each dict contains box coordinates in PDF coordinate system (origin bottom-left)
|
|
27
|
+
and metadata such as page number and placeholder fields.
|
|
28
|
+
"""
|
|
29
|
+
boxes = []
|
|
30
|
+
try:
|
|
31
|
+
with fitz.open(pdf_path) as doc:
|
|
32
|
+
for page_num in range(1, len(doc) + 1):
|
|
33
|
+
try:
|
|
34
|
+
page = doc[page_num - 1]
|
|
35
|
+
except IndexError:
|
|
36
|
+
logger.warning(f"Page {page_num} not found in document.")
|
|
37
|
+
continue
|
|
38
|
+
page_height = page.rect.height
|
|
39
|
+
for drawing in page.get_drawings():
|
|
40
|
+
rect = drawing.get("rect")
|
|
41
|
+
fill_color = drawing.get("fill")
|
|
42
|
+
if rect and colour_match(fill_color, target_color=TARGET_COLOUR):
|
|
43
|
+
# Convert PyMuPDF page coordinates (origin top-left)
|
|
44
|
+
# to PDF coordinate system (origin bottom-left)
|
|
45
|
+
|
|
46
|
+
pdf_y0 = page_height - rect.y1
|
|
47
|
+
pdf_y1 = page_height - rect.y0
|
|
48
|
+
boxes.append(
|
|
49
|
+
{
|
|
50
|
+
"page_num": page_num,
|
|
51
|
+
"x0": rect.x0,
|
|
52
|
+
"y0": pdf_y0,
|
|
53
|
+
"x1": rect.x1,
|
|
54
|
+
"y1": pdf_y1,
|
|
55
|
+
"left": round(rect.x0, 2),
|
|
56
|
+
"bottom": round(pdf_y0, 2),
|
|
57
|
+
"right": round(rect.x1, 2),
|
|
58
|
+
"top": round(pdf_y1, 2),
|
|
59
|
+
"chars": "",
|
|
60
|
+
"field_type": None,
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error(f"Could not open PDF file {pdf_path}: {e}")
|
|
65
|
+
return boxes
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def filter_boxes(page, boxes):
|
|
69
|
+
"""
|
|
70
|
+
Filter boxes on a PDF page by height and allowed text content.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
page (fitz.Page): PyMuPDF page object.
|
|
74
|
+
boxes (list): List of box dicts extracted from the page.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
list: Filtered boxes meeting size and allowed text criteria.
|
|
78
|
+
"""
|
|
79
|
+
filtered = []
|
|
80
|
+
page_height = page.rect.height
|
|
81
|
+
black = (0, 0, 0) # RGB for black text matching
|
|
82
|
+
|
|
83
|
+
for box in boxes:
|
|
84
|
+
height = box.get("y1", 0) - box.get("y0", 0)
|
|
85
|
+
if height < MIN_BOX_HEIGHT or height > MAX_BOX_HEIGHT:
|
|
86
|
+
continue
|
|
87
|
+
# Convert box coordinates to PyMuPDF's coordinate system for clipping
|
|
88
|
+
|
|
89
|
+
pymupdf_y0 = page_height - box["y1"]
|
|
90
|
+
pymupdf_y1 = page_height - box["y0"]
|
|
91
|
+
clip_rect = fitz.Rect(box["x0"], pymupdf_y0, box["x1"], pymupdf_y1)
|
|
92
|
+
|
|
93
|
+
text_dict = page.get_text("dict", clip=clip_rect)
|
|
94
|
+
|
|
95
|
+
black_text_parts = []
|
|
96
|
+
non_black_text_parts = []
|
|
97
|
+
|
|
98
|
+
for block in text_dict.get("blocks", []):
|
|
99
|
+
for line in block.get("lines", []):
|
|
100
|
+
for span in line.get("spans", []):
|
|
101
|
+
span_text = span.get("text", "").strip()
|
|
102
|
+
if not span_text:
|
|
103
|
+
continue
|
|
104
|
+
span_color = span.get("color")
|
|
105
|
+
rgb = None
|
|
106
|
+
if span_color is not None:
|
|
107
|
+
if isinstance(span_color, int):
|
|
108
|
+
rgb = int_to_rgb(span_color)
|
|
109
|
+
elif isinstance(span_color, str):
|
|
110
|
+
try:
|
|
111
|
+
rgb = fitz.utils.getColor(span_color)
|
|
112
|
+
except Exception:
|
|
113
|
+
rgb = None
|
|
114
|
+
if rgb and colour_match(rgb, target_color=black):
|
|
115
|
+
black_text_parts.append(span_text)
|
|
116
|
+
else:
|
|
117
|
+
non_black_text_parts.append(span_text)
|
|
118
|
+
fill_text = "".join(black_text_parts)
|
|
119
|
+
box_text = "".join(non_black_text_parts)
|
|
120
|
+
|
|
121
|
+
allowed, detected_field_type = allowed_text(
|
|
122
|
+
box_text, field_type=box.get("field_type")
|
|
123
|
+
)
|
|
124
|
+
if box_text and not allowed:
|
|
125
|
+
continue
|
|
126
|
+
box["field_type"] = detected_field_type
|
|
127
|
+
box["chars"] = box_text
|
|
128
|
+
box["fill"] = fill_text
|
|
129
|
+
filtered.append(box)
|
|
130
|
+
return filtered
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def remove_duplicates(boxes):
|
|
134
|
+
"""
|
|
135
|
+
Remove duplicate boxes per page based on rounded coordinates.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
boxes (list): List of box dicts.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
list: Boxes without duplicates.
|
|
142
|
+
"""
|
|
143
|
+
page_groups = defaultdict(list)
|
|
144
|
+
for box in boxes:
|
|
145
|
+
page_groups[box["page_num"]].append(box)
|
|
146
|
+
cleaned = []
|
|
147
|
+
for page_num, page_boxes in page_groups.items():
|
|
148
|
+
seen = set()
|
|
149
|
+
for box in page_boxes:
|
|
150
|
+
key = (
|
|
151
|
+
round(box["x0"], 3),
|
|
152
|
+
round(box["y0"], 3),
|
|
153
|
+
round(box["x1"], 3),
|
|
154
|
+
round(box["y1"], 3),
|
|
155
|
+
)
|
|
156
|
+
if key not in seen:
|
|
157
|
+
seen.add(key)
|
|
158
|
+
cleaned.append(box)
|
|
159
|
+
return cleaned
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def sort_boxes(boxes, decimal_places=0):
|
|
163
|
+
"""
|
|
164
|
+
Sort boxes by page number, top-to-bottom (descending), then left-to-right.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
boxes (list): List of box dicts.
|
|
168
|
+
decimal_places (int): Precision for bottom coordinate rounding to group near boxes.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
list: Sorted boxes.
|
|
172
|
+
"""
|
|
173
|
+
return sorted(
|
|
174
|
+
boxes,
|
|
175
|
+
key=lambda b: (b["page_num"], -round(b["bottom"], decimal_places), b["left"]),
|
|
176
|
+
)
|