flyfield 2025.8.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ flyfield is licensed under the GNU General Public License version 3 or later (GPL-3.0-or-later).
2
+
3
+ Full license text available at: https://www.gnu.org/licenses/gpl-3.0.html
@@ -0,0 +1,3 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include flyfield *.py
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: flyfield
3
+ Version: 2025.8.29
4
+ Summary: Tools for extracting, processing, and generating interactive fields for PDFs containing white box fields.
5
+ Author-email: flywire <flywire0@gmail.com>
6
+ License: GPL-3.0-or-later
7
+ Keywords: pdf,form,fields,extraction,pymupdf,pypdfforms
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: PyMuPDF>=1.18.0
20
+ Requires-Dist: PyPDFForm>=0.3.3
21
+ Dynamic: license-file
22
+
23
+ # flyfield
24
+
25
+ A tool to automatically detect white boxes in PDFs and convert them into interactive, automated form fields—aimed at users and developers looking to streamline PDF form workflows.
26
+
27
+ ---
28
+
29
+ ## Overview
30
+
31
+ **flyfield** is a Python library and command-line tool designed to automate the conversion of white box placeholders within PDFs into fully interactive form fields. This enables users and developers to analyze form layouts, generate fillable fields, fill forms programmatically using CSV data, and capture filled data for further use.
32
+
33
+ Powered by [PyMuPDF](https://pymupdf.readthedocs.io) and [PyPDFForm](https://pypdfform.readthedocs.io), flyfield provides a modular, extensible solution for reliable PDF form automation.
34
+
35
+ ---
36
+
37
+ ## Key Features
38
+
39
+ - **White box detection:** Precisely extract white box regions as potential form fields from vector PDFs.
40
+ - **Layout analysis:** Group extracted fields by page, line, and block with flexible gap detection.
41
+ - **Form field generation:** Automatically produce Python scripts to create interactive PDF form fields aligned with detected boxes.
42
+ - **Markup visualization:** Generate annotated PDFs marking detected fields for verification.
43
+ - **Form filling and capture:** Fill PDF forms programmatically from CSV data and extract filled data into CSV format.
44
+ - **CLI integration:** User-friendly command-line interface to orchestrate workflows from extraction to data capture.
45
+ - **Open Source and Extensible:** Easily customize and extend for specific PDF processing needs.
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ For isolated installation, use [pipx](https://pipxproject.github.io/pipx/):
52
+
53
+ ```
54
+
55
+ pipx install flyfield
56
+
57
+ ```
58
+
59
+ Verify the installed version:
60
+
61
+ ```
62
+
63
+ flyfield --version
64
+
65
+ ```
66
+
67
+ Alternatively, install with `pip`:
68
+
69
+ ```
70
+
71
+ pip install flyfield
72
+
73
+ ```
74
+
75
+ ---
76
+
77
+ ## Usage
78
+
79
+ Execute commands against PDF files as needed:
80
+
81
+ ```
82
+
83
+ flyfield --input-pdf myfile.pdf --markup
84
+
85
+ ```
86
+
87
+ Common options:
88
+
89
+ - `--markup` → Generate a marked-up PDF showing detected white boxes.
90
+ - `--fields` → Generate and run scripts to add interactive form fields.
91
+ - `--fill` → Fill form fields using CSV data.
92
+ - `--capture` → Extract filled form data back to CSV.
93
+ - `--input-csv` → Provide CSV input for field data, bypassing extraction.
94
+ - `--debug` → Enable verbose debug output.
95
+
96
+ Example workflow:
97
+
98
+ ```
99
+
100
+ flyfield --input-pdf form_template.pdf --markup --fields
101
+ flyfield --input-pdf form_template-fields.pdf --input-csv form_template.csv --fill
102
+ flyfield --input-pdf form_template-filled.pdf --capture
103
+
104
+ ```
105
+
106
+ ---
107
+
108
+ ## For Developers
109
+
110
+ Clone the repository and install development dependencies:
111
+
112
+ ```
113
+
114
+ git clone https://github.com/flywire/flyfield.git
115
+ cd flyfield
116
+ pip install -e .[dev]
117
+
118
+ ```
119
+
120
+ Run the test suite with coverage enabled:
121
+
122
+ ```
123
+
124
+ tox
125
+
126
+ ```
127
+
128
+ The project is modular with separate components, including:
129
+
130
+ - `extract` for box detection
131
+ - `layout` for field grouping and filtering
132
+ - `markup_and_fields` for field generation and markup
133
+ - `io_utils` for data input/output
134
+ - `utils` for utility functions
135
+
136
+ Get CLI help with:
137
+
138
+ ```
139
+
140
+ python -m flyfield.cli --help
141
+
142
+ ```
143
+
144
+ ---
145
+
146
+ ## License
147
+
148
+ Licensed under the **GNU General Public License v3.0 or later** (GPL-3.0-or-later). See [LICENSE](LICENSE) for details.
149
+
150
+ ---
151
+
152
+ ## Contributing
153
+
154
+ Contributions are welcome! Please open issues to report bugs or request features, and submit pull requests with tests and documentation improvements.
155
+
156
+ ---
157
+
158
+ ## Acknowledgements
159
+
160
+ - Built using [PyMuPDF](https://pymupdf.readthedocs.io) for PDF handling.
161
+ - Uses [PyPDFForm](https://pypdfform.readthedocs.io) for interactive form creation.
162
+ - Inspired by the need for robust automation of PDF workflows involving white boxed form fields.
163
+
164
+ ---
@@ -0,0 +1,142 @@
1
+ # flyfield
2
+
3
+ A tool to automatically detect white boxes in PDFs and convert them into interactive, automated form fields—aimed at users and developers looking to streamline PDF form workflows.
4
+
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ **flyfield** is a Python library and command-line tool designed to automate the conversion of white box placeholders within PDFs into fully interactive form fields. This enables users and developers to analyze form layouts, generate fillable fields, fill forms programmatically using CSV data, and capture filled data for further use.
10
+
11
+ Powered by [PyMuPDF](https://pymupdf.readthedocs.io) and [PyPDFForm](https://pypdfform.readthedocs.io), flyfield provides a modular, extensible solution for reliable PDF form automation.
12
+
13
+ ---
14
+
15
+ ## Key Features
16
+
17
+ - **White box detection:** Precisely extract white box regions as potential form fields from vector PDFs.
18
+ - **Layout analysis:** Group extracted fields by page, line, and block with flexible gap detection.
19
+ - **Form field generation:** Automatically produce Python scripts to create interactive PDF form fields aligned with detected boxes.
20
+ - **Markup visualization:** Generate annotated PDFs marking detected fields for verification.
21
+ - **Form filling and capture:** Fill PDF forms programmatically from CSV data and extract filled data into CSV format.
22
+ - **CLI integration:** User-friendly command-line interface to orchestrate workflows from extraction to data capture.
23
+ - **Open Source and Extensible:** Easily customize and extend for specific PDF processing needs.
24
+
25
+ ---
26
+
27
+ ## Installation
28
+
29
+ For isolated installation, use [pipx](https://pipxproject.github.io/pipx/):
30
+
31
+ ```
32
+
33
+ pipx install flyfield
34
+
35
+ ```
36
+
37
+ Verify the installed version:
38
+
39
+ ```
40
+
41
+ flyfield --version
42
+
43
+ ```
44
+
45
+ Alternatively, install with `pip`:
46
+
47
+ ```
48
+
49
+ pip install flyfield
50
+
51
+ ```
52
+
53
+ ---
54
+
55
+ ## Usage
56
+
57
+ Execute commands against PDF files as needed:
58
+
59
+ ```
60
+
61
+ flyfield --input-pdf myfile.pdf --markup
62
+
63
+ ```
64
+
65
+ Common options:
66
+
67
+ - `--markup` → Generate a marked-up PDF showing detected white boxes.
68
+ - `--fields` → Generate and run scripts to add interactive form fields.
69
+ - `--fill` → Fill form fields using CSV data.
70
+ - `--capture` → Extract filled form data back to CSV.
71
+ - `--input-csv` → Provide CSV input for field data, bypassing extraction.
72
+ - `--debug` → Enable verbose debug output.
73
+
74
+ Example workflow:
75
+
76
+ ```
77
+
78
+ flyfield --input-pdf form_template.pdf --markup --fields
79
+ flyfield --input-pdf form_template-fields.pdf --input-csv form_template.csv --fill
80
+ flyfield --input-pdf form_template-filled.pdf --capture
81
+
82
+ ```
83
+
84
+ ---
85
+
86
+ ## For Developers
87
+
88
+ Clone the repository and install development dependencies:
89
+
90
+ ```
91
+
92
+ git clone https://github.com/flywire/flyfield.git
93
+ cd flyfield
94
+ pip install -e .[dev]
95
+
96
+ ```
97
+
98
+ Run the test suite with coverage enabled:
99
+
100
+ ```
101
+
102
+ tox
103
+
104
+ ```
105
+
106
+ The project is modular with separate components, including:
107
+
108
+ - `extract` for box detection
109
+ - `layout` for field grouping and filtering
110
+ - `markup_and_fields` for field generation and markup
111
+ - `io_utils` for data input/output
112
+ - `utils` for utility functions
113
+
114
+ Get CLI help with:
115
+
116
+ ```
117
+
118
+ python -m flyfield.cli --help
119
+
120
+ ```
121
+
122
+ ---
123
+
124
+ ## License
125
+
126
+ Licensed under the **GNU General Public License v3.0 or later** (GPL-3.0-or-later). See [LICENSE](LICENSE) for details.
127
+
128
+ ---
129
+
130
+ ## Contributing
131
+
132
+ Contributions are welcome! Please open issues to report bugs or request features, and submit pull requests with tests and documentation improvements.
133
+
134
+ ---
135
+
136
+ ## Acknowledgements
137
+
138
+ - Built using [PyMuPDF](https://pymupdf.readthedocs.io) for PDF handling.
139
+ - Uses [PyPDFForm](https://pypdfform.readthedocs.io) for interactive form creation.
140
+ - Inspired by the need for robust automation of PDF workflows involving white boxed form fields.
141
+
142
+ ---
@@ -0,0 +1,23 @@
1
+ """
2
+ pdf_form_generator package
3
+
4
+ This package provides modular components for:
5
+ - PDF box extraction and layout processing
6
+ - CSV input/output
7
+ - PDF markup and form field generation/filling
8
+ """
9
+
10
+ from .config import *
11
+ from .utils import *
12
+ from .extract import *
13
+ from .io_utils import *
14
+ from .markup_and_fields import *
15
+
16
+ # Optionally define __all__ to specify what is exported on import *
17
+ __all__ = [
18
+ "config",
19
+ "utils",
20
+ "extract",
21
+ "io_utils",
22
+ "markup_and_fields",
23
+ ]
@@ -0,0 +1,6 @@
1
+ # flyfield/cli.py
2
+
3
+ from .main import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,34 @@
1
+ """
2
+ Configuration constants used throughout the project.
3
+ """
4
+
5
+ # Default file paths
6
+ DEFAULT_INPUT_PDF = "input.pdf"
7
+ DEFAULT_CAPTURE_SUFFIX = "-capture.csv"
8
+ DEFAULT_FIELD_GENERATOR_SUFFIX = "-field-generator.py"
9
+ DEFAULT_FILLER_SUFFIX = "-filler.py"
10
+ DEFAULT_MARKUP_SUFFIX = "-markup"
11
+ DEFAULT_FIELDS_SUFFIX = "-fields"
12
+ DEFAULT_FILL_SUFFIX = "-filled"
13
+
14
+ # Color constants (normalized RGB)
15
+ COLOR_WHITE = (1, 1, 1)
16
+ COLOR_BLACK = (0, 0, 0)
17
+
18
+ # Target color for extraction (normalized RGB)
19
+ TARGET_COLOUR = (1, 1, 1) # RGB normalized white for fill colour detection
20
+
21
+ # Thresholds for layout grouping and gaps (arbitrary units, related to PDF coords)
22
+ GAP_THRESHOLD = 3.0
23
+ GAP = 1.9
24
+ GAP_GROUP = 7.6
25
+
26
+ # Constants for height filtering (units are PDF coordinate points)
27
+ MIN_BOX_HEIGHT = 15.0
28
+ MAX_BOX_HEIGHT = 30.0 # Filters out signature boxes
29
+
30
+ # Field types considered numeric for special processing
31
+ NUMERIC_FIELD_TYPES = ["Currency", "CurrencyDecimal", "DollarCents", "Dollars"]
32
+
33
+ # Pages to test on, if set (list of page numbers)
34
+ PDF_PAGES = None
@@ -0,0 +1,176 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ import fitz # PyMuPDF
4
+ from .config import (
5
+ TARGET_COLOUR,
6
+ MIN_BOX_HEIGHT,
7
+ MAX_BOX_HEIGHT,
8
+ )
9
+ from .utils import (
10
+ colour_match,
11
+ int_to_rgb,
12
+ allowed_text,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def extract_boxes(pdf_path):
19
+ """
20
+ Extract filled rectangles (boxes) from each page of the PDF that match the TARGET_COLOUR.
21
+
22
+ Args:
23
+ pdf_path (str): Path to input PDF file.
24
+
25
+ Returns:
26
+ list of dict: Each dict contains box coordinates in PDF coordinate system (origin bottom-left)
27
+ and metadata such as page number and placeholder fields.
28
+ """
29
+ boxes = []
30
+ try:
31
+ with fitz.open(pdf_path) as doc:
32
+ for page_num in range(1, len(doc) + 1):
33
+ try:
34
+ page = doc[page_num - 1]
35
+ except IndexError:
36
+ logger.warning(f"Page {page_num} not found in document.")
37
+ continue
38
+ page_height = page.rect.height
39
+ for drawing in page.get_drawings():
40
+ rect = drawing.get("rect")
41
+ fill_color = drawing.get("fill")
42
+ if rect and colour_match(fill_color, target_color=TARGET_COLOUR):
43
+ # Convert PyMuPDF page coordinates (origin top-left)
44
+ # to PDF coordinate system (origin bottom-left)
45
+
46
+ pdf_y0 = page_height - rect.y1
47
+ pdf_y1 = page_height - rect.y0
48
+ boxes.append(
49
+ {
50
+ "page_num": page_num,
51
+ "x0": rect.x0,
52
+ "y0": pdf_y0,
53
+ "x1": rect.x1,
54
+ "y1": pdf_y1,
55
+ "left": round(rect.x0, 2),
56
+ "bottom": round(pdf_y0, 2),
57
+ "right": round(rect.x1, 2),
58
+ "top": round(pdf_y1, 2),
59
+ "chars": "",
60
+ "field_type": None,
61
+ }
62
+ )
63
+ except Exception as e:
64
+ logger.error(f"Could not open PDF file {pdf_path}: {e}")
65
+ return boxes
66
+
67
+
68
+ def filter_boxes(page, boxes):
69
+ """
70
+ Filter boxes on a PDF page by height and allowed text content.
71
+
72
+ Args:
73
+ page (fitz.Page): PyMuPDF page object.
74
+ boxes (list): List of box dicts extracted from the page.
75
+
76
+ Returns:
77
+ list: Filtered boxes meeting size and allowed text criteria.
78
+ """
79
+ filtered = []
80
+ page_height = page.rect.height
81
+ black = (0, 0, 0) # RGB for black text matching
82
+
83
+ for box in boxes:
84
+ height = box.get("y1", 0) - box.get("y0", 0)
85
+ if height < MIN_BOX_HEIGHT or height > MAX_BOX_HEIGHT:
86
+ continue
87
+ # Convert box coordinates to PyMuPDF's coordinate system for clipping
88
+
89
+ pymupdf_y0 = page_height - box["y1"]
90
+ pymupdf_y1 = page_height - box["y0"]
91
+ clip_rect = fitz.Rect(box["x0"], pymupdf_y0, box["x1"], pymupdf_y1)
92
+
93
+ text_dict = page.get_text("dict", clip=clip_rect)
94
+
95
+ black_text_parts = []
96
+ non_black_text_parts = []
97
+
98
+ for block in text_dict.get("blocks", []):
99
+ for line in block.get("lines", []):
100
+ for span in line.get("spans", []):
101
+ span_text = span.get("text", "").strip()
102
+ if not span_text:
103
+ continue
104
+ span_color = span.get("color")
105
+ rgb = None
106
+ if span_color is not None:
107
+ if isinstance(span_color, int):
108
+ rgb = int_to_rgb(span_color)
109
+ elif isinstance(span_color, str):
110
+ try:
111
+ rgb = fitz.utils.getColor(span_color)
112
+ except Exception:
113
+ rgb = None
114
+ if rgb and colour_match(rgb, target_color=black):
115
+ black_text_parts.append(span_text)
116
+ else:
117
+ non_black_text_parts.append(span_text)
118
+ fill_text = "".join(black_text_parts)
119
+ box_text = "".join(non_black_text_parts)
120
+
121
+ allowed, detected_field_type = allowed_text(
122
+ box_text, field_type=box.get("field_type")
123
+ )
124
+ if box_text and not allowed:
125
+ continue
126
+ box["field_type"] = detected_field_type
127
+ box["chars"] = box_text
128
+ box["fill"] = fill_text
129
+ filtered.append(box)
130
+ return filtered
131
+
132
+
133
+ def remove_duplicates(boxes):
134
+ """
135
+ Remove duplicate boxes per page based on rounded coordinates.
136
+
137
+ Args:
138
+ boxes (list): List of box dicts.
139
+
140
+ Returns:
141
+ list: Boxes without duplicates.
142
+ """
143
+ page_groups = defaultdict(list)
144
+ for box in boxes:
145
+ page_groups[box["page_num"]].append(box)
146
+ cleaned = []
147
+ for page_num, page_boxes in page_groups.items():
148
+ seen = set()
149
+ for box in page_boxes:
150
+ key = (
151
+ round(box["x0"], 3),
152
+ round(box["y0"], 3),
153
+ round(box["x1"], 3),
154
+ round(box["y1"], 3),
155
+ )
156
+ if key not in seen:
157
+ seen.add(key)
158
+ cleaned.append(box)
159
+ return cleaned
160
+
161
+
162
+ def sort_boxes(boxes, decimal_places=0):
163
+ """
164
+ Sort boxes by page number, top-to-bottom (descending), then left-to-right.
165
+
166
+ Args:
167
+ boxes (list): List of box dicts.
168
+ decimal_places (int): Precision for bottom coordinate rounding to group near boxes.
169
+
170
+ Returns:
171
+ list: Sorted boxes.
172
+ """
173
+ return sorted(
174
+ boxes,
175
+ key=lambda b: (b["page_num"], -round(b["bottom"], decimal_places), b["left"]),
176
+ )