xfa-extract 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfa_extract-0.1.0/LICENSE +21 -0
- xfa_extract-0.1.0/PKG-INFO +133 -0
- xfa_extract-0.1.0/README.md +103 -0
- xfa_extract-0.1.0/pyproject.toml +48 -0
- xfa_extract-0.1.0/setup.cfg +4 -0
- xfa_extract-0.1.0/src/xfa_extract/__init__.py +26 -0
- xfa_extract-0.1.0/src/xfa_extract/cli.py +438 -0
- xfa_extract-0.1.0/src/xfa_extract.egg-info/PKG-INFO +133 -0
- xfa_extract-0.1.0/src/xfa_extract.egg-info/SOURCES.txt +12 -0
- xfa_extract-0.1.0/src/xfa_extract.egg-info/dependency_links.txt +1 -0
- xfa_extract-0.1.0/src/xfa_extract.egg-info/entry_points.txt +2 -0
- xfa_extract-0.1.0/src/xfa_extract.egg-info/requires.txt +9 -0
- xfa_extract-0.1.0/src/xfa_extract.egg-info/top_level.txt +1 -0
- xfa_extract-0.1.0/tests/test_acceptance.py +109 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ryan Kashtan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xfa-extract
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Read entered values out of XFA / LiveCycle 'dynamic' PDF forms (IRCC and other government forms) that standard PDF field extraction misses.
|
|
5
|
+
Author: Ryan Kashtan
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ryanjkashtan/xfa-extract
|
|
8
|
+
Project-URL: Issues, https://github.com/ryanjkashtan/xfa-extract/issues
|
|
9
|
+
Keywords: xfa,pdf,livecycle,acroform,forms,form-data,ircc,immigration,pypdf,datasets,extraction,dynamic-pdf
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Office/Business
|
|
17
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: pypdf>=4.0
|
|
23
|
+
Requires-Dist: lxml>=5.0
|
|
24
|
+
Provides-Extra: robust
|
|
25
|
+
Requires-Dist: pikepdf>=8.0; extra == "robust"
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
28
|
+
Requires-Dist: pikepdf>=8.0; extra == "test"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# xfa-extract
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/xfa-extract/)
|
|
34
|
+
[](https://pypi.org/project/xfa-extract/)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
|
|
37
|
+
**Read the entered values out of XFA / LiveCycle "dynamic" PDF forms — the ones where
|
|
38
|
+
`pypdf.get_fields()` comes back empty even though the form is clearly filled in.**
|
|
39
|
+
|
|
40
|
+
If you've ever hit this:
|
|
41
|
+
|
|
42
|
+
> I filled out a government PDF (an IRCC immigration form, a tax form, …), but when I run
|
|
43
|
+
> `PdfReader(...).get_fields()` or `pdftk dump_data_fields`, the values are **blank or
|
|
44
|
+
> missing**. The form *template* text extracts fine, but **none of the answers show up.**
|
|
45
|
+
> Or the PDF just shows *"Please wait… If this message is not eventually replaced…"*.
|
|
46
|
+
|
|
47
|
+
…then your PDF is an **XFA form**, and this tool reads it.
|
|
48
|
+
|
|
49
|
+
## Why standard extraction misses the data
|
|
50
|
+
|
|
51
|
+
A normal interactive PDF (an **AcroForm**) stores each field's value in its `/V` entry —
|
|
52
|
+
`pypdf`, `pdftk`, `pdfminer` all read those fine.
|
|
53
|
+
|
|
54
|
+
An **XFA form** (Adobe LiveCycle / "dynamic" PDF — what most government and immigration forms
|
|
55
|
+
are) does **not** keep the entered data in `/V`. It keeps it in an XML packet inside the
|
|
56
|
+
AcroForm dictionary under the `/XFA` key, in a sub-packet called **`datasets`**. So
|
|
57
|
+
`get_fields()` and text extraction look blank even on a fully completed form. `xfa-extract`
|
|
58
|
+
detects XFA, pulls the `datasets` packet, parses it, and gives you the field → value map.
|
|
59
|
+
|
|
60
|
+
**Read-only.** It never writes to or mutates your PDF.
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install xfa-extract # core (pypdf + lxml)
|
|
66
|
+
pip install "xfa-extract[robust]" # + pikepdf fallback for unusual PDFs
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Use it — command line
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
xfa-extract FORM.pdf # human-readable tree + flat "path: value" table
|
|
73
|
+
xfa-extract FORM.pdf --json # machine-readable JSON (for scripts / LLMs)
|
|
74
|
+
xfa-extract FORM.pdf --flatten # just the path: value table
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Every run also writes the raw `datasets` XML to `--raw-out` (default `./xfa_datasets.xml`)
|
|
78
|
+
for auditing.
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
$ xfa-extract application.pdf --flatten
|
|
82
|
+
form1.PersonalInfo.Surname Smith
|
|
83
|
+
form1.PersonalInfo.GivenName Jane
|
|
84
|
+
form1.Dependents.Dependent[0].Name Alex
|
|
85
|
+
form1.Dependents.Dependent[1].Name Sam
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Repeating sections (multiple dependents, applicants, addresses) are **indexed**
|
|
89
|
+
(`Dependent[0]`, `Dependent[1]`, …), never collapsed.
|
|
90
|
+
|
|
91
|
+
## Use it — as a library
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from xfa_extract import locate_datasets, parse_datasets
|
|
95
|
+
|
|
96
|
+
kind, datasets, _packets, _engine = locate_datasets("FORM.pdf")
|
|
97
|
+
if kind == "xfa" and datasets:
|
|
98
|
+
tree, flat = parse_datasets(datasets)
|
|
99
|
+
print(flat["form1.PersonalInfo.Surname"]) # -> "Smith"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Exit codes (the CLI tells you which case you're in)
|
|
103
|
+
|
|
104
|
+
| code | meaning | what to do |
|
|
105
|
+
|------|---------|------------|
|
|
106
|
+
| `0` | XFA data extracted (≥1 non-empty value) | use the values |
|
|
107
|
+
| `2` | **not XFA** — AcroForm-only or no form | use `get_fields()`; the tool prints those values for you as a convenience |
|
|
108
|
+
| `3` | XFA but no `datasets`, or the form is empty/unfilled | report "unfilled / no entered data" |
|
|
109
|
+
| `4` | parse failure | the raw XML is still written to `--raw-out` for inspection |
|
|
110
|
+
|
|
111
|
+
## Tested against real forms
|
|
112
|
+
|
|
113
|
+
Validated on synthetic fixtures (in CI) **and** 14 real-world XFA forms — IRCC IMM5257 /
|
|
114
|
+
1295 / 1344 / 5710 / 5669, a DHL waybill, an Indian MCA MGT-7, an Ontario lease, a US DOL
|
|
115
|
+
form, French CERFA — plus a real filled Canadian Proof-of-Citizenship application. Repeating
|
|
116
|
+
sections, namespaces, Adobe's quirky tag serialization, and base64-image-bearing datasets all
|
|
117
|
+
handled. See [`docs/xfa-internals.md`](docs/xfa-internals.md) for the deep dive.
|
|
118
|
+
|
|
119
|
+
## What it does **not** do
|
|
120
|
+
|
|
121
|
+
- **Fill / write** values into XFA forms — fragile, and the failure mode on a legal document
|
|
122
|
+
is bad. (Use a dedicated form-filling tool.)
|
|
123
|
+
- **Flatten / render** XFA to static pages — different operation.
|
|
124
|
+
- **OCR** — these are digital forms, not scans.
|
|
125
|
+
|
|
126
|
+
## Use it with Claude / Claude Code
|
|
127
|
+
|
|
128
|
+
This repo also ships an [Agent Skill](skill/SKILL.md) so Claude Code automatically reaches
|
|
129
|
+
for it when a fillable PDF's values come back blank. Point Claude at `skill/`.
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT © Ryan Kashtan. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# xfa-extract
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/xfa-extract/)
|
|
4
|
+
[](https://pypi.org/project/xfa-extract/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
**Read the entered values out of XFA / LiveCycle "dynamic" PDF forms — the ones where
|
|
8
|
+
`pypdf.get_fields()` comes back empty even though the form is clearly filled in.**
|
|
9
|
+
|
|
10
|
+
If you've ever hit this:
|
|
11
|
+
|
|
12
|
+
> I filled out a government PDF (an IRCC immigration form, a tax form, …), but when I run
|
|
13
|
+
> `PdfReader(...).get_fields()` or `pdftk dump_data_fields`, the values are **blank or
|
|
14
|
+
> missing**. The form *template* text extracts fine, but **none of the answers show up.**
|
|
15
|
+
> Or the PDF just shows *"Please wait… If this message is not eventually replaced…"*.
|
|
16
|
+
|
|
17
|
+
…then your PDF is an **XFA form**, and this tool reads it.
|
|
18
|
+
|
|
19
|
+
## Why standard extraction misses the data
|
|
20
|
+
|
|
21
|
+
A normal interactive PDF (an **AcroForm**) stores each field's value in its `/V` entry —
|
|
22
|
+
`pypdf`, `pdftk`, `pdfminer` all read those fine.
|
|
23
|
+
|
|
24
|
+
An **XFA form** (Adobe LiveCycle / "dynamic" PDF — what most government and immigration forms
|
|
25
|
+
are) does **not** keep the entered data in `/V`. It keeps it in an XML packet inside the
|
|
26
|
+
AcroForm dictionary under the `/XFA` key, in a sub-packet called **`datasets`**. So
|
|
27
|
+
`get_fields()` and text extraction look blank even on a fully completed form. `xfa-extract`
|
|
28
|
+
detects XFA, pulls the `datasets` packet, parses it, and gives you the field → value map.
|
|
29
|
+
|
|
30
|
+
**Read-only.** It never writes to or mutates your PDF.
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install xfa-extract # core (pypdf + lxml)
|
|
36
|
+
pip install "xfa-extract[robust]" # + pikepdf fallback for unusual PDFs
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Use it — command line
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
xfa-extract FORM.pdf # human-readable tree + flat "path: value" table
|
|
43
|
+
xfa-extract FORM.pdf --json # machine-readable JSON (for scripts / LLMs)
|
|
44
|
+
xfa-extract FORM.pdf --flatten # just the path: value table
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Every run also writes the raw `datasets` XML to `--raw-out` (default `./xfa_datasets.xml`)
|
|
48
|
+
for auditing.
|
|
49
|
+
|
|
50
|
+
```text
|
|
51
|
+
$ xfa-extract application.pdf --flatten
|
|
52
|
+
form1.PersonalInfo.Surname Smith
|
|
53
|
+
form1.PersonalInfo.GivenName Jane
|
|
54
|
+
form1.Dependents.Dependent[0].Name Alex
|
|
55
|
+
form1.Dependents.Dependent[1].Name Sam
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Repeating sections (multiple dependents, applicants, addresses) are **indexed**
|
|
59
|
+
(`Dependent[0]`, `Dependent[1]`, …), never collapsed.
|
|
60
|
+
|
|
61
|
+
## Use it — as a library
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from xfa_extract import locate_datasets, parse_datasets
|
|
65
|
+
|
|
66
|
+
kind, datasets, _packets, _engine = locate_datasets("FORM.pdf")
|
|
67
|
+
if kind == "xfa" and datasets:
|
|
68
|
+
tree, flat = parse_datasets(datasets)
|
|
69
|
+
print(flat["form1.PersonalInfo.Surname"]) # -> "Smith"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Exit codes (the CLI tells you which case you're in)
|
|
73
|
+
|
|
74
|
+
| code | meaning | what to do |
|
|
75
|
+
|------|---------|------------|
|
|
76
|
+
| `0` | XFA data extracted (≥1 non-empty value) | use the values |
|
|
77
|
+
| `2` | **not XFA** — AcroForm-only or no form | use `get_fields()`; the tool prints those values for you as a convenience |
|
|
78
|
+
| `3` | XFA but no `datasets`, or the form is empty/unfilled | report "unfilled / no entered data" |
|
|
79
|
+
| `4` | parse failure | the raw XML is still written to `--raw-out` for inspection |
|
|
80
|
+
|
|
81
|
+
## Tested against real forms
|
|
82
|
+
|
|
83
|
+
Validated on synthetic fixtures (in CI) **and** 14 real-world XFA forms — IRCC IMM5257 /
|
|
84
|
+
1295 / 1344 / 5710 / 5669, a DHL waybill, an Indian MCA MGT-7, an Ontario lease, a US DOL
|
|
85
|
+
form, French CERFA — plus a real filled Canadian Proof-of-Citizenship application. Repeating
|
|
86
|
+
sections, namespaces, Adobe's quirky tag serialization, and base64-image-bearing datasets all
|
|
87
|
+
handled. See [`docs/xfa-internals.md`](docs/xfa-internals.md) for the deep dive.
|
|
88
|
+
|
|
89
|
+
## What it does **not** do
|
|
90
|
+
|
|
91
|
+
- **Fill / write** values into XFA forms — fragile, and the failure mode on a legal document
|
|
92
|
+
is bad. (Use a dedicated form-filling tool.)
|
|
93
|
+
- **Flatten / render** XFA to static pages — different operation.
|
|
94
|
+
- **OCR** — these are digital forms, not scans.
|
|
95
|
+
|
|
96
|
+
## Use it with Claude / Claude Code
|
|
97
|
+
|
|
98
|
+
This repo also ships an [Agent Skill](skill/SKILL.md) so Claude Code automatically reaches
|
|
99
|
+
for it when a fillable PDF's values come back blank. Point Claude at `skill/`.
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT © Ryan Kashtan. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "xfa-extract"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Read entered values out of XFA / LiveCycle 'dynamic' PDF forms (IRCC and other government forms) that standard PDF field extraction misses."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Ryan Kashtan" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"xfa", "pdf", "livecycle", "acroform", "forms", "form-data",
|
|
15
|
+
"ircc", "immigration", "pypdf", "datasets", "extraction", "dynamic-pdf",
|
|
16
|
+
]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Environment :: Console",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Topic :: Office/Business",
|
|
25
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
26
|
+
"Topic :: Utilities",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"pypdf>=4.0",
|
|
30
|
+
"lxml>=5.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
robust = ["pikepdf>=8.0"] # robust object/stream fallback for unusual PDFs
|
|
35
|
+
test = ["pytest>=7", "pikepdf>=8.0"] # pikepdf is needed to build the test fixtures
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/ryanjkashtan/xfa-extract"
|
|
39
|
+
Issues = "https://github.com/ryanjkashtan/xfa-extract/issues"
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
xfa-extract = "xfa_extract.cli:main"
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.dynamic]
|
|
45
|
+
version = { attr = "xfa_extract.__version__" }
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""xfa-extract — read the entered values out of XFA / LiveCycle "dynamic" PDF forms.
|
|
2
|
+
|
|
3
|
+
Most government and immigration forms (e.g. IRCC) are XFA forms: they keep entered data in
|
|
4
|
+
an XML `datasets` packet under /AcroForm /XFA, not in the /V layer that pypdf.get_fields()
|
|
5
|
+
and pdftk read. This package detects XFA, pulls that packet, and emits field -> value maps.
|
|
6
|
+
Read-only; never mutates the source PDF.
|
|
7
|
+
"""
|
|
8
|
+
from .cli import ( # noqa: F401
|
|
9
|
+
locate_datasets,
|
|
10
|
+
parse_datasets,
|
|
11
|
+
get_datasets_via_pypdf,
|
|
12
|
+
get_datasets_via_pikepdf,
|
|
13
|
+
read_acroform_fields,
|
|
14
|
+
main,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
__all__ = [
|
|
19
|
+
"locate_datasets",
|
|
20
|
+
"parse_datasets",
|
|
21
|
+
"get_datasets_via_pypdf",
|
|
22
|
+
"get_datasets_via_pikepdf",
|
|
23
|
+
"read_acroform_fields",
|
|
24
|
+
"main",
|
|
25
|
+
"__version__",
|
|
26
|
+
]
|
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""extract_xfa.py — read the entered values out of XFA / LiveCycle "dynamic" PDF forms.
|
|
3
|
+
|
|
4
|
+
A normal interactive PDF (an AcroForm) stores each field's value as a /V entry, which
|
|
5
|
+
pypdf.get_fields() and pdftk read fine. An XFA form (Adobe LiveCycle / "dynamic" PDF — most
|
|
6
|
+
government / immigration forms, e.g. IRCC) keeps the entered data in an XML packet under
|
|
7
|
+
/AcroForm /XFA, in a sub-packet named `datasets`. So get_fields() looks blank even though
|
|
8
|
+
the form is filled. This tool detects XFA, pulls the `datasets` packet, parses it, and emits
|
|
9
|
+
field -> value mappings plus the raw XML for auditing.
|
|
10
|
+
|
|
11
|
+
READING ONLY. Never mutates the source PDF. See SKILL.md / REFERENCE.md for context.
|
|
12
|
+
|
|
13
|
+
Exit codes:
|
|
14
|
+
0 success — XFA data extracted (at least one non-empty value)
|
|
15
|
+
2 not XFA — AcroForm-only or no form at all; use standard get_fields()
|
|
16
|
+
3 XFA but no `datasets` packet, or the form is empty / unfilled
|
|
17
|
+
4 parse failure — raw XML written to --raw-out, but no parseable root
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import sys
|
|
24
|
+
|
|
25
|
+
# The xfa-data namespace URI is fixed by the XFA spec; match on it exactly.
|
|
26
|
+
XFA_DATA_NS = "http://www.xfa.org/schema/xfa-data/1.0/"
|
|
27
|
+
|
|
28
|
+
# Guard against pathological nesting (§4.10) — real forms are nowhere near this deep.
|
|
29
|
+
MAX_DEPTH = 400
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# --------------------------------------------------------------------------------------
|
|
33
|
+
# Locating the /XFA datasets packet
|
|
34
|
+
# --------------------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
def _packets_from_pypdf_array(xfa) -> dict:
|
|
37
|
+
"""Walk an /XFA array of alternating (name, stream) pairs into {name: bytes}."""
|
|
38
|
+
packets = {}
|
|
39
|
+
it = iter(xfa)
|
|
40
|
+
for name in it:
|
|
41
|
+
try:
|
|
42
|
+
stream = next(it)
|
|
43
|
+
except StopIteration:
|
|
44
|
+
break # odd-length array — tolerate it
|
|
45
|
+
try:
|
|
46
|
+
packets[str(name)] = stream.get_object().get_data()
|
|
47
|
+
except Exception:
|
|
48
|
+
continue # a single unreadable packet shouldn't sink the rest
|
|
49
|
+
return packets
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_datasets_via_pypdf(pdf_path: str):
|
|
53
|
+
"""Returns (form_kind, datasets_bytes, packets). form_kind in {xfa, acroform, none}."""
|
|
54
|
+
from pypdf import PdfReader
|
|
55
|
+
from pypdf.generic import ArrayObject
|
|
56
|
+
|
|
57
|
+
reader = PdfReader(pdf_path)
|
|
58
|
+
root = reader.trailer["/Root"]
|
|
59
|
+
acro = root.get("/AcroForm")
|
|
60
|
+
if acro is None:
|
|
61
|
+
return ("none", None, {})
|
|
62
|
+
acro = acro.get_object()
|
|
63
|
+
xfa = acro.get("/XFA")
|
|
64
|
+
if xfa is None:
|
|
65
|
+
return ("acroform", None, {})
|
|
66
|
+
xfa = xfa.get_object()
|
|
67
|
+
|
|
68
|
+
if isinstance(xfa, ArrayObject):
|
|
69
|
+
packets = _packets_from_pypdf_array(xfa)
|
|
70
|
+
return ("xfa", packets.get("datasets"), packets)
|
|
71
|
+
# single XDP stream — the datasets element lives inside it
|
|
72
|
+
data = xfa.get_data()
|
|
73
|
+
return ("xfa", data, {"__xdp__": data})
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_datasets_via_pikepdf(pdf_path: str):
|
|
77
|
+
"""pikepdf fallback for raw object/stream access on weird PDFs."""
|
|
78
|
+
import pikepdf
|
|
79
|
+
|
|
80
|
+
with pikepdf.open(pdf_path) as pdf:
|
|
81
|
+
try:
|
|
82
|
+
acro = pdf.Root.AcroForm
|
|
83
|
+
except (AttributeError, KeyError):
|
|
84
|
+
return ("none", None, {})
|
|
85
|
+
try:
|
|
86
|
+
xfa = acro.XFA
|
|
87
|
+
except (AttributeError, KeyError):
|
|
88
|
+
return ("acroform", None, {})
|
|
89
|
+
|
|
90
|
+
if isinstance(xfa, pikepdf.Array):
|
|
91
|
+
packets = {}
|
|
92
|
+
items = list(xfa)
|
|
93
|
+
i = 0
|
|
94
|
+
while i + 1 < len(items):
|
|
95
|
+
try:
|
|
96
|
+
packets[str(items[i])] = bytes(items[i + 1].read_bytes())
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
i += 2
|
|
100
|
+
return ("xfa", packets.get("datasets"), packets)
|
|
101
|
+
data = bytes(xfa.read_bytes())
|
|
102
|
+
return ("xfa", data, {"__xdp__": data})
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def locate_datasets(pdf_path: str):
|
|
106
|
+
"""pypdf-first, pikepdf-fallback resolution of the datasets packet.
|
|
107
|
+
|
|
108
|
+
Returns (form_kind, datasets_bytes, packets, engine).
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
kind, datasets, packets = get_datasets_via_pypdf(pdf_path)
|
|
112
|
+
engine = "pypdf"
|
|
113
|
+
except Exception as exc: # corrupt enough that pypdf can't even open it
|
|
114
|
+
kind, datasets, packets, engine = None, None, {}, None
|
|
115
|
+
pypdf_error = exc
|
|
116
|
+
else:
|
|
117
|
+
pypdf_error = None
|
|
118
|
+
|
|
119
|
+
# Fall back to pikepdf when pypdf failed outright, or found XFA but couldn't
|
|
120
|
+
# resolve the datasets stream to bytes (§7).
|
|
121
|
+
needs_fallback = kind is None or (kind == "xfa" and datasets is None)
|
|
122
|
+
if needs_fallback:
|
|
123
|
+
try:
|
|
124
|
+
import pikepdf # noqa: F401
|
|
125
|
+
k2, d2, p2 = get_datasets_via_pikepdf(pdf_path)
|
|
126
|
+
if kind is None or d2 is not None:
|
|
127
|
+
return (k2, d2, p2, "pikepdf")
|
|
128
|
+
except ImportError:
|
|
129
|
+
if kind is None:
|
|
130
|
+
raise RuntimeError(
|
|
131
|
+
f"pypdf could not open {pdf_path!r} ({pypdf_error}); install pikepdf "
|
|
132
|
+
"for the robust fallback."
|
|
133
|
+
)
|
|
134
|
+
except Exception:
|
|
135
|
+
if kind is None:
|
|
136
|
+
raise
|
|
137
|
+
return (kind, datasets, packets, engine)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# --------------------------------------------------------------------------------------
|
|
141
|
+
# Parsing datasets XML -> field map
|
|
142
|
+
# --------------------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def _localname(el) -> "str | None":
|
|
145
|
+
tag = el.tag
|
|
146
|
+
if not isinstance(tag, str):
|
|
147
|
+
return None # comment / processing-instruction
|
|
148
|
+
return tag.split("}", 1)[1] if tag.startswith("{") else tag
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _namespace(el) -> "str | None":
|
|
152
|
+
tag = el.tag
|
|
153
|
+
if isinstance(tag, str) and tag.startswith("{"):
|
|
154
|
+
return tag[1:].split("}", 1)[0]
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _find_data_element(root):
|
|
159
|
+
"""Find the <xfa:data> container regardless of whether we were handed a bare
|
|
160
|
+
datasets packet or a whole XDP document (refinement #2)."""
|
|
161
|
+
for el in root.iter():
|
|
162
|
+
if _localname(el) == "data" and _namespace(el) == XFA_DATA_NS:
|
|
163
|
+
return el
|
|
164
|
+
for el in root.iter():
|
|
165
|
+
if _localname(el) == "data":
|
|
166
|
+
return el
|
|
167
|
+
if _localname(root) == "datasets":
|
|
168
|
+
return root # no explicit <data> — treat datasets children as data
|
|
169
|
+
if _localname(root) not in ("xdp", "template", "config"):
|
|
170
|
+
return root # someone handed us the data subtree directly
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def element_to_node(el, depth: int = 0):
|
|
175
|
+
"""str for a leaf, dict for a group, list when a tag repeats under one parent."""
|
|
176
|
+
if depth > MAX_DEPTH:
|
|
177
|
+
return "...<max depth exceeded>..."
|
|
178
|
+
children = [c for c in el if isinstance(c.tag, str)]
|
|
179
|
+
if not children:
|
|
180
|
+
text = el.text or ""
|
|
181
|
+
return text if text.strip() else ""
|
|
182
|
+
|
|
183
|
+
groups: dict = {}
|
|
184
|
+
order = []
|
|
185
|
+
for c in children:
|
|
186
|
+
name = _localname(c)
|
|
187
|
+
if name not in groups:
|
|
188
|
+
groups[name] = []
|
|
189
|
+
order.append(name)
|
|
190
|
+
groups[name].append(c)
|
|
191
|
+
|
|
192
|
+
node: dict = {}
|
|
193
|
+
for name in order:
|
|
194
|
+
els = groups[name]
|
|
195
|
+
if len(els) == 1:
|
|
196
|
+
node[name] = element_to_node(els[0], depth + 1)
|
|
197
|
+
else:
|
|
198
|
+
node[name] = [element_to_node(e, depth + 1) for e in els]
|
|
199
|
+
return node
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def flatten(node, out: dict, prefix: str = "") -> None:
|
|
203
|
+
if isinstance(node, dict):
|
|
204
|
+
for key, value in node.items():
|
|
205
|
+
flatten(value, out, f"{prefix}.{key}" if prefix else key)
|
|
206
|
+
elif isinstance(node, list):
|
|
207
|
+
for i, item in enumerate(node):
|
|
208
|
+
flatten(item, out, f"{prefix}[{i}]")
|
|
209
|
+
else:
|
|
210
|
+
out[prefix] = node
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def parse_datasets(datasets_bytes: bytes):
|
|
214
|
+
"""Returns (tree, flat) or raises ValueError when there's no parseable root."""
|
|
215
|
+
from lxml import etree
|
|
216
|
+
|
|
217
|
+
parser = etree.XMLParser(recover=True, ns_clean=True, resolve_entities=False)
|
|
218
|
+
root = etree.fromstring(datasets_bytes, parser=parser)
|
|
219
|
+
if root is None:
|
|
220
|
+
raise ValueError("no parseable XML root (datasets packet is unrecoverable)")
|
|
221
|
+
|
|
222
|
+
data_el = _find_data_element(root)
|
|
223
|
+
if data_el is None:
|
|
224
|
+
return ({}, {})
|
|
225
|
+
tree = element_to_node(data_el)
|
|
226
|
+
if not isinstance(tree, dict): # data node held bare text — wrap it
|
|
227
|
+
tree = {"value": tree}
|
|
228
|
+
flat: dict = {}
|
|
229
|
+
flatten(tree, flat)
|
|
230
|
+
return (tree, flat)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# --------------------------------------------------------------------------------------
|
|
234
|
+
# Rendering
|
|
235
|
+
# --------------------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
def _display(value, limit: int = 80) -> str:
|
|
238
|
+
"""Human-view rendering of a leaf value: collapse internal whitespace/newlines to keep
|
|
239
|
+
one field per line, and truncate very long values (e.g. base64 images some XFA forms
|
|
240
|
+
embed in the datasets, which arrive with embedded line breaks). The --json output and
|
|
241
|
+
the raw XML keep the full, faithful value."""
|
|
242
|
+
if value == "":
|
|
243
|
+
return "(empty)"
|
|
244
|
+
raw = str(value)
|
|
245
|
+
s = " ".join(raw.split()) # one physical line, no matter how the value was wrapped
|
|
246
|
+
return f"{s[:limit]}… ({len(raw)} chars)" if len(s) > limit else s
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def render_human_tree(node, lines: list, key=None, indent: int = 0) -> None:
|
|
250
|
+
pad = " " * indent
|
|
251
|
+
if isinstance(node, dict):
|
|
252
|
+
child_indent = indent
|
|
253
|
+
if key is not None:
|
|
254
|
+
lines.append(f"{pad}{key}:")
|
|
255
|
+
child_indent = indent + 1
|
|
256
|
+
for k, v in node.items():
|
|
257
|
+
render_human_tree(v, lines, k, child_indent)
|
|
258
|
+
elif isinstance(node, list):
|
|
259
|
+
for i, item in enumerate(node):
|
|
260
|
+
render_human_tree(item, lines, f"{key}[{i}]", indent)
|
|
261
|
+
else:
|
|
262
|
+
lines.append(f"{pad}{key}: {_display(node)}")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def render_flat_table(flat: dict) -> str:
|
|
266
|
+
if not flat:
|
|
267
|
+
return "(no fields)"
|
|
268
|
+
width = max(len(k) for k in flat)
|
|
269
|
+
return "\n".join(f"{k.ljust(width)} {_display(v)}" for k, v in flat.items())
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# --------------------------------------------------------------------------------------
|
|
273
|
+
# AcroForm fallback values (so a non-XFA caller isn't left empty-handed)
|
|
274
|
+
# --------------------------------------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
def read_acroform_fields(pdf_path: str) -> dict:
|
|
277
|
+
try:
|
|
278
|
+
from pypdf import PdfReader
|
|
279
|
+
|
|
280
|
+
fields = PdfReader(pdf_path).get_fields()
|
|
281
|
+
if not fields:
|
|
282
|
+
return {}
|
|
283
|
+
out = {}
|
|
284
|
+
for name, fobj in fields.items():
|
|
285
|
+
value = fobj.get("/V") if hasattr(fobj, "get") else None
|
|
286
|
+
out[str(name)] = "" if value is None else str(value)
|
|
287
|
+
return out
|
|
288
|
+
except Exception:
|
|
289
|
+
return {}
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
# --------------------------------------------------------------------------------------
|
|
293
|
+
# CLI
|
|
294
|
+
# --------------------------------------------------------------------------------------
|
|
295
|
+
|
|
296
|
+
def _emit(msg: str, quiet: bool) -> None:
|
|
297
|
+
if not quiet:
|
|
298
|
+
print(msg, file=sys.stderr)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def main(argv=None) -> int:
|
|
302
|
+
ap = argparse.ArgumentParser(
|
|
303
|
+
description="Extract entered values from XFA / LiveCycle dynamic PDF forms (read-only)."
|
|
304
|
+
)
|
|
305
|
+
ap.add_argument("pdf", help="path to the PDF form")
|
|
306
|
+
ap.add_argument("--json", action="store_true", help="emit only the JSON object to stdout")
|
|
307
|
+
ap.add_argument("--flatten", action="store_true", help="print only the path: value table")
|
|
308
|
+
ap.add_argument("--raw-out", default="./xfa_datasets.xml",
|
|
309
|
+
help="where to write the raw datasets XML (default ./xfa_datasets.xml)")
|
|
310
|
+
ap.add_argument("--quiet", action="store_true", help="suppress informational notes on stderr")
|
|
311
|
+
args = ap.parse_args(argv)
|
|
312
|
+
|
|
313
|
+
try:
|
|
314
|
+
kind, datasets, _packets, engine = locate_datasets(args.pdf)
|
|
315
|
+
except FileNotFoundError:
|
|
316
|
+
print(f"error: file not found: {args.pdf}", file=sys.stderr)
|
|
317
|
+
return 4
|
|
318
|
+
except Exception as exc:
|
|
319
|
+
print(f"error: could not open PDF: {exc}", file=sys.stderr)
|
|
320
|
+
return 4
|
|
321
|
+
|
|
322
|
+
# ---- not XFA: AcroForm-only or no form at all (exit 2) -------------------------
|
|
323
|
+
if kind in ("acroform", "none"):
|
|
324
|
+
af_fields = read_acroform_fields(args.pdf) if kind == "acroform" else {}
|
|
325
|
+
# XFA SOM-style names (e.g. form1[0].page1[0].field[0]) on an AcroForm usually mean
|
|
326
|
+
# a dynamic XFA form whose data was exported into the /V layer (XFA packet stripped).
|
|
327
|
+
looks_exfa = sum(1 for k in af_fields if "[" in k and "]" in k) >= max(3, len(af_fields) // 4)
|
|
328
|
+
if kind == "acroform":
|
|
329
|
+
note = ("AcroForm form — values are in the /V layer; standard get_fields() reads "
|
|
330
|
+
"them (below). XFA datasets extraction does not apply.")
|
|
331
|
+
if looks_exfa:
|
|
332
|
+
note += (" NOTE: field names look like XFA SOM expressions — this is likely a "
|
|
333
|
+
"dynamic XFA form whose data was exported into the AcroForm /V layer. "
|
|
334
|
+
"A plain text extraction would MISS these values; use the field values "
|
|
335
|
+
"here (get_fields), not extract_text().")
|
|
336
|
+
else:
|
|
337
|
+
note = "No /AcroForm — this PDF has no interactive form data."
|
|
338
|
+
filled = sum(1 for v in af_fields.values() if str(v).strip())
|
|
339
|
+
if args.json:
|
|
340
|
+
print(json.dumps(
|
|
341
|
+
{"form_kind": kind, "source": args.pdf, "note": note,
|
|
342
|
+
"field_count": len(af_fields), "filled_count": filled,
|
|
343
|
+
"acroform_fields": af_fields},
|
|
344
|
+
indent=2, ensure_ascii=False))
|
|
345
|
+
else:
|
|
346
|
+
_emit(f"[{kind}] {note}", args.quiet)
|
|
347
|
+
if af_fields and not args.flatten:
|
|
348
|
+
print("Standard AcroForm field values (via get_fields):\n")
|
|
349
|
+
if af_fields:
|
|
350
|
+
width = max(len(k) for k in af_fields)
|
|
351
|
+
for k, v in af_fields.items():
|
|
352
|
+
print(f"{k.ljust(width)} {_display(v)}")
|
|
353
|
+
return 2
|
|
354
|
+
|
|
355
|
+
# ---- XFA but no datasets packet (exit 3) --------------------------------------
|
|
356
|
+
if datasets is None:
|
|
357
|
+
note = "XFA form but no `datasets` packet — nothing was entered, or data is absent."
|
|
358
|
+
if args.json:
|
|
359
|
+
print(json.dumps(
|
|
360
|
+
{"form_kind": "xfa", "source": args.pdf, "tree": {}, "flat": {},
|
|
361
|
+
"field_count": 0, "filled_count": 0, "raw_datasets_path": None,
|
|
362
|
+
"note": note}, indent=2, ensure_ascii=False))
|
|
363
|
+
else:
|
|
364
|
+
_emit(f"[xfa] {note}", args.quiet)
|
|
365
|
+
return 3
|
|
366
|
+
|
|
367
|
+
# We have datasets bytes — always write them for auditing/fallback (§2.4).
|
|
368
|
+
raw_path = args.raw_out
|
|
369
|
+
try:
|
|
370
|
+
with open(raw_path, "wb") as fh:
|
|
371
|
+
fh.write(datasets)
|
|
372
|
+
except OSError as exc:
|
|
373
|
+
_emit(f"warning: could not write raw datasets to {raw_path}: {exc}", args.quiet)
|
|
374
|
+
raw_path = None
|
|
375
|
+
|
|
376
|
+
# ---- parse (exit 4 on unrecoverable failure) ----------------------------------
|
|
377
|
+
try:
|
|
378
|
+
tree, flat = parse_datasets(datasets)
|
|
379
|
+
except Exception as exc:
|
|
380
|
+
print(f"error: datasets XML did not parse: {exc}", file=sys.stderr)
|
|
381
|
+
if raw_path:
|
|
382
|
+
_emit(f"raw datasets XML written to {raw_path} for inspection.", args.quiet)
|
|
383
|
+
return 4
|
|
384
|
+
|
|
385
|
+
field_count = len(flat)
|
|
386
|
+
filled_count = sum(1 for v in flat.values() if str(v).strip())
|
|
387
|
+
|
|
388
|
+
result = {
|
|
389
|
+
"form_kind": "xfa",
|
|
390
|
+
"source": args.pdf,
|
|
391
|
+
"tree": tree,
|
|
392
|
+
"flat": flat,
|
|
393
|
+
"field_count": field_count,
|
|
394
|
+
"filled_count": filled_count,
|
|
395
|
+
"raw_datasets_path": raw_path,
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# ---- empty / unfilled XFA (exit 3) --------------------------------------------
|
|
399
|
+
if filled_count == 0:
|
|
400
|
+
note = ("XFA datasets parsed but every field is empty — the form looks unfilled."
|
|
401
|
+
if field_count else
|
|
402
|
+
"XFA datasets parsed but contained no form fields under xfa:data.")
|
|
403
|
+
if args.json:
|
|
404
|
+
result["note"] = note
|
|
405
|
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
406
|
+
else:
|
|
407
|
+
_emit(f"[xfa] {note}", args.quiet)
|
|
408
|
+
_emit(f"raw datasets XML written to {raw_path}.", args.quiet)
|
|
409
|
+
return 3
|
|
410
|
+
|
|
411
|
+
# ---- success (exit 0) ----------------------------------------------------------
|
|
412
|
+
if args.json:
|
|
413
|
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
414
|
+
return 0
|
|
415
|
+
|
|
416
|
+
if args.flatten:
|
|
417
|
+
print(render_flat_table(flat))
|
|
418
|
+
return 0
|
|
419
|
+
|
|
420
|
+
# default view: human tree + flat table, with notes on stderr
|
|
421
|
+
_emit(f"[xfa] extracted via {engine}: {filled_count} filled of {field_count} fields.",
|
|
422
|
+
args.quiet)
|
|
423
|
+
lines: list = []
|
|
424
|
+
render_human_tree(tree, lines)
|
|
425
|
+
print("XFA data tree")
|
|
426
|
+
print("=============")
|
|
427
|
+
print("\n".join(lines))
|
|
428
|
+
print("\nFlattened path: value")
|
|
429
|
+
print("=====================")
|
|
430
|
+
print(render_flat_table(flat))
|
|
431
|
+
_emit(f"\nraw datasets XML written to {raw_path}.", args.quiet)
|
|
432
|
+
_emit("note: checkbox/radio values are shown raw (export value or 1/0); human meaning "
|
|
433
|
+
"may need the template packet — see REFERENCE.md.", args.quiet)
|
|
434
|
+
return 0
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
if __name__ == "__main__":
|
|
438
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xfa-extract
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Read entered values out of XFA / LiveCycle 'dynamic' PDF forms (IRCC and other government forms) that standard PDF field extraction misses.
|
|
5
|
+
Author: Ryan Kashtan
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ryanjkashtan/xfa-extract
|
|
8
|
+
Project-URL: Issues, https://github.com/ryanjkashtan/xfa-extract/issues
|
|
9
|
+
Keywords: xfa,pdf,livecycle,acroform,forms,form-data,ircc,immigration,pypdf,datasets,extraction,dynamic-pdf
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Office/Business
|
|
17
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: pypdf>=4.0
|
|
23
|
+
Requires-Dist: lxml>=5.0
|
|
24
|
+
Provides-Extra: robust
|
|
25
|
+
Requires-Dist: pikepdf>=8.0; extra == "robust"
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
28
|
+
Requires-Dist: pikepdf>=8.0; extra == "test"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# xfa-extract
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/xfa-extract/)
|
|
34
|
+
[](https://pypi.org/project/xfa-extract/)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
|
|
37
|
+
**Read the entered values out of XFA / LiveCycle "dynamic" PDF forms — the ones where
|
|
38
|
+
`pypdf.get_fields()` comes back empty even though the form is clearly filled in.**
|
|
39
|
+
|
|
40
|
+
If you've ever hit this:
|
|
41
|
+
|
|
42
|
+
> I filled out a government PDF (an IRCC immigration form, a tax form, …), but when I run
|
|
43
|
+
> `PdfReader(...).get_fields()` or `pdftk dump_data_fields`, the values are **blank or
|
|
44
|
+
> missing**. The form *template* text extracts fine, but **none of the answers show up.**
|
|
45
|
+
> Or the PDF just shows *"Please wait… If this message is not eventually replaced…"*.
|
|
46
|
+
|
|
47
|
+
…then your PDF is an **XFA form**, and this tool reads it.
|
|
48
|
+
|
|
49
|
+
## Why standard extraction misses the data
|
|
50
|
+
|
|
51
|
+
A normal interactive PDF (an **AcroForm**) stores each field's value in its `/V` entry —
|
|
52
|
+
`pypdf`, `pdftk`, `pdfminer` all read those fine.
|
|
53
|
+
|
|
54
|
+
An **XFA form** (Adobe LiveCycle / "dynamic" PDF — what most government and immigration forms
|
|
55
|
+
are) does **not** keep the entered data in `/V`. It keeps it in an XML packet inside the
|
|
56
|
+
AcroForm dictionary under the `/XFA` key, in a sub-packet called **`datasets`**. So
|
|
57
|
+
`get_fields()` and text extraction look blank even on a fully completed form. `xfa-extract`
|
|
58
|
+
detects XFA, pulls the `datasets` packet, parses it, and gives you the field → value map.
|
|
59
|
+
|
|
60
|
+
**Read-only.** It never writes to or mutates your PDF.
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install xfa-extract # core (pypdf + lxml)
|
|
66
|
+
pip install "xfa-extract[robust]" # + pikepdf fallback for unusual PDFs
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Use it — command line
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
xfa-extract FORM.pdf # human-readable tree + flat "path: value" table
|
|
73
|
+
xfa-extract FORM.pdf --json # machine-readable JSON (for scripts / LLMs)
|
|
74
|
+
xfa-extract FORM.pdf --flatten # just the path: value table
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Every run also writes the raw `datasets` XML to `--raw-out` (default `./xfa_datasets.xml`)
|
|
78
|
+
for auditing.
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
$ xfa-extract application.pdf --flatten
|
|
82
|
+
form1.PersonalInfo.Surname Smith
|
|
83
|
+
form1.PersonalInfo.GivenName Jane
|
|
84
|
+
form1.Dependents.Dependent[0].Name Alex
|
|
85
|
+
form1.Dependents.Dependent[1].Name Sam
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Repeating sections (multiple dependents, applicants, addresses) are **indexed**
|
|
89
|
+
(`Dependent[0]`, `Dependent[1]`, …), never collapsed.
|
|
90
|
+
|
|
91
|
+
## Use it — as a library
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from xfa_extract import locate_datasets, parse_datasets
|
|
95
|
+
|
|
96
|
+
kind, datasets, _packets, _engine = locate_datasets("FORM.pdf")
|
|
97
|
+
if kind == "xfa" and datasets:
|
|
98
|
+
tree, flat = parse_datasets(datasets)
|
|
99
|
+
print(flat["form1.PersonalInfo.Surname"]) # -> "Smith"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Exit codes (the CLI tells you which case you're in)
|
|
103
|
+
|
|
104
|
+
| code | meaning | what to do |
|
|
105
|
+
|------|---------|------------|
|
|
106
|
+
| `0` | XFA data extracted (≥1 non-empty value) | use the values |
|
|
107
|
+
| `2` | **not XFA** — AcroForm-only or no form | use `get_fields()`; the tool prints those values for you as a convenience |
|
|
108
|
+
| `3` | XFA but no `datasets`, or the form is empty/unfilled | report "unfilled / no entered data" |
|
|
109
|
+
| `4` | parse failure | the raw XML is still written to `--raw-out` for inspection |
|
|
110
|
+
|
|
111
|
+
## Tested against real forms
|
|
112
|
+
|
|
113
|
+
Validated on synthetic fixtures (in CI) **and** 14 real-world XFA forms — IRCC IMM5257 /
|
|
114
|
+
1295 / 1344 / 5710 / 5669, a DHL waybill, an Indian MCA MGT-7, an Ontario lease, a US DOL
|
|
115
|
+
form, French CERFA — plus a real filled Canadian Proof-of-Citizenship application. Repeating
|
|
116
|
+
sections, namespaces, Adobe's quirky tag serialization, and base64-image-bearing datasets all
|
|
117
|
+
handled. See [`docs/xfa-internals.md`](docs/xfa-internals.md) for the deep dive.
|
|
118
|
+
|
|
119
|
+
## What it does **not** do
|
|
120
|
+
|
|
121
|
+
- **Fill / write** values into XFA forms — fragile, and the failure mode on a legal document
|
|
122
|
+
is bad. (Use a dedicated form-filling tool.)
|
|
123
|
+
- **Flatten / render** XFA to static pages — different operation.
|
|
124
|
+
- **OCR** — these are digital forms, not scans.
|
|
125
|
+
|
|
126
|
+
## Use it with Claude / Claude Code
|
|
127
|
+
|
|
128
|
+
This repo also ships an [Agent Skill](skill/SKILL.md) so Claude Code automatically reaches
|
|
129
|
+
for it when a fillable PDF's values come back blank. Point Claude at `skill/`.
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT © Ryan Kashtan. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/xfa_extract/__init__.py
|
|
5
|
+
src/xfa_extract/cli.py
|
|
6
|
+
src/xfa_extract.egg-info/PKG-INFO
|
|
7
|
+
src/xfa_extract.egg-info/SOURCES.txt
|
|
8
|
+
src/xfa_extract.egg-info/dependency_links.txt
|
|
9
|
+
src/xfa_extract.egg-info/entry_points.txt
|
|
10
|
+
src/xfa_extract.egg-info/requires.txt
|
|
11
|
+
src/xfa_extract.egg-info/top_level.txt
|
|
12
|
+
tests/test_acceptance.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
xfa_extract
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Acceptance tests for xfa-extract, run against synthetic fixtures.
|
|
2
|
+
|
|
3
|
+
Covers: filled XFA (array + single-stream XDP), AcroForm routing, empty XFA, corrupt XFA,
|
|
4
|
+
and that every source PDF is byte-for-byte unchanged. Invokes the CLI as a subprocess via
|
|
5
|
+
`python -m xfa_extract.cli`, so the package must be importable (`pip install -e .`).
|
|
6
|
+
"""
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
HERE = Path(__file__).resolve().parent
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def cli(pdf_name, *flags):
|
|
17
|
+
raw = HERE / f"_raw_{Path(pdf_name).stem}.xml"
|
|
18
|
+
proc = subprocess.run(
|
|
19
|
+
[sys.executable, "-m", "xfa_extract.cli", str(HERE / pdf_name),
|
|
20
|
+
"--raw-out", str(raw), *flags],
|
|
21
|
+
capture_output=True, text=True,
|
|
22
|
+
)
|
|
23
|
+
return proc.returncode, proc.stdout, proc.stderr, raw
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def cli_json(pdf_name, *flags):
|
|
27
|
+
code, out, err, raw = cli(pdf_name, "--json", *flags)
|
|
28
|
+
data = json.loads(out) if out.strip() else {}
|
|
29
|
+
return code, data, raw
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def sha(name):
|
|
33
|
+
return hashlib.sha256((HERE / name).read_bytes()).hexdigest()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# --- (a) filled XFA array form -------------------------------------------------------
|
|
37
|
+
def test_filled_xfa_values_and_exit_code():
|
|
38
|
+
code, d, _ = cli_json("filled_xfa.pdf")
|
|
39
|
+
assert code == 0
|
|
40
|
+
assert d["form_kind"] == "xfa"
|
|
41
|
+
flat = d["flat"]
|
|
42
|
+
assert flat["form1.PersonalInfo.Surname"] == "Smith"
|
|
43
|
+
assert flat["form1.PersonalInfo.GivenName"] == "Jane"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_repeating_sections_indexed_not_collapsed():
|
|
47
|
+
code, d, _ = cli_json("filled_xfa.pdf")
|
|
48
|
+
deps = d["tree"]["form1"]["Dependents"]["Dependent"]
|
|
49
|
+
assert isinstance(deps, list) and len(deps) == 3
|
|
50
|
+
assert d["flat"]["form1.Dependents.Dependent[0].Name"] == "Alex"
|
|
51
|
+
assert d["flat"]["form1.Dependents.Dependent[2].Age"] == "3"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_empty_leaf_preserved_and_counts():
|
|
55
|
+
code, d, _ = cli_json("filled_xfa.pdf")
|
|
56
|
+
assert d["flat"]["form1.Notes"] == ""
|
|
57
|
+
assert d["field_count"] == 11
|
|
58
|
+
assert d["filled_count"] == 10
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# --- (a') single-stream XDP ----------------------------------------------------------
|
|
62
|
+
def test_single_stream_xdp_locates_data_inside():
|
|
63
|
+
code, d, _ = cli_json("filled_xdp.pdf")
|
|
64
|
+
assert code == 0
|
|
65
|
+
assert d["flat"]["form1.Applicant.Surname"] == "Okonkwo"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# --- (b) AcroForm routing ------------------------------------------------------------
|
|
69
|
+
def test_acroform_routes_to_exit_2_with_values():
|
|
70
|
+
code, out, err, _ = cli("filled_acroform.pdf")
|
|
71
|
+
assert code == 2
|
|
72
|
+
assert "standard" in (out + err).lower()
|
|
73
|
+
assert "John Doe" in out
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_acroform_json_still_emits_object():
|
|
77
|
+
code, d, _ = cli_json("filled_acroform.pdf")
|
|
78
|
+
assert code == 2
|
|
79
|
+
assert d["form_kind"] == "acroform"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_no_form_exits_2():
|
|
83
|
+
code, _, _, _ = cli("no_form.pdf")
|
|
84
|
+
assert code == 2
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# --- (c) empty / unfilled XFA --------------------------------------------------------
|
|
88
|
+
def test_empty_xfa_exits_3_cleanly():
|
|
89
|
+
code, _, err, _ = cli("empty_xfa.pdf")
|
|
90
|
+
assert code == 3
|
|
91
|
+
assert "Traceback" not in err
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# --- (d) corrupt datasets ------------------------------------------------------------
|
|
95
|
+
def test_corrupt_datasets_exits_4_and_writes_raw():
|
|
96
|
+
code, _, _, raw = cli("corrupt_xfa.pdf")
|
|
97
|
+
assert code == 4
|
|
98
|
+
assert raw.exists() and raw.stat().st_size > 0
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# --- (e) source PDFs unchanged -------------------------------------------------------
|
|
102
|
+
def test_source_pdfs_byte_for_byte_unchanged():
|
|
103
|
+
names = ["filled_xfa.pdf", "filled_xdp.pdf", "filled_acroform.pdf",
|
|
104
|
+
"no_form.pdf", "empty_xfa.pdf", "corrupt_xfa.pdf"]
|
|
105
|
+
before = {n: sha(n) for n in names}
|
|
106
|
+
for n in names:
|
|
107
|
+
cli(n)
|
|
108
|
+
for n in names:
|
|
109
|
+
assert sha(n) == before[n], f"{n} was modified"
|