bankstract 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bankstract-0.2.0/.gitignore +26 -0
- bankstract-0.2.0/LICENSE +21 -0
- bankstract-0.2.0/PKG-INFO +151 -0
- bankstract-0.2.0/PRD.md +184 -0
- bankstract-0.2.0/README.md +100 -0
- bankstract-0.2.0/pyproject.toml +68 -0
- bankstract-0.2.0/src/bankstract/__init__.py +1 -0
- bankstract-0.2.0/src/bankstract/_layout.py +91 -0
- bankstract-0.2.0/src/bankstract/_pdfplumber.py +23 -0
- bankstract-0.2.0/src/bankstract/_pymupdf.py +30 -0
- bankstract-0.2.0/src/bankstract/cli.py +115 -0
- bankstract-0.2.0/src/bankstract/parsers/__init__.py +26 -0
- bankstract-0.2.0/src/bankstract/parsers/base.py +14 -0
- bankstract-0.2.0/src/bankstract/parsers/fbn.py +251 -0
- bankstract-0.2.0/src/bankstract/parsers/palmpay.py +199 -0
- bankstract-0.2.0/src/bankstract/reconcile.py +57 -0
- bankstract-0.2.0/src/bankstract/redactors/__init__.py +27 -0
- bankstract-0.2.0/src/bankstract/redactors/_shared.py +77 -0
- bankstract-0.2.0/src/bankstract/redactors/base.py +68 -0
- bankstract-0.2.0/src/bankstract/redactors/fbn.py +126 -0
- bankstract-0.2.0/src/bankstract/redactors/palmpay.py +202 -0
- bankstract-0.2.0/src/bankstract/schema.py +40 -0
- bankstract-0.2.0/src/bankstract/writers/__init__.py +0 -0
- bankstract-0.2.0/src/bankstract/writers/csv.py +57 -0
- bankstract-0.2.0/uv.lock +1175 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
.eggs/
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
.ruff_cache/
|
|
7
|
+
.coverage
|
|
8
|
+
htmlcov/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
env/
|
|
14
|
+
.python-version
|
|
15
|
+
.env
|
|
16
|
+
.DS_Store
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
|
|
21
|
+
# Fixture privacy — never commit unredacted statements.
|
|
22
|
+
# Drop real PDFs into tests/<bank>/fixtures/_local/ for dev only.
|
|
23
|
+
tests/**/fixtures/_local/
|
|
24
|
+
tests/**/fixtures/*real*.pdf
|
|
25
|
+
tests/**/fixtures/*unredacted*.pdf
|
|
26
|
+
tests/**/fixtures/*raw*.pdf
|
bankstract-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jeffery Orazulike
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bankstract
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Convert Nigerian bank PDF statements into structured CSV.
|
|
5
|
+
Project-URL: Homepage, https://github.com/logickoder/bankstract
|
|
6
|
+
Project-URL: Issues, https://github.com/logickoder/bankstract/issues
|
|
7
|
+
Author: Jeffery Orazulike
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Jeffery Orazulike
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Keywords: bank,csv,nigeria,pdf,statement
|
|
31
|
+
Classifier: Development Status :: 3 - Alpha
|
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
35
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
36
|
+
Requires-Python: >=3.11
|
|
37
|
+
Requires-Dist: click>=8.1
|
|
38
|
+
Requires-Dist: pdfplumber>=0.11
|
|
39
|
+
Requires-Dist: pydantic>=2.6
|
|
40
|
+
Requires-Dist: pymupdf>=1.27.2
|
|
41
|
+
Provides-Extra: camelot
|
|
42
|
+
Requires-Dist: camelot-py[cv]>=0.11; extra == 'camelot'
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
45
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
46
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
47
|
+
Provides-Extra: ocr
|
|
48
|
+
Requires-Dist: pillow>=10.0; extra == 'ocr'
|
|
49
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
50
|
+
Description-Content-Type: text/markdown
|
|
51
|
+
|
|
52
|
+
# bankstract
|
|
53
|
+
|
|
54
|
+
Convert Nigerian bank PDF statements into structured CSV. Plugin architecture — one parser per bank.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install bankstract
|
|
58
|
+
|
|
59
|
+
bankstract palmpay statement.pdf -o out.csv
|
|
60
|
+
bankstract auto unknown.pdf -o out.csv
|
|
61
|
+
bankstract list
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Status
|
|
65
|
+
|
|
66
|
+
| Bank | Status |
|
|
67
|
+
| ---------- | ------------- |
|
|
68
|
+
| PalmPay | v0.1 — alpha |
|
|
69
|
+
| First Bank | v0.1 — alpha |
|
|
70
|
+
|
|
71
|
+
## Install
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install bankstract
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Optional extras:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install "bankstract[ocr]" # pytesseract for scanned PDFs
|
|
81
|
+
pip install "bankstract[camelot]" # camelot lattice fallback
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Develop
|
|
85
|
+
|
|
86
|
+
Project uses [uv](https://docs.astral.sh/uv/) for dependency + venv management.
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uv sync --all-extras # create .venv, install deps + extras from uv.lock
|
|
90
|
+
uv run pre-commit install # one-time: enable the pre-commit hook
|
|
91
|
+
uv run pytest # run tests
|
|
92
|
+
uv run ruff check src tests
|
|
93
|
+
uv run pyright src tests # strict type check (see CLAUDE.md directive 8)
|
|
94
|
+
uv run bankstract list # invoke CLI
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Add a dependency with `uv add <pkg>` (dev: `uv add --dev <pkg>`). Commit `uv.lock`.
|
|
98
|
+
|
|
99
|
+
The pre-commit hook runs `ruff check`, `ruff format --check`, `pyright` (strict), and `pytest` before every commit. Bypass only in genuine emergencies with `git commit --no-verify`; the same checks run again in CI.
|
|
100
|
+
|
|
101
|
+
### Releasing
|
|
102
|
+
|
|
103
|
+
CI publishes to PyPI automatically on push to `main` via `.github/workflows/publish.yml`. The workflow runs the full gate (ruff + pyright + pytest), and if the current `pyproject.toml` version already exists on PyPI it auto-bumps the minor component and commits the bump before publishing. PyPI auth uses OIDC trusted publishing — no token in repo or CI secrets.
|
|
104
|
+
|
|
105
|
+
To prepare a release locally:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
scripts/bump-version.sh # patch bump
|
|
109
|
+
scripts/bump-version.sh minor # 0.2.x -> 0.3.0
|
|
110
|
+
scripts/bump-version.sh major # 0.x.x -> 1.0.0
|
|
111
|
+
scripts/bump-version.sh 0.3.0 # exact set
|
|
112
|
+
uv build # dist/*.whl + dist/*.tar.gz
|
|
113
|
+
uv publish dist/* # only if not using the GH workflow; needs --token or UV_PUBLISH_TOKEN
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Trusted-publisher setup (one-time, owner only): create a publisher at <https://pypi.org/manage/account/publishing/> with workflow `publish.yml`, repo `logickoder/bankstract`.
|
|
117
|
+
|
|
118
|
+
## Usage
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
bankstract <bank> <pdf> -o <csv> # explicit parser
|
|
122
|
+
bankstract auto <pdf> -o <csv> # auto-detect via Parser.detect()
|
|
123
|
+
bankstract list # show registered parsers
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Unparseable blocks are written to a `.log` sidecar next to the output CSV.
|
|
127
|
+
|
|
128
|
+
## Reconciliation invariant
|
|
129
|
+
|
|
130
|
+
Two complementary checks; the CLI picks whichever applies per bank.
|
|
131
|
+
|
|
132
|
+
- **Row-wise** (banks that print a running balance): `prev.balance ± debit/credit == curr.balance`. Mismatch raises `ReconciliationError` with the row index.
|
|
133
|
+
- **Totals-based** (banks like PalmPay that omit a balance column): the parser reads `Total Money In` / `Total Money Out` from the statement header and the CLI asserts that the sum of parsed credits/debits equals those totals.
|
|
134
|
+
|
|
135
|
+
Both modes exist to catch silently-dropped rows — the failure mode of naive PDF parsers.
|
|
136
|
+
|
|
137
|
+
## Contributing a bank parser
|
|
138
|
+
|
|
139
|
+
1. Copy `src/bankstract/parsers/palmpay.py` to `src/bankstract/parsers/<bank>.py`.
|
|
140
|
+
2. Implement `detect()` and `parse() -> ParseResult` from `parsers/base.py`. Populate `total_credit` / `total_debit` if the statement only ships header totals.
|
|
141
|
+
3. Add a `Redactor` subclass under `src/bankstract/redactors/<bank>.py` for the fixture pipeline.
|
|
142
|
+
4. Drop the raw statement at `tests/<bank>/fixtures/_local/` (gitignored), then `uv run bankstract redact <bank> <raw> tests/<bank>/fixtures/sample.pdf` to produce the committable fixture.
|
|
143
|
+
5. Add tests under `tests/<bank>/test_parser.py` and `tests/<bank>/test_redactor.py`.
|
|
144
|
+
|
|
145
|
+
CI runs `ruff` + `pyright` (strict) + `pytest`. All three must pass clean. Reconciliation invariant must hold on every fixture.
|
|
146
|
+
|
|
147
|
+
Fixture PDFs must be redacted: account numbers, names, addresses, transaction IDs scrubbed. Never commit unredacted statements.
|
|
148
|
+
|
|
149
|
+
## License
|
|
150
|
+
|
|
151
|
+
MIT. Author: [logickoder](https://github.com/logickoder).
|
bankstract-0.2.0/PRD.md
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# bankstract — PRD
|
|
2
|
+
|
|
3
|
+
**Status:** concept · v0.1 target
|
|
4
|
+
**License:** MIT
|
|
5
|
+
**Stack:** Python 3.11+
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## What
|
|
10
|
+
|
|
11
|
+
Public Python CLI + library that converts Nigerian bank PDF statements into structured CSV. Plugin architecture — one parser module per bank.
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
bankstract palmpay statement.pdf -o out.csv
|
|
15
|
+
bankstract auto unknown.pdf -o out.csv # auto-detect bank
|
|
16
|
+
bankstract fbn scanned.pdf -o out.csv --ocr # force OCR
|
|
17
|
+
bankstract list # show registered parsers
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Why
|
|
21
|
+
|
|
22
|
+
Every Nigerian dev solves this once, badly, in private. Banks export PDFs; tools like BudgetBakers, YNAB, Notion, Google Sheets want CSV. Manual entry costs more than the visibility is worth, so trackers go stale.
|
|
23
|
+
|
|
24
|
+
bankstract closes that gap with one clean tool, one plugin contract, and community-driven bank coverage.
|
|
25
|
+
|
|
26
|
+
## Scope
|
|
27
|
+
|
|
28
|
+
| Version | Coverage |
|
|
29
|
+
| ------- | -------------------------------------------------------------------------------- |
|
|
30
|
+
| v0.1 | PalmPay only. CLI + plugin contract + reconciliation + tests + CI. PyPI release. |
|
|
31
|
+
| v0.2 | First Bank parser + OCR fallback for scanned statements. |
|
|
32
|
+
| v0.3+ | GTB, Kuda, Opay, Stanbic, Wise, Bamboo, Risevest — community PRs. |
|
|
33
|
+
|
|
34
|
+
**Out of scope:** category inference, ML-based parsing, GUI, pushing data into third-party trackers (those belong in downstream tools).
|
|
35
|
+
|
|
36
|
+
## Architecture
|
|
37
|
+
|
|
38
|
+
### Stack rationale
|
|
39
|
+
|
|
40
|
+
- **Python** over Node — `pdfplumber` and `camelot-py` are best-in-class table extractors; `pytesseract` is the cleanest OCR binding. Node alternatives are weaker on table extract and slower on OCR.
|
|
41
|
+
- **pdfplumber primary, camelot fallback** — pdfplumber for text-PDF table extract; camelot lattice mode for messy ruled tables; pytesseract only when the text layer is absent (scanned PDFs).
|
|
42
|
+
- **pymupdf for true redaction** — `apply_redactions()` rewrites the PDF content stream rather than visually overlaying, so fixture PDFs contain no recoverable PII.
|
|
43
|
+
- **pydantic schema** — runtime validation + clean JSON Schema export for downstream tools.
|
|
44
|
+
- **click CLI** — standard, autocomplete-friendly, low boilerplate.
|
|
45
|
+
|
|
46
|
+
### Plugin contract
|
|
47
|
+
|
|
48
|
+
Every bank is a parser module implementing the `Parser` ABC.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
class Parser(ABC):
|
|
52
|
+
bank: str # module-level identifier (e.g. "palmpay", "fbn")
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def detect(self, pdf_path: Path) -> bool:
|
|
56
|
+
"""Return True if this parser handles the given PDF (header/logo/text-marker match)."""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def parse(self, pdf_path: Path) -> ParseResult:
|
|
60
|
+
"""Extract transactions and header totals. Raise ParseError on format mismatch."""
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Parsers live in `src/bankstract/parsers/<bank>.py` and self-register via import side-effect in `parsers/__init__.py`. A parallel `Redactor` plugin tree under `src/bankstract/redactors/<bank>.py` produces committable fixtures from raw statements.
|
|
64
|
+
|
|
65
|
+
### Transaction + ParseResult schema
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
class Transaction(BaseModel):
|
|
69
|
+
date: date
|
|
70
|
+
narration: str
|
|
71
|
+
debit: Decimal = Decimal("0")
|
|
72
|
+
credit: Decimal = Decimal("0")
|
|
73
|
+
balance: Decimal | None = None # None when the statement omits a running balance
|
|
74
|
+
reference: str | None = None # bank transaction ID
|
|
75
|
+
currency: str = "NGN"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class ParseResult:
|
|
80
|
+
transactions: list[Transaction]
|
|
81
|
+
total_credit: Decimal | None = None # from statement header
|
|
82
|
+
total_debit: Decimal | None = None
|
|
83
|
+
format_version: str | None = None
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Amounts are stored as `Decimal` (not float — financial precision). The Naira sign is stripped before parsing.
|
|
87
|
+
|
|
88
|
+
### Reconciliation invariant
|
|
89
|
+
|
|
90
|
+
Two complementary checks:
|
|
91
|
+
|
|
92
|
+
- **Row-wise** (`reconcile()`): `prev.balance ± debit/credit == curr.balance`. Used when the statement carries a per-row running balance. Mismatch raises `ReconciliationError` with the row index.
|
|
93
|
+
- **Totals-based** (`verify_totals()`): sum of parsed credits/debits equals header `Total Money In` / `Total Money Out`. Used when the statement omits a running balance (e.g. PalmPay). Parsers MUST populate `ParseResult.total_credit/total_debit` in that case, otherwise reconciliation is skipped silently — a directive 2 violation.
|
|
94
|
+
|
|
95
|
+
Both modes catch silently-dropped rows, which is the failure mode of every naive PDF parser.
|
|
96
|
+
|
|
97
|
+
### Failure handling
|
|
98
|
+
|
|
99
|
+
- **Unparseable blocks** go to a `.log` sidecar file. Never silently dropped.
|
|
100
|
+
- **Format-version drift** — each parser logs a detected `format_version` at run start. Parse errors include the detected version, so issue reports are actionable.
|
|
101
|
+
|
|
102
|
+
### Repo layout
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
bankstract/
|
|
106
|
+
├── pyproject.toml hatchling backend, ruff + pytest + pyright config
|
|
107
|
+
├── pyrightconfig.json IDE-side mirror of [tool.pyright]
|
|
108
|
+
├── uv.lock uv-managed lockfile
|
|
109
|
+
├── README.md
|
|
110
|
+
├── PRD.md
|
|
111
|
+
├── LICENSE MIT
|
|
112
|
+
├── src/
|
|
113
|
+
│ └── bankstract/ standard src-layout package
|
|
114
|
+
│ ├── cli.py
|
|
115
|
+
│ ├── schema.py Transaction + ParseResult + errors
|
|
116
|
+
│ ├── reconcile.py reconcile() + verify_totals()
|
|
117
|
+
│ ├── _layout.py Word dataclass + classify + Y-grouping (shared)
|
|
118
|
+
│ ├── _pymupdf.py typed facade over pymupdf
|
|
119
|
+
│ ├── _pdfplumber.py typed facade over pdfplumber
|
|
120
|
+
│ ├── writers/csv.py
|
|
121
|
+
│ ├── parsers/
|
|
122
|
+
│ │ ├── __init__.py registry (import side-effect)
|
|
123
|
+
│ │ ├── base.py Parser ABC
|
|
124
|
+
│ │ ├── palmpay.py
|
|
125
|
+
│ │ └── fbn.py
|
|
126
|
+
│ └── redactors/
|
|
127
|
+
│ ├── __init__.py registry (import side-effect)
|
|
128
|
+
│ ├── base.py Redactor ABC + RedactReport (template-method)
|
|
129
|
+
│ ├── _shared.py shared redact primitives
|
|
130
|
+
│ ├── palmpay.py
|
|
131
|
+
│ └── fbn.py
|
|
132
|
+
├── tests/
|
|
133
|
+
│ ├── test_reconcile.py bank-agnostic
|
|
134
|
+
│ └── <bank>/ one folder per bank
|
|
135
|
+
│ ├── test_parser.py
|
|
136
|
+
│ ├── test_redactor.py
|
|
137
|
+
│ └── fixtures/
|
|
138
|
+
│ ├── sample.pdf redacted sample (committed)
|
|
139
|
+
│ └── _local/ gitignored: raw statements for dev
|
|
140
|
+
└── .github/workflows/ci.yml uv + ruff + pyright + pytest
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## CLI surface
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
bankstract <bank> <pdf> -o <csv> # explicit parser
|
|
147
|
+
bankstract auto <pdf> -o <csv> # detect via Parser.detect()
|
|
148
|
+
bankstract <bank> <pdf> -o <csv> --ocr # force OCR path
|
|
149
|
+
bankstract list # show registered parsers + status
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Risks
|
|
153
|
+
|
|
154
|
+
| Risk | Mitigation |
|
|
155
|
+
| ---------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
|
156
|
+
| Statement format drift — banks rev PDFs annually. | Per-parser `format_version` detection + log on parse error. Per-bank fixture suite in tests. |
|
|
157
|
+
| OCR accuracy on scanned statements — ₦ / N confusion, comma-separator drift. | Post-OCR regex normalization. Reconciliation invariant catches arithmetic errors before they ship. |
|
|
158
|
+
| Charset edge cases — `₦` decodes differently across PDF producers. | Strip currency symbols and store as `Decimal`. |
|
|
159
|
+
| Fixture privacy — sample PDFs contain PII. | All fixtures must be anonymized: account numbers, names, addresses scrubbed. Never commit unredacted PDFs. |
|
|
160
|
+
|
|
161
|
+
## Roadmap
|
|
162
|
+
|
|
163
|
+
- [ ] `pyproject.toml` + Parser ABC + Transaction schema + csv writer + reconciliation
|
|
164
|
+
- [ ] PalmPay parser + 1 anonymized fixture + test
|
|
165
|
+
- [ ] CLI wrapper + auto-detect
|
|
166
|
+
- [ ] README + LICENSE + CI
|
|
167
|
+
- [ ] **v0.1.0** — PyPI release, PalmPay only, FBN marked in progress
|
|
168
|
+
- [ ] First Bank parser + OCR fallback → **v0.2.0**
|
|
169
|
+
- [ ] Open issues for next 5 banks; invite contributors
|
|
170
|
+
|
|
171
|
+
## Contributing
|
|
172
|
+
|
|
173
|
+
Add a bank in four steps:
|
|
174
|
+
|
|
175
|
+
1. Copy `src/bankstract/parsers/palmpay.py` to `src/bankstract/parsers/<your_bank>.py` and implement `detect()` + `parse() -> ParseResult`.
|
|
176
|
+
2. Copy `src/bankstract/redactors/palmpay.py` to `src/bankstract/redactors/<your_bank>.py` for the fixture pipeline.
|
|
177
|
+
3. Drop the raw statement in `tests/<your_bank>/fixtures/_local/` (gitignored), run `uv run bankstract redact <your_bank> <raw> tests/<your_bank>/fixtures/sample.pdf`, eyeball the output, commit the redacted sample.
|
|
178
|
+
4. Add tests in `tests/<your_bank>/test_parser.py` and `tests/<your_bank>/test_redactor.py`.
|
|
179
|
+
|
|
180
|
+
CI runs `ruff` + `pyright` (strict) + `pytest`. All three must pass. Reconciliation invariant must hold on every fixture.
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
MIT. See `LICENSE`.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# bankstract
|
|
2
|
+
|
|
3
|
+
Convert Nigerian bank PDF statements into structured CSV. Plugin architecture — one parser per bank.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install bankstract
|
|
7
|
+
|
|
8
|
+
bankstract palmpay statement.pdf -o out.csv
|
|
9
|
+
bankstract auto unknown.pdf -o out.csv
|
|
10
|
+
bankstract list
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Status
|
|
14
|
+
|
|
15
|
+
| Bank | Status |
|
|
16
|
+
| ---------- | ------------- |
|
|
17
|
+
| PalmPay | v0.1 — alpha |
|
|
18
|
+
| First Bank | v0.1 — alpha |
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install bankstract
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Optional extras:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install "bankstract[ocr]" # pytesseract for scanned PDFs
|
|
30
|
+
pip install "bankstract[camelot]" # camelot lattice fallback
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Develop
|
|
34
|
+
|
|
35
|
+
Project uses [uv](https://docs.astral.sh/uv/) for dependency + venv management.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv sync --all-extras # create .venv, install deps + extras from uv.lock
|
|
39
|
+
uv run pre-commit install # one-time: enable the pre-commit hook
|
|
40
|
+
uv run pytest # run tests
|
|
41
|
+
uv run ruff check src tests
|
|
42
|
+
uv run pyright src tests # strict type check (see CLAUDE.md directive 8)
|
|
43
|
+
uv run bankstract list # invoke CLI
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Add a dependency with `uv add <pkg>` (dev: `uv add --dev <pkg>`). Commit `uv.lock`.
|
|
47
|
+
|
|
48
|
+
The pre-commit hook runs `ruff check`, `ruff format --check`, `pyright` (strict), and `pytest` before every commit. Bypass only in genuine emergencies with `git commit --no-verify`; the same checks run again in CI.
|
|
49
|
+
|
|
50
|
+
### Releasing
|
|
51
|
+
|
|
52
|
+
CI publishes to PyPI automatically on push to `main` via `.github/workflows/publish.yml`. The workflow runs the full gate (ruff + pyright + pytest), and if the current `pyproject.toml` version already exists on PyPI it auto-bumps the minor component and commits the bump before publishing. PyPI auth uses OIDC trusted publishing — no token in repo or CI secrets.
|
|
53
|
+
|
|
54
|
+
To prepare a release locally:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
scripts/bump-version.sh # patch bump
|
|
58
|
+
scripts/bump-version.sh minor # 0.2.x -> 0.3.0
|
|
59
|
+
scripts/bump-version.sh major # 0.x.x -> 1.0.0
|
|
60
|
+
scripts/bump-version.sh 0.3.0 # exact set
|
|
61
|
+
uv build # dist/*.whl + dist/*.tar.gz
|
|
62
|
+
uv publish dist/* # only if not using the GH workflow; needs --token or UV_PUBLISH_TOKEN
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Trusted-publisher setup (one-time, owner only): create a publisher at <https://pypi.org/manage/account/publishing/> with workflow `publish.yml`, repo `logickoder/bankstract`.
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
bankstract <bank> <pdf> -o <csv> # explicit parser
|
|
71
|
+
bankstract auto <pdf> -o <csv> # auto-detect via Parser.detect()
|
|
72
|
+
bankstract list # show registered parsers
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Unparseable blocks are written to a `.log` sidecar next to the output CSV.
|
|
76
|
+
|
|
77
|
+
## Reconciliation invariant
|
|
78
|
+
|
|
79
|
+
Two complementary checks; the CLI picks whichever applies per bank.
|
|
80
|
+
|
|
81
|
+
- **Row-wise** (banks that print a running balance): `prev.balance ± debit/credit == curr.balance`. Mismatch raises `ReconciliationError` with the row index.
|
|
82
|
+
- **Totals-based** (banks like PalmPay that omit a balance column): the parser reads `Total Money In` / `Total Money Out` from the statement header and the CLI asserts that the sum of parsed credits/debits equals those totals.
|
|
83
|
+
|
|
84
|
+
Both modes exist to catch silently-dropped rows — the failure mode of naive PDF parsers.
|
|
85
|
+
|
|
86
|
+
## Contributing a bank parser
|
|
87
|
+
|
|
88
|
+
1. Copy `src/bankstract/parsers/palmpay.py` to `src/bankstract/parsers/<bank>.py`.
|
|
89
|
+
2. Implement `detect()` and `parse() -> ParseResult` from `parsers/base.py`. Populate `total_credit` / `total_debit` if the statement only ships header totals.
|
|
90
|
+
3. Add a `Redactor` subclass under `src/bankstract/redactors/<bank>.py` for the fixture pipeline.
|
|
91
|
+
4. Drop the raw statement at `tests/<bank>/fixtures/_local/` (gitignored), then `uv run bankstract redact <bank> <raw> tests/<bank>/fixtures/sample.pdf` to produce the committable fixture.
|
|
92
|
+
5. Add tests under `tests/<bank>/test_parser.py` and `tests/<bank>/test_redactor.py`.
|
|
93
|
+
|
|
94
|
+
CI runs `ruff` + `pyright` (strict) + `pytest`. All three must pass clean. Reconciliation invariant must hold on every fixture.
|
|
95
|
+
|
|
96
|
+
Fixture PDFs must be redacted: account numbers, names, addresses, transaction IDs scrubbed. Never commit unredacted statements.
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
MIT. Author: [logickoder](https://github.com/logickoder).
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bankstract"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Convert Nigerian bank PDF statements into structured CSV."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "Jeffery Orazulike" }]
|
|
13
|
+
keywords = ["pdf", "csv", "bank", "statement", "nigeria"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Topic :: Office/Business :: Financial",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"click>=8.1",
|
|
23
|
+
"pdfplumber>=0.11",
|
|
24
|
+
"pydantic>=2.6",
|
|
25
|
+
"pymupdf>=1.27.2",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
ocr = ["pytesseract>=0.3.10", "Pillow>=10.0"]
|
|
30
|
+
camelot = ["camelot-py[cv]>=0.11"]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=8.0",
|
|
33
|
+
"ruff>=0.5",
|
|
34
|
+
"build>=1.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
bankstract = "bankstract.cli:main"
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/logickoder/bankstract"
|
|
42
|
+
Issues = "https://github.com/logickoder/bankstract/issues"
|
|
43
|
+
|
|
44
|
+
[tool.hatch.build.targets.wheel]
|
|
45
|
+
packages = ["src/bankstract"]
|
|
46
|
+
|
|
47
|
+
[tool.hatch.build.targets.sdist]
|
|
48
|
+
# Fixture PDFs in tests/ are dev-only redacted samples; exclude from the
|
|
49
|
+
# published sdist to keep the upload lean.
|
|
50
|
+
include = ["src", "README.md", "LICENSE", "PRD.md", "pyproject.toml", "uv.lock"]
|
|
51
|
+
|
|
52
|
+
[tool.ruff]
|
|
53
|
+
line-length = 100
|
|
54
|
+
target-version = "py311"
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint]
|
|
57
|
+
select = ["E", "F", "I", "B", "UP", "W", "N"]
|
|
58
|
+
ignore = ["E501"]
|
|
59
|
+
|
|
60
|
+
[tool.pytest.ini_options]
|
|
61
|
+
testpaths = ["tests"]
|
|
62
|
+
addopts = "-ra"
|
|
63
|
+
|
|
64
|
+
[dependency-groups]
|
|
65
|
+
dev = [
|
|
66
|
+
"pre-commit>=4.6.0",
|
|
67
|
+
"pyright>=1.1.410",
|
|
68
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared PDF-layout primitives used by both the parser and redactor stacks.
|
|
3
|
+
|
|
4
|
+
A `Word` is the canonical typed token. pymupdf returns word tuples; pdfplumber
|
|
5
|
+
returns dicts. Both are converted to `Word` at the boundary so downstream code
|
|
6
|
+
stays strictly typed.
|
|
7
|
+
|
|
8
|
+
`classify` and `group_by_baseline` are intentionally bank-agnostic — they
|
|
9
|
+
operate on shapes, not vocabulary. Bank-specific dictionaries
|
|
10
|
+
(NARRATION_PHRASES, HEADER_LABELS, etc.) live with their consuming module.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Any, Literal
|
|
18
|
+
|
|
19
|
+
TokenKind = Literal["blank", "date", "time", "ampm", "amount", "alnum", "text"]
|
|
20
|
+
|
|
21
|
+
DATE_TOK = re.compile(r"(?:\d{2}/\d{2}/\d{4}|\d{2}-[A-Za-z]{3}-\d{4})")
|
|
22
|
+
TIME_TOK = re.compile(r"\d{2}:\d{2}:\d{2}")
|
|
23
|
+
AMOUNT_TOK = re.compile(r"[+-]?\d[\d,]*\.\d{2}")
|
|
24
|
+
NAIRA_TOK = re.compile(r"₦\d[\d,]*\.\d{2}")
|
|
25
|
+
TXID_TOK = re.compile(r"[A-Za-z0-9_]{6,}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True, slots=True)
|
|
29
|
+
class Word:
|
|
30
|
+
text: str
|
|
31
|
+
x0: float
|
|
32
|
+
top: float
|
|
33
|
+
x1: float
|
|
34
|
+
bottom: float
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def classify(text: str) -> TokenKind:
|
|
38
|
+
if not text:
|
|
39
|
+
return "blank"
|
|
40
|
+
if DATE_TOK.fullmatch(text):
|
|
41
|
+
return "date"
|
|
42
|
+
if TIME_TOK.fullmatch(text):
|
|
43
|
+
return "time"
|
|
44
|
+
if text in ("AM", "PM"):
|
|
45
|
+
return "ampm"
|
|
46
|
+
if AMOUNT_TOK.fullmatch(text) or NAIRA_TOK.fullmatch(text):
|
|
47
|
+
return "amount"
|
|
48
|
+
if TXID_TOK.fullmatch(text) and any(c.isdigit() for c in text):
|
|
49
|
+
return "alnum"
|
|
50
|
+
return "text"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def group_by_baseline(words: list[Word], tol: float) -> list[list[Word]]:
|
|
54
|
+
"""Group words sharing a visual baseline. PalmPay and similar layouts
|
|
55
|
+
place a row's date / narration / txid columns at slightly offset
|
|
56
|
+
y-coordinates (txid often sits ~4 pt above the date baseline), so the
|
|
57
|
+
`top` value drifts WITHIN a single visual row. We compare each candidate
|
|
58
|
+
word against the LAST appended word's top, not the first — otherwise a
|
|
59
|
+
row whose first word is at the high edge of the drift will split off
|
|
60
|
+
the tokens at the low edge."""
|
|
61
|
+
rows: list[list[Word]] = []
|
|
62
|
+
for w in sorted(words, key=lambda x: (round(x.top / tol) * tol, x.x0)):
|
|
63
|
+
if rows and abs(rows[-1][-1].top - w.top) <= tol:
|
|
64
|
+
rows[-1].append(w)
|
|
65
|
+
else:
|
|
66
|
+
rows.append([w])
|
|
67
|
+
for row in rows:
|
|
68
|
+
row.sort(key=lambda x: x.x0)
|
|
69
|
+
return rows
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def from_pymupdf_words(raw: Any) -> list[Word]:
|
|
73
|
+
"""Adapt pymupdf's (x0, y0, x1, y1, text, block, line, word_no) tuples."""
|
|
74
|
+
return [
|
|
75
|
+
Word(text=str(w[4]), x0=float(w[0]), top=float(w[1]), x1=float(w[2]), bottom=float(w[3]))
|
|
76
|
+
for w in raw
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def from_pdfplumber_words(raw: Any) -> list[Word]:
|
|
81
|
+
"""Adapt pdfplumber's word dicts."""
|
|
82
|
+
return [
|
|
83
|
+
Word(
|
|
84
|
+
text=str(w["text"]),
|
|
85
|
+
x0=float(w["x0"]),
|
|
86
|
+
top=float(w["top"]),
|
|
87
|
+
x1=float(w["x1"]),
|
|
88
|
+
bottom=float(w["bottom"]),
|
|
89
|
+
)
|
|
90
|
+
for w in raw
|
|
91
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Typed facade over pdfplumber.
|
|
3
|
+
|
|
4
|
+
Same rationale as _pymupdf: restrict the untyped third-party surface to one
|
|
5
|
+
file so the rest of the codebase stays clean under pyright/Pylance strict.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from contextlib import contextmanager
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import pdfplumber as _pdfplumber # type: ignore[import-untyped]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@contextmanager
|
|
18
|
+
def open_doc(path: Path) -> Any:
|
|
19
|
+
pdf = _pdfplumber.open(str(path))
|
|
20
|
+
try:
|
|
21
|
+
yield pdf
|
|
22
|
+
finally:
|
|
23
|
+
pdf.close()
|