bankstract 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ .coverage
8
+ htmlcov/
9
+ dist/
10
+ build/
11
+ .venv/
12
+ venv/
13
+ env/
14
+ .python-version
15
+ .env
16
+ .DS_Store
17
+ .idea/
18
+ .vscode/
19
+ *.swp
20
+
21
+ # Fixture privacy — never commit unredacted statements.
22
+ # Drop real PDFs into tests/<bank>/fixtures/_local/ for dev only.
23
+ tests/**/fixtures/_local/
24
+ tests/**/fixtures/*real*.pdf
25
+ tests/**/fixtures/*unredacted*.pdf
26
+ tests/**/fixtures/*raw*.pdf
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jeffery Orazulike
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: bankstract
3
+ Version: 0.2.0
4
+ Summary: Convert Nigerian bank PDF statements into structured CSV.
5
+ Project-URL: Homepage, https://github.com/logickoder/bankstract
6
+ Project-URL: Issues, https://github.com/logickoder/bankstract/issues
7
+ Author: Jeffery Orazulike
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Jeffery Orazulike
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ License-File: LICENSE
30
+ Keywords: bank,csv,nigeria,pdf,statement
31
+ Classifier: Development Status :: 3 - Alpha
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Topic :: Office/Business :: Financial
36
+ Requires-Python: >=3.11
37
+ Requires-Dist: click>=8.1
38
+ Requires-Dist: pdfplumber>=0.11
39
+ Requires-Dist: pydantic>=2.6
40
+ Requires-Dist: pymupdf>=1.27.2
41
+ Provides-Extra: camelot
42
+ Requires-Dist: camelot-py[cv]>=0.11; extra == 'camelot'
43
+ Provides-Extra: dev
44
+ Requires-Dist: build>=1.0; extra == 'dev'
45
+ Requires-Dist: pytest>=8.0; extra == 'dev'
46
+ Requires-Dist: ruff>=0.5; extra == 'dev'
47
+ Provides-Extra: ocr
48
+ Requires-Dist: pillow>=10.0; extra == 'ocr'
49
+ Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
50
+ Description-Content-Type: text/markdown
51
+
52
+ # bankstract
53
+
54
+ Convert Nigerian bank PDF statements into structured CSV. Plugin architecture — one parser per bank.
55
+
56
+ ```bash
57
+ pip install bankstract
58
+
59
+ bankstract palmpay statement.pdf -o out.csv
60
+ bankstract auto unknown.pdf -o out.csv
61
+ bankstract list
62
+ ```
63
+
64
+ ## Status
65
+
66
+ | Bank | Status |
67
+ | ---------- | ------------- |
68
+ | PalmPay | v0.1 — alpha |
69
+ | First Bank | v0.1 — alpha |
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ pip install bankstract
75
+ ```
76
+
77
+ Optional extras:
78
+
79
+ ```bash
80
+ pip install "bankstract[ocr]" # pytesseract for scanned PDFs
81
+ pip install "bankstract[camelot]" # camelot lattice fallback
82
+ ```
83
+
84
+ ## Develop
85
+
86
+ Project uses [uv](https://docs.astral.sh/uv/) for dependency + venv management.
87
+
88
+ ```bash
89
+ uv sync --all-extras # create .venv, install deps + extras from uv.lock
90
+ uv run pre-commit install # one-time: enable the pre-commit hook
91
+ uv run pytest # run tests
92
+ uv run ruff check src tests
93
+ uv run pyright src tests # strict type check (see CLAUDE.md directive 8)
94
+ uv run bankstract list # invoke CLI
95
+ ```
96
+
97
+ Add a dependency with `uv add <pkg>` (dev: `uv add --dev <pkg>`). Commit `uv.lock`.
98
+
99
+ The pre-commit hook runs `ruff check`, `ruff format --check`, `pyright` (strict), and `pytest` before every commit. Bypass only in genuine emergencies with `git commit --no-verify`; the same checks run again in CI.
100
+
101
+ ### Releasing
102
+
103
+ CI publishes to PyPI automatically on push to `main` via `.github/workflows/publish.yml`. The workflow runs the full gate (ruff + pyright + pytest), and if the current `pyproject.toml` version already exists on PyPI it auto-bumps the minor component and commits the bump before publishing. PyPI auth uses OIDC trusted publishing — no token in repo or CI secrets.
104
+
105
+ To prepare a release locally:
106
+
107
+ ```bash
108
+ scripts/bump-version.sh # patch bump
109
+ scripts/bump-version.sh minor # 0.2.x -> 0.3.0
110
+ scripts/bump-version.sh major # 0.x.x -> 1.0.0
111
+ scripts/bump-version.sh 0.3.0 # exact set
112
+ uv build # dist/*.whl + dist/*.tar.gz
113
+ uv publish dist/* # only if not using the GH workflow; needs --token or UV_PUBLISH_TOKEN
114
+ ```
115
+
116
+ Trusted-publisher setup (one-time, owner only): create a publisher at <https://pypi.org/manage/account/publishing/> with workflow `publish.yml`, repo `logickoder/bankstract`.
117
+
118
+ ## Usage
119
+
120
+ ```bash
121
+ bankstract <bank> <pdf> -o <csv> # explicit parser
122
+ bankstract auto <pdf> -o <csv> # auto-detect via Parser.detect()
123
+ bankstract list # show registered parsers
124
+ ```
125
+
126
+ Unparseable blocks are written to a `.log` sidecar next to the output CSV.
127
+
128
+ ## Reconciliation invariant
129
+
130
+ Two complementary checks; the CLI picks whichever applies per bank.
131
+
132
+ - **Row-wise** (banks that print a running balance): `prev.balance ± debit/credit == curr.balance`. Mismatch raises `ReconciliationError` with the row index.
133
+ - **Totals-based** (banks like PalmPay that omit a balance column): the parser reads `Total Money In` / `Total Money Out` from the statement header and the CLI asserts that the sum of parsed credits/debits equals those totals.
134
+
135
+ Both modes exist to catch silently-dropped rows — the failure mode of naive PDF parsers.
136
+
137
+ ## Contributing a bank parser
138
+
139
+ 1. Copy `src/bankstract/parsers/palmpay.py` to `src/bankstract/parsers/<bank>.py`.
140
+ 2. Implement `detect()` and `parse() -> ParseResult` from `parsers/base.py`. Populate `total_credit` / `total_debit` if the statement only ships header totals.
141
+ 3. Add a `Redactor` subclass under `src/bankstract/redactors/<bank>.py` for the fixture pipeline.
142
+ 4. Drop the raw statement at `tests/<bank>/fixtures/_local/` (gitignored), then `uv run bankstract redact <bank> <raw> tests/<bank>/fixtures/sample.pdf` to produce the committable fixture.
143
+ 5. Add tests under `tests/<bank>/test_parser.py` and `tests/<bank>/test_redactor.py`.
144
+
145
+ CI runs `ruff` + `pyright` (strict) + `pytest`. All three must pass clean. Reconciliation invariant must hold on every fixture.
146
+
147
+ Fixture PDFs must be redacted: account numbers, names, addresses, transaction IDs scrubbed. Never commit unredacted statements.
148
+
149
+ ## License
150
+
151
+ MIT. Author: [logickoder](https://github.com/logickoder).
@@ -0,0 +1,184 @@
1
+ # bankstract — PRD
2
+
3
+ **Status:** concept · v0.1 target
4
+ **License:** MIT
5
+ **Stack:** Python 3.11+
6
+
7
+ ---
8
+
9
+ ## What
10
+
11
+ Public Python CLI + library that converts Nigerian bank PDF statements into structured CSV. Plugin architecture — one parser module per bank.
12
+
13
+ ```bash
14
+ bankstract palmpay statement.pdf -o out.csv
15
+ bankstract auto unknown.pdf -o out.csv # auto-detect bank
16
+ bankstract fbn scanned.pdf -o out.csv --ocr # force OCR
17
+ bankstract list # show registered parsers
18
+ ```
19
+
20
+ ## Why
21
+
22
+ Every Nigerian dev solves this once, badly, in private. Banks export PDFs; tools like BudgetBakers, YNAB, Notion, Google Sheets want CSV. Manual entry costs more than the visibility is worth, so trackers go stale.
23
+
24
+ bankstract closes that gap with one clean tool, one plugin contract, and community-driven bank coverage.
25
+
26
+ ## Scope
27
+
28
+ | Version | Coverage |
29
+ | ------- | -------------------------------------------------------------------------------- |
30
+ | v0.1 | PalmPay only. CLI + plugin contract + reconciliation + tests + CI. PyPI release. |
31
+ | v0.2 | First Bank parser + OCR fallback for scanned statements. |
32
+ | v0.3+ | GTB, Kuda, Opay, Stanbic, Wise, Bamboo, Risevest — community PRs. |
33
+
34
+ **Out of scope:** category inference, ML-based parsing, GUI, pushing data into third-party trackers (those belong in downstream tools).
35
+
36
+ ## Architecture
37
+
38
+ ### Stack rationale
39
+
40
+ - **Python** over Node — `pdfplumber` and `camelot-py` are best-in-class table extractors; `pytesseract` is the cleanest OCR binding. Node alternatives are weaker on table extract and slower on OCR.
41
+ - **pdfplumber primary, camelot fallback** — pdfplumber for text-PDF table extract; camelot lattice mode for messy ruled tables; pytesseract only when the text layer is absent (scanned PDFs).
42
+ - **pymupdf for true redaction** — `apply_redactions()` rewrites the PDF content stream rather than visually overlaying, so fixture PDFs contain no recoverable PII.
43
+ - **pydantic schema** — runtime validation + clean JSON Schema export for downstream tools.
44
+ - **click CLI** — standard, autocomplete-friendly, low boilerplate.
45
+
46
+ ### Plugin contract
47
+
48
+ Every bank is a parser module implementing the `Parser` ABC.
49
+
50
+ ```python
51
+ class Parser(ABC):
52
+ bank: str # module-level identifier (e.g. "palmpay", "fbn")
53
+
54
+ @abstractmethod
55
+ def detect(self, pdf_path: Path) -> bool:
56
+ """Return True if this parser handles the given PDF (header/logo/text-marker match)."""
57
+
58
+ @abstractmethod
59
+ def parse(self, pdf_path: Path) -> ParseResult:
60
+ """Extract transactions and header totals. Raise ParseError on format mismatch."""
61
+ ```
62
+
63
+ Parsers live in `src/bankstract/parsers/<bank>.py` and self-register via import side-effect in `parsers/__init__.py`. A parallel `Redactor` plugin tree under `src/bankstract/redactors/<bank>.py` produces committable fixtures from raw statements.
64
+
65
+ ### Transaction + ParseResult schema
66
+
67
+ ```python
68
+ class Transaction(BaseModel):
69
+ date: date
70
+ narration: str
71
+ debit: Decimal = Decimal("0")
72
+ credit: Decimal = Decimal("0")
73
+ balance: Decimal | None = None # None when the statement omits a running balance
74
+ reference: str | None = None # bank transaction ID
75
+ currency: str = "NGN"
76
+
77
+
78
+ @dataclass
79
+ class ParseResult:
80
+ transactions: list[Transaction]
81
+ total_credit: Decimal | None = None # from statement header
82
+ total_debit: Decimal | None = None
83
+ format_version: str | None = None
84
+ ```
85
+
86
+ Amounts are stored as `Decimal` (not float — financial precision). The Naira sign is stripped before parsing.
87
+
88
+ ### Reconciliation invariant
89
+
90
+ Two complementary checks:
91
+
92
+ - **Row-wise** (`reconcile()`): `prev.balance ± debit/credit == curr.balance`. Used when the statement carries a per-row running balance. Mismatch raises `ReconciliationError` with the row index.
93
+ - **Totals-based** (`verify_totals()`): sum of parsed credits/debits equals header `Total Money In` / `Total Money Out`. Used when the statement omits a running balance (e.g. PalmPay). Parsers MUST populate `ParseResult.total_credit/total_debit` in that case, otherwise reconciliation is skipped silently — a directive 2 violation.
94
+
95
+ Both modes catch silently-dropped rows, which is the failure mode of every naive PDF parser.
96
+
97
+ ### Failure handling
98
+
99
+ - **Unparseable blocks** go to a `.log` sidecar file. Never silently dropped.
100
+ - **Format-version drift** — each parser logs a detected `format_version` at run start. Parse errors include the detected version, so issue reports are actionable.
101
+
102
+ ### Repo layout
103
+
104
+ ```
105
+ bankstract/
106
+ ├── pyproject.toml hatchling backend, ruff + pytest + pyright config
107
+ ├── pyrightconfig.json IDE-side mirror of [tool.pyright]
108
+ ├── uv.lock uv-managed lockfile
109
+ ├── README.md
110
+ ├── PRD.md
111
+ ├── LICENSE MIT
112
+ ├── src/
113
+ │ └── bankstract/ standard src-layout package
114
+ │ ├── cli.py
115
+ │ ├── schema.py Transaction + ParseResult + errors
116
+ │ ├── reconcile.py reconcile() + verify_totals()
117
+ │ ├── _layout.py Word dataclass + classify + Y-grouping (shared)
118
+ │ ├── _pymupdf.py typed facade over pymupdf
119
+ │ ├── _pdfplumber.py typed facade over pdfplumber
120
+ │ ├── writers/csv.py
121
+ │ ├── parsers/
122
+ │ │ ├── __init__.py registry (import side-effect)
123
+ │ │ ├── base.py Parser ABC
124
+ │ │ ├── palmpay.py
125
+ │ │ └── fbn.py
126
+ │ └── redactors/
127
+ │ ├── __init__.py registry (import side-effect)
128
+ │ ├── base.py Redactor ABC + RedactReport (template-method)
129
+ │ ├── _shared.py shared redact primitives
130
+ │ ├── palmpay.py
131
+ │ └── fbn.py
132
+ ├── tests/
133
+ │ ├── test_reconcile.py bank-agnostic
134
+ │ └── <bank>/ one folder per bank
135
+ │ ├── test_parser.py
136
+ │ ├── test_redactor.py
137
+ │ └── fixtures/
138
+ │ ├── sample.pdf redacted sample (committed)
139
+ │ └── _local/ gitignored: raw statements for dev
140
+ └── .github/workflows/ci.yml uv + ruff + pyright + pytest
141
+ ```
142
+
143
+ ## CLI surface
144
+
145
+ ```bash
146
+ bankstract <bank> <pdf> -o <csv> # explicit parser
147
+ bankstract auto <pdf> -o <csv> # detect via Parser.detect()
148
+ bankstract <bank> <pdf> -o <csv> --ocr # force OCR path
149
+ bankstract list # show registered parsers + status
150
+ ```
151
+
152
+ ## Risks
153
+
154
+ | Risk | Mitigation |
155
+ | ---------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
156
+ | Statement format drift — banks rev PDFs annually. | Per-parser `format_version` detection + log on parse error. Per-bank fixture suite in tests. |
157
+ | OCR accuracy on scanned statements — ₦ / N confusion, comma-separator drift. | Post-OCR regex normalization. Reconciliation invariant catches arithmetic errors before they ship. |
158
+ | Charset edge cases — `₦` decodes differently across PDF producers. | Strip currency symbols and store as `Decimal`. |
159
+ | Fixture privacy — sample PDFs contain PII. | All fixtures must be anonymized: account numbers, names, addresses scrubbed. Never commit unredacted PDFs. |
160
+
161
+ ## Roadmap
162
+
163
+ - [ ] `pyproject.toml` + Parser ABC + Transaction schema + csv writer + reconciliation
164
+ - [ ] PalmPay parser + 1 anonymized fixture + test
165
+ - [ ] CLI wrapper + auto-detect
166
+ - [ ] README + LICENSE + CI
167
+ - [ ] **v0.1.0** — PyPI release, PalmPay only, FBN marked in progress
168
+ - [ ] First Bank parser + OCR fallback → **v0.2.0**
169
+ - [ ] Open issues for next 5 banks; invite contributors
170
+
171
+ ## Contributing
172
+
173
+ Add a bank in four steps:
174
+
175
+ 1. Copy `src/bankstract/parsers/palmpay.py` to `src/bankstract/parsers/<your_bank>.py` and implement `detect()` + `parse() -> ParseResult`.
176
+ 2. Copy `src/bankstract/redactors/palmpay.py` to `src/bankstract/redactors/<your_bank>.py` for the fixture pipeline.
177
+ 3. Drop the raw statement in `tests/<your_bank>/fixtures/_local/` (gitignored), run `uv run bankstract redact <your_bank> <raw> tests/<your_bank>/fixtures/sample.pdf`, eyeball the output, commit the redacted sample.
178
+ 4. Add tests in `tests/<your_bank>/test_parser.py` and `tests/<your_bank>/test_redactor.py`.
179
+
180
+ CI runs `ruff` + `pyright` (strict) + `pytest`. All three must pass. Reconciliation invariant must hold on every fixture.
181
+
182
+ ## License
183
+
184
+ MIT. See `LICENSE`.
@@ -0,0 +1,100 @@
1
+ # bankstract
2
+
3
+ Convert Nigerian bank PDF statements into structured CSV. Plugin architecture — one parser per bank.
4
+
5
+ ```bash
6
+ pip install bankstract
7
+
8
+ bankstract palmpay statement.pdf -o out.csv
9
+ bankstract auto unknown.pdf -o out.csv
10
+ bankstract list
11
+ ```
12
+
13
+ ## Status
14
+
15
+ | Bank | Status |
16
+ | ---------- | ------------- |
17
+ | PalmPay | v0.1 — alpha |
18
+ | First Bank | v0.1 — alpha |
19
+
20
+ ## Install
21
+
22
+ ```bash
23
+ pip install bankstract
24
+ ```
25
+
26
+ Optional extras:
27
+
28
+ ```bash
29
+ pip install "bankstract[ocr]" # pytesseract for scanned PDFs
30
+ pip install "bankstract[camelot]" # camelot lattice fallback
31
+ ```
32
+
33
+ ## Develop
34
+
35
+ Project uses [uv](https://docs.astral.sh/uv/) for dependency + venv management.
36
+
37
+ ```bash
38
+ uv sync --all-extras # create .venv, install deps + extras from uv.lock
39
+ uv run pre-commit install # one-time: enable the pre-commit hook
40
+ uv run pytest # run tests
41
+ uv run ruff check src tests
42
+ uv run pyright src tests # strict type check (see CLAUDE.md directive 8)
43
+ uv run bankstract list # invoke CLI
44
+ ```
45
+
46
+ Add a dependency with `uv add <pkg>` (dev: `uv add --dev <pkg>`). Commit `uv.lock`.
47
+
48
+ The pre-commit hook runs `ruff check`, `ruff format --check`, `pyright` (strict), and `pytest` before every commit. Bypass only in genuine emergencies with `git commit --no-verify`; the same checks run again in CI.
49
+
50
+ ### Releasing
51
+
52
+ CI publishes to PyPI automatically on push to `main` via `.github/workflows/publish.yml`. The workflow runs the full gate (ruff + pyright + pytest), and if the current `pyproject.toml` version already exists on PyPI it auto-bumps the minor component and commits the bump before publishing. PyPI auth uses OIDC trusted publishing — no token in repo or CI secrets.
53
+
54
+ To prepare a release locally:
55
+
56
+ ```bash
57
+ scripts/bump-version.sh # patch bump
58
+ scripts/bump-version.sh minor # 0.2.x -> 0.3.0
59
+ scripts/bump-version.sh major # 0.x.x -> 1.0.0
60
+ scripts/bump-version.sh 0.3.0 # exact set
61
+ uv build # dist/*.whl + dist/*.tar.gz
62
+ uv publish dist/* # only if not using the GH workflow; needs --token or UV_PUBLISH_TOKEN
63
+ ```
64
+
65
+ Trusted-publisher setup (one-time, owner only): create a publisher at <https://pypi.org/manage/account/publishing/> with workflow `publish.yml`, repo `logickoder/bankstract`.
66
+
67
+ ## Usage
68
+
69
+ ```bash
70
+ bankstract <bank> <pdf> -o <csv> # explicit parser
71
+ bankstract auto <pdf> -o <csv> # auto-detect via Parser.detect()
72
+ bankstract list # show registered parsers
73
+ ```
74
+
75
+ Unparseable blocks are written to a `.log` sidecar next to the output CSV.
76
+
77
+ ## Reconciliation invariant
78
+
79
+ Two complementary checks; the CLI picks whichever applies per bank.
80
+
81
+ - **Row-wise** (banks that print a running balance): `prev.balance ± debit/credit == curr.balance`. Mismatch raises `ReconciliationError` with the row index.
82
+ - **Totals-based** (banks like PalmPay that omit a balance column): the parser reads `Total Money In` / `Total Money Out` from the statement header and the CLI asserts that the sum of parsed credits/debits equals those totals.
83
+
84
+ Both modes exist to catch silently-dropped rows — the failure mode of naive PDF parsers.
85
+
86
+ ## Contributing a bank parser
87
+
88
+ 1. Copy `src/bankstract/parsers/palmpay.py` to `src/bankstract/parsers/<bank>.py`.
89
+ 2. Implement `detect()` and `parse() -> ParseResult` from `parsers/base.py`. Populate `total_credit` / `total_debit` if the statement only ships header totals.
90
+ 3. Add a `Redactor` subclass under `src/bankstract/redactors/<bank>.py` for the fixture pipeline.
91
+ 4. Drop the raw statement at `tests/<bank>/fixtures/_local/` (gitignored), then `uv run bankstract redact <bank> <raw> tests/<bank>/fixtures/sample.pdf` to produce the committable fixture.
92
+ 5. Add tests under `tests/<bank>/test_parser.py` and `tests/<bank>/test_redactor.py`.
93
+
94
+ CI runs `ruff` + `pyright` (strict) + `pytest`. All three must pass clean. Reconciliation invariant must hold on every fixture.
95
+
96
+ Fixture PDFs must be redacted: account numbers, names, addresses, transaction IDs scrubbed. Never commit unredacted statements.
97
+
98
+ ## License
99
+
100
+ MIT. Author: [logickoder](https://github.com/logickoder).
@@ -0,0 +1,68 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "bankstract"
7
+ version = "0.2.0"
8
+ description = "Convert Nigerian bank PDF statements into structured CSV."
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.11"
12
+ authors = [{ name = "Jeffery Orazulike" }]
13
+ keywords = ["pdf", "csv", "bank", "statement", "nigeria"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Topic :: Office/Business :: Financial",
20
+ ]
21
+ dependencies = [
22
+ "click>=8.1",
23
+ "pdfplumber>=0.11",
24
+ "pydantic>=2.6",
25
+ "pymupdf>=1.27.2",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ ocr = ["pytesseract>=0.3.10", "Pillow>=10.0"]
30
+ camelot = ["camelot-py[cv]>=0.11"]
31
+ dev = [
32
+ "pytest>=8.0",
33
+ "ruff>=0.5",
34
+ "build>=1.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ bankstract = "bankstract.cli:main"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/logickoder/bankstract"
42
+ Issues = "https://github.com/logickoder/bankstract/issues"
43
+
44
+ [tool.hatch.build.targets.wheel]
45
+ packages = ["src/bankstract"]
46
+
47
+ [tool.hatch.build.targets.sdist]
48
+ # Fixture PDFs in tests/ are dev-only redacted samples; exclude from the
49
+ # published sdist to keep the upload lean.
50
+ include = ["src", "README.md", "LICENSE", "PRD.md", "pyproject.toml", "uv.lock"]
51
+
52
+ [tool.ruff]
53
+ line-length = 100
54
+ target-version = "py311"
55
+
56
+ [tool.ruff.lint]
57
+ select = ["E", "F", "I", "B", "UP", "W", "N"]
58
+ ignore = ["E501"]
59
+
60
+ [tool.pytest.ini_options]
61
+ testpaths = ["tests"]
62
+ addopts = "-ra"
63
+
64
+ [dependency-groups]
65
+ dev = [
66
+ "pre-commit>=4.6.0",
67
+ "pyright>=1.1.410",
68
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -0,0 +1,91 @@
1
+ """
2
+ Shared PDF-layout primitives used by both the parser and redactor stacks.
3
+
4
+ A `Word` is the canonical typed token. pymupdf returns word tuples; pdfplumber
5
+ returns dicts. Both are converted to `Word` at the boundary so downstream code
6
+ stays strictly typed.
7
+
8
+ `classify` and `group_by_baseline` are intentionally bank-agnostic — they
9
+ operate on shapes, not vocabulary. Bank-specific dictionaries
10
+ (NARRATION_PHRASES, HEADER_LABELS, etc.) live with their consuming module.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from dataclasses import dataclass
17
+ from typing import Any, Literal
18
+
19
+ TokenKind = Literal["blank", "date", "time", "ampm", "amount", "alnum", "text"]
20
+
21
+ DATE_TOK = re.compile(r"(?:\d{2}/\d{2}/\d{4}|\d{2}-[A-Za-z]{3}-\d{4})")
22
+ TIME_TOK = re.compile(r"\d{2}:\d{2}:\d{2}")
23
+ AMOUNT_TOK = re.compile(r"[+-]?\d[\d,]*\.\d{2}")
24
+ NAIRA_TOK = re.compile(r"₦\d[\d,]*\.\d{2}")
25
+ TXID_TOK = re.compile(r"[A-Za-z0-9_]{6,}")
26
+
27
+
28
+ @dataclass(frozen=True, slots=True)
29
+ class Word:
30
+ text: str
31
+ x0: float
32
+ top: float
33
+ x1: float
34
+ bottom: float
35
+
36
+
37
+ def classify(text: str) -> TokenKind:
38
+ if not text:
39
+ return "blank"
40
+ if DATE_TOK.fullmatch(text):
41
+ return "date"
42
+ if TIME_TOK.fullmatch(text):
43
+ return "time"
44
+ if text in ("AM", "PM"):
45
+ return "ampm"
46
+ if AMOUNT_TOK.fullmatch(text) or NAIRA_TOK.fullmatch(text):
47
+ return "amount"
48
+ if TXID_TOK.fullmatch(text) and any(c.isdigit() for c in text):
49
+ return "alnum"
50
+ return "text"
51
+
52
+
53
+ def group_by_baseline(words: list[Word], tol: float) -> list[list[Word]]:
54
+ """Group words sharing a visual baseline. PalmPay and similar layouts
55
+ place a row's date / narration / txid columns at slightly offset
56
+ y-coordinates (txid often sits ~4 pt above the date baseline), so the
57
+ `top` value drifts WITHIN a single visual row. We compare each candidate
58
+ word against the LAST appended word's top, not the first — otherwise a
59
+ row whose first word is at the high edge of the drift will split off
60
+ the tokens at the low edge."""
61
+ rows: list[list[Word]] = []
62
+ for w in sorted(words, key=lambda x: (round(x.top / tol) * tol, x.x0)):
63
+ if rows and abs(rows[-1][-1].top - w.top) <= tol:
64
+ rows[-1].append(w)
65
+ else:
66
+ rows.append([w])
67
+ for row in rows:
68
+ row.sort(key=lambda x: x.x0)
69
+ return rows
70
+
71
+
72
+ def from_pymupdf_words(raw: Any) -> list[Word]:
73
+ """Adapt pymupdf's (x0, y0, x1, y1, text, block, line, word_no) tuples."""
74
+ return [
75
+ Word(text=str(w[4]), x0=float(w[0]), top=float(w[1]), x1=float(w[2]), bottom=float(w[3]))
76
+ for w in raw
77
+ ]
78
+
79
+
80
+ def from_pdfplumber_words(raw: Any) -> list[Word]:
81
+ """Adapt pdfplumber's word dicts."""
82
+ return [
83
+ Word(
84
+ text=str(w["text"]),
85
+ x0=float(w["x0"]),
86
+ top=float(w["top"]),
87
+ x1=float(w["x1"]),
88
+ bottom=float(w["bottom"]),
89
+ )
90
+ for w in raw
91
+ ]
@@ -0,0 +1,23 @@
1
+ """
2
+ Typed facade over pdfplumber.
3
+
4
+ Same rationale as _pymupdf: restrict the untyped third-party surface to one
5
+ file so the rest of the codebase stays clean under pyright/Pylance strict.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from contextlib import contextmanager
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import pdfplumber as _pdfplumber # type: ignore[import-untyped]
15
+
16
+
17
+ @contextmanager
18
+ def open_doc(path: Path) -> Any:
19
+ pdf = _pdfplumber.open(str(path))
20
+ try:
21
+ yield pdf
22
+ finally:
23
+ pdf.close()