chkparse 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chkparse-1.0.0/.gitignore +9 -0
- chkparse-1.0.0/ADDING_BANKS.md +322 -0
- chkparse-1.0.0/PKG-INFO +114 -0
- chkparse-1.0.0/README.md +93 -0
- chkparse-1.0.0/pyproject.toml +44 -0
- chkparse-1.0.0/src/chkparse/__init__.py +55 -0
- chkparse-1.0.0/src/chkparse/domain/__init__.py +0 -0
- chkparse-1.0.0/src/chkparse/domain/exceptions.py +11 -0
- chkparse-1.0.0/src/chkparse/domain/models.py +47 -0
- chkparse-1.0.0/src/chkparse/domain/validators.py +27 -0
- chkparse-1.0.0/src/chkparse/export/__init__.py +0 -0
- chkparse-1.0.0/src/chkparse/export/export_service.py +20 -0
- chkparse-1.0.0/src/chkparse/infrastructure/__init__.py +0 -0
- chkparse-1.0.0/src/chkparse/infrastructure/pdf_extractor.py +18 -0
- chkparse-1.0.0/src/chkparse/infrastructure/text_normalizer.py +25 -0
- chkparse-1.0.0/src/chkparse/parsers/__init__.py +0 -0
- chkparse-1.0.0/src/chkparse/parsers/base.py +267 -0
- chkparse-1.0.0/src/chkparse/parsers/factory.py +41 -0
- chkparse-1.0.0/src/chkparse/parsers/td_business.py +75 -0
- chkparse-1.0.0/tests/conftest.py +4 -0
- chkparse-1.0.0/tests/test_parser.py +211 -0
- chkparse-1.0.0/uv.lock +643 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
# Adding a New Bank Parser
|
|
2
|
+
|
|
3
|
+
This guide shows how to add support for a new bank in v2.0.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Example: Adding Chase Business Checking
|
|
8
|
+
|
|
9
|
+
### Step 1: Create Parser File
|
|
10
|
+
|
|
11
|
+
Create `src/chkparse/parsers/chase_business.py`:
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import re
|
|
15
|
+
from datetime import datetime, date
|
|
16
|
+
|
|
17
|
+
from ..domain.exceptions import DataIntegrityError
|
|
18
|
+
from ..domain.models import TransactionType
|
|
19
|
+
from .base import BankConfig, StatementParser
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ChaseBusinessCheckingParser(StatementParser):
|
|
23
|
+
"""Parser for Chase Business Checking statements."""
|
|
24
|
+
|
|
25
|
+
def get_config(self) -> BankConfig:
|
|
26
|
+
return BankConfig(
|
|
27
|
+
# Chase uses different column coordinates
|
|
28
|
+
col_date=(40, 95),
|
|
29
|
+
col_desc=(100, 450),
|
|
30
|
+
col_amount=(480, 550),
|
|
31
|
+
col_serial=None, # Chase doesn't show check serial numbers
|
|
32
|
+
|
|
33
|
+
# Chase uses different patterns
|
|
34
|
+
date_pattern=r"^\d{2}/\d{2}$",
|
|
35
|
+
amount_pattern=r"^[\d,]+\.\d{2}$",
|
|
36
|
+
|
|
37
|
+
# Chase uses different labels (with spaces)
|
|
38
|
+
summary_labels={
|
|
39
|
+
"Beginning Balance": "beginning_balance",
|
|
40
|
+
"Total Deposits": "deposits",
|
|
41
|
+
"Total Electronic Deposits": "electronic_deposits",
|
|
42
|
+
"Total Credits": "other_credits",
|
|
43
|
+
"Total Checks": "checks_paid",
|
|
44
|
+
"Total Electronic Payments": "electronic_payments",
|
|
45
|
+
"Total Withdrawals": "other_withdrawals",
|
|
46
|
+
"Service Charges": "service_charges",
|
|
47
|
+
"Ending Balance": "ending_balance",
|
|
48
|
+
},
|
|
49
|
+
|
|
50
|
+
# Chase uses different section headers
|
|
51
|
+
section_headers={
|
|
52
|
+
"Deposits and Additions": TransactionType.DEPOSIT,
|
|
53
|
+
"Electronic Deposits": TransactionType.ELECTRONIC_DEPOSIT,
|
|
54
|
+
"Other Credits": TransactionType.OTHER_CREDIT,
|
|
55
|
+
"Checks": TransactionType.CHECK,
|
|
56
|
+
"Electronic Payments": TransactionType.ELECTRONIC_PAYMENT,
|
|
57
|
+
"Withdrawals": TransactionType.OTHER_WITHDRAWAL,
|
|
58
|
+
},
|
|
59
|
+
|
|
60
|
+
# Chase-specific fingerprints
|
|
61
|
+
fingerprints={"CHASE", "BUSINESS CHECKING", "Account Summary"},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def extract_header(self, rows: list[list[dict]]) -> dict:
|
|
65
|
+
"""Extract Chase-specific header metadata."""
|
|
66
|
+
result = {}
|
|
67
|
+
for row in rows:
|
|
68
|
+
joined = "".join(w["text"] for w in row)
|
|
69
|
+
|
|
70
|
+
# Chase format: "Business Name: ACME CORP"
|
|
71
|
+
if not result.get("entity") and "Business Name:" in joined:
|
|
72
|
+
m = re.search(r"Business Name:\s*(.+?)(?:\s|$)", joined)
|
|
73
|
+
if m:
|
|
74
|
+
result["entity"] = m.group(1).strip()
|
|
75
|
+
|
|
76
|
+
# Chase format: "Account Number: 123456789"
|
|
77
|
+
if not result.get("account") and "Account Number:" in joined:
|
|
78
|
+
m = re.search(r"Account Number:\s*(\d+)", joined)
|
|
79
|
+
if m:
|
|
80
|
+
result["account"] = m.group(1)
|
|
81
|
+
result["suffix"] = m.group(1)[-4:]
|
|
82
|
+
|
|
83
|
+
# Chase format: "Statement Period: 01/26/2024 - 02/25/2024"
|
|
84
|
+
if not result.get("period") and "Statement Period:" in joined:
|
|
85
|
+
m = re.search(r"(\d{2}/\d{2}/\d{4})\s*-\s*(\d{2}/\d{2}/\d{4})", joined)
|
|
86
|
+
if m:
|
|
87
|
+
result["period"] = f"{m.group(1)}-{m.group(2)}"
|
|
88
|
+
|
|
89
|
+
if all(k in result for k in ("entity", "account", "period")):
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
def parse_period(self, raw: str) -> tuple[date, date]:
|
|
95
|
+
"""Parse Chase-specific period format: '01/26/2024 - 02/25/2024'."""
|
|
96
|
+
m = re.match(r"(\d{2}/\d{2}/\d{4})-(\d{2}/\d{2}/\d{4})", raw)
|
|
97
|
+
if not m:
|
|
98
|
+
raise DataIntegrityError(f"Cannot parse statement period: {raw!r}")
|
|
99
|
+
|
|
100
|
+
start = datetime.strptime(m.group(1), "%m/%d/%Y").date()
|
|
101
|
+
end = datetime.strptime(m.group(2), "%m/%d/%Y").date()
|
|
102
|
+
return start, end
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Step 2: Update Factory
|
|
106
|
+
|
|
107
|
+
Edit `src/chkparse/parsers/factory.py`:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
import pdfplumber
|
|
111
|
+
|
|
112
|
+
from ..domain.exceptions import UnsupportedFormatError
|
|
113
|
+
from .base import StatementParser
|
|
114
|
+
from .td_business import TDBusinessCheckingParser
|
|
115
|
+
from .chase_business import ChaseBusinessCheckingParser # Add import
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ParserFactory:
|
|
119
|
+
@staticmethod
|
|
120
|
+
def create_parser(pdf_path: str) -> StatementParser:
|
|
121
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
122
|
+
if not pdf.pages:
|
|
123
|
+
raise UnsupportedFormatError(f"{pdf_path!r} has no pages")
|
|
124
|
+
|
|
125
|
+
text = pdf.pages[0].extract_text()
|
|
126
|
+
if not text:
|
|
127
|
+
raise UnsupportedFormatError(f"{pdf_path!r} has no extractable text")
|
|
128
|
+
|
|
129
|
+
normalized = text.replace(" ", "").upper()
|
|
130
|
+
|
|
131
|
+
# TD Business Checking
|
|
132
|
+
if "TD" in normalized and "ACCOUNTSUMMARY" in normalized:
|
|
133
|
+
return TDBusinessCheckingParser()
|
|
134
|
+
|
|
135
|
+
# Chase Business Checking (NEW)
|
|
136
|
+
elif "CHASE" in normalized and "BUSINESSCHECKING" in normalized:
|
|
137
|
+
return ChaseBusinessCheckingParser()
|
|
138
|
+
|
|
139
|
+
raise UnsupportedFormatError(
|
|
140
|
+
f"{pdf_path!r} is not a recognized bank statement format. "
|
|
141
|
+
f"Currently supported: TD Business Checking, Chase Business Checking"
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Step 3: Update Public API (Optional)
|
|
146
|
+
|
|
147
|
+
Edit `src/chkparse/__init__.py`:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from .parsers.chase_business import ChaseBusinessCheckingParser
|
|
151
|
+
|
|
152
|
+
__all__ = [
|
|
153
|
+
# ... existing exports
|
|
154
|
+
"ChaseBusinessCheckingParser", # Add to exports
|
|
155
|
+
]
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Step 4: Add Tests
|
|
159
|
+
|
|
160
|
+
Create `tests/test_chase_parser.py`:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from chkparse import parse, ChaseBusinessCheckingParser
|
|
164
|
+
from chkparse.parsers.factory import ParserFactory
|
|
165
|
+
|
|
166
|
+
def test_factory_detects_chase():
|
|
167
|
+
parser = ParserFactory.create_parser("path/to/chase_statement.pdf")
|
|
168
|
+
assert isinstance(parser, ChaseBusinessCheckingParser)
|
|
169
|
+
|
|
170
|
+
def test_parse_chase_statement():
|
|
171
|
+
statement = parse("path/to/chase_statement.pdf")
|
|
172
|
+
assert statement.account_suffix is not None
|
|
173
|
+
assert statement.account_summary.ending_balance is not None
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## That's It!
|
|
179
|
+
|
|
180
|
+
You've added Chase support by:
|
|
181
|
+
1. Creating one file (150 lines)
|
|
182
|
+
2. Implementing 3 methods
|
|
183
|
+
3. Adding 2 lines to factory
|
|
184
|
+
4. Writing tests
|
|
185
|
+
|
|
186
|
+
**No changes to:**
|
|
187
|
+
- Domain models
|
|
188
|
+
- Validators
|
|
189
|
+
- Export layer
|
|
190
|
+
- Infrastructure utilities
|
|
191
|
+
- Existing TD parser
|
|
192
|
+
|
|
193
|
+
**Time estimate:** 4-6 hours
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## What You Need to Implement
|
|
198
|
+
|
|
199
|
+
### Required Methods (3)
|
|
200
|
+
|
|
201
|
+
1. **`get_config()`** - Return bank-specific configuration
|
|
202
|
+
- Column coordinates (analyze PDF with pdfplumber)
|
|
203
|
+
- Label mappings (from PDF text to field names)
|
|
204
|
+
- Section headers (transaction categories)
|
|
205
|
+
- Fingerprints (unique strings for detection)
|
|
206
|
+
|
|
207
|
+
2. **`extract_header()`** - Parse statement metadata
|
|
208
|
+
- Entity name
|
|
209
|
+
- Account number
|
|
210
|
+
- Statement period
|
|
211
|
+
|
|
212
|
+
3. **`parse_period()`** - Parse date range format
|
|
213
|
+
- Bank-specific date format
|
|
214
|
+
- Return (start_date, end_date)
|
|
215
|
+
|
|
216
|
+
### Inherited (Automatic)
|
|
217
|
+
|
|
218
|
+
- `parse()` - Template method (same for all banks)
|
|
219
|
+
- `parse_unvalidated()` - Diagnostic parsing
|
|
220
|
+
- `extract_account_summary()` - Uses your config
|
|
221
|
+
- `extract_transactions()` - Uses your config
|
|
222
|
+
- `assert_format()` - Uses your fingerprints
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Tips for Adding New Banks
|
|
227
|
+
|
|
228
|
+
### 1. Analyze PDF Structure
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
import pdfplumber
|
|
232
|
+
|
|
233
|
+
with pdfplumber.open("statement.pdf") as pdf:
|
|
234
|
+
page = pdf.pages[0]
|
|
235
|
+
|
|
236
|
+
# Extract words with coordinates
|
|
237
|
+
words = page.extract_words()
|
|
238
|
+
for w in words[:20]:
|
|
239
|
+
print(f"{w['text']:20} x0={w['x0']:.1f} top={w['top']:.1f}")
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### 2. Find Column Ranges
|
|
243
|
+
|
|
244
|
+
Look for consistent x0 coordinates:
|
|
245
|
+
- Date column: x0 ≈ 40-95
|
|
246
|
+
- Description: x0 ≈ 100-450
|
|
247
|
+
- Amount: x0 ≈ 480-550
|
|
248
|
+
|
|
249
|
+
### 3. Identify Fingerprints
|
|
250
|
+
|
|
251
|
+
Unique strings that appear in every statement:
|
|
252
|
+
- Bank name
|
|
253
|
+
- "Account Summary" or similar
|
|
254
|
+
- Specific formatting patterns
|
|
255
|
+
|
|
256
|
+
### 4. Map Labels
|
|
257
|
+
|
|
258
|
+
Find the exact text used for:
|
|
259
|
+
- Beginning Balance
|
|
260
|
+
- Deposits
|
|
261
|
+
- Withdrawals
|
|
262
|
+
- Ending Balance
|
|
263
|
+
|
|
264
|
+
### 5. Test Thoroughly
|
|
265
|
+
|
|
266
|
+
- Multiple statement periods
|
|
267
|
+
- Edge cases (year boundaries)
|
|
268
|
+
- Different transaction types
|
|
269
|
+
- Balance validation
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Common Patterns
|
|
274
|
+
|
|
275
|
+
### Pattern 1: Different Date Format
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
def parse_period(self, raw: str) -> tuple[date, date]:
|
|
279
|
+
# Wells Fargo: "January 26, 2024 to February 25, 2024"
|
|
280
|
+
m = re.match(r"(\w+ \d+, \d{4}) to (\w+ \d+, \d{4})", raw)
|
|
281
|
+
start = datetime.strptime(m.group(1), "%B %d, %Y").date()
|
|
282
|
+
end = datetime.strptime(m.group(2), "%B %d, %Y").date()
|
|
283
|
+
return start, end
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### Pattern 2: No Check Serial Numbers
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
def get_config(self) -> BankConfig:
|
|
290
|
+
return BankConfig(
|
|
291
|
+
col_serial=None, # Bank doesn't show check numbers
|
|
292
|
+
# ...
|
|
293
|
+
)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
### Pattern 3: Different Section Names
|
|
297
|
+
|
|
298
|
+
```python
|
|
299
|
+
section_headers={
|
|
300
|
+
"Deposits": TransactionType.DEPOSIT,
|
|
301
|
+
"ACH Credits": TransactionType.ELECTRONIC_DEPOSIT,
|
|
302
|
+
"Wire Transfers": TransactionType.OTHER_CREDIT,
|
|
303
|
+
# ...
|
|
304
|
+
}
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## Summary
|
|
310
|
+
|
|
311
|
+
Adding a new bank requires:
|
|
312
|
+
- ✅ 1 new file (~150 lines)
|
|
313
|
+
- ✅ 3 methods to implement
|
|
314
|
+
- ✅ 2 lines in factory
|
|
315
|
+
- ✅ Tests
|
|
316
|
+
|
|
317
|
+
**Reuses:**
|
|
318
|
+
- ✅ All infrastructure (PDF extraction, coordinate detection)
|
|
319
|
+
- ✅ All domain logic (validation, models)
|
|
320
|
+
- ✅ All export logic (DataFrame, future OFX/CSV)
|
|
321
|
+
|
|
322
|
+
**Time:** 4-6 hours per bank
|
chkparse-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chkparse
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Parse checking account PDF statements into structured financial data. Extensible multi-bank support with Strategy Pattern.
|
|
5
|
+
Project-URL: Repository, https://github.com/rmuktader/chkparse
|
|
6
|
+
Author-email: Rayhan Muktader <rmuktader@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: accounting,bank,checking,finance,parser,pdf,statement,td bank,toronto dominion bank
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Office/Business :: Financial :: Accounting
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Requires-Dist: pandas>=2.0
|
|
19
|
+
Requires-Dist: pdfplumber>=0.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# chkparse
|
|
23
|
+
|
|
24
|
+
A Python library for extracting financial data from TD Business Convenience Plus checking account PDF statements.
|
|
25
|
+
|
|
26
|
+
Parses transactions, account summaries, and validates the Golden Equation on every parse.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- Extracts all transactions with posting date, description, amount, and type
|
|
33
|
+
- Categorises transactions: `DEPOSIT`, `ELECTRONIC_DEPOSIT`, `OTHER_CREDIT`, `CHECK`, `ELECTRONIC_PAYMENT`, `OTHER_WITHDRAWAL`
|
|
34
|
+
- Checks include `serial_number`
|
|
35
|
+
- Validates `Beginning Balance + Credits - Debits = Ending Balance` on every parse
|
|
36
|
+
- All monetary values use `decimal.Decimal`
|
|
37
|
+
- Exports to Pandas DataFrame with `datetime64` date column
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install chkparse
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Requires Python 3.11+.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from chkparse import parse
|
|
55
|
+
|
|
56
|
+
statement = parse("path/to/statement.pdf")
|
|
57
|
+
|
|
58
|
+
print(statement.account_suffix) # "7410"
|
|
59
|
+
print(statement.statement_period_start) # datetime.date(2024, 12, 26)
|
|
60
|
+
print(statement.account_summary.ending_balance) # Decimal('4482.23')
|
|
61
|
+
|
|
62
|
+
for t in statement.transactions:
|
|
63
|
+
print(t.posting_date, t.transaction_type, t.description, t.amount)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Pandas Export
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from chkparse import parse
|
|
70
|
+
from chkparse.export.export_service import to_df
|
|
71
|
+
|
|
72
|
+
statement = parse("path/to/statement.pdf")
|
|
73
|
+
df = to_df(statement)
|
|
74
|
+
|
|
75
|
+
print(df.dtypes)
|
|
76
|
+
# posting_date datetime64[ns]
|
|
77
|
+
# description object
|
|
78
|
+
# amount float64
|
|
79
|
+
# transaction_type object
|
|
80
|
+
# serial_number object
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Error Handling
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from chkparse import parse, BalanceMismatchError, DataIntegrityError
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
statement = parse("path/to/statement.pdf")
|
|
92
|
+
except BalanceMismatchError as e:
|
|
93
|
+
print(e)
|
|
94
|
+
except DataIntegrityError as e:
|
|
95
|
+
print(e)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
| Exception | Raised when |
|
|
99
|
+
|---|---|
|
|
100
|
+
| `BalanceMismatchError` | Golden Equation validation fails |
|
|
101
|
+
| `DataIntegrityError` | A required field is missing or unparseable |
|
|
102
|
+
| `UnsupportedFormatError` | PDF is not a TD Business checking statement |
|
|
103
|
+
| `ChkParserError` | Base class for all library errors |
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Development
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
git clone https://github.com/rmuktader/chkparse
|
|
111
|
+
cd chkparse
|
|
112
|
+
uv sync
|
|
113
|
+
uv run pytest tests/ -v
|
|
114
|
+
```
|
chkparse-1.0.0/README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# chkparse
|
|
2
|
+
|
|
3
|
+
A Python library for extracting financial data from TD Business Convenience Plus checking account PDF statements.
|
|
4
|
+
|
|
5
|
+
Parses transactions, account summaries, and validates the Golden Equation on every parse.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- Extracts all transactions with posting date, description, amount, and type
|
|
12
|
+
- Categorises transactions: `DEPOSIT`, `ELECTRONIC_DEPOSIT`, `OTHER_CREDIT`, `CHECK`, `ELECTRONIC_PAYMENT`, `OTHER_WITHDRAWAL`
|
|
13
|
+
- Checks include `serial_number`
|
|
14
|
+
- Validates `Beginning Balance + Credits - Debits = Ending Balance` on every parse
|
|
15
|
+
- All monetary values use `decimal.Decimal`
|
|
16
|
+
- Exports to Pandas DataFrame with `datetime64` date column
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install chkparse
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Requires Python 3.11+.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from chkparse import parse
|
|
34
|
+
|
|
35
|
+
statement = parse("path/to/statement.pdf")
|
|
36
|
+
|
|
37
|
+
print(statement.account_suffix) # "7410"
|
|
38
|
+
print(statement.statement_period_start) # datetime.date(2024, 12, 26)
|
|
39
|
+
print(statement.account_summary.ending_balance) # Decimal('4482.23')
|
|
40
|
+
|
|
41
|
+
for t in statement.transactions:
|
|
42
|
+
print(t.posting_date, t.transaction_type, t.description, t.amount)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Pandas Export
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from chkparse import parse
|
|
49
|
+
from chkparse.export.export_service import to_df
|
|
50
|
+
|
|
51
|
+
statement = parse("path/to/statement.pdf")
|
|
52
|
+
df = to_df(statement)
|
|
53
|
+
|
|
54
|
+
print(df.dtypes)
|
|
55
|
+
# posting_date datetime64[ns]
|
|
56
|
+
# description object
|
|
57
|
+
# amount float64
|
|
58
|
+
# transaction_type object
|
|
59
|
+
# serial_number object
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Error Handling
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from chkparse import parse, BalanceMismatchError, DataIntegrityError
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
statement = parse("path/to/statement.pdf")
|
|
71
|
+
except BalanceMismatchError as e:
|
|
72
|
+
print(e)
|
|
73
|
+
except DataIntegrityError as e:
|
|
74
|
+
print(e)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
| Exception | Raised when |
|
|
78
|
+
|---|---|
|
|
79
|
+
| `BalanceMismatchError` | Golden Equation validation fails |
|
|
80
|
+
| `DataIntegrityError` | A required field is missing or unparseable |
|
|
81
|
+
| `UnsupportedFormatError` | PDF is not a TD Business checking statement |
|
|
82
|
+
| `ChkParserError` | Base class for all library errors |
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Development
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
git clone https://github.com/rmuktader/chkparse
|
|
90
|
+
cd chkparse
|
|
91
|
+
uv sync
|
|
92
|
+
uv run pytest tests/ -v
|
|
93
|
+
```
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chkparse"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Parse checking account PDF statements into structured financial data. Extensible multi-bank support with Strategy Pattern."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Rayhan Muktader", email = "rmuktader@gmail.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["checking", "bank", "pdf", "finance", "statement", "parser", "accounting", "td bank", "toronto dominion bank"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Financial and Insurance Industry",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Office/Business :: Financial :: Accounting",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pdfplumber>=0.11",
|
|
28
|
+
"pandas>=2.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Repository = "https://github.com/rmuktader/chkparse"
|
|
33
|
+
|
|
34
|
+
[tool.hatch.build.targets.wheel]
|
|
35
|
+
packages = ["src/chkparse"]
|
|
36
|
+
|
|
37
|
+
[tool.pytest.ini_options]
|
|
38
|
+
testpaths = ["tests"]
|
|
39
|
+
|
|
40
|
+
[dependency-groups]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=9.0.2",
|
|
43
|
+
"openpyxl>=3.1",
|
|
44
|
+
]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Domain models and exceptions
|
|
2
|
+
from .domain.exceptions import (
|
|
3
|
+
BalanceMismatchError,
|
|
4
|
+
ChkParserError,
|
|
5
|
+
DataIntegrityError,
|
|
6
|
+
UnsupportedFormatError,
|
|
7
|
+
)
|
|
8
|
+
from .domain.models import AccountSummary, Statement, Transaction, TransactionType
|
|
9
|
+
|
|
10
|
+
# Parsers
|
|
11
|
+
from .parsers.factory import ParserFactory
|
|
12
|
+
from .parsers.td_business import TDBusinessCheckingParser
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse(pdf_path: str) -> Statement:
|
|
16
|
+
"""
|
|
17
|
+
Parse a checking account statement PDF (auto-detects bank).
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
pdf_path: Path to the PDF statement file.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Statement: Parsed and validated statement.
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
UnsupportedFormatError: If bank format is not recognized.
|
|
27
|
+
BalanceMismatchError: If Golden Equation validation fails.
|
|
28
|
+
DataIntegrityError: If required fields are missing or unparseable.
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> from chkparse import parse
|
|
32
|
+
>>> statement = parse("statement.pdf")
|
|
33
|
+
>>> print(statement.account_summary.ending_balance)
|
|
34
|
+
"""
|
|
35
|
+
parser = ParserFactory.create_parser(pdf_path)
|
|
36
|
+
return parser.parse(pdf_path)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
# Main API
|
|
41
|
+
"parse",
|
|
42
|
+
"ParserFactory",
|
|
43
|
+
# Domain models
|
|
44
|
+
"Statement",
|
|
45
|
+
"Transaction",
|
|
46
|
+
"TransactionType",
|
|
47
|
+
"AccountSummary",
|
|
48
|
+
# Exceptions
|
|
49
|
+
"ChkParserError",
|
|
50
|
+
"BalanceMismatchError",
|
|
51
|
+
"DataIntegrityError",
|
|
52
|
+
"UnsupportedFormatError",
|
|
53
|
+
# Parsers (for advanced use)
|
|
54
|
+
"TDBusinessCheckingParser",
|
|
55
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from datetime import date
|
|
3
|
+
from decimal import Decimal
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TransactionType(str, Enum):
|
|
9
|
+
DEPOSIT = "DEPOSIT"
|
|
10
|
+
ELECTRONIC_DEPOSIT = "ELECTRONIC_DEPOSIT"
|
|
11
|
+
OTHER_CREDIT = "OTHER_CREDIT"
|
|
12
|
+
CHECK = "CHECK"
|
|
13
|
+
ELECTRONIC_PAYMENT = "ELECTRONIC_PAYMENT"
|
|
14
|
+
OTHER_WITHDRAWAL = "OTHER_WITHDRAWAL"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class Transaction:
|
|
19
|
+
posting_date: date
|
|
20
|
+
description: str
|
|
21
|
+
amount: Decimal # always positive; sign implied by transaction_type
|
|
22
|
+
transaction_type: TransactionType
|
|
23
|
+
serial_number: str | None = None # checks only
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class AccountSummary:
|
|
28
|
+
beginning_balance: Decimal
|
|
29
|
+
deposits: Decimal = Decimal("0")
|
|
30
|
+
electronic_deposits: Decimal = Decimal("0")
|
|
31
|
+
other_credits: Decimal = Decimal("0")
|
|
32
|
+
checks_paid: Decimal = Decimal("0")
|
|
33
|
+
electronic_payments: Decimal = Decimal("0")
|
|
34
|
+
other_withdrawals: Decimal = Decimal("0")
|
|
35
|
+
service_charges: Decimal = Decimal("0")
|
|
36
|
+
ending_balance: Decimal = Decimal("0")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Statement:
|
|
41
|
+
entity_name: str
|
|
42
|
+
account_number: str
|
|
43
|
+
account_suffix: str
|
|
44
|
+
statement_period_start: date
|
|
45
|
+
statement_period_end: date
|
|
46
|
+
account_summary: AccountSummary
|
|
47
|
+
transactions: List[Transaction] = field(default_factory=list)
|