bankstatementparser 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bankstatementparser/__init__.py +82 -0
- bankstatementparser/additional_parsers.py +376 -0
- bankstatementparser/bank_statement_parsers.py +370 -0
- bankstatementparser/base_parser.py +205 -0
- bankstatementparser/camt_parser.py +971 -0
- bankstatementparser/cli.py +575 -0
- bankstatementparser/exceptions.py +36 -0
- bankstatementparser/input_validator.py +628 -0
- bankstatementparser/pain001_parser.py +742 -0
- bankstatementparser/parallel.py +127 -0
- bankstatementparser/record_types.py +94 -0
- bankstatementparser/transaction_deduplicator.py +402 -0
- bankstatementparser/transaction_models.py +196 -0
- bankstatementparser/zip_security.py +141 -0
- bankstatementparser-0.0.4.dist-info/METADATA +363 -0
- bankstatementparser-0.0.4.dist-info/RECORD +18 -0
- bankstatementparser-0.0.4.dist-info/WHEEL +4 -0
- bankstatementparser-0.0.4.dist-info/licenses/LICENSE +203 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
`bankstatementparser` package provides useful tools for finance and
|
|
17
|
+
treasury specialists.
|
|
18
|
+
|
|
19
|
+
This package includes modules for parsing bank statements in various
|
|
20
|
+
formats, as well as other utilities commonly used in finance and treasury
|
|
21
|
+
operations.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .additional_parsers import (
|
|
25
|
+
CsvStatementParser,
|
|
26
|
+
Mt940Parser,
|
|
27
|
+
OfxParser,
|
|
28
|
+
QfxParser,
|
|
29
|
+
create_parser,
|
|
30
|
+
detect_statement_format,
|
|
31
|
+
)
|
|
32
|
+
from .base_parser import BankStatementParser
|
|
33
|
+
from .camt_parser import CamtParser
|
|
34
|
+
from .exceptions import (
|
|
35
|
+
BankStatementParserError,
|
|
36
|
+
ExportError,
|
|
37
|
+
Pain001ParseError,
|
|
38
|
+
ParserError,
|
|
39
|
+
)
|
|
40
|
+
from .input_validator import InputValidator, ValidationError
|
|
41
|
+
from .pain001_parser import Pain001Parser
|
|
42
|
+
from .parallel import FileResult, parse_files_parallel
|
|
43
|
+
from .transaction_deduplicator import (
|
|
44
|
+
DeduplicationResult,
|
|
45
|
+
Deduplicator,
|
|
46
|
+
ExactDuplicateGroup,
|
|
47
|
+
MatchGroup,
|
|
48
|
+
)
|
|
49
|
+
from .transaction_models import Transaction
|
|
50
|
+
from .zip_security import (
|
|
51
|
+
ZipSecurityError,
|
|
52
|
+
ZipXMLSource,
|
|
53
|
+
iter_secure_xml_entries,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"BankStatementParser",
|
|
58
|
+
"BankStatementParserError",
|
|
59
|
+
"CamtParser",
|
|
60
|
+
"CsvStatementParser",
|
|
61
|
+
"ExportError",
|
|
62
|
+
"OfxParser",
|
|
63
|
+
"QfxParser",
|
|
64
|
+
"Mt940Parser",
|
|
65
|
+
"detect_statement_format",
|
|
66
|
+
"create_parser",
|
|
67
|
+
"Pain001Parser",
|
|
68
|
+
"Pain001ParseError",
|
|
69
|
+
"ParserError",
|
|
70
|
+
"Transaction",
|
|
71
|
+
"Deduplicator",
|
|
72
|
+
"DeduplicationResult",
|
|
73
|
+
"ExactDuplicateGroup",
|
|
74
|
+
"MatchGroup",
|
|
75
|
+
"FileResult",
|
|
76
|
+
"parse_files_parallel",
|
|
77
|
+
"InputValidator",
|
|
78
|
+
"ValidationError",
|
|
79
|
+
"ZipSecurityError",
|
|
80
|
+
"ZipXMLSource",
|
|
81
|
+
"iter_secure_xml_entries",
|
|
82
|
+
]
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""Additional bank statement parsers and format detection helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from .base_parser import BankStatementParser
|
|
11
|
+
from .camt_parser import CamtParser
|
|
12
|
+
from .input_validator import InputValidator, ValidationError
|
|
13
|
+
from .pain001_parser import Pain001Parser
|
|
14
|
+
from .record_types import SummaryRecord, TransactionRecord
|
|
15
|
+
|
|
16
|
+
CSV_COLUMN_GROUPS = {
|
|
17
|
+
"date": {"date", "bookingdate", "transactiondate", "valuedate"},
|
|
18
|
+
"description": {
|
|
19
|
+
"description",
|
|
20
|
+
"details",
|
|
21
|
+
"memo",
|
|
22
|
+
"narrative",
|
|
23
|
+
"payee",
|
|
24
|
+
"name",
|
|
25
|
+
},
|
|
26
|
+
"amount": {"amount", "transactionamount"},
|
|
27
|
+
"debit": {"debit", "withdrawal", "outflow"},
|
|
28
|
+
"credit": {"credit", "deposit", "inflow"},
|
|
29
|
+
"balance": {"balance", "runningbalance"},
|
|
30
|
+
"currency": {"currency", "ccy"},
|
|
31
|
+
"account_id": {"account", "accountnumber", "iban"},
|
|
32
|
+
"transaction_id": {"id", "transactionid", "reference", "ref"},
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _normalized_name(name: str) -> str:
|
|
37
|
+
return re.sub(r"[^a-z0-9]+", "", name.lower())
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _read_validated_text(file_name: str | Path) -> tuple[Path, str]:
|
|
41
|
+
validator = InputValidator()
|
|
42
|
+
path = validator.validate_input_file_path(str(file_name))
|
|
43
|
+
return path, path.read_text(encoding="utf-8", errors="ignore")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _parse_amount(value: object) -> float | None:
|
|
47
|
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
|
48
|
+
return None
|
|
49
|
+
text = str(value).strip()
|
|
50
|
+
if not text:
|
|
51
|
+
return None
|
|
52
|
+
if "," in text and "." in text:
|
|
53
|
+
if text.rfind(",") > text.rfind("."):
|
|
54
|
+
normalized = text.replace(".", "").replace(",", ".")
|
|
55
|
+
else:
|
|
56
|
+
normalized = text.replace(",", "")
|
|
57
|
+
elif "," in text:
|
|
58
|
+
normalized = text.replace(",", ".")
|
|
59
|
+
else:
|
|
60
|
+
normalized = text
|
|
61
|
+
try:
|
|
62
|
+
return float(normalized)
|
|
63
|
+
except ValueError:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class CsvStatementParser(BankStatementParser):
|
|
68
|
+
"""Parse bank statement CSV files with basic column normalization."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, file_name: str | Path) -> None:
|
|
71
|
+
super().__init__(file_name)
|
|
72
|
+
self._path, _ = _read_validated_text(file_name)
|
|
73
|
+
self._parsed_df: pd.DataFrame | None = None
|
|
74
|
+
|
|
75
|
+
def _find_column(
|
|
76
|
+
self, df: pd.DataFrame, logical_name: str
|
|
77
|
+
) -> str | None:
|
|
78
|
+
candidates = CSV_COLUMN_GROUPS[logical_name]
|
|
79
|
+
for column in df.columns:
|
|
80
|
+
column_name = str(column)
|
|
81
|
+
if _normalized_name(column_name) in candidates:
|
|
82
|
+
return column_name
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def parse(self) -> pd.DataFrame:
|
|
86
|
+
if self._parsed_df is not None:
|
|
87
|
+
return self._parsed_df.copy()
|
|
88
|
+
|
|
89
|
+
raw_df = pd.read_csv(self._path, sep=None, engine="python")
|
|
90
|
+
parsed = pd.DataFrame(index=raw_df.index)
|
|
91
|
+
|
|
92
|
+
date_col = self._find_column(raw_df, "date")
|
|
93
|
+
if date_col:
|
|
94
|
+
parsed["date"] = raw_df[date_col]
|
|
95
|
+
|
|
96
|
+
desc_col = self._find_column(raw_df, "description")
|
|
97
|
+
if desc_col:
|
|
98
|
+
parsed["description"] = raw_df[desc_col]
|
|
99
|
+
|
|
100
|
+
amount_col = self._find_column(raw_df, "amount")
|
|
101
|
+
if amount_col:
|
|
102
|
+
parsed["amount"] = raw_df[amount_col].map(_parse_amount)
|
|
103
|
+
else:
|
|
104
|
+
credit_col = self._find_column(raw_df, "credit")
|
|
105
|
+
debit_col = self._find_column(raw_df, "debit")
|
|
106
|
+
credit = (
|
|
107
|
+
raw_df[credit_col].map(_parse_amount)
|
|
108
|
+
if credit_col
|
|
109
|
+
else pd.Series([0.0] * len(raw_df), index=raw_df.index)
|
|
110
|
+
)
|
|
111
|
+
debit = (
|
|
112
|
+
raw_df[debit_col].map(_parse_amount)
|
|
113
|
+
if debit_col
|
|
114
|
+
else pd.Series([0.0] * len(raw_df), index=raw_df.index)
|
|
115
|
+
)
|
|
116
|
+
parsed["amount"] = credit.fillna(0.0) - debit.fillna(0.0)
|
|
117
|
+
|
|
118
|
+
for logical_name in (
|
|
119
|
+
"currency",
|
|
120
|
+
"balance",
|
|
121
|
+
"account_id",
|
|
122
|
+
"transaction_id",
|
|
123
|
+
):
|
|
124
|
+
source_col = self._find_column(raw_df, logical_name)
|
|
125
|
+
if source_col:
|
|
126
|
+
parsed[logical_name] = raw_df[source_col]
|
|
127
|
+
|
|
128
|
+
self._parsed_df = parsed.fillna(value={"amount": 0.0})
|
|
129
|
+
return self._parsed_df.copy()
|
|
130
|
+
|
|
131
|
+
def get_summary(self) -> SummaryRecord:
|
|
132
|
+
df = self.parse()
|
|
133
|
+
balance = (
|
|
134
|
+
df["balance"] if "balance" in df.columns else pd.Series()
|
|
135
|
+
)
|
|
136
|
+
return {
|
|
137
|
+
"account_id": (
|
|
138
|
+
df["account_id"].dropna().astype(str).iloc[0]
|
|
139
|
+
if "account_id" in df.columns and not df.empty
|
|
140
|
+
else None
|
|
141
|
+
),
|
|
142
|
+
"statement_date": (
|
|
143
|
+
df["date"].dropna().astype(str).iloc[-1]
|
|
144
|
+
if "date" in df.columns and not df.empty
|
|
145
|
+
else None
|
|
146
|
+
),
|
|
147
|
+
"transaction_count": int(len(df)),
|
|
148
|
+
"total_amount": float(df["amount"].fillna(0.0).sum()),
|
|
149
|
+
"opening_balance": (
|
|
150
|
+
_parse_amount(balance.iloc[0])
|
|
151
|
+
if not balance.empty
|
|
152
|
+
else None
|
|
153
|
+
),
|
|
154
|
+
"closing_balance": (
|
|
155
|
+
_parse_amount(balance.iloc[-1])
|
|
156
|
+
if not balance.empty
|
|
157
|
+
else None
|
|
158
|
+
),
|
|
159
|
+
"currency": (
|
|
160
|
+
df["currency"].dropna().astype(str).iloc[0]
|
|
161
|
+
if "currency" in df.columns and not df.empty
|
|
162
|
+
else None
|
|
163
|
+
),
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class OfxParser(BankStatementParser):
|
|
168
|
+
"""Parse OFX and QFX bank statement files."""
|
|
169
|
+
|
|
170
|
+
def __init__(self, file_name: str | Path) -> None:
|
|
171
|
+
super().__init__(file_name)
|
|
172
|
+
self._path, self._text = _read_validated_text(file_name)
|
|
173
|
+
self._parsed_df: pd.DataFrame | None = None
|
|
174
|
+
|
|
175
|
+
def _tag_value(self, source: str, tag: str) -> str | None:
|
|
176
|
+
match = re.search(
|
|
177
|
+
rf"<{tag}>([^<\r\n]+)", source, flags=re.IGNORECASE
|
|
178
|
+
)
|
|
179
|
+
if match is None:
|
|
180
|
+
return None
|
|
181
|
+
return match.group(1).strip()
|
|
182
|
+
|
|
183
|
+
def parse(self) -> pd.DataFrame:
|
|
184
|
+
if self._parsed_df is not None:
|
|
185
|
+
return self._parsed_df.copy()
|
|
186
|
+
|
|
187
|
+
currency = self._tag_value(self._text, "CURDEF")
|
|
188
|
+
account_id = self._tag_value(self._text, "ACCTID")
|
|
189
|
+
rows: list[TransactionRecord] = []
|
|
190
|
+
blocks = re.findall(
|
|
191
|
+
r"<STMTTRN>(.*?)(?:</STMTTRN>|(?=<STMTTRN>|</BANKTRANLIST>))",
|
|
192
|
+
self._text,
|
|
193
|
+
flags=re.IGNORECASE | re.DOTALL,
|
|
194
|
+
)
|
|
195
|
+
for block in blocks:
|
|
196
|
+
posted = self._tag_value(block, "DTPOSTED") or ""
|
|
197
|
+
rows.append(
|
|
198
|
+
{
|
|
199
|
+
"date": posted[:8],
|
|
200
|
+
"description": (
|
|
201
|
+
self._tag_value(block, "MEMO")
|
|
202
|
+
or self._tag_value(block, "NAME")
|
|
203
|
+
),
|
|
204
|
+
"amount": _parse_amount(
|
|
205
|
+
self._tag_value(block, "TRNAMT")
|
|
206
|
+
)
|
|
207
|
+
or 0.0,
|
|
208
|
+
"currency": currency,
|
|
209
|
+
"account_id": account_id,
|
|
210
|
+
"transaction_id": self._tag_value(block, "FITID"),
|
|
211
|
+
"transaction_type": self._tag_value(
|
|
212
|
+
block, "TRNTYPE"
|
|
213
|
+
),
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
self._parsed_df = pd.DataFrame(rows)
|
|
218
|
+
return self._parsed_df.copy()
|
|
219
|
+
|
|
220
|
+
def get_summary(self) -> SummaryRecord:
|
|
221
|
+
df = self.parse()
|
|
222
|
+
return {
|
|
223
|
+
"account_id": (
|
|
224
|
+
df["account_id"].dropna().astype(str).iloc[0]
|
|
225
|
+
if "account_id" in df.columns and not df.empty
|
|
226
|
+
else None
|
|
227
|
+
),
|
|
228
|
+
"statement_date": (
|
|
229
|
+
df["date"].dropna().astype(str).iloc[-1]
|
|
230
|
+
if "date" in df.columns and not df.empty
|
|
231
|
+
else None
|
|
232
|
+
),
|
|
233
|
+
"transaction_count": int(len(df)),
|
|
234
|
+
"total_amount": float(df["amount"].fillna(0.0).sum())
|
|
235
|
+
if "amount" in df.columns
|
|
236
|
+
else 0.0,
|
|
237
|
+
"opening_balance": None,
|
|
238
|
+
"closing_balance": None,
|
|
239
|
+
"currency": (
|
|
240
|
+
df["currency"].dropna().astype(str).iloc[0]
|
|
241
|
+
if "currency" in df.columns and not df.empty
|
|
242
|
+
else None
|
|
243
|
+
),
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class Mt940Parser(BankStatementParser):
|
|
248
|
+
"""Parse MT940 bank statement files."""
|
|
249
|
+
|
|
250
|
+
def __init__(self, file_name: str | Path) -> None:
|
|
251
|
+
super().__init__(file_name)
|
|
252
|
+
self._path, self._text = _read_validated_text(file_name)
|
|
253
|
+
self._parsed_df: pd.DataFrame | None = None
|
|
254
|
+
self._opening_balance: float | None = None
|
|
255
|
+
self._closing_balance: float | None = None
|
|
256
|
+
self._account_id: str | None = None
|
|
257
|
+
self._currency: str | None = None
|
|
258
|
+
|
|
259
|
+
def parse(self) -> pd.DataFrame:
|
|
260
|
+
if self._parsed_df is not None:
|
|
261
|
+
return self._parsed_df.copy()
|
|
262
|
+
|
|
263
|
+
rows: list[TransactionRecord] = []
|
|
264
|
+
current: TransactionRecord | None = None
|
|
265
|
+
|
|
266
|
+
for raw_line in self._text.splitlines():
|
|
267
|
+
line = raw_line.strip()
|
|
268
|
+
if line.startswith(":25:"):
|
|
269
|
+
self._account_id = line[4:].strip() or None
|
|
270
|
+
elif line.startswith(":60F:") or line.startswith(":62F:"):
|
|
271
|
+
match = re.match(
|
|
272
|
+
r"^:(60F|62F):[CD](\d{6})([A-Z]{3})([0-9,]+)$", line
|
|
273
|
+
)
|
|
274
|
+
if match is not None:
|
|
275
|
+
amount = _parse_amount(match.group(4))
|
|
276
|
+
self._currency = match.group(3)
|
|
277
|
+
if match.group(1) == "60F":
|
|
278
|
+
self._opening_balance = amount
|
|
279
|
+
else:
|
|
280
|
+
self._closing_balance = amount
|
|
281
|
+
elif line.startswith(":61:"):
|
|
282
|
+
match = re.match(
|
|
283
|
+
r"^:61:(\d{6})(?:\d{4})?([CD])([0-9,]+)(.*)$",
|
|
284
|
+
line,
|
|
285
|
+
)
|
|
286
|
+
if match is not None:
|
|
287
|
+
sign = -1.0 if match.group(2) == "D" else 1.0
|
|
288
|
+
current_record: TransactionRecord = {
|
|
289
|
+
"date": match.group(1),
|
|
290
|
+
"amount": sign
|
|
291
|
+
* (_parse_amount(match.group(3)) or 0.0),
|
|
292
|
+
"transaction_id": match.group(4).strip()
|
|
293
|
+
or None,
|
|
294
|
+
"account_id": self._account_id,
|
|
295
|
+
"currency": self._currency,
|
|
296
|
+
"description": None,
|
|
297
|
+
}
|
|
298
|
+
current = current_record
|
|
299
|
+
rows.append(current_record)
|
|
300
|
+
elif line.startswith(":86:") and current is not None:
|
|
301
|
+
current["description"] = line[4:].strip() or None
|
|
302
|
+
|
|
303
|
+
self._parsed_df = pd.DataFrame(rows)
|
|
304
|
+
return self._parsed_df.copy()
|
|
305
|
+
|
|
306
|
+
def get_summary(self) -> SummaryRecord:
|
|
307
|
+
df = self.parse()
|
|
308
|
+
return {
|
|
309
|
+
"account_id": self._account_id,
|
|
310
|
+
"statement_date": (
|
|
311
|
+
df["date"].dropna().astype(str).iloc[-1]
|
|
312
|
+
if "date" in df.columns and not df.empty
|
|
313
|
+
else None
|
|
314
|
+
),
|
|
315
|
+
"transaction_count": int(len(df)),
|
|
316
|
+
"total_amount": float(df["amount"].fillna(0.0).sum())
|
|
317
|
+
if "amount" in df.columns
|
|
318
|
+
else 0.0,
|
|
319
|
+
"opening_balance": self._opening_balance,
|
|
320
|
+
"closing_balance": self._closing_balance,
|
|
321
|
+
"currency": self._currency,
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
QfxParser = OfxParser
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def detect_statement_format(file_name: str | Path) -> str:
|
|
329
|
+
"""Detect the parser format for a bank statement file."""
|
|
330
|
+
path, text = _read_validated_text(file_name)
|
|
331
|
+
suffix = path.suffix.lower()
|
|
332
|
+
lowered = text.lower()
|
|
333
|
+
|
|
334
|
+
if suffix == ".csv":
|
|
335
|
+
return "csv"
|
|
336
|
+
if suffix in {".ofx", ".qfx"}:
|
|
337
|
+
return "ofx"
|
|
338
|
+
if suffix in {".mt940", ".sta"}:
|
|
339
|
+
return "mt940"
|
|
340
|
+
if suffix == ".xml" and (
|
|
341
|
+
"cstmrcdttrfinitn" in lowered or "pain.001" in lowered
|
|
342
|
+
):
|
|
343
|
+
return "pain001"
|
|
344
|
+
if suffix == ".xml" and (
|
|
345
|
+
"bk to cstmr stmt" in lowered or "camt." in lowered
|
|
346
|
+
):
|
|
347
|
+
return "camt"
|
|
348
|
+
if "<ofx>" in lowered or "<banktranlist>" in lowered:
|
|
349
|
+
return "ofx"
|
|
350
|
+
if ":20:" in text and ":61:" in text:
|
|
351
|
+
return "mt940"
|
|
352
|
+
raise ValidationError(f"Unable to detect statement format: {path}")
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def create_parser(
|
|
356
|
+
file_name: str | Path,
|
|
357
|
+
format_name: str | None = None,
|
|
358
|
+
) -> BankStatementParser:
|
|
359
|
+
"""Create a parser instance from an explicit or detected format."""
|
|
360
|
+
selected = (
|
|
361
|
+
format_name or detect_statement_format(file_name)
|
|
362
|
+
).lower()
|
|
363
|
+
parser_map: dict[str, type[BankStatementParser]] = {
|
|
364
|
+
"camt": CamtParser,
|
|
365
|
+
"pain001": Pain001Parser,
|
|
366
|
+
"csv": CsvStatementParser,
|
|
367
|
+
"ofx": OfxParser,
|
|
368
|
+
"qfx": QfxParser,
|
|
369
|
+
"mt940": Mt940Parser,
|
|
370
|
+
}
|
|
371
|
+
if selected not in parser_map:
|
|
372
|
+
raise ValidationError(
|
|
373
|
+
f"Unsupported statement format: {selected}"
|
|
374
|
+
)
|
|
375
|
+
parser_cls = parser_map[selected]
|
|
376
|
+
return parser_cls(str(file_name))
|