bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ `bankstatementparser` package provides useful tools for finance and
17
+ treasury specialists.
18
+
19
+ This package includes modules for parsing bank statements in various
20
+ formats, as well as other utilities commonly used in finance and treasury
21
+ operations.
22
+ """
23
+
24
+ from .additional_parsers import (
25
+ CsvStatementParser,
26
+ Mt940Parser,
27
+ OfxParser,
28
+ QfxParser,
29
+ create_parser,
30
+ detect_statement_format,
31
+ )
32
+ from .base_parser import BankStatementParser
33
+ from .camt_parser import CamtParser
34
+ from .exceptions import (
35
+ BankStatementParserError,
36
+ ExportError,
37
+ Pain001ParseError,
38
+ ParserError,
39
+ )
40
+ from .input_validator import InputValidator, ValidationError
41
+ from .pain001_parser import Pain001Parser
42
+ from .parallel import FileResult, parse_files_parallel
43
+ from .transaction_deduplicator import (
44
+ DeduplicationResult,
45
+ Deduplicator,
46
+ ExactDuplicateGroup,
47
+ MatchGroup,
48
+ )
49
+ from .transaction_models import Transaction
50
+ from .zip_security import (
51
+ ZipSecurityError,
52
+ ZipXMLSource,
53
+ iter_secure_xml_entries,
54
+ )
55
+
56
+ __all__ = [
57
+ "BankStatementParser",
58
+ "BankStatementParserError",
59
+ "CamtParser",
60
+ "CsvStatementParser",
61
+ "ExportError",
62
+ "OfxParser",
63
+ "QfxParser",
64
+ "Mt940Parser",
65
+ "detect_statement_format",
66
+ "create_parser",
67
+ "Pain001Parser",
68
+ "Pain001ParseError",
69
+ "ParserError",
70
+ "Transaction",
71
+ "Deduplicator",
72
+ "DeduplicationResult",
73
+ "ExactDuplicateGroup",
74
+ "MatchGroup",
75
+ "FileResult",
76
+ "parse_files_parallel",
77
+ "InputValidator",
78
+ "ValidationError",
79
+ "ZipSecurityError",
80
+ "ZipXMLSource",
81
+ "iter_secure_xml_entries",
82
+ ]
@@ -0,0 +1,376 @@
1
+ """Additional bank statement parsers and format detection helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ from .base_parser import BankStatementParser
11
+ from .camt_parser import CamtParser
12
+ from .input_validator import InputValidator, ValidationError
13
+ from .pain001_parser import Pain001Parser
14
+ from .record_types import SummaryRecord, TransactionRecord
15
+
16
+ CSV_COLUMN_GROUPS = {
17
+ "date": {"date", "bookingdate", "transactiondate", "valuedate"},
18
+ "description": {
19
+ "description",
20
+ "details",
21
+ "memo",
22
+ "narrative",
23
+ "payee",
24
+ "name",
25
+ },
26
+ "amount": {"amount", "transactionamount"},
27
+ "debit": {"debit", "withdrawal", "outflow"},
28
+ "credit": {"credit", "deposit", "inflow"},
29
+ "balance": {"balance", "runningbalance"},
30
+ "currency": {"currency", "ccy"},
31
+ "account_id": {"account", "accountnumber", "iban"},
32
+ "transaction_id": {"id", "transactionid", "reference", "ref"},
33
+ }
34
+
35
+
36
+ def _normalized_name(name: str) -> str:
37
+ return re.sub(r"[^a-z0-9]+", "", name.lower())
38
+
39
+
40
+ def _read_validated_text(file_name: str | Path) -> tuple[Path, str]:
41
+ validator = InputValidator()
42
+ path = validator.validate_input_file_path(str(file_name))
43
+ return path, path.read_text(encoding="utf-8", errors="ignore")
44
+
45
+
46
+ def _parse_amount(value: object) -> float | None:
47
+ if value is None or (isinstance(value, float) and pd.isna(value)):
48
+ return None
49
+ text = str(value).strip()
50
+ if not text:
51
+ return None
52
+ if "," in text and "." in text:
53
+ if text.rfind(",") > text.rfind("."):
54
+ normalized = text.replace(".", "").replace(",", ".")
55
+ else:
56
+ normalized = text.replace(",", "")
57
+ elif "," in text:
58
+ normalized = text.replace(",", ".")
59
+ else:
60
+ normalized = text
61
+ try:
62
+ return float(normalized)
63
+ except ValueError:
64
+ return None
65
+
66
+
67
+ class CsvStatementParser(BankStatementParser):
68
+ """Parse bank statement CSV files with basic column normalization."""
69
+
70
+ def __init__(self, file_name: str | Path) -> None:
71
+ super().__init__(file_name)
72
+ self._path, _ = _read_validated_text(file_name)
73
+ self._parsed_df: pd.DataFrame | None = None
74
+
75
+ def _find_column(
76
+ self, df: pd.DataFrame, logical_name: str
77
+ ) -> str | None:
78
+ candidates = CSV_COLUMN_GROUPS[logical_name]
79
+ for column in df.columns:
80
+ column_name = str(column)
81
+ if _normalized_name(column_name) in candidates:
82
+ return column_name
83
+ return None
84
+
85
+ def parse(self) -> pd.DataFrame:
86
+ if self._parsed_df is not None:
87
+ return self._parsed_df.copy()
88
+
89
+ raw_df = pd.read_csv(self._path, sep=None, engine="python")
90
+ parsed = pd.DataFrame(index=raw_df.index)
91
+
92
+ date_col = self._find_column(raw_df, "date")
93
+ if date_col:
94
+ parsed["date"] = raw_df[date_col]
95
+
96
+ desc_col = self._find_column(raw_df, "description")
97
+ if desc_col:
98
+ parsed["description"] = raw_df[desc_col]
99
+
100
+ amount_col = self._find_column(raw_df, "amount")
101
+ if amount_col:
102
+ parsed["amount"] = raw_df[amount_col].map(_parse_amount)
103
+ else:
104
+ credit_col = self._find_column(raw_df, "credit")
105
+ debit_col = self._find_column(raw_df, "debit")
106
+ credit = (
107
+ raw_df[credit_col].map(_parse_amount)
108
+ if credit_col
109
+ else pd.Series([0.0] * len(raw_df), index=raw_df.index)
110
+ )
111
+ debit = (
112
+ raw_df[debit_col].map(_parse_amount)
113
+ if debit_col
114
+ else pd.Series([0.0] * len(raw_df), index=raw_df.index)
115
+ )
116
+ parsed["amount"] = credit.fillna(0.0) - debit.fillna(0.0)
117
+
118
+ for logical_name in (
119
+ "currency",
120
+ "balance",
121
+ "account_id",
122
+ "transaction_id",
123
+ ):
124
+ source_col = self._find_column(raw_df, logical_name)
125
+ if source_col:
126
+ parsed[logical_name] = raw_df[source_col]
127
+
128
+ self._parsed_df = parsed.fillna(value={"amount": 0.0})
129
+ return self._parsed_df.copy()
130
+
131
+ def get_summary(self) -> SummaryRecord:
132
+ df = self.parse()
133
+ balance = (
134
+ df["balance"] if "balance" in df.columns else pd.Series()
135
+ )
136
+ return {
137
+ "account_id": (
138
+ df["account_id"].dropna().astype(str).iloc[0]
139
+ if "account_id" in df.columns and not df.empty
140
+ else None
141
+ ),
142
+ "statement_date": (
143
+ df["date"].dropna().astype(str).iloc[-1]
144
+ if "date" in df.columns and not df.empty
145
+ else None
146
+ ),
147
+ "transaction_count": int(len(df)),
148
+ "total_amount": float(df["amount"].fillna(0.0).sum()),
149
+ "opening_balance": (
150
+ _parse_amount(balance.iloc[0])
151
+ if not balance.empty
152
+ else None
153
+ ),
154
+ "closing_balance": (
155
+ _parse_amount(balance.iloc[-1])
156
+ if not balance.empty
157
+ else None
158
+ ),
159
+ "currency": (
160
+ df["currency"].dropna().astype(str).iloc[0]
161
+ if "currency" in df.columns and not df.empty
162
+ else None
163
+ ),
164
+ }
165
+
166
+
167
+ class OfxParser(BankStatementParser):
168
+ """Parse OFX and QFX bank statement files."""
169
+
170
+ def __init__(self, file_name: str | Path) -> None:
171
+ super().__init__(file_name)
172
+ self._path, self._text = _read_validated_text(file_name)
173
+ self._parsed_df: pd.DataFrame | None = None
174
+
175
+ def _tag_value(self, source: str, tag: str) -> str | None:
176
+ match = re.search(
177
+ rf"<{tag}>([^<\r\n]+)", source, flags=re.IGNORECASE
178
+ )
179
+ if match is None:
180
+ return None
181
+ return match.group(1).strip()
182
+
183
+ def parse(self) -> pd.DataFrame:
184
+ if self._parsed_df is not None:
185
+ return self._parsed_df.copy()
186
+
187
+ currency = self._tag_value(self._text, "CURDEF")
188
+ account_id = self._tag_value(self._text, "ACCTID")
189
+ rows: list[TransactionRecord] = []
190
+ blocks = re.findall(
191
+ r"<STMTTRN>(.*?)(?:</STMTTRN>|(?=<STMTTRN>|</BANKTRANLIST>))",
192
+ self._text,
193
+ flags=re.IGNORECASE | re.DOTALL,
194
+ )
195
+ for block in blocks:
196
+ posted = self._tag_value(block, "DTPOSTED") or ""
197
+ rows.append(
198
+ {
199
+ "date": posted[:8],
200
+ "description": (
201
+ self._tag_value(block, "MEMO")
202
+ or self._tag_value(block, "NAME")
203
+ ),
204
+ "amount": _parse_amount(
205
+ self._tag_value(block, "TRNAMT")
206
+ )
207
+ or 0.0,
208
+ "currency": currency,
209
+ "account_id": account_id,
210
+ "transaction_id": self._tag_value(block, "FITID"),
211
+ "transaction_type": self._tag_value(
212
+ block, "TRNTYPE"
213
+ ),
214
+ }
215
+ )
216
+
217
+ self._parsed_df = pd.DataFrame(rows)
218
+ return self._parsed_df.copy()
219
+
220
+ def get_summary(self) -> SummaryRecord:
221
+ df = self.parse()
222
+ return {
223
+ "account_id": (
224
+ df["account_id"].dropna().astype(str).iloc[0]
225
+ if "account_id" in df.columns and not df.empty
226
+ else None
227
+ ),
228
+ "statement_date": (
229
+ df["date"].dropna().astype(str).iloc[-1]
230
+ if "date" in df.columns and not df.empty
231
+ else None
232
+ ),
233
+ "transaction_count": int(len(df)),
234
+ "total_amount": float(df["amount"].fillna(0.0).sum())
235
+ if "amount" in df.columns
236
+ else 0.0,
237
+ "opening_balance": None,
238
+ "closing_balance": None,
239
+ "currency": (
240
+ df["currency"].dropna().astype(str).iloc[0]
241
+ if "currency" in df.columns and not df.empty
242
+ else None
243
+ ),
244
+ }
245
+
246
+
247
+ class Mt940Parser(BankStatementParser):
248
+ """Parse MT940 bank statement files."""
249
+
250
+ def __init__(self, file_name: str | Path) -> None:
251
+ super().__init__(file_name)
252
+ self._path, self._text = _read_validated_text(file_name)
253
+ self._parsed_df: pd.DataFrame | None = None
254
+ self._opening_balance: float | None = None
255
+ self._closing_balance: float | None = None
256
+ self._account_id: str | None = None
257
+ self._currency: str | None = None
258
+
259
+ def parse(self) -> pd.DataFrame:
260
+ if self._parsed_df is not None:
261
+ return self._parsed_df.copy()
262
+
263
+ rows: list[TransactionRecord] = []
264
+ current: TransactionRecord | None = None
265
+
266
+ for raw_line in self._text.splitlines():
267
+ line = raw_line.strip()
268
+ if line.startswith(":25:"):
269
+ self._account_id = line[4:].strip() or None
270
+ elif line.startswith(":60F:") or line.startswith(":62F:"):
271
+ match = re.match(
272
+ r"^:(60F|62F):[CD](\d{6})([A-Z]{3})([0-9,]+)$", line
273
+ )
274
+ if match is not None:
275
+ amount = _parse_amount(match.group(4))
276
+ self._currency = match.group(3)
277
+ if match.group(1) == "60F":
278
+ self._opening_balance = amount
279
+ else:
280
+ self._closing_balance = amount
281
+ elif line.startswith(":61:"):
282
+ match = re.match(
283
+ r"^:61:(\d{6})(?:\d{4})?([CD])([0-9,]+)(.*)$",
284
+ line,
285
+ )
286
+ if match is not None:
287
+ sign = -1.0 if match.group(2) == "D" else 1.0
288
+ current_record: TransactionRecord = {
289
+ "date": match.group(1),
290
+ "amount": sign
291
+ * (_parse_amount(match.group(3)) or 0.0),
292
+ "transaction_id": match.group(4).strip()
293
+ or None,
294
+ "account_id": self._account_id,
295
+ "currency": self._currency,
296
+ "description": None,
297
+ }
298
+ current = current_record
299
+ rows.append(current_record)
300
+ elif line.startswith(":86:") and current is not None:
301
+ current["description"] = line[4:].strip() or None
302
+
303
+ self._parsed_df = pd.DataFrame(rows)
304
+ return self._parsed_df.copy()
305
+
306
+ def get_summary(self) -> SummaryRecord:
307
+ df = self.parse()
308
+ return {
309
+ "account_id": self._account_id,
310
+ "statement_date": (
311
+ df["date"].dropna().astype(str).iloc[-1]
312
+ if "date" in df.columns and not df.empty
313
+ else None
314
+ ),
315
+ "transaction_count": int(len(df)),
316
+ "total_amount": float(df["amount"].fillna(0.0).sum())
317
+ if "amount" in df.columns
318
+ else 0.0,
319
+ "opening_balance": self._opening_balance,
320
+ "closing_balance": self._closing_balance,
321
+ "currency": self._currency,
322
+ }
323
+
324
+
325
+ QfxParser = OfxParser
326
+
327
+
328
+ def detect_statement_format(file_name: str | Path) -> str:
329
+ """Detect the parser format for a bank statement file."""
330
+ path, text = _read_validated_text(file_name)
331
+ suffix = path.suffix.lower()
332
+ lowered = text.lower()
333
+
334
+ if suffix == ".csv":
335
+ return "csv"
336
+ if suffix in {".ofx", ".qfx"}:
337
+ return "ofx"
338
+ if suffix in {".mt940", ".sta"}:
339
+ return "mt940"
340
+ if suffix == ".xml" and (
341
+ "cstmrcdttrfinitn" in lowered or "pain.001" in lowered
342
+ ):
343
+ return "pain001"
344
+ if suffix == ".xml" and (
345
+ "bk to cstmr stmt" in lowered or "camt." in lowered
346
+ ):
347
+ return "camt"
348
+ if "<ofx>" in lowered or "<banktranlist>" in lowered:
349
+ return "ofx"
350
+ if ":20:" in text and ":61:" in text:
351
+ return "mt940"
352
+ raise ValidationError(f"Unable to detect statement format: {path}")
353
+
354
+
355
+ def create_parser(
356
+ file_name: str | Path,
357
+ format_name: str | None = None,
358
+ ) -> BankStatementParser:
359
+ """Create a parser instance from an explicit or detected format."""
360
+ selected = (
361
+ format_name or detect_statement_format(file_name)
362
+ ).lower()
363
+ parser_map: dict[str, type[BankStatementParser]] = {
364
+ "camt": CamtParser,
365
+ "pain001": Pain001Parser,
366
+ "csv": CsvStatementParser,
367
+ "ofx": OfxParser,
368
+ "qfx": QfxParser,
369
+ "mt940": Mt940Parser,
370
+ }
371
+ if selected not in parser_map:
372
+ raise ValidationError(
373
+ f"Unsupported statement format: {selected}"
374
+ )
375
+ parser_cls = parser_map[selected]
376
+ return parser_cls(str(file_name))