bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,127 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ #
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Parallel multi-file parsing for batch treasury workloads."""
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ from concurrent.futures import (
23
+ ProcessPoolExecutor,
24
+ as_completed,
25
+ )
26
+ from dataclasses import dataclass, field
27
+ from pathlib import Path
28
+
29
+ import pandas as pd
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class FileResult:
36
+ """Result of parsing a single file."""
37
+
38
+ path: str
39
+ status: str
40
+ transactions: pd.DataFrame = field(
41
+ default_factory=pd.DataFrame
42
+ )
43
+ error: str = ""
44
+
45
+
46
+ def _parse_single_file(
47
+ file_path: str,
48
+ format_name: str | None,
49
+ ) -> FileResult:
50
+ """Parse one file in a worker process."""
51
+ from .additional_parsers import (
52
+ create_parser,
53
+ detect_statement_format,
54
+ )
55
+
56
+ try:
57
+ fmt = format_name or detect_statement_format(file_path)
58
+ parser = create_parser(file_path, fmt)
59
+ df = parser.parse()
60
+ return FileResult(
61
+ path=file_path,
62
+ status="SUCCESS",
63
+ transactions=df,
64
+ )
65
+ except Exception as exc:
66
+ return FileResult(
67
+ path=file_path,
68
+ status="FAILED",
69
+ error=str(exc),
70
+ )
71
+
72
+
73
+ def parse_files_parallel(
74
+ file_paths: list[str | Path],
75
+ *,
76
+ format_name: str | None = None,
77
+ max_workers: int | None = None,
78
+ ) -> list[FileResult]:
79
+ """Parse multiple statement files in parallel.
80
+
81
+ Uses process-based parallelism to bypass the GIL and
82
+ maximise throughput on multi-core systems. Each file is
83
+ parsed in its own worker process.
84
+
85
+ Args:
86
+ file_paths: Paths to statement files.
87
+ format_name: Force a specific format for all files.
88
+ When *None*, each file is auto-detected.
89
+ max_workers: Maximum worker processes. Defaults to
90
+ the number of CPU cores.
91
+
92
+ Returns:
93
+ List of ``FileResult`` in the same order as *file_paths*.
94
+ """
95
+ if not file_paths:
96
+ return []
97
+
98
+ str_paths = [str(p) for p in file_paths]
99
+
100
+ # Single file — skip process overhead
101
+ if len(str_paths) == 1:
102
+ return [_parse_single_file(str_paths[0], format_name)]
103
+
104
+ results: dict[str, FileResult] = {}
105
+
106
+ with ProcessPoolExecutor(
107
+ max_workers=max_workers
108
+ ) as executor:
109
+ future_to_path = {
110
+ executor.submit(
111
+ _parse_single_file, p, format_name
112
+ ): p
113
+ for p in str_paths
114
+ }
115
+ for future in as_completed(future_to_path):
116
+ path = future_to_path[future]
117
+ try:
118
+ results[path] = future.result()
119
+ except Exception as exc: # pragma: no cover
120
+ results[path] = FileResult(
121
+ path=path,
122
+ status="FAILED",
123
+ error=str(exc),
124
+ )
125
+
126
+ # Preserve original order
127
+ return [results[p] for p in str_paths]
@@ -0,0 +1,94 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """Typed records shared by parser implementations."""
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import TypedDict
21
+
22
+
23
+ class BalanceRecord(TypedDict, total=False):
24
+ Amount: float
25
+ Currency: str | None
26
+ Code: str | None
27
+ Description: str | None
28
+ DrCr: str | None
29
+ Date: str | None
30
+ AccountId: str | None
31
+
32
+
33
+ class TransactionRecord(TypedDict, total=False):
34
+ Amount: float
35
+ Currency: str | None
36
+ DrCr: str | None
37
+ Debtor: str | None
38
+ Creditor: str | None
39
+ Reference: str | None
40
+ ValDt: str | None
41
+ BookgDt: str | None
42
+ AccountId: str | None
43
+ DebtorAddress: str | None
44
+ CreditorAddress: str | None
45
+ date: str | None
46
+ description: str | None
47
+ amount: float | None
48
+ currency: str | None
49
+ balance: object
50
+ account_id: str | None
51
+ transaction_id: str | None
52
+ transaction_type: str | None
53
+
54
+
55
+ class PaymentRecord(TypedDict, total=False):
56
+ MsgId: str | None
57
+ CreDtTm: str | None
58
+ NbOfTxs: str | None
59
+ InitgPty: str | None
60
+ PmtInfId: str | None
61
+ PmtMtd: str | None
62
+ CtrlSum: str | None
63
+ ReqdExctnDt: str | None
64
+ ChrgBr: str | None
65
+ DbtrNm: str | None
66
+ DbtrIBAN: str | None
67
+ DbtrBIC: str | None
68
+ EndToEndId: str | None
69
+ InstdAmt: str | None
70
+ Currency: str | None
71
+ CdtrBIC: str | None
72
+ CdtrNm: str | None
73
+ RmtInf: str | None
74
+
75
+
76
+ class StatementStatsRecord(TypedDict, total=False):
77
+ StatementId: str | None
78
+ AccountId: str | None
79
+ StatementCreated: str | None
80
+ NumTransactions: int
81
+ NetAmount: float
82
+
83
+
84
+ class SummaryRecord(TypedDict, total=False):
85
+ account_id: str | None
86
+ statement_date: str | None
87
+ transaction_count: int
88
+ total_amount: float
89
+ opening_balance: float | None
90
+ closing_balance: float | None
91
+ currency: str | None
92
+ message_id: str | None
93
+ initiating_party: str | None
94
+ error: str
@@ -0,0 +1,402 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """Deterministic transaction deduplication utilities."""
17
+
18
+ from __future__ import annotations
19
+
20
+ import hashlib
21
+ from collections import defaultdict
22
+ from collections.abc import Iterable
23
+ from dataclasses import dataclass
24
+ from datetime import date
25
+ from difflib import SequenceMatcher
26
+
27
+ import pandas as pd
28
+ from pydantic import BaseModel, ConfigDict, Field
29
+
30
+ from .transaction_models import Transaction
31
+
32
+
33
+ def _days_between(left: date | None, right: date | None) -> int | None:
34
+ if left is None or right is None:
35
+ return None
36
+ return abs((left - right).days)
37
+
38
+
39
+ def _description_similarity(
40
+ left: Transaction, right: Transaction
41
+ ) -> float:
42
+ if (
43
+ not left.normalized_description
44
+ or not right.normalized_description
45
+ ):
46
+ return 0.0
47
+ return SequenceMatcher(
48
+ None, left.normalized_description, right.normalized_description
49
+ ).ratio()
50
+
51
+
52
+ class ExactDuplicateGroup(BaseModel):
53
+ """Transactions that collide on the deterministic primary hash."""
54
+
55
+ model_config = ConfigDict(frozen=True)
56
+
57
+ primary_hash: str
58
+ transactions: list[Transaction]
59
+
60
+
61
+ class MatchGroup(BaseModel):
62
+ """A set of transactions requiring operator review."""
63
+
64
+ model_config = ConfigDict(frozen=True)
65
+
66
+ transactions: list[Transaction]
67
+ reason: str
68
+ confidence: float = Field(ge=0.0, le=1.0)
69
+ tier: str
70
+
71
+
72
+ class DeduplicationResult(BaseModel):
73
+ """Explainable deduplication output."""
74
+
75
+ model_config = ConfigDict(frozen=True)
76
+
77
+ unique_transactions: list[Transaction]
78
+ exact_duplicates: list[ExactDuplicateGroup]
79
+ suspected_matches: list[MatchGroup]
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class _Candidate:
84
+ index: int
85
+ transaction: Transaction
86
+ primary_hash: str
87
+
88
+
89
+ class Deduplicator:
90
+ """Deduplicate bank transactions with deterministic match tiers."""
91
+
92
+ def __init__(
93
+ self,
94
+ *,
95
+ value_date_window_days: int = 3,
96
+ description_similarity_threshold: float = 0.9,
97
+ ) -> None:
98
+ self.value_date_window_days = value_date_window_days
99
+ self.description_similarity_threshold = (
100
+ description_similarity_threshold
101
+ )
102
+
103
+ def primary_hash(self, transaction: Transaction) -> str:
104
+ """Return the stable primary hash for hard identity matching."""
105
+ material = "|".join(
106
+ [
107
+ transaction.account_id or "",
108
+ transaction.currency or "",
109
+ transaction.amount_key(),
110
+ transaction.booking_date.isoformat()
111
+ if transaction.booking_date is not None
112
+ else "",
113
+ ]
114
+ )
115
+ return hashlib.sha256(material.encode("utf-8")).hexdigest()
116
+
117
+ def normalize_transactions(
118
+ self,
119
+ transactions: Iterable[Transaction | dict[str, object]],
120
+ *,
121
+ source: str | None = None,
122
+ ) -> list[Transaction]:
123
+ """Normalize transaction inputs into deterministic models."""
124
+ normalized: list[Transaction] = []
125
+ for index, transaction in enumerate(transactions):
126
+ if isinstance(transaction, Transaction):
127
+ normalized.append(
128
+ transaction.model_copy(
129
+ update={
130
+ "source": transaction.source or source,
131
+ "source_index": (
132
+ transaction.source_index
133
+ if transaction.source_index is not None
134
+ else index
135
+ ),
136
+ }
137
+ )
138
+ )
139
+ else:
140
+ normalized.append(
141
+ Transaction.from_record(
142
+ dict(transaction),
143
+ source=source,
144
+ source_index=index,
145
+ )
146
+ )
147
+ return normalized
148
+
149
+ def from_dataframe(
150
+ self, df: pd.DataFrame, *, source: str | None = None
151
+ ) -> list[Transaction]:
152
+ """Normalize parser DataFrame output into transaction models."""
153
+ return self.normalize_transactions(
154
+ df.to_dict("records"), source=source
155
+ )
156
+
157
+ def deduplicate(
158
+ self, transactions: Iterable[Transaction | dict[str, object]]
159
+ ) -> DeduplicationResult:
160
+ """Deduplicate transactions into unique, exact, and suspected sets."""
161
+ normalized = self.normalize_transactions(transactions)
162
+ candidates = [
163
+ _Candidate(
164
+ index=index,
165
+ transaction=transaction,
166
+ primary_hash=self.primary_hash(transaction),
167
+ )
168
+ for index, transaction in enumerate(normalized)
169
+ ]
170
+
171
+ exact_groups = self._find_exact_duplicates(candidates)
172
+ exact_indices = {
173
+ candidate.index
174
+ for bucket in self._candidate_groups_by_primary(
175
+ candidates
176
+ ).values()
177
+ if len(bucket) > 1
178
+ for candidate in bucket
179
+ }
180
+ suspected_groups = self._find_suspected_matches(
181
+ candidates,
182
+ excluded_indices=exact_indices,
183
+ )
184
+
185
+ suspected_indices = {
186
+ transaction.source_index
187
+ for group in suspected_groups
188
+ for transaction in group.transactions
189
+ if transaction.source_index is not None
190
+ }
191
+ unique_transactions = [
192
+ candidate.transaction
193
+ for candidate in candidates
194
+ if candidate.index not in exact_indices | suspected_indices
195
+ ]
196
+
197
+ return DeduplicationResult(
198
+ unique_transactions=unique_transactions,
199
+ exact_duplicates=exact_groups,
200
+ suspected_matches=suspected_groups,
201
+ )
202
+
203
+ def _candidate_groups_by_primary(
204
+ self, candidates: list[_Candidate]
205
+ ) -> dict[str, list[_Candidate]]:
206
+ groups: dict[str, list[_Candidate]] = defaultdict(list)
207
+ for candidate in candidates:
208
+ groups[candidate.primary_hash].append(candidate)
209
+ return groups
210
+
211
+ def _find_exact_duplicates(
212
+ self, candidates: list[_Candidate]
213
+ ) -> list[ExactDuplicateGroup]:
214
+ groups = self._candidate_groups_by_primary(candidates)
215
+ exact_groups = []
216
+ for primary_hash, bucket in sorted(groups.items()):
217
+ if len(bucket) < 2:
218
+ continue
219
+ transactions = sorted(
220
+ (candidate.transaction for candidate in bucket),
221
+ key=lambda item: (
222
+ item.source_index or -1,
223
+ item.reference or "",
224
+ ),
225
+ )
226
+ exact_groups.append(
227
+ ExactDuplicateGroup(
228
+ primary_hash=primary_hash,
229
+ transactions=transactions,
230
+ )
231
+ )
232
+ return exact_groups
233
+
234
+ def _find_suspected_matches(
235
+ self,
236
+ candidates: list[_Candidate],
237
+ *,
238
+ excluded_indices: set[int],
239
+ ) -> list[MatchGroup]:
240
+ probable_groups = self._find_probable_matches(candidates)
241
+ probable_indices = {
242
+ transaction.source_index
243
+ for group in probable_groups
244
+ for transaction in group.transactions
245
+ if transaction.source_index is not None
246
+ }
247
+ temporal_groups = self._find_temporal_matches(
248
+ [
249
+ candidate
250
+ for candidate in candidates
251
+ if candidate.index
252
+ not in excluded_indices | probable_indices
253
+ ]
254
+ )
255
+ return probable_groups + temporal_groups
256
+
257
+ def _find_probable_matches(
258
+ self, candidates: list[_Candidate]
259
+ ) -> list[MatchGroup]:
260
+ groups = []
261
+ for bucket in self._candidate_groups_by_primary(
262
+ candidates
263
+ ).values():
264
+ if len(bucket) < 2:
265
+ continue
266
+ similarities = []
267
+ for left_index, left in enumerate(bucket):
268
+ for right in bucket[left_index + 1 :]:
269
+ similarity = _description_similarity(
270
+ left.transaction, right.transaction
271
+ )
272
+ if (
273
+ similarity
274
+ >= self.description_similarity_threshold
275
+ and left.transaction.normalized_description
276
+ != right.transaction.normalized_description
277
+ ):
278
+ similarities.append(similarity)
279
+
280
+ if not similarities:
281
+ continue
282
+
283
+ groups.append(
284
+ MatchGroup(
285
+ transactions=sorted(
286
+ (candidate.transaction for candidate in bucket),
287
+ key=lambda item: (
288
+ item.source_index or -1,
289
+ item.reference or "",
290
+ ),
291
+ ),
292
+ reason=(
293
+ "Primary hash collision with description similarity "
294
+ f"{max(similarities):.2f}"
295
+ ),
296
+ confidence=min(0.99, max(similarities) + 0.05),
297
+ tier="probable",
298
+ )
299
+ )
300
+ return groups
301
+
302
+ def _find_temporal_matches(
303
+ self, candidates: list[_Candidate]
304
+ ) -> list[MatchGroup]:
305
+ buckets: dict[tuple[str, str, str], list[_Candidate]] = (
306
+ defaultdict(list)
307
+ )
308
+ for candidate in candidates:
309
+ transaction = candidate.transaction
310
+ buckets[
311
+ (
312
+ transaction.account_id or "",
313
+ transaction.currency or "",
314
+ transaction.amount_key(),
315
+ )
316
+ ].append(candidate)
317
+
318
+ groups = []
319
+ for bucket in buckets.values():
320
+ if len(bucket) < 2:
321
+ continue
322
+
323
+ bucket = sorted(
324
+ bucket,
325
+ key=lambda item: (
326
+ item.transaction.value_date or date.min,
327
+ item.index,
328
+ ),
329
+ )
330
+ component: list[_Candidate] = [bucket[0]]
331
+ component_similarities: list[float] = []
332
+ for candidate in bucket[1:]:
333
+ prev = component[-1]
334
+ day_delta = _days_between(
335
+ prev.transaction.value_date,
336
+ candidate.transaction.value_date,
337
+ )
338
+ if (
339
+ day_delta is not None
340
+ and day_delta <= self.value_date_window_days
341
+ and prev.primary_hash != candidate.primary_hash
342
+ ):
343
+ component.append(candidate)
344
+ component_similarities.append(
345
+ _description_similarity(
346
+ prev.transaction, candidate.transaction
347
+ )
348
+ )
349
+ continue
350
+
351
+ if len(component) > 1:
352
+ groups.append(
353
+ self._temporal_group(
354
+ component, component_similarities
355
+ )
356
+ )
357
+ component = [candidate]
358
+ component_similarities = []
359
+
360
+ if len(component) > 1:
361
+ groups.append(
362
+ self._temporal_group(
363
+ component, component_similarities
364
+ )
365
+ )
366
+
367
+ return groups
368
+
369
+ def _temporal_group(
370
+ self,
371
+ component: list[_Candidate],
372
+ similarities: list[float],
373
+ ) -> MatchGroup:
374
+ max_delta = max(
375
+ _days_between(
376
+ component[0].transaction.value_date,
377
+ component[-1].transaction.value_date,
378
+ )
379
+ or 0,
380
+ 0,
381
+ )
382
+ max_similarity = max(similarities) if similarities else 0.0
383
+ confidence = min(0.95, 0.75 + (max_similarity * 0.2))
384
+ reason = (
385
+ f"Value date shift within {max_delta} day window"
386
+ if max_delta == 1
387
+ else f"Value date shift within {max_delta} day window"
388
+ )
389
+ if max_similarity > 0:
390
+ reason += f"; description similarity {max_similarity:.2f}"
391
+
392
+ return MatchGroup(
393
+ transactions=[
394
+ candidate.transaction
395
+ for candidate in sorted(
396
+ component, key=lambda item: item.index
397
+ )
398
+ ],
399
+ reason=reason,
400
+ confidence=confidence,
401
+ tier="suspected",
402
+ )