bankstatementparser 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bankstatementparser/__init__.py +82 -0
- bankstatementparser/additional_parsers.py +376 -0
- bankstatementparser/bank_statement_parsers.py +370 -0
- bankstatementparser/base_parser.py +205 -0
- bankstatementparser/camt_parser.py +971 -0
- bankstatementparser/cli.py +575 -0
- bankstatementparser/exceptions.py +36 -0
- bankstatementparser/input_validator.py +628 -0
- bankstatementparser/pain001_parser.py +742 -0
- bankstatementparser/parallel.py +127 -0
- bankstatementparser/record_types.py +94 -0
- bankstatementparser/transaction_deduplicator.py +402 -0
- bankstatementparser/transaction_models.py +196 -0
- bankstatementparser/zip_security.py +141 -0
- bankstatementparser-0.0.4.dist-info/METADATA +363 -0
- bankstatementparser-0.0.4.dist-info/RECORD +18 -0
- bankstatementparser-0.0.4.dist-info/WHEEL +4 -0
- bankstatementparser-0.0.4.dist-info/licenses/LICENSE +203 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
12
|
+
# implied.
|
|
13
|
+
#
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
"""Parallel multi-file parsing for batch treasury workloads."""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from concurrent.futures import (
|
|
23
|
+
ProcessPoolExecutor,
|
|
24
|
+
as_completed,
|
|
25
|
+
)
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class FileResult:
|
|
36
|
+
"""Result of parsing a single file."""
|
|
37
|
+
|
|
38
|
+
path: str
|
|
39
|
+
status: str
|
|
40
|
+
transactions: pd.DataFrame = field(
|
|
41
|
+
default_factory=pd.DataFrame
|
|
42
|
+
)
|
|
43
|
+
error: str = ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _parse_single_file(
|
|
47
|
+
file_path: str,
|
|
48
|
+
format_name: str | None,
|
|
49
|
+
) -> FileResult:
|
|
50
|
+
"""Parse one file in a worker process."""
|
|
51
|
+
from .additional_parsers import (
|
|
52
|
+
create_parser,
|
|
53
|
+
detect_statement_format,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
fmt = format_name or detect_statement_format(file_path)
|
|
58
|
+
parser = create_parser(file_path, fmt)
|
|
59
|
+
df = parser.parse()
|
|
60
|
+
return FileResult(
|
|
61
|
+
path=file_path,
|
|
62
|
+
status="SUCCESS",
|
|
63
|
+
transactions=df,
|
|
64
|
+
)
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
return FileResult(
|
|
67
|
+
path=file_path,
|
|
68
|
+
status="FAILED",
|
|
69
|
+
error=str(exc),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_files_parallel(
|
|
74
|
+
file_paths: list[str | Path],
|
|
75
|
+
*,
|
|
76
|
+
format_name: str | None = None,
|
|
77
|
+
max_workers: int | None = None,
|
|
78
|
+
) -> list[FileResult]:
|
|
79
|
+
"""Parse multiple statement files in parallel.
|
|
80
|
+
|
|
81
|
+
Uses process-based parallelism to bypass the GIL and
|
|
82
|
+
maximise throughput on multi-core systems. Each file is
|
|
83
|
+
parsed in its own worker process.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
file_paths: Paths to statement files.
|
|
87
|
+
format_name: Force a specific format for all files.
|
|
88
|
+
When *None*, each file is auto-detected.
|
|
89
|
+
max_workers: Maximum worker processes. Defaults to
|
|
90
|
+
the number of CPU cores.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of ``FileResult`` in the same order as *file_paths*.
|
|
94
|
+
"""
|
|
95
|
+
if not file_paths:
|
|
96
|
+
return []
|
|
97
|
+
|
|
98
|
+
str_paths = [str(p) for p in file_paths]
|
|
99
|
+
|
|
100
|
+
# Single file — skip process overhead
|
|
101
|
+
if len(str_paths) == 1:
|
|
102
|
+
return [_parse_single_file(str_paths[0], format_name)]
|
|
103
|
+
|
|
104
|
+
results: dict[str, FileResult] = {}
|
|
105
|
+
|
|
106
|
+
with ProcessPoolExecutor(
|
|
107
|
+
max_workers=max_workers
|
|
108
|
+
) as executor:
|
|
109
|
+
future_to_path = {
|
|
110
|
+
executor.submit(
|
|
111
|
+
_parse_single_file, p, format_name
|
|
112
|
+
): p
|
|
113
|
+
for p in str_paths
|
|
114
|
+
}
|
|
115
|
+
for future in as_completed(future_to_path):
|
|
116
|
+
path = future_to_path[future]
|
|
117
|
+
try:
|
|
118
|
+
results[path] = future.result()
|
|
119
|
+
except Exception as exc: # pragma: no cover
|
|
120
|
+
results[path] = FileResult(
|
|
121
|
+
path=path,
|
|
122
|
+
status="FAILED",
|
|
123
|
+
error=str(exc),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Preserve original order
|
|
127
|
+
return [results[p] for p in str_paths]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
12
|
+
# implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
"""Typed records shared by parser implementations."""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from typing import TypedDict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BalanceRecord(TypedDict, total=False):
|
|
24
|
+
Amount: float
|
|
25
|
+
Currency: str | None
|
|
26
|
+
Code: str | None
|
|
27
|
+
Description: str | None
|
|
28
|
+
DrCr: str | None
|
|
29
|
+
Date: str | None
|
|
30
|
+
AccountId: str | None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TransactionRecord(TypedDict, total=False):
|
|
34
|
+
Amount: float
|
|
35
|
+
Currency: str | None
|
|
36
|
+
DrCr: str | None
|
|
37
|
+
Debtor: str | None
|
|
38
|
+
Creditor: str | None
|
|
39
|
+
Reference: str | None
|
|
40
|
+
ValDt: str | None
|
|
41
|
+
BookgDt: str | None
|
|
42
|
+
AccountId: str | None
|
|
43
|
+
DebtorAddress: str | None
|
|
44
|
+
CreditorAddress: str | None
|
|
45
|
+
date: str | None
|
|
46
|
+
description: str | None
|
|
47
|
+
amount: float | None
|
|
48
|
+
currency: str | None
|
|
49
|
+
balance: object
|
|
50
|
+
account_id: str | None
|
|
51
|
+
transaction_id: str | None
|
|
52
|
+
transaction_type: str | None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class PaymentRecord(TypedDict, total=False):
|
|
56
|
+
MsgId: str | None
|
|
57
|
+
CreDtTm: str | None
|
|
58
|
+
NbOfTxs: str | None
|
|
59
|
+
InitgPty: str | None
|
|
60
|
+
PmtInfId: str | None
|
|
61
|
+
PmtMtd: str | None
|
|
62
|
+
CtrlSum: str | None
|
|
63
|
+
ReqdExctnDt: str | None
|
|
64
|
+
ChrgBr: str | None
|
|
65
|
+
DbtrNm: str | None
|
|
66
|
+
DbtrIBAN: str | None
|
|
67
|
+
DbtrBIC: str | None
|
|
68
|
+
EndToEndId: str | None
|
|
69
|
+
InstdAmt: str | None
|
|
70
|
+
Currency: str | None
|
|
71
|
+
CdtrBIC: str | None
|
|
72
|
+
CdtrNm: str | None
|
|
73
|
+
RmtInf: str | None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class StatementStatsRecord(TypedDict, total=False):
|
|
77
|
+
StatementId: str | None
|
|
78
|
+
AccountId: str | None
|
|
79
|
+
StatementCreated: str | None
|
|
80
|
+
NumTransactions: int
|
|
81
|
+
NetAmount: float
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class SummaryRecord(TypedDict, total=False):
|
|
85
|
+
account_id: str | None
|
|
86
|
+
statement_date: str | None
|
|
87
|
+
transaction_count: int
|
|
88
|
+
total_amount: float
|
|
89
|
+
opening_balance: float | None
|
|
90
|
+
closing_balance: float | None
|
|
91
|
+
currency: str | None
|
|
92
|
+
message_id: str | None
|
|
93
|
+
initiating_party: str | None
|
|
94
|
+
error: str
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
12
|
+
# implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
"""Deterministic transaction deduplication utilities."""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from collections.abc import Iterable
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from datetime import date
|
|
25
|
+
from difflib import SequenceMatcher
|
|
26
|
+
|
|
27
|
+
import pandas as pd
|
|
28
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
29
|
+
|
|
30
|
+
from .transaction_models import Transaction
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _days_between(left: date | None, right: date | None) -> int | None:
|
|
34
|
+
if left is None or right is None:
|
|
35
|
+
return None
|
|
36
|
+
return abs((left - right).days)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _description_similarity(
|
|
40
|
+
left: Transaction, right: Transaction
|
|
41
|
+
) -> float:
|
|
42
|
+
if (
|
|
43
|
+
not left.normalized_description
|
|
44
|
+
or not right.normalized_description
|
|
45
|
+
):
|
|
46
|
+
return 0.0
|
|
47
|
+
return SequenceMatcher(
|
|
48
|
+
None, left.normalized_description, right.normalized_description
|
|
49
|
+
).ratio()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ExactDuplicateGroup(BaseModel):
|
|
53
|
+
"""Transactions that collide on the deterministic primary hash."""
|
|
54
|
+
|
|
55
|
+
model_config = ConfigDict(frozen=True)
|
|
56
|
+
|
|
57
|
+
primary_hash: str
|
|
58
|
+
transactions: list[Transaction]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class MatchGroup(BaseModel):
|
|
62
|
+
"""A set of transactions requiring operator review."""
|
|
63
|
+
|
|
64
|
+
model_config = ConfigDict(frozen=True)
|
|
65
|
+
|
|
66
|
+
transactions: list[Transaction]
|
|
67
|
+
reason: str
|
|
68
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
69
|
+
tier: str
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DeduplicationResult(BaseModel):
|
|
73
|
+
"""Explainable deduplication output."""
|
|
74
|
+
|
|
75
|
+
model_config = ConfigDict(frozen=True)
|
|
76
|
+
|
|
77
|
+
unique_transactions: list[Transaction]
|
|
78
|
+
exact_duplicates: list[ExactDuplicateGroup]
|
|
79
|
+
suspected_matches: list[MatchGroup]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class _Candidate:
|
|
84
|
+
index: int
|
|
85
|
+
transaction: Transaction
|
|
86
|
+
primary_hash: str
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Deduplicator:
|
|
90
|
+
"""Deduplicate bank transactions with deterministic match tiers."""
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
*,
|
|
95
|
+
value_date_window_days: int = 3,
|
|
96
|
+
description_similarity_threshold: float = 0.9,
|
|
97
|
+
) -> None:
|
|
98
|
+
self.value_date_window_days = value_date_window_days
|
|
99
|
+
self.description_similarity_threshold = (
|
|
100
|
+
description_similarity_threshold
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def primary_hash(self, transaction: Transaction) -> str:
|
|
104
|
+
"""Return the stable primary hash for hard identity matching."""
|
|
105
|
+
material = "|".join(
|
|
106
|
+
[
|
|
107
|
+
transaction.account_id or "",
|
|
108
|
+
transaction.currency or "",
|
|
109
|
+
transaction.amount_key(),
|
|
110
|
+
transaction.booking_date.isoformat()
|
|
111
|
+
if transaction.booking_date is not None
|
|
112
|
+
else "",
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
return hashlib.sha256(material.encode("utf-8")).hexdigest()
|
|
116
|
+
|
|
117
|
+
def normalize_transactions(
|
|
118
|
+
self,
|
|
119
|
+
transactions: Iterable[Transaction | dict[str, object]],
|
|
120
|
+
*,
|
|
121
|
+
source: str | None = None,
|
|
122
|
+
) -> list[Transaction]:
|
|
123
|
+
"""Normalize transaction inputs into deterministic models."""
|
|
124
|
+
normalized: list[Transaction] = []
|
|
125
|
+
for index, transaction in enumerate(transactions):
|
|
126
|
+
if isinstance(transaction, Transaction):
|
|
127
|
+
normalized.append(
|
|
128
|
+
transaction.model_copy(
|
|
129
|
+
update={
|
|
130
|
+
"source": transaction.source or source,
|
|
131
|
+
"source_index": (
|
|
132
|
+
transaction.source_index
|
|
133
|
+
if transaction.source_index is not None
|
|
134
|
+
else index
|
|
135
|
+
),
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
normalized.append(
|
|
141
|
+
Transaction.from_record(
|
|
142
|
+
dict(transaction),
|
|
143
|
+
source=source,
|
|
144
|
+
source_index=index,
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
return normalized
|
|
148
|
+
|
|
149
|
+
def from_dataframe(
|
|
150
|
+
self, df: pd.DataFrame, *, source: str | None = None
|
|
151
|
+
) -> list[Transaction]:
|
|
152
|
+
"""Normalize parser DataFrame output into transaction models."""
|
|
153
|
+
return self.normalize_transactions(
|
|
154
|
+
df.to_dict("records"), source=source
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def deduplicate(
|
|
158
|
+
self, transactions: Iterable[Transaction | dict[str, object]]
|
|
159
|
+
) -> DeduplicationResult:
|
|
160
|
+
"""Deduplicate transactions into unique, exact, and suspected sets."""
|
|
161
|
+
normalized = self.normalize_transactions(transactions)
|
|
162
|
+
candidates = [
|
|
163
|
+
_Candidate(
|
|
164
|
+
index=index,
|
|
165
|
+
transaction=transaction,
|
|
166
|
+
primary_hash=self.primary_hash(transaction),
|
|
167
|
+
)
|
|
168
|
+
for index, transaction in enumerate(normalized)
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
exact_groups = self._find_exact_duplicates(candidates)
|
|
172
|
+
exact_indices = {
|
|
173
|
+
candidate.index
|
|
174
|
+
for bucket in self._candidate_groups_by_primary(
|
|
175
|
+
candidates
|
|
176
|
+
).values()
|
|
177
|
+
if len(bucket) > 1
|
|
178
|
+
for candidate in bucket
|
|
179
|
+
}
|
|
180
|
+
suspected_groups = self._find_suspected_matches(
|
|
181
|
+
candidates,
|
|
182
|
+
excluded_indices=exact_indices,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
suspected_indices = {
|
|
186
|
+
transaction.source_index
|
|
187
|
+
for group in suspected_groups
|
|
188
|
+
for transaction in group.transactions
|
|
189
|
+
if transaction.source_index is not None
|
|
190
|
+
}
|
|
191
|
+
unique_transactions = [
|
|
192
|
+
candidate.transaction
|
|
193
|
+
for candidate in candidates
|
|
194
|
+
if candidate.index not in exact_indices | suspected_indices
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
return DeduplicationResult(
|
|
198
|
+
unique_transactions=unique_transactions,
|
|
199
|
+
exact_duplicates=exact_groups,
|
|
200
|
+
suspected_matches=suspected_groups,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def _candidate_groups_by_primary(
|
|
204
|
+
self, candidates: list[_Candidate]
|
|
205
|
+
) -> dict[str, list[_Candidate]]:
|
|
206
|
+
groups: dict[str, list[_Candidate]] = defaultdict(list)
|
|
207
|
+
for candidate in candidates:
|
|
208
|
+
groups[candidate.primary_hash].append(candidate)
|
|
209
|
+
return groups
|
|
210
|
+
|
|
211
|
+
def _find_exact_duplicates(
|
|
212
|
+
self, candidates: list[_Candidate]
|
|
213
|
+
) -> list[ExactDuplicateGroup]:
|
|
214
|
+
groups = self._candidate_groups_by_primary(candidates)
|
|
215
|
+
exact_groups = []
|
|
216
|
+
for primary_hash, bucket in sorted(groups.items()):
|
|
217
|
+
if len(bucket) < 2:
|
|
218
|
+
continue
|
|
219
|
+
transactions = sorted(
|
|
220
|
+
(candidate.transaction for candidate in bucket),
|
|
221
|
+
key=lambda item: (
|
|
222
|
+
item.source_index or -1,
|
|
223
|
+
item.reference or "",
|
|
224
|
+
),
|
|
225
|
+
)
|
|
226
|
+
exact_groups.append(
|
|
227
|
+
ExactDuplicateGroup(
|
|
228
|
+
primary_hash=primary_hash,
|
|
229
|
+
transactions=transactions,
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
return exact_groups
|
|
233
|
+
|
|
234
|
+
def _find_suspected_matches(
|
|
235
|
+
self,
|
|
236
|
+
candidates: list[_Candidate],
|
|
237
|
+
*,
|
|
238
|
+
excluded_indices: set[int],
|
|
239
|
+
) -> list[MatchGroup]:
|
|
240
|
+
probable_groups = self._find_probable_matches(candidates)
|
|
241
|
+
probable_indices = {
|
|
242
|
+
transaction.source_index
|
|
243
|
+
for group in probable_groups
|
|
244
|
+
for transaction in group.transactions
|
|
245
|
+
if transaction.source_index is not None
|
|
246
|
+
}
|
|
247
|
+
temporal_groups = self._find_temporal_matches(
|
|
248
|
+
[
|
|
249
|
+
candidate
|
|
250
|
+
for candidate in candidates
|
|
251
|
+
if candidate.index
|
|
252
|
+
not in excluded_indices | probable_indices
|
|
253
|
+
]
|
|
254
|
+
)
|
|
255
|
+
return probable_groups + temporal_groups
|
|
256
|
+
|
|
257
|
+
def _find_probable_matches(
|
|
258
|
+
self, candidates: list[_Candidate]
|
|
259
|
+
) -> list[MatchGroup]:
|
|
260
|
+
groups = []
|
|
261
|
+
for bucket in self._candidate_groups_by_primary(
|
|
262
|
+
candidates
|
|
263
|
+
).values():
|
|
264
|
+
if len(bucket) < 2:
|
|
265
|
+
continue
|
|
266
|
+
similarities = []
|
|
267
|
+
for left_index, left in enumerate(bucket):
|
|
268
|
+
for right in bucket[left_index + 1 :]:
|
|
269
|
+
similarity = _description_similarity(
|
|
270
|
+
left.transaction, right.transaction
|
|
271
|
+
)
|
|
272
|
+
if (
|
|
273
|
+
similarity
|
|
274
|
+
>= self.description_similarity_threshold
|
|
275
|
+
and left.transaction.normalized_description
|
|
276
|
+
!= right.transaction.normalized_description
|
|
277
|
+
):
|
|
278
|
+
similarities.append(similarity)
|
|
279
|
+
|
|
280
|
+
if not similarities:
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
groups.append(
|
|
284
|
+
MatchGroup(
|
|
285
|
+
transactions=sorted(
|
|
286
|
+
(candidate.transaction for candidate in bucket),
|
|
287
|
+
key=lambda item: (
|
|
288
|
+
item.source_index or -1,
|
|
289
|
+
item.reference or "",
|
|
290
|
+
),
|
|
291
|
+
),
|
|
292
|
+
reason=(
|
|
293
|
+
"Primary hash collision with description similarity "
|
|
294
|
+
f"{max(similarities):.2f}"
|
|
295
|
+
),
|
|
296
|
+
confidence=min(0.99, max(similarities) + 0.05),
|
|
297
|
+
tier="probable",
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
return groups
|
|
301
|
+
|
|
302
|
+
def _find_temporal_matches(
|
|
303
|
+
self, candidates: list[_Candidate]
|
|
304
|
+
) -> list[MatchGroup]:
|
|
305
|
+
buckets: dict[tuple[str, str, str], list[_Candidate]] = (
|
|
306
|
+
defaultdict(list)
|
|
307
|
+
)
|
|
308
|
+
for candidate in candidates:
|
|
309
|
+
transaction = candidate.transaction
|
|
310
|
+
buckets[
|
|
311
|
+
(
|
|
312
|
+
transaction.account_id or "",
|
|
313
|
+
transaction.currency or "",
|
|
314
|
+
transaction.amount_key(),
|
|
315
|
+
)
|
|
316
|
+
].append(candidate)
|
|
317
|
+
|
|
318
|
+
groups = []
|
|
319
|
+
for bucket in buckets.values():
|
|
320
|
+
if len(bucket) < 2:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
bucket = sorted(
|
|
324
|
+
bucket,
|
|
325
|
+
key=lambda item: (
|
|
326
|
+
item.transaction.value_date or date.min,
|
|
327
|
+
item.index,
|
|
328
|
+
),
|
|
329
|
+
)
|
|
330
|
+
component: list[_Candidate] = [bucket[0]]
|
|
331
|
+
component_similarities: list[float] = []
|
|
332
|
+
for candidate in bucket[1:]:
|
|
333
|
+
prev = component[-1]
|
|
334
|
+
day_delta = _days_between(
|
|
335
|
+
prev.transaction.value_date,
|
|
336
|
+
candidate.transaction.value_date,
|
|
337
|
+
)
|
|
338
|
+
if (
|
|
339
|
+
day_delta is not None
|
|
340
|
+
and day_delta <= self.value_date_window_days
|
|
341
|
+
and prev.primary_hash != candidate.primary_hash
|
|
342
|
+
):
|
|
343
|
+
component.append(candidate)
|
|
344
|
+
component_similarities.append(
|
|
345
|
+
_description_similarity(
|
|
346
|
+
prev.transaction, candidate.transaction
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
if len(component) > 1:
|
|
352
|
+
groups.append(
|
|
353
|
+
self._temporal_group(
|
|
354
|
+
component, component_similarities
|
|
355
|
+
)
|
|
356
|
+
)
|
|
357
|
+
component = [candidate]
|
|
358
|
+
component_similarities = []
|
|
359
|
+
|
|
360
|
+
if len(component) > 1:
|
|
361
|
+
groups.append(
|
|
362
|
+
self._temporal_group(
|
|
363
|
+
component, component_similarities
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
return groups
|
|
368
|
+
|
|
369
|
+
def _temporal_group(
|
|
370
|
+
self,
|
|
371
|
+
component: list[_Candidate],
|
|
372
|
+
similarities: list[float],
|
|
373
|
+
) -> MatchGroup:
|
|
374
|
+
max_delta = max(
|
|
375
|
+
_days_between(
|
|
376
|
+
component[0].transaction.value_date,
|
|
377
|
+
component[-1].transaction.value_date,
|
|
378
|
+
)
|
|
379
|
+
or 0,
|
|
380
|
+
0,
|
|
381
|
+
)
|
|
382
|
+
max_similarity = max(similarities) if similarities else 0.0
|
|
383
|
+
confidence = min(0.95, 0.75 + (max_similarity * 0.2))
|
|
384
|
+
reason = (
|
|
385
|
+
f"Value date shift within {max_delta} day window"
|
|
386
|
+
if max_delta == 1
|
|
387
|
+
else f"Value date shift within {max_delta} day window"
|
|
388
|
+
)
|
|
389
|
+
if max_similarity > 0:
|
|
390
|
+
reason += f"; description similarity {max_similarity:.2f}"
|
|
391
|
+
|
|
392
|
+
return MatchGroup(
|
|
393
|
+
transactions=[
|
|
394
|
+
candidate.transaction
|
|
395
|
+
for candidate in sorted(
|
|
396
|
+
component, key=lambda item: item.index
|
|
397
|
+
)
|
|
398
|
+
],
|
|
399
|
+
reason=reason,
|
|
400
|
+
confidence=confidence,
|
|
401
|
+
tier="suspected",
|
|
402
|
+
)
|