statement-parser 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {statement_parser-0.1.0/statement_parser.egg-info → statement_parser-0.1.1}/PKG-INFO +1 -1
- {statement_parser-0.1.0 → statement_parser-0.1.1}/pyproject.toml +1 -1
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/GenericBank.py +80 -2
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/Transaction.py +12 -2
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/bank_configs.json +32 -1
- {statement_parser-0.1.0 → statement_parser-0.1.1/statement_parser.egg-info}/PKG-INFO +1 -1
- {statement_parser-0.1.0 → statement_parser-0.1.1}/LICENSE +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/README.md +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/setup.cfg +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/Bank.py +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/__init__.py +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser.egg-info/SOURCES.txt +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser.egg-info/dependency_links.txt +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser.egg-info/requires.txt +0 -0
- {statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: statement_parser
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Bank Statement Parser is a Python library designed to parse and normalize transaction data from various bank statement formats ( CSV, Excel, etc.) into a consistent and easy-to-use Pandas DataFrame. It supports multiple banks and file formats, making it a versatile tool for financial data analysis.
|
|
5
5
|
Author-email: Khuzema Challawala <khuzema.ac@gmail.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -84,7 +84,7 @@ class GenericBank(Bank):
|
|
|
84
84
|
# ------------------------------------------------------------------ #
|
|
85
85
|
def getTransactions(self, filename: str) -> list[Transaction]:
|
|
86
86
|
df = self.getData(filename)
|
|
87
|
-
return self._build_transactions(df)
|
|
87
|
+
return self._build_transactions(df, filename)
|
|
88
88
|
|
|
89
89
|
def getData(self, filename: str) -> pd.DataFrame:
|
|
90
90
|
df = self._load(filename)
|
|
@@ -227,7 +227,8 @@ class GenericBank(Bank):
|
|
|
227
227
|
# ------------------------------------------------------------------ #
|
|
228
228
|
# Transaction building
|
|
229
229
|
# ------------------------------------------------------------------ #
|
|
230
|
-
def _build_transactions(self, df: pd.DataFrame
|
|
230
|
+
def _build_transactions(self, df: pd.DataFrame,
|
|
231
|
+
filename: str) -> list[Transaction]:
|
|
231
232
|
cols = self._resolve_columns(df)
|
|
232
233
|
work = pd.DataFrame(index=df.index)
|
|
233
234
|
|
|
@@ -243,6 +244,13 @@ class GenericBank(Bank):
|
|
|
243
244
|
# Remarks (without the duplicate marker yet)
|
|
244
245
|
work["_remarks"] = self._compute_remarks(df, cols).loc[work.index]
|
|
245
246
|
|
|
247
|
+
# Reference id (cheque / reference number, when present)
|
|
248
|
+
work["_reference"] = self._compute_reference(df, cols).loc[work.index]
|
|
249
|
+
|
|
250
|
+
# Account id (account or credit card number for the statement)
|
|
251
|
+
work["_account"] = self._compute_account_id(
|
|
252
|
+
filename, df, cols).loc[work.index]
|
|
253
|
+
|
|
246
254
|
# Duplicate sequence marker
|
|
247
255
|
seq = (
|
|
248
256
|
work.groupby(["_date", "_remarks", "_amount"]).cumcount().add(1)
|
|
@@ -259,6 +267,8 @@ class GenericBank(Bank):
|
|
|
259
267
|
created_date=row["_date"],
|
|
260
268
|
remarks=remarks,
|
|
261
269
|
amount=row["_amount"],
|
|
270
|
+
reference_id=row["_reference"],
|
|
271
|
+
account_id=row["_account"],
|
|
262
272
|
)
|
|
263
273
|
)
|
|
264
274
|
return transactions
|
|
@@ -310,3 +320,71 @@ class GenericBank(Bank):
|
|
|
310
320
|
result = result + column.map(render)
|
|
311
321
|
|
|
312
322
|
return result.str.strip()
|
|
323
|
+
|
|
324
|
+
def _compute_reference(self, df: pd.DataFrame, cols: dict) -> pd.Series:
|
|
325
|
+
"""Extract a reference / cheque number when the config defines one."""
|
|
326
|
+
parts_cfg = self.config.get("reference")
|
|
327
|
+
result = pd.Series([""] * len(df), index=df.index)
|
|
328
|
+
if not parts_cfg:
|
|
329
|
+
return result
|
|
330
|
+
|
|
331
|
+
for part in parts_cfg:
|
|
332
|
+
field = part["field"]
|
|
333
|
+
if field not in cols:
|
|
334
|
+
continue
|
|
335
|
+
prefix = part.get("prefix", "")
|
|
336
|
+
suffix = part.get("suffix", "")
|
|
337
|
+
skip = {str(s).lower() for s in part.get("skip", [])}
|
|
338
|
+
column = df[cols[field]]
|
|
339
|
+
|
|
340
|
+
def render(value):
|
|
341
|
+
text = "" if pd.isna(value) else str(value).strip()
|
|
342
|
+
if text.lower() in skip:
|
|
343
|
+
return ""
|
|
344
|
+
return prefix + text + suffix
|
|
345
|
+
|
|
346
|
+
result = result + column.map(render)
|
|
347
|
+
|
|
348
|
+
return result.str.strip()
|
|
349
|
+
|
|
350
|
+
def _raw_text(self, filename: str) -> str:
|
|
351
|
+
"""Return the whole file as text so metadata can be searched."""
|
|
352
|
+
if filename.endswith((".xls", ".xlsx")):
|
|
353
|
+
raw = pd.read_excel(filename, header=None)
|
|
354
|
+
return "\n".join(
|
|
355
|
+
" ".join(str(v) for v in row if pd.notna(v))
|
|
356
|
+
for row in raw.values.tolist()
|
|
357
|
+
)
|
|
358
|
+
with open(filename, "r", encoding="utf-8") as handle:
|
|
359
|
+
return handle.read()
|
|
360
|
+
|
|
361
|
+
def _compute_account_id(self, filename: str, df: pd.DataFrame,
|
|
362
|
+
cols: dict) -> pd.Series:
|
|
363
|
+
"""Resolve the account / credit-card number for the statement."""
|
|
364
|
+
cfg = self.config.get("account_id")
|
|
365
|
+
result = pd.Series([""] * len(df), index=df.index)
|
|
366
|
+
if not cfg:
|
|
367
|
+
return result
|
|
368
|
+
|
|
369
|
+
mode = cfg.get("mode", "regex")
|
|
370
|
+
|
|
371
|
+
if mode == "column":
|
|
372
|
+
field = cfg["field"]
|
|
373
|
+
if field not in cols:
|
|
374
|
+
return result
|
|
375
|
+
column = df[cols[field]]
|
|
376
|
+
return column.map(
|
|
377
|
+
lambda v: "" if pd.isna(v) else str(v).strip()
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
if mode == "regex":
|
|
381
|
+
text = self._raw_text(filename)
|
|
382
|
+
match = re.search(
|
|
383
|
+
cfg["pattern"], text, re.IGNORECASE | re.MULTILINE
|
|
384
|
+
)
|
|
385
|
+
value = match.group(1).strip() if match else ""
|
|
386
|
+
return pd.Series([value] * len(df), index=df.index)
|
|
387
|
+
|
|
388
|
+
raise ValueError(
|
|
389
|
+
f"[{self.bank_id}] Unknown account_id mode '{mode}'"
|
|
390
|
+
)
|
|
@@ -7,21 +7,29 @@ class Transaction:
|
|
|
7
7
|
bank: str
|
|
8
8
|
remarks: str
|
|
9
9
|
amount: float
|
|
10
|
+
reference_id: str
|
|
11
|
+
account_id: str
|
|
10
12
|
|
|
11
13
|
def __init__(self, bank: str,
|
|
12
14
|
created_date: datetime,
|
|
13
15
|
remarks: str,
|
|
14
|
-
amount: float
|
|
16
|
+
amount: float,
|
|
17
|
+
reference_id: str = "",
|
|
18
|
+
account_id: str = ""):
|
|
15
19
|
self.bank = bank
|
|
16
20
|
self.created_date = created_date
|
|
17
21
|
self.remarks = remarks
|
|
18
22
|
self.amount = amount
|
|
23
|
+
self.reference_id = reference_id
|
|
24
|
+
self.account_id = account_id
|
|
19
25
|
|
|
20
26
|
def hash(self):
|
|
21
27
|
strObj = (str(self.bank) +
|
|
22
28
|
str(self.created_date) +
|
|
23
29
|
str(self.remarks) +
|
|
24
|
-
str(self.amount)
|
|
30
|
+
str(self.amount) +
|
|
31
|
+
str(self.reference_id) +
|
|
32
|
+
str(self.account_id))
|
|
25
33
|
hash_obj = hashlib.sha1(strObj.encode("utf-8"))
|
|
26
34
|
hex_hash = hash_obj.hexdigest()
|
|
27
35
|
return hex_hash
|
|
@@ -32,5 +40,7 @@ class Transaction:
|
|
|
32
40
|
"created_date": self.created_date,
|
|
33
41
|
"remarks": self.remarks,
|
|
34
42
|
"amount": self.amount,
|
|
43
|
+
"reference_id": self.reference_id,
|
|
44
|
+
"account_id": self.account_id,
|
|
35
45
|
"hash": self.hash(),
|
|
36
46
|
}
|
|
@@ -23,6 +23,10 @@
|
|
|
23
23
|
"field": "amount",
|
|
24
24
|
"sign_field": "sign",
|
|
25
25
|
"credit_values": ["CR"]
|
|
26
|
+
},
|
|
27
|
+
"account_id": {
|
|
28
|
+
"mode": "regex",
|
|
29
|
+
"pattern": "Card No:\\s*([0-9X][0-9X ]*[0-9X])"
|
|
26
30
|
}
|
|
27
31
|
},
|
|
28
32
|
|
|
@@ -71,6 +75,10 @@
|
|
|
71
75
|
"mode": "deposit_minus_withdrawal",
|
|
72
76
|
"deposit": "deposit",
|
|
73
77
|
"withdrawal": "withdrawal"
|
|
78
|
+
},
|
|
79
|
+
"account_id": {
|
|
80
|
+
"mode": "regex",
|
|
81
|
+
"pattern": "ACCOUNT-\\s*([0-9]+)"
|
|
74
82
|
}
|
|
75
83
|
},
|
|
76
84
|
|
|
@@ -98,6 +106,10 @@
|
|
|
98
106
|
"field": "amount",
|
|
99
107
|
"sign_field": "sign",
|
|
100
108
|
"credit_values": ["CR"]
|
|
109
|
+
},
|
|
110
|
+
"account_id": {
|
|
111
|
+
"mode": "regex",
|
|
112
|
+
"pattern": "Accountno:[^0-9]*([0-9]+)"
|
|
101
113
|
}
|
|
102
114
|
},
|
|
103
115
|
|
|
@@ -128,10 +140,17 @@
|
|
|
128
140
|
"skip": ["-", "", "nan"]},
|
|
129
141
|
{"field": "remarks"}
|
|
130
142
|
],
|
|
143
|
+
"reference": [
|
|
144
|
+
{"field": "cheque", "skip": ["-", "", "nan"]}
|
|
145
|
+
],
|
|
131
146
|
"amount": {
|
|
132
147
|
"mode": "deposit_minus_withdrawal",
|
|
133
148
|
"deposit": "deposit",
|
|
134
149
|
"withdrawal": "withdrawal"
|
|
150
|
+
},
|
|
151
|
+
"account_id": {
|
|
152
|
+
"mode": "regex",
|
|
153
|
+
"pattern": "Account Number\\s+([^\\n]+)"
|
|
135
154
|
}
|
|
136
155
|
},
|
|
137
156
|
|
|
@@ -163,11 +182,18 @@
|
|
|
163
182
|
"skip": ["nan", "", "-"]},
|
|
164
183
|
{"field": "description"}
|
|
165
184
|
],
|
|
185
|
+
"reference": [
|
|
186
|
+
{"field": "ref", "skip": ["nan", "", "-"]}
|
|
187
|
+
],
|
|
166
188
|
"amount": {
|
|
167
189
|
"mode": "signed",
|
|
168
190
|
"field": "amount",
|
|
169
191
|
"sign_field": "sign",
|
|
170
192
|
"credit_values": ["CR"]
|
|
193
|
+
},
|
|
194
|
+
"account_id": {
|
|
195
|
+
"mode": "regex",
|
|
196
|
+
"pattern": "Account No\\.[^A-Za-z0-9]*([A-Za-z0-9]+)"
|
|
171
197
|
}
|
|
172
198
|
},
|
|
173
199
|
|
|
@@ -183,7 +209,8 @@
|
|
|
183
209
|
"date": ["date"],
|
|
184
210
|
"note": ["note"],
|
|
185
211
|
"category": ["category"],
|
|
186
|
-
"amount": ["amount"]
|
|
212
|
+
"amount": ["amount"],
|
|
213
|
+
"account": ["account"]
|
|
187
214
|
},
|
|
188
215
|
"date_field": "date",
|
|
189
216
|
"remarks": [
|
|
@@ -193,6 +220,10 @@
|
|
|
193
220
|
"amount": {
|
|
194
221
|
"mode": "direct",
|
|
195
222
|
"field": "amount"
|
|
223
|
+
},
|
|
224
|
+
"account_id": {
|
|
225
|
+
"mode": "column",
|
|
226
|
+
"field": "account"
|
|
196
227
|
}
|
|
197
228
|
}
|
|
198
229
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: statement_parser
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Bank Statement Parser is a Python library designed to parse and normalize transaction data from various bank statement formats ( CSV, Excel, etc.) into a consistent and easy-to-use Pandas DataFrame. It supports multiple banks and file formats, making it a versatile tool for financial data analysis.
|
|
5
5
|
Author-email: Khuzema Challawala <khuzema.ac@gmail.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|