statement-parser 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: statement_parser
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Bank Statement Parser is a Python library designed to parse and normalize transaction data from various bank statement formats ( CSV, Excel, etc.) into a consistent and easy-to-use Pandas DataFrame. It supports multiple banks and file formats, making it a versatile tool for financial data analysis.
5
5
  Author-email: Khuzema Challawala <khuzema.ac@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "statement_parser"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  authors = [
9
9
  { name="Khuzema Challawala", email="khuzema.ac@gmail.com" },
10
10
  ]
@@ -84,7 +84,7 @@ class GenericBank(Bank):
84
84
  # ------------------------------------------------------------------ #
85
85
  def getTransactions(self, filename: str) -> list[Transaction]:
86
86
  df = self.getData(filename)
87
- return self._build_transactions(df)
87
+ return self._build_transactions(df, filename)
88
88
 
89
89
  def getData(self, filename: str) -> pd.DataFrame:
90
90
  df = self._load(filename)
@@ -227,7 +227,8 @@ class GenericBank(Bank):
227
227
  # ------------------------------------------------------------------ #
228
228
  # Transaction building
229
229
  # ------------------------------------------------------------------ #
230
- def _build_transactions(self, df: pd.DataFrame) -> list[Transaction]:
230
+ def _build_transactions(self, df: pd.DataFrame,
231
+ filename: str) -> list[Transaction]:
231
232
  cols = self._resolve_columns(df)
232
233
  work = pd.DataFrame(index=df.index)
233
234
 
@@ -243,6 +244,13 @@ class GenericBank(Bank):
243
244
  # Remarks (without the duplicate marker yet)
244
245
  work["_remarks"] = self._compute_remarks(df, cols).loc[work.index]
245
246
 
247
+ # Reference id (cheque / reference number, when present)
248
+ work["_reference"] = self._compute_reference(df, cols).loc[work.index]
249
+
250
+ # Account id (account or credit card number for the statement)
251
+ work["_account"] = self._compute_account_id(
252
+ filename, df, cols).loc[work.index]
253
+
246
254
  # Duplicate sequence marker
247
255
  seq = (
248
256
  work.groupby(["_date", "_remarks", "_amount"]).cumcount().add(1)
@@ -259,6 +267,8 @@ class GenericBank(Bank):
259
267
  created_date=row["_date"],
260
268
  remarks=remarks,
261
269
  amount=row["_amount"],
270
+ reference_id=row["_reference"],
271
+ account_id=row["_account"],
262
272
  )
263
273
  )
264
274
  return transactions
@@ -310,3 +320,71 @@ class GenericBank(Bank):
310
320
  result = result + column.map(render)
311
321
 
312
322
  return result.str.strip()
323
+
324
+ def _compute_reference(self, df: pd.DataFrame, cols: dict) -> pd.Series:
325
+ """Extract a reference / cheque number when the config defines one."""
326
+ parts_cfg = self.config.get("reference")
327
+ result = pd.Series([""] * len(df), index=df.index)
328
+ if not parts_cfg:
329
+ return result
330
+
331
+ for part in parts_cfg:
332
+ field = part["field"]
333
+ if field not in cols:
334
+ continue
335
+ prefix = part.get("prefix", "")
336
+ suffix = part.get("suffix", "")
337
+ skip = {str(s).lower() for s in part.get("skip", [])}
338
+ column = df[cols[field]]
339
+
340
+ def render(value):
341
+ text = "" if pd.isna(value) else str(value).strip()
342
+ if text.lower() in skip:
343
+ return ""
344
+ return prefix + text + suffix
345
+
346
+ result = result + column.map(render)
347
+
348
+ return result.str.strip()
349
+
350
+ def _raw_text(self, filename: str) -> str:
351
+ """Return the whole file as text so metadata can be searched."""
352
+ if filename.endswith((".xls", ".xlsx")):
353
+ raw = pd.read_excel(filename, header=None)
354
+ return "\n".join(
355
+ " ".join(str(v) for v in row if pd.notna(v))
356
+ for row in raw.values.tolist()
357
+ )
358
+ with open(filename, "r", encoding="utf-8") as handle:
359
+ return handle.read()
360
+
361
+ def _compute_account_id(self, filename: str, df: pd.DataFrame,
362
+ cols: dict) -> pd.Series:
363
+ """Resolve the account / credit-card number for the statement."""
364
+ cfg = self.config.get("account_id")
365
+ result = pd.Series([""] * len(df), index=df.index)
366
+ if not cfg:
367
+ return result
368
+
369
+ mode = cfg.get("mode", "regex")
370
+
371
+ if mode == "column":
372
+ field = cfg["field"]
373
+ if field not in cols:
374
+ return result
375
+ column = df[cols[field]]
376
+ return column.map(
377
+ lambda v: "" if pd.isna(v) else str(v).strip()
378
+ )
379
+
380
+ if mode == "regex":
381
+ text = self._raw_text(filename)
382
+ match = re.search(
383
+ cfg["pattern"], text, re.IGNORECASE | re.MULTILINE
384
+ )
385
+ value = match.group(1).strip() if match else ""
386
+ return pd.Series([value] * len(df), index=df.index)
387
+
388
+ raise ValueError(
389
+ f"[{self.bank_id}] Unknown account_id mode '{mode}'"
390
+ )
@@ -7,21 +7,29 @@ class Transaction:
7
7
  bank: str
8
8
  remarks: str
9
9
  amount: float
10
+ reference_id: str
11
+ account_id: str
10
12
 
11
13
  def __init__(self, bank: str,
12
14
  created_date: datetime,
13
15
  remarks: str,
14
- amount: float):
16
+ amount: float,
17
+ reference_id: str = "",
18
+ account_id: str = ""):
15
19
  self.bank = bank
16
20
  self.created_date = created_date
17
21
  self.remarks = remarks
18
22
  self.amount = amount
23
+ self.reference_id = reference_id
24
+ self.account_id = account_id
19
25
 
20
26
  def hash(self):
21
27
  strObj = (str(self.bank) +
22
28
  str(self.created_date) +
23
29
  str(self.remarks) +
24
- str(self.amount))
30
+ str(self.amount) +
31
+ str(self.reference_id) +
32
+ str(self.account_id))
25
33
  hash_obj = hashlib.sha1(strObj.encode("utf-8"))
26
34
  hex_hash = hash_obj.hexdigest()
27
35
  return hex_hash
@@ -32,5 +40,7 @@ class Transaction:
32
40
  "created_date": self.created_date,
33
41
  "remarks": self.remarks,
34
42
  "amount": self.amount,
43
+ "reference_id": self.reference_id,
44
+ "account_id": self.account_id,
35
45
  "hash": self.hash(),
36
46
  }
@@ -23,6 +23,10 @@
23
23
  "field": "amount",
24
24
  "sign_field": "sign",
25
25
  "credit_values": ["CR"]
26
+ },
27
+ "account_id": {
28
+ "mode": "regex",
29
+ "pattern": "Card No:\\s*([0-9X][0-9X ]*[0-9X])"
26
30
  }
27
31
  },
28
32
 
@@ -71,6 +75,10 @@
71
75
  "mode": "deposit_minus_withdrawal",
72
76
  "deposit": "deposit",
73
77
  "withdrawal": "withdrawal"
78
+ },
79
+ "account_id": {
80
+ "mode": "regex",
81
+ "pattern": "ACCOUNT-\\s*([0-9]+)"
74
82
  }
75
83
  },
76
84
 
@@ -98,6 +106,10 @@
98
106
  "field": "amount",
99
107
  "sign_field": "sign",
100
108
  "credit_values": ["CR"]
109
+ },
110
+ "account_id": {
111
+ "mode": "regex",
112
+ "pattern": "Accountno:[^0-9]*([0-9]+)"
101
113
  }
102
114
  },
103
115
 
@@ -128,10 +140,17 @@
128
140
  "skip": ["-", "", "nan"]},
129
141
  {"field": "remarks"}
130
142
  ],
143
+ "reference": [
144
+ {"field": "cheque", "skip": ["-", "", "nan"]}
145
+ ],
131
146
  "amount": {
132
147
  "mode": "deposit_minus_withdrawal",
133
148
  "deposit": "deposit",
134
149
  "withdrawal": "withdrawal"
150
+ },
151
+ "account_id": {
152
+ "mode": "regex",
153
+ "pattern": "Account Number\\s+([^\\n]+)"
135
154
  }
136
155
  },
137
156
 
@@ -163,11 +182,18 @@
163
182
  "skip": ["nan", "", "-"]},
164
183
  {"field": "description"}
165
184
  ],
185
+ "reference": [
186
+ {"field": "ref", "skip": ["nan", "", "-"]}
187
+ ],
166
188
  "amount": {
167
189
  "mode": "signed",
168
190
  "field": "amount",
169
191
  "sign_field": "sign",
170
192
  "credit_values": ["CR"]
193
+ },
194
+ "account_id": {
195
+ "mode": "regex",
196
+ "pattern": "Account No\\.[^A-Za-z0-9]*([A-Za-z0-9]+)"
171
197
  }
172
198
  },
173
199
 
@@ -183,7 +209,8 @@
183
209
  "date": ["date"],
184
210
  "note": ["note"],
185
211
  "category": ["category"],
186
- "amount": ["amount"]
212
+ "amount": ["amount"],
213
+ "account": ["account"]
187
214
  },
188
215
  "date_field": "date",
189
216
  "remarks": [
@@ -193,6 +220,10 @@
193
220
  "amount": {
194
221
  "mode": "direct",
195
222
  "field": "amount"
223
+ },
224
+ "account_id": {
225
+ "mode": "column",
226
+ "field": "account"
196
227
  }
197
228
  }
198
229
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: statement_parser
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Bank Statement Parser is a Python library designed to parse and normalize transaction data from various bank statement formats ( CSV, Excel, etc.) into a consistent and easy-to-use Pandas DataFrame. It supports multiple banks and file formats, making it a versatile tool for financial data analysis.
5
5
  Author-email: Khuzema Challawala <khuzema.ac@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3