PyPI - statement-parser - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

statement-parser 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{statement_parser-0.1.0/statement_parser.egg-info → statement_parser-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: statement_parser
-Version: 0.1.0
+Version: 0.1.1
 Summary: Bank Statement Parser is a Python library designed to parse and normalize transaction data from various bank statement formats ( CSV, Excel, etc.) into a consistent and easy-to-use Pandas DataFrame. It supports multiple banks and file formats, making it a versatile tool for financial data analysis.
 Author-email: Khuzema Challawala <khuzema.ac@gmail.com>
 Classifier: Programming Language :: Python :: 3

{statement_parser-0.1.0 → statement_parser-0.1.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "statement_parser"
-version = "0.1.0"
+version = "0.1.1"
 authors = [
     { name="Khuzema Challawala", email="khuzema.ac@gmail.com" },
 ]

{statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/GenericBank.py RENAMED Viewed

@@ -84,7 +84,7 @@ class GenericBank(Bank):
     # ------------------------------------------------------------------ #
     def getTransactions(self, filename: str) -> list[Transaction]:
         df = self.getData(filename)
-        return self._build_transactions(df)
+        return self._build_transactions(df, filename)
     def getData(self, filename: str) -> pd.DataFrame:
         df = self._load(filename)
@@ -227,7 +227,8 @@ class GenericBank(Bank):
     # ------------------------------------------------------------------ #
     # Transaction building
     # ------------------------------------------------------------------ #
-    def _build_transactions(self, df: pd.DataFrame) -> list[Transaction]:
+    def _build_transactions(self, df: pd.DataFrame,
+                            filename: str) -> list[Transaction]:
         cols = self._resolve_columns(df)
         work = pd.DataFrame(index=df.index)
@@ -243,6 +244,13 @@ class GenericBank(Bank):
         # Remarks (without the duplicate marker yet)
         work["_remarks"] = self._compute_remarks(df, cols).loc[work.index]
+        # Reference id (cheque / reference number, when present)
+        work["_reference"] = self._compute_reference(df, cols).loc[work.index]
+        # Account id (account or credit card number for the statement)
+        work["_account"] = self._compute_account_id(
+            filename, df, cols).loc[work.index]
         # Duplicate sequence marker
         seq = (
             work.groupby(["_date", "_remarks", "_amount"]).cumcount().add(1)
@@ -259,6 +267,8 @@ class GenericBank(Bank):
                     created_date=row["_date"],
                     remarks=remarks,
                     amount=row["_amount"],
+                    reference_id=row["_reference"],
+                    account_id=row["_account"],
                 )
             )
         return transactions
@@ -310,3 +320,71 @@ class GenericBank(Bank):
             result = result + column.map(render)
         return result.str.strip()
+    def _compute_reference(self, df: pd.DataFrame, cols: dict) -> pd.Series:
+        """Extract a reference / cheque number when the config defines one."""
+        parts_cfg = self.config.get("reference")
+        result = pd.Series([""] * len(df), index=df.index)
+        if not parts_cfg:
+            return result
+        for part in parts_cfg:
+            field = part["field"]
+            if field not in cols:
+                continue
+            prefix = part.get("prefix", "")
+            suffix = part.get("suffix", "")
+            skip = {str(s).lower() for s in part.get("skip", [])}
+            column = df[cols[field]]
+            def render(value):
+                text = "" if pd.isna(value) else str(value).strip()
+                if text.lower() in skip:
+                    return ""
+                return prefix + text + suffix
+            result = result + column.map(render)
+        return result.str.strip()
+    def _raw_text(self, filename: str) -> str:
+        """Return the whole file as text so metadata can be searched."""
+        if filename.endswith((".xls", ".xlsx")):
+            raw = pd.read_excel(filename, header=None)
+            return "\n".join(
+                " ".join(str(v) for v in row if pd.notna(v))
+                for row in raw.values.tolist()
+            )
+        with open(filename, "r", encoding="utf-8") as handle:
+            return handle.read()
+    def _compute_account_id(self, filename: str, df: pd.DataFrame,
+                            cols: dict) -> pd.Series:
+        """Resolve the account / credit-card number for the statement."""
+        cfg = self.config.get("account_id")
+        result = pd.Series([""] * len(df), index=df.index)
+        if not cfg:
+            return result
+        mode = cfg.get("mode", "regex")
+        if mode == "column":
+            field = cfg["field"]
+            if field not in cols:
+                return result
+            column = df[cols[field]]
+            return column.map(
+                lambda v: "" if pd.isna(v) else str(v).strip()
+            )
+        if mode == "regex":
+            text = self._raw_text(filename)
+            match = re.search(
+                cfg["pattern"], text, re.IGNORECASE | re.MULTILINE
+            )
+            value = match.group(1).strip() if match else ""
+            return pd.Series([value] * len(df), index=df.index)
+        raise ValueError(
+            f"[{self.bank_id}] Unknown account_id mode '{mode}'"
+        )

{statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/Transaction.py RENAMED Viewed

@@ -7,21 +7,29 @@ class Transaction:
     bank: str
     remarks: str
     amount: float
+    reference_id: str
+    account_id: str
     def __init__(self, bank: str,
                  created_date: datetime,
                  remarks: str,
-                 amount: float):
+                 amount: float,
+                 reference_id: str = "",
+                 account_id: str = ""):
         self.bank = bank
         self.created_date = created_date
         self.remarks = remarks
         self.amount = amount
+        self.reference_id = reference_id
+        self.account_id = account_id
     def hash(self):
         strObj = (str(self.bank) +
                   str(self.created_date) +
                   str(self.remarks) +
-                  str(self.amount))
+                  str(self.amount) +
+                  str(self.reference_id) +
+                  str(self.account_id))
         hash_obj = hashlib.sha1(strObj.encode("utf-8"))
         hex_hash = hash_obj.hexdigest()
         return hex_hash
@@ -32,5 +40,7 @@ class Transaction:
             "created_date": self.created_date,
             "remarks": self.remarks,
             "amount": self.amount,
+            "reference_id": self.reference_id,
+            "account_id": self.account_id,
             "hash": self.hash(),
         }

{statement_parser-0.1.0 → statement_parser-0.1.1}/statement_parser/bank_configs.json RENAMED Viewed

@@ -23,6 +23,10 @@
       "field": "amount",
       "sign_field": "sign",
       "credit_values": ["CR"]
+    },
+    "account_id": {
+      "mode": "regex",
+      "pattern": "Card No:\\s*([0-9X][0-9X ]*[0-9X])"
     }
   },
@@ -71,6 +75,10 @@
       "mode": "deposit_minus_withdrawal",
       "deposit": "deposit",
       "withdrawal": "withdrawal"
+    },
+    "account_id": {
+      "mode": "regex",
+      "pattern": "ACCOUNT-\\s*([0-9]+)"
     }
   },
@@ -98,6 +106,10 @@
       "field": "amount",
       "sign_field": "sign",
       "credit_values": ["CR"]
+    },
+    "account_id": {
+      "mode": "regex",
+      "pattern": "Accountno:[^0-9]*([0-9]+)"
     }
   },
@@ -128,10 +140,17 @@
        "skip": ["-", "", "nan"]},
       {"field": "remarks"}
     ],
+    "reference": [
+      {"field": "cheque", "skip": ["-", "", "nan"]}
+    ],
     "amount": {
       "mode": "deposit_minus_withdrawal",
       "deposit": "deposit",
       "withdrawal": "withdrawal"
+    },
+    "account_id": {
+      "mode": "regex",
+      "pattern": "Account Number\\s+([^\\n]+)"
     }
   },
@@ -163,11 +182,18 @@
        "skip": ["nan", "", "-"]},
       {"field": "description"}
     ],
+    "reference": [
+      {"field": "ref", "skip": ["nan", "", "-"]}
+    ],
     "amount": {
       "mode": "signed",
       "field": "amount",
       "sign_field": "sign",
       "credit_values": ["CR"]
+    },
+    "account_id": {
+      "mode": "regex",
+      "pattern": "Account No\\.[^A-Za-z0-9]*([A-Za-z0-9]+)"
     }
   },
@@ -183,7 +209,8 @@
       "date": ["date"],
       "note": ["note"],
       "category": ["category"],
-      "amount": ["amount"]
+      "amount": ["amount"],
+      "account": ["account"]
     },
     "date_field": "date",
     "remarks": [
@@ -193,6 +220,10 @@
     "amount": {
       "mode": "direct",
       "field": "amount"
+    },
+    "account_id": {
+      "mode": "column",
+      "field": "account"
     }
   }
 }

{statement_parser-0.1.0 → statement_parser-0.1.1/statement_parser.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: statement_parser
-Version: 0.1.0
+Version: 0.1.1
 Summary: Bank Statement Parser is a Python library designed to parse and normalize transaction data from various bank statement formats ( CSV, Excel, etc.) into a consistent and easy-to-use Pandas DataFrame. It supports multiple banks and file formats, making it a versatile tool for financial data analysis.
 Author-email: Khuzema Challawala <khuzema.ac@gmail.com>
 Classifier: Programming Language :: Python :: 3