PyPI - bank-statement-mapper - Versions diffs - 0.1.0__py3-none-any.whl - Mend

bank-statement-mapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

bank_statement_mapper/__init__.py +66 -0
bank_statement_mapper/ai_matcher.py +253 -0
bank_statement_mapper/bank_mapper.py +877 -0
bank_statement_mapper/bank_mapper_api.py +169 -0
bank_statement_mapper/cli.py +227 -0
bank_statement_mapper/learn.py +201 -0
bank_statement_mapper/llm_fallback.py +118 -0
bank_statement_mapper/mapping_cache.py +69 -0
bank_statement_mapper/schema.py +263 -0
bank_statement_mapper/stores.py +238 -0
bank_statement_mapper-0.1.0.dist-info/METADATA +734 -0
bank_statement_mapper-0.1.0.dist-info/RECORD +16 -0
bank_statement_mapper-0.1.0.dist-info/WHEEL +5 -0
bank_statement_mapper-0.1.0.dist-info/entry_points.txt +2 -0
bank_statement_mapper-0.1.0.dist-info/licenses/LICENSE +21 -0
bank_statement_mapper-0.1.0.dist-info/top_level.txt +1 -0

bank_statement_mapper/__init__.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""
+bank_statement_mapper — map any bank statement .xlsx to a standard schema.
+Two-stage, auditable pipeline: deterministic header detection + synonym/fuzzy
+column mapping, with an optional AI table matcher and a self-learning vocabulary.
+Quick start:
+    from bank_statement_mapper import process_file, MappingCache
+    res = process_file("statement.xlsx", cache=MappingCache())
+    print(res.records)          # list[dict], ready for JSON / DB
+Heavier pieces are kept as submodules so importing this package stays light:
+    from bank_statement_mapper.ai_matcher import OpenAICompatibleMatcher
+    from bank_statement_mapper.bank_mapper_api import router   # needs [api] extra
+"""
+from .bank_mapper import (
+    ALLOWED_FIELDS,
+    OUTPUT_SCHEMA,
+    ColumnMap,
+    OutputResult,
+    ProcessResult,
+    apply_learned,
+    configure,
+    detect_header_row,
+    map_columns,
+    normalize_amount,
+    normalize_date,
+    process_file,
+    process_stream,
+    records_to_csv_bytes,
+)
+from .learn import LearnStore, harvest_folder, learn_from_result
+from .mapping_cache import MappingCache
+from .schema import Config, config_from_dict, default_config, load_config
+from .stores import open_store
+__version__ = "0.1.0"
+__all__ = [
+    "process_file",
+    "process_stream",
+    "records_to_csv_bytes",
+    "configure",
+    "apply_learned",
+    "MappingCache",
+    "LearnStore",
+    "learn_from_result",
+    "harvest_folder",
+    "load_config",
+    "config_from_dict",
+    "default_config",
+    "Config",
+    "open_store",
+    "ProcessResult",
+    "ColumnMap",
+    "OutputResult",
+    "OUTPUT_SCHEMA",
+    "ALLOWED_FIELDS",
+    "detect_header_row",
+    "map_columns",
+    "normalize_amount",
+    "normalize_date",
+    "__version__",
+]

bank_statement_mapper/ai_matcher.py ADDED Viewed

@@ -0,0 +1,253 @@
+"""
+ai_matcher.py — LLM-based, table-level column matcher for NEW bank layouts.
+This is the high-accuracy path your boss is asking for: when a statement's
+header is unknown to the synonym table, one LLM call maps the whole header row
+to the output fields and the result is written straight into mapping_cache.json,
+so that bank is "known" forever after (never hits the LLM again).
+PRIVACY — the model matches the TABLE, never the data
+-----------------------------------------------------
+The prompt contains ONLY:
+  * column header strings (e.g. "Withdrawals", "Value Dt")
+  * a structural profile per column computed locally (dtype, sign, fill-rate,
+    which columns are mutually exclusive) — this is metadata, NOT cell contents
+  * the list of allowed output fields + short descriptions
+It NEVER contains transaction amounts, dates, names, narrations or references.
+No real statement data leaves the machine. (You can opt into sending a couple of
+sanitized sample values with include_samples=True, but it is OFF by default.)
+Provider — OpenAI-compatible
+----------------------------
+Works with any endpoint that speaks the OpenAI /chat/completions API: OpenAI,
+Azure OpenAI, Together, Groq, or a local vLLM / Ollama / LM Studio server. Set
+base_url + api_key + model. Uses only the Python standard library (urllib), so
+there is no SDK dependency to install or pin.
+"""
+from __future__ import annotations
+import datetime as _dt
+import json
+import os
+import re
+import urllib.request
+from typing import Callable, Optional
+# Concise field definitions the LLM maps onto. Kept separate from the embedding
+# descriptions because an instruct model wants crisp semantics, not keyword soup.
+FIELD_DEFS: dict[str, str] = {
+    "date": "the transaction date (post/value/booking date)",
+    "description": "free-text narration / particulars / details of the transaction",
+    "reference": "reference or cheque/UTR/instrument number identifying the entry",
+    "debit": "money leaving the account (withdrawal / paid out); a debit-only column",
+    "credit": "money entering the account (deposit / paid in); a credit-only column",
+    "balance": "running account balance after the transaction",
+    "amount": "a SINGLE signed amount column (one column, +credit / -debit)",
+}
+# --------------------------------------------------------------------------
+# Structural profiling — deterministic, no cell contents leave this function
+# --------------------------------------------------------------------------
+def _classify(v) -> str:
+    if v is None or (isinstance(v, str) and v.strip() == ""):
+        return "empty"
+    if isinstance(v, (_dt.datetime, _dt.date)):
+        return "date"
+    if isinstance(v, bool):
+        return "text"
+    if isinstance(v, (int, float)):
+        return "number"
+    s = str(v).strip()
+    if re.match(r"^[-(]?[\d,]+\.?\d*\)?\s*(dr|cr)?$", s, re.I):
+        return "number"
+    if re.search(r"\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}", s) or \
+       re.match(r"\d{1,2}\s*[A-Za-z]{3,9}\s*\d{2,4}", s):
+        return "date"
+    return "text"
+def _is_negative(v) -> bool:
+    if isinstance(v, (int, float)) and not isinstance(v, bool):
+        return v < 0
+    if isinstance(v, str):
+        s = v.strip().lower()
+        return s.startswith("-") or ("(" in s and ")" in s) or s.endswith("dr")
+    return False
+def profile_columns(header_row: list, data_rows: list[list],
+                    max_rows: int = 40) -> list[dict]:
+    """Return a per-column STRUCTURAL profile — no raw cell values.
+    Fields: index, name, dtype (majority), fill_rate, has_negative,
+    mutually_exclusive_with (column indices never co-filled -> debit/credit
+    pairs). This is exactly the signal a human uses to tell debit from credit
+    without reading the numbers.
+    """
+    ncols = len(header_row)
+    rows = data_rows[:max_rows]
+    filled = [[False] * ncols for _ in rows]
+    dtypes: list[list[str]] = [[] for _ in range(ncols)]
+    neg = [False] * ncols
+    for r_i, row in enumerate(rows):
+        for c in range(ncols):
+            v = row[c] if c < len(row) else None
+            t = _classify(v)
+            if t != "empty":
+                filled[r_i][c] = True
+                dtypes[c].append(t)
+                if _is_negative(v):
+                    neg[c] = True
+    profiles = []
+    for c in range(ncols):
+        types = dtypes[c]
+        majority = max(set(types), key=types.count) if types else "empty"
+        fill_rate = (sum(1 for r in filled if r[c]) / len(rows)) if rows else 0.0
+        # mutual exclusivity: never filled in the same row as column d
+        excl = []
+        for d in range(ncols):
+            if d == c:
+                continue
+            both = any(r[c] and r[d] for r in filled)
+            c_has = any(r[c] for r in filled)
+            d_has = any(r[d] for r in filled)
+            if c_has and d_has and not both:
+                excl.append(d)
+        profiles.append({
+            "index": c,
+            "name": ("" if header_row[c] is None else str(header_row[c]).strip()),
+            "dtype": majority,
+            "fill_rate": round(fill_rate, 2),
+            "has_negative": neg[c],
+            "mutually_exclusive_with": excl,
+        })
+    return profiles
+# --------------------------------------------------------------------------
+# OpenAI-compatible table matcher
+# --------------------------------------------------------------------------
+class OpenAICompatibleMatcher:
+    """Map an unknown header row to output fields with one LLM call.
+    Transport is any OpenAI-compatible /chat/completions endpoint. Inject a
+    custom `transport` (messages -> assistant_text) to unit-test without network.
+    """
+    def __init__(self,
+                 base_url: Optional[str] = None,
+                 api_key: Optional[str] = None,
+                 model: Optional[str] = None,
+                 field_defs: Optional[dict] = None,
+                 include_samples: bool = False,
+                 timeout: float = 30.0,
+                 temperature: float = 0.0,
+                 transport: Optional[Callable[[list], str]] = None):
+        self.base_url = (base_url or os.getenv("OPENAI_BASE_URL")
+                         or "https://api.openai.com/v1").rstrip("/")
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
+        self.model = model or os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+        self.field_defs = field_defs or FIELD_DEFS
+        self.include_samples = include_samples
+        self.timeout = timeout
+        self.temperature = temperature
+        self._transport = transport  # for tests / custom clients
+    # -- prompt construction (structure only) --
+    def _build_messages(self, profiles: list[dict], allowed_fields: list[str]) -> list:
+        field_lines = "\n".join(
+            f"  - {f}: {self.field_defs.get(f, f)}"
+            for f in allowed_fields
+        )
+        col_lines = []
+        for p in profiles:
+            excl = (f", mutually-exclusive with columns {p['mutually_exclusive_with']}"
+                    if p["mutually_exclusive_with"] else "")
+            neg = ", contains negative values" if p["has_negative"] else ""
+            col_lines.append(
+                f"  [{p['index']}] name={p['name']!r} "
+                f"type={p['dtype']} fill={p['fill_rate']}{neg}{excl}"
+            )
+        cols = "\n".join(col_lines)
+        system = (
+            "You map bank-statement spreadsheet COLUMNS to a fixed schema. "
+            "You are given only column headers and structural metadata (data "
+            "types, fill rates, sign, and which columns are mutually exclusive) "
+            "— never the actual transaction values. Use the header wording plus "
+            "these structural hints. Two money columns that are mutually "
+            "exclusive are almost always a debit/credit pair; decide direction "
+            "from the header wording. A single signed money column (has negative "
+            "values, not mutually exclusive with another money column) is "
+            "'amount'. Respond with ONLY a JSON object mapping the column index "
+            "(as a string) to one field name, or null if a column matches no "
+            "field. Do not invent fields."
+        )
+        user = (
+            f"Allowed fields:\n{field_lines}\n\n"
+            f"Columns:\n{cols}\n\n"
+            "Return JSON like {\"0\": \"date\", \"1\": \"description\", "
+            "\"4\": null}. Every column index must appear exactly once."
+        )
+        return [{"role": "system", "content": system},
+                {"role": "user", "content": user}]
+    # -- HTTP transport (stdlib) --
+    def _http(self, messages: list) -> str:
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": self.temperature,
+            "response_format": {"type": "json_object"},
+        }
+        req = urllib.request.Request(
+            f"{self.base_url}/chat/completions",
+            data=json.dumps(payload).encode("utf-8"),
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}",
+            },
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=self.timeout) as resp:
+            body = json.loads(resp.read().decode("utf-8"))
+        return body["choices"][0]["message"]["content"]
+    # -- parse + validate --
+    @staticmethod
+    def _parse(text: str, ncols: int, allowed_fields: list[str]) -> dict:
+        m = re.search(r"\{.*\}", text, re.S)
+        raw = json.loads(m.group(0) if m else text)
+        # single-slot fields: keep only the first (highest-priority) assignment
+        result: dict[int, str] = {}
+        seen: set[str] = set()
+        for k, v in raw.items():
+            try:
+                ci = int(k)
+            except (ValueError, TypeError):
+                continue
+            if not (0 <= ci < ncols):
+                continue
+            if v in allowed_fields and v not in seen:
+                result[ci] = v
+                seen.add(v)
+        return result
+    def __call__(self, header_row: list, data_rows: list[list],
+                 allowed_fields: list[str]) -> dict:
+        """Return {col_index: field} for the header. Empty dict on any failure
+        (caller then leaves those columns unmapped -> needs_review)."""
+        profiles = profile_columns(header_row, data_rows)
+        messages = self._build_messages(profiles, allowed_fields)
+        try:
+            text = self._transport(messages) if self._transport else self._http(messages)
+        except Exception:  # noqa: BLE001 — network/parse errors must not crash the pipeline
+            return {}
+        try:
+            return self._parse(text, len(header_row), allowed_fields)
+        except (json.JSONDecodeError, ValueError, TypeError):
+            return {}