PyPI - tabularmapper - Versions diffs - 1.0.0__py3-none-any.whl - Mend

tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

tabularmapper/__init__.py +75 -0
tabularmapper/ai_matcher.py +247 -0
tabularmapper/api.py +186 -0
tabularmapper/cli.py +233 -0
tabularmapper/engine.py +938 -0
tabularmapper/learn.py +203 -0
tabularmapper/llm_fallback.py +118 -0
tabularmapper/mapping_cache.py +73 -0
tabularmapper/schema.py +341 -0
tabularmapper/stores.py +238 -0
tabularmapper-1.0.0.dist-info/METADATA +455 -0
tabularmapper-1.0.0.dist-info/RECORD +16 -0
tabularmapper-1.0.0.dist-info/WHEEL +5 -0
tabularmapper-1.0.0.dist-info/entry_points.txt +2 -0
tabularmapper-1.0.0.dist-info/licenses/LICENSE +21 -0
tabularmapper-1.0.0.dist-info/top_level.txt +1 -0

tabularmapper/learn.py ADDED Viewed

@@ -0,0 +1,203 @@
+"""
+learn.py — self-learning synonym vocabulary.
+When the AI (or a human) confirms that a header maps to a field, that phrase is
+recorded here. Next time any bank uses that header it's a deterministic EXACT
+match — the AI never fires for it again, and nobody edits code. Over time the
+AI-call rate trends to zero.
+Two halves, cleanly split:
+  * CONFIG (schema.py / config.json) = seed vocabulary, read-only, from S3/URL.
+  * LEARNED (this store)             = mutable, grows from real traffic.
+Effective synonyms at match time = seed + learned (seed wins on conflict).
+Storage uses the same URL convention as the cache (stores.open_store):
+    LearnStore()                                  # env TABULARMAPPER_LEARN_STORE, else sqlite
+    LearnStore("redis://localhost:6379/0")
+    LearnStore("memory://")                        # tests
+Trust policy (financial-data safe by default):
+  * date / description / reference / balance / amount  -> auto-applied.
+  * debit / credit                                     -> held in `pending` for a
+    human to approve(), because a wrong debit/credit direction is the one costly
+    error. Set auto_apply_gated=True to skip review (fully unattended).
+"""
+from __future__ import annotations
+import os
+import re
+import time
+from typing import Optional
+from .stores import open_store
+# In-memory by default — creates NO files. Set TABULARMAPPER_LEARN_STORE (or pass a
+# URL) to a path / redis:// / valkey:// / postgresql:// for persistence.
+_DEFAULT_URL = "memory://"
+_KEY = "learned"
+_DEFAULT_GATED = frozenset({"debit", "credit"})
+def _norm(s) -> str:
+    return re.sub(r"\s+", " ", str(s).strip().lower()) if s is not None else ""
+def _now() -> str:
+    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+class LearnStore:
+    def __init__(self, source: Optional[str] = None, *,
+                 gated_fields=_DEFAULT_GATED, auto_apply_gated: bool = False):
+        url = source or os.getenv("TABULARMAPPER_LEARN_STORE") or _DEFAULT_URL
+        self.url = url
+        self._store = open_store(url)
+        self.gated_fields = set(gated_fields)
+        self.auto_apply_gated = auto_apply_gated
+    # -- persistence (single provenance-rich record) --
+    def _load(self) -> dict:
+        return self._store.get(_KEY) or {
+            "version": 1, "fields": {}, "pending": [], "conflicts": []}
+    def _save(self, blob: dict) -> None:
+        self._store.put(_KEY, blob)
+    @staticmethod
+    def _field_of(blob: dict, phrase: str) -> Optional[str]:
+        for fld, entries in blob["fields"].items():
+            if any(e["phrase"] == phrase for e in entries):
+                return fld
+        return None
+    # -- read views --
+    def synonyms(self) -> dict:
+        """Applied learned synonyms as {field: [phrases]} (for matching)."""
+        blob = self._load()
+        return {f: [e["phrase"] for e in es] for f, es in blob["fields"].items() if es}
+    def pending(self) -> list:
+        return self._load()["pending"]
+    def conflicts(self) -> list:
+        return self._load()["conflicts"]
+    def stats(self) -> dict:
+        b = self._load()
+        return {"applied": sum(len(v) for v in b["fields"].values()),
+                "pending": len(b["pending"]), "conflicts": len(b["conflicts"])}
+    # -- write path --
+    def add(self, header: str, field: str, *, source: str = "ai",
+            bank: Optional[str] = None) -> str:
+        """Record a header->field mapping. Returns one of:
+        'learned' | 'pending' | 'exists' | 'conflict' | 'skip'."""
+        phrase = _norm(header)
+        if not phrase or not field:
+            return "skip"
+        blob = self._load()
+        existing = self._field_of(blob, phrase)
+        if existing == field:
+            return "exists"
+        if existing is not None:                 # phrase already means something else
+            blob["conflicts"].append({
+                "phrase": phrase, "existing": existing, "proposed": field,
+                "source": source, "ts": _now()})
+            self._save(blob)
+            return "conflict"
+        entry = {"phrase": phrase, "field": field, "source": source,
+                 "bank": bank, "ts": _now()}
+        gated = (field in self.gated_fields and source in ("ai", "harvest")
+                 and not self.auto_apply_gated)
+        if gated:
+            if not any(p["phrase"] == phrase for p in blob["pending"]):
+                blob["pending"].append(entry)
+            self._save(blob)
+            return "pending"
+        blob["fields"].setdefault(field, []).append(entry)
+        self._save(blob)
+        return "learned"
+    # -- human review of gated entries --
+    def approve(self, phrase: str, field: Optional[str] = None) -> bool:
+        phrase = _norm(phrase)
+        blob = self._load()
+        keep, moved = [], False
+        for p in blob["pending"]:
+            if p["phrase"] == phrase and (field is None or p["field"] == field):
+                blob["fields"].setdefault(p["field"], []).append(
+                    dict(p, source="human", ts=_now()))
+                moved = True
+            else:
+                keep.append(p)
+        blob["pending"] = keep
+        self._save(blob)
+        return moved
+    def reject(self, phrase: str, field: Optional[str] = None) -> bool:
+        phrase = _norm(phrase)
+        blob = self._load()
+        before = len(blob["pending"])
+        blob["pending"] = [p for p in blob["pending"]
+                           if not (p["phrase"] == phrase and
+                                   (field is None or p["field"] == field))]
+        self._save(blob)
+        return len(blob["pending"]) < before
+    def close(self) -> None:
+        self._store.close()
+def learn_from_result(result, store: LearnStore, *, min_confidence: int = 85,
+                      methods=("ai",), source: str = "ai",
+                      bank: Optional[str] = None) -> dict:
+    """Walk a ProcessResult's column maps and teach the store every confident,
+    model-resolved (non-exact) mapping. Returns a summary keyed by outcome."""
+    summary: dict[str, list] = {
+        "learned": [], "pending": [], "exists": [], "conflict": [], "skip": []}
+    for m in result.column_maps:
+        if not m.field or m.method == "exact" or m.method not in methods:
+            continue
+        if m.confidence < min_confidence:
+            continue
+        outcome = store.add(m.raw_header, m.field, source=source, bank=bank)
+        summary[outcome].append((m.raw_header, m.field))
+    return summary
+def harvest_folder(folder: str, store: LearnStore, *,
+                   table_matcher=None, min_confidence: int = 85,
+                   methods=("ai", "fuzzy"), recursive: bool = False) -> dict:
+    """Bootstrap the vocabulary from a folder of past statements.
+    Runs the mapper over every .xlsx in `folder`, and teaches the store each
+    confident header->field pair that the seed synonyms didn't already resolve
+    exactly (fuzzy + AI matches). Gated fields (debit/credit) land in the pending
+    queue for a quick one-time review. Returns a report.
+    Pass a `table_matcher` (OpenAICompatibleMatcher) to also resolve headers that
+    fuzzy can't place; omit it to harvest deterministically only.
+    """
+    import glob
+    from .engine import process_file
+    pattern = os.path.join(folder, "**", "*.xlsx") if recursive \
+        else os.path.join(folder, "*.xlsx")
+    report: dict = {"files": 0, "learned": [], "pending": [],
+                    "exists": [], "conflict": [], "skip": [], "errors": []}
+    for path in sorted(glob.glob(pattern, recursive=recursive)):
+        bank = os.path.splitext(os.path.basename(path))[0]
+        try:
+            res = process_file(path, table_matcher=table_matcher)
+        except Exception as exc:  # noqa: BLE001 — a bad file shouldn't abort the batch
+            report["errors"].append((os.path.basename(path), str(exc)))
+            continue
+        report["files"] += 1
+        summ = learn_from_result(res, store, min_confidence=min_confidence,
+                                 methods=methods, source="harvest", bank=bank)
+        for outcome, pairs in summ.items():
+            report.setdefault(outcome, []).extend(
+                (os.path.basename(path), h, f) for h, f in pairs)
+    report["stats"] = store.stats()
+    return report

tabularmapper/llm_fallback.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+llm_fallback.py — pluggable fallback adapters for map_columns().
+Interface (matches engine.map_columns' `llm_fallback` param):
+    fallback(header: str, samples: list[str], allowed_fields: list[str]) -> str | None
+Contract: the callable receives ONLY a header string, up to 3 sample cell
+strings, and the list of allowed field keys. It never sees full transaction
+rows. It returns one of `allowed_fields` or None. This keeps bank data local
+and auditable.
+Adapters here are the PER-COLUMN, offline degraded path:
+  * HashingEmbeddingFallback — zero-dependency char-ngram cosine. Lexical only
+                            (weak), but runs fully air-gapped with no model file
+                            and no API. A last resort when the AI matcher is off
+                            or unreachable.
+  * make_llm_fallback     — wrap any 'text -> text' model into the per-column
+                            interface (OFF by default).
+For the primary, high-accuracy path — a real LLM that reads the whole header row
+plus structural profiles and returns a full mapping — see `ai_matcher.py`
+(OpenAICompatibleMatcher). That is the recommended way to auto-map new banks.
+None of these fire unless the deterministic exact+fuzzy matcher fails a header.
+"""
+from __future__ import annotations
+import math
+import re
+from collections import Counter
+from typing import Callable, Optional
+# Human-readable descriptions per field. The embedding models compare the raw
+# header against these, so richer phrasing -> better cosine separation.
+FIELD_DESCRIPTIONS: dict[str, str] = {
+    "date": "date of the transaction when it was posted or valued",
+    "description": "narration particulars description details remarks of the transaction",
+    "reference": "reference number cheque number transaction id utr instrument number code",
+    "debit": "debit withdrawal money going out paid out outgoing spent amount reducing balance",
+    "credit": "credit deposit money coming in paid in incoming received income amount increasing balance",
+    "balance": "account balance remaining after the transaction closing running available balance",
+    "amount": "single signed transaction amount positive or negative value",
+}
+# --------------------------------------------------------------------------
+# 2) Zero-dependency offline fallback (no model download)
+# --------------------------------------------------------------------------
+class HashingEmbeddingFallback:
+    """Char-ngram cosine similarity. No torch, no download, fully offline.
+    Lexical only, so weaker than the AI matcher — but it needs no API, no model
+    file, and runs fully air-gapped / in CI. Same interface & contract.
+    """
+    def __init__(self, min_similarity: float = 0.18, ngram: int = 3,
+                 field_descriptions: Optional[dict] = None):
+        self.min_similarity = min_similarity
+        self.ngram = ngram
+        self.field_descriptions = field_descriptions or FIELD_DESCRIPTIONS
+        self._field_vecs = {
+            f: self._vec(d) for f, d in self.field_descriptions.items()
+        }
+    def _vec(self, text: str) -> Counter:
+        t = re.sub(r"[^a-z0-9]", " ", text.lower())
+        grams: Counter = Counter()
+        for tok in t.split():
+            grams[tok] += 1  # word tokens
+            padded = f" {tok} "
+            for i in range(len(padded) - self.ngram + 1):
+                grams[padded[i:i + self.ngram]] += 1
+        return grams
+    @staticmethod
+    def _cos(a: Counter, b: Counter) -> float:
+        common = set(a) & set(b)
+        num = sum(a[k] * b[k] for k in common)
+        na = math.sqrt(sum(v * v for v in a.values()))
+        nb = math.sqrt(sum(v * v for v in b.values()))
+        return num / (na * nb) if na and nb else 0.0
+    def __call__(self, header: str, samples: list[str],
+                 allowed_fields: list[str]) -> Optional[str]:
+        qv = self._vec(header + " " + " ".join(samples))
+        best_field, best_sim = None, -1.0
+        for fld in allowed_fields:
+            fv = self._field_vecs.get(fld) or self._vec(fld)
+            sim = self._cos(qv, fv)
+            if sim > best_sim:
+                best_field, best_sim = fld, sim
+        return best_field if best_sim >= self.min_similarity else None
+# --------------------------------------------------------------------------
+# Optional hosted small-model adapter (OFF by default)
+# --------------------------------------------------------------------------
+def make_llm_fallback(client_call: Callable[[str], str]) -> Callable:
+    """Wrap any 'text -> text' small-model call into the fallback interface.
+    `client_call(prompt)` should return a single field name. This never sends
+    transaction rows — only the header + samples + allowed fields.
+    """
+    def _fallback(header: str, samples: list[str],
+                  allowed_fields: list[str]) -> Optional[str]:
+        prompt = (
+            "You map one spreadsheet column header to exactly one field.\n"
+            f"Allowed fields: {', '.join(allowed_fields)}\n"
+            f"Header: {header!r}\n"
+            f"Sample values: {samples}\n"
+            "Reply with ONLY the single best field name, or 'none'."
+        )
+        ans = (client_call(prompt) or "").strip().lower()
+        ans = re.sub(r"[^a-z]", "", ans)
+        return ans if ans in allowed_fields else None
+    return _fallback

tabularmapper/mapping_cache.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""
+mapping_cache.py — persistent {header_fingerprint: field_mapping} cache.
+A repeat bank format skips detection/mapping entirely -> true 100% on seen
+formats. The fingerprint is a hash of the normalized header cell strings, so
+the same header layout always resolves to the same cached mapping regardless
+of row content.
+Storage is pluggable via a URL (see stores.open_store):
+    MappingCache()                              # env TABULARMAPPER_CACHE, else sqlite default
+    MappingCache("sqlite:///mapping_cache.db")  # file, no server, concurrency-safe
+    MappingCache("redis://localhost:6379/0")    # multi-worker
+    MappingCache("memory://")                   # tests
+    MappingCache(path="legacy.json")            # legacy JSON file (back-compat)
+"""
+from __future__ import annotations
+import hashlib
+import os
+import re
+from typing import Optional
+from .engine import ColumnMap
+from .stores import open_store
+# In-memory by default — creates NO files. Persistence is opt-in: set
+# TABULARMAPPER_CACHE (or pass a URL) to a path / redis:// / valkey:// /
+# postgresql://. In-memory still caches within a process (lost on restart).
+_DEFAULT_URL = "memory://"
+def _fingerprint(header: list, namespace: str = "") -> str:
+    parts = []
+    for c in header:
+        s = "" if c is None else re.sub(r"\s+", " ", str(c).strip().lower())
+        parts.append(s)
+    # `namespace` scopes the key to the active schema, so a config change (e.g.
+    # adding a field) does NOT return a stale mapping for the same header.
+    raw = namespace + "\x00" + "|".join(parts)
+    return hashlib.sha1(raw.encode("utf-8")).hexdigest()
+class MappingCache:
+    def __init__(self, source: Optional[str] = None, *, path: Optional[str] = None):
+        # precedence: explicit source > legacy path kwarg > env > sqlite default
+        url = source or path or os.getenv("TABULARMAPPER_CACHE") or _DEFAULT_URL
+        self.url = url
+        self._store = open_store(url)
+    def get(self, header: list, namespace: str = "") -> Optional[list[ColumnMap]]:
+        entry = self._store.get(_fingerprint(header, namespace))
+        if not entry:
+            return None
+        return [
+            ColumnMap(m["col_index"], m["raw_header"], m["field"],
+                      m["confidence"], "cache")
+            for m in entry["columns"]
+        ]
+    def put(self, header: list, col_maps: list[ColumnMap],
+            namespace: str = "") -> None:
+        self._store.put(_fingerprint(header, namespace), {
+            "header_preview": [("" if c is None else str(c)) for c in header],
+            "columns": [
+                {"col_index": m.col_index, "raw_header": m.raw_header,
+                 "field": m.field, "confidence": m.confidence, "method": m.method}
+                for m in col_maps
+            ],
+        })
+    def close(self) -> None:
+        self._store.close()