PyPI - tabularmapper - Versions diffs - 1.0.0__py3-none-any.whl - Mend

tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

tabularmapper/__init__.py +75 -0
tabularmapper/ai_matcher.py +247 -0
tabularmapper/api.py +186 -0
tabularmapper/cli.py +233 -0
tabularmapper/engine.py +938 -0
tabularmapper/learn.py +203 -0
tabularmapper/llm_fallback.py +118 -0
tabularmapper/mapping_cache.py +73 -0
tabularmapper/schema.py +341 -0
tabularmapper/stores.py +238 -0
tabularmapper-1.0.0.dist-info/METADATA +455 -0
tabularmapper-1.0.0.dist-info/RECORD +16 -0
tabularmapper-1.0.0.dist-info/WHEEL +5 -0
tabularmapper-1.0.0.dist-info/entry_points.txt +2 -0
tabularmapper-1.0.0.dist-info/licenses/LICENSE +21 -0
tabularmapper-1.0.0.dist-info/top_level.txt +1 -0

tabularmapper/__init__.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+tabularmapper — map any spreadsheet (.xlsx) to a schema you define.
+Two-stage, auditable pipeline: deterministic header detection + synonym/fuzzy
+column mapping, with an optional AI table matcher and a self-learning vocabulary.
+The engine is domain-agnostic; "bank statements" is just a built-in preset.
+Quick start:
+    from tabularmapper import process_file, configure, config_from_dict
+    configure(config_from_dict({"output_schema": [...], "synonyms": {...}}))
+    res = process_file("file.xlsx")
+    print(res.records)          # list[dict], ready for JSON / DB
+    # or the ready-made bank layout:
+    from tabularmapper import bank_preset, configure
+    configure(config=bank_preset())
+Heavier pieces are kept as submodules so importing this package stays light:
+    from tabularmapper.ai_matcher import OpenAICompatibleMatcher
+    from tabularmapper.api import router   # needs [api] extra
+"""
+from .engine import (
+    ALLOWED_FIELDS,
+    OUTPUT_SCHEMA,
+    ColumnMap,
+    OutputResult,
+    ProcessResult,
+    apply_learned,
+    configure,
+    detect_header_row,
+    map_columns,
+    normalize_amount,
+    normalize_date,
+    process_file,
+    process_stream,
+    records_to_csv_bytes,
+)
+from .learn import LearnStore, harvest_folder, learn_from_result
+from .mapping_cache import MappingCache
+from .schema import (
+    Config, bank_preset, config_from_dict, default_config, load_config,
+)
+from .stores import open_store
+__version__ = "1.0.0"
+__all__ = [
+    "process_file",
+    "process_stream",
+    "records_to_csv_bytes",
+    "configure",
+    "apply_learned",
+    "MappingCache",
+    "LearnStore",
+    "learn_from_result",
+    "harvest_folder",
+    "load_config",
+    "config_from_dict",
+    "default_config",
+    "bank_preset",
+    "Config",
+    "open_store",
+    "ProcessResult",
+    "ColumnMap",
+    "OutputResult",
+    "OUTPUT_SCHEMA",
+    "ALLOWED_FIELDS",
+    "detect_header_row",
+    "map_columns",
+    "normalize_amount",
+    "normalize_date",
+    "__version__",
+]

tabularmapper/ai_matcher.py ADDED Viewed

@@ -0,0 +1,247 @@
+"""
+ai_matcher.py — LLM-based, table-level column matcher for NEW bank layouts.
+This is the high-accuracy path your boss is asking for: when a statement's
+header is unknown to the synonym table, one LLM call maps the whole header row
+to the output fields and the result is written straight into mapping_cache.json,
+so that bank is "known" forever after (never hits the LLM again).
+PRIVACY — the model matches the TABLE, never the data
+-----------------------------------------------------
+The prompt contains ONLY:
+  * column header strings (e.g. "Withdrawals", "Value Dt")
+  * a structural profile per column computed locally (dtype, sign, fill-rate,
+    which columns are mutually exclusive) — this is metadata, NOT cell contents
+  * the list of allowed output fields + short descriptions
+It NEVER contains transaction amounts, dates, names, narrations or references.
+No real statement data leaves the machine. (You can opt into sending a couple of
+sanitized sample values with include_samples=True, but it is OFF by default.)
+Provider — OpenAI-compatible
+----------------------------
+Works with any endpoint that speaks the OpenAI /chat/completions API: OpenAI,
+Azure OpenAI, Together, Groq, or a local vLLM / Ollama / LM Studio server. Set
+base_url + api_key + model. Uses only the Python standard library (urllib), so
+there is no SDK dependency to install or pin.
+"""
+from __future__ import annotations
+import datetime as _dt
+import json
+import os
+import re
+import urllib.request
+from typing import Callable, Optional
+# No hardcoded field definitions — descriptions come from the config (each
+# output field may carry a `description`). When a field has none, the matcher
+# falls back to the field name itself, so this works for ANY domain, not just
+# banking. Pass `field_defs={field: description}` to override.
+FIELD_DEFS: dict[str, str] = {}
+# --------------------------------------------------------------------------
+# Structural profiling — deterministic, no cell contents leave this function
+# --------------------------------------------------------------------------
+def _classify(v) -> str:
+    if v is None or (isinstance(v, str) and v.strip() == ""):
+        return "empty"
+    if isinstance(v, (_dt.datetime, _dt.date)):
+        return "date"
+    if isinstance(v, bool):
+        return "text"
+    if isinstance(v, (int, float)):
+        return "number"
+    s = str(v).strip()
+    if re.match(r"^[-(]?[\d,]+\.?\d*\)?\s*(dr|cr)?$", s, re.I):
+        return "number"
+    if re.search(r"\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}", s) or \
+       re.match(r"\d{1,2}\s*[A-Za-z]{3,9}\s*\d{2,4}", s):
+        return "date"
+    return "text"
+def _is_negative(v) -> bool:
+    if isinstance(v, (int, float)) and not isinstance(v, bool):
+        return v < 0
+    if isinstance(v, str):
+        s = v.strip().lower()
+        return s.startswith("-") or ("(" in s and ")" in s) or s.endswith("dr")
+    return False
+def profile_columns(header_row: list, data_rows: list[list],
+                    max_rows: int = 40) -> list[dict]:
+    """Return a per-column STRUCTURAL profile — no raw cell values.
+    Fields: index, name, dtype (majority), fill_rate, has_negative,
+    mutually_exclusive_with (column indices never co-filled -> debit/credit
+    pairs). This is exactly the signal a human uses to tell debit from credit
+    without reading the numbers.
+    """
+    ncols = len(header_row)
+    rows = data_rows[:max_rows]
+    filled = [[False] * ncols for _ in rows]
+    dtypes: list[list[str]] = [[] for _ in range(ncols)]
+    neg = [False] * ncols
+    for r_i, row in enumerate(rows):
+        for c in range(ncols):
+            v = row[c] if c < len(row) else None
+            t = _classify(v)
+            if t != "empty":
+                filled[r_i][c] = True
+                dtypes[c].append(t)
+                if _is_negative(v):
+                    neg[c] = True
+    profiles = []
+    for c in range(ncols):
+        types = dtypes[c]
+        majority = max(set(types), key=types.count) if types else "empty"
+        fill_rate = (sum(1 for r in filled if r[c]) / len(rows)) if rows else 0.0
+        # mutual exclusivity: never filled in the same row as column d
+        excl = []
+        for d in range(ncols):
+            if d == c:
+                continue
+            both = any(r[c] and r[d] for r in filled)
+            c_has = any(r[c] for r in filled)
+            d_has = any(r[d] for r in filled)
+            if c_has and d_has and not both:
+                excl.append(d)
+        profiles.append({
+            "index": c,
+            "name": ("" if header_row[c] is None else str(header_row[c]).strip()),
+            "dtype": majority,
+            "fill_rate": round(fill_rate, 2),
+            "has_negative": neg[c],
+            "mutually_exclusive_with": excl,
+        })
+    return profiles
+# --------------------------------------------------------------------------
+# OpenAI-compatible table matcher
+# --------------------------------------------------------------------------
+class OpenAICompatibleMatcher:
+    """Map an unknown header row to output fields with one LLM call.
+    Transport is any OpenAI-compatible /chat/completions endpoint. Inject a
+    custom `transport` (messages -> assistant_text) to unit-test without network.
+    """
+    def __init__(self,
+                 base_url: Optional[str] = None,
+                 api_key: Optional[str] = None,
+                 model: Optional[str] = None,
+                 field_defs: Optional[dict] = None,
+                 include_samples: bool = False,
+                 timeout: float = 30.0,
+                 temperature: float = 0.0,
+                 transport: Optional[Callable[[list], str]] = None):
+        self.base_url = (base_url or os.getenv("OPENAI_BASE_URL")
+                         or "https://api.openai.com/v1").rstrip("/")
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
+        self.model = model or os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+        self.field_defs = field_defs if field_defs is not None else dict(FIELD_DEFS)
+        self.include_samples = include_samples
+        self.timeout = timeout
+        self.temperature = temperature
+        self._transport = transport  # for tests / custom clients
+    # -- prompt construction (structure only) --
+    def _build_messages(self, profiles: list[dict], allowed_fields: list[str]) -> list:
+        field_lines = "\n".join(
+            f"  - {f}: {self.field_defs.get(f, f)}"
+            for f in allowed_fields
+        )
+        col_lines = []
+        for p in profiles:
+            excl = (f", mutually-exclusive with columns {p['mutually_exclusive_with']}"
+                    if p["mutually_exclusive_with"] else "")
+            neg = ", contains negative values" if p["has_negative"] else ""
+            col_lines.append(
+                f"  [{p['index']}] name={p['name']!r} "
+                f"type={p['dtype']} fill={p['fill_rate']}{neg}{excl}"
+            )
+        cols = "\n".join(col_lines)
+        system = (
+            "You map bank-statement spreadsheet COLUMNS to a fixed schema. "
+            "You are given only column headers and structural metadata (data "
+            "types, fill rates, sign, and which columns are mutually exclusive) "
+            "— never the actual transaction values. Use the header wording plus "
+            "these structural hints. Two money columns that are mutually "
+            "exclusive are almost always a debit/credit pair; decide direction "
+            "from the header wording. A single signed money column (has negative "
+            "values, not mutually exclusive with another money column) is "
+            "'amount'. Respond with ONLY a JSON object mapping the column index "
+            "(as a string) to one field name, or null if a column matches no "
+            "field. Do not invent fields."
+        )
+        user = (
+            f"Allowed fields:\n{field_lines}\n\n"
+            f"Columns:\n{cols}\n\n"
+            "Return JSON like {\"0\": \"date\", \"1\": \"description\", "
+            "\"4\": null}. Every column index must appear exactly once."
+        )
+        return [{"role": "system", "content": system},
+                {"role": "user", "content": user}]
+    # -- HTTP transport (stdlib) --
+    def _http(self, messages: list) -> str:
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": self.temperature,
+            "response_format": {"type": "json_object"},
+        }
+        req = urllib.request.Request(
+            f"{self.base_url}/chat/completions",
+            data=json.dumps(payload).encode("utf-8"),
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}",
+            },
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=self.timeout) as resp:
+            body = json.loads(resp.read().decode("utf-8"))
+        return body["choices"][0]["message"]["content"]
+    # -- parse + validate --
+    @staticmethod
+    def _parse(text: str, ncols: int, allowed_fields: list[str]) -> dict:
+        m = re.search(r"\{.*\}", text, re.S)
+        raw = json.loads(m.group(0) if m else text)
+        # single-slot fields: keep only the first (highest-priority) assignment
+        result: dict[int, str] = {}
+        seen: set[str] = set()
+        for k, v in raw.items():
+            try:
+                ci = int(k)
+            except (ValueError, TypeError):
+                continue
+            if not (0 <= ci < ncols):
+                continue
+            if v in allowed_fields and v not in seen:
+                result[ci] = v
+                seen.add(v)
+        return result
+    def __call__(self, header_row: list, data_rows: list[list],
+                 allowed_fields: list[str]) -> dict:
+        """Return {col_index: field} for the header. Empty dict on any failure
+        (caller then leaves those columns unmapped -> needs_review)."""
+        profiles = profile_columns(header_row, data_rows)
+        messages = self._build_messages(profiles, allowed_fields)
+        try:
+            text = self._transport(messages) if self._transport else self._http(messages)
+        except Exception:  # noqa: BLE001 — network/parse errors must not crash the pipeline
+            return {}
+        try:
+            return self._parse(text, len(header_row), allowed_fields)
+        except (json.JSONDecodeError, ValueError, TypeError):
+            return {}

tabularmapper/api.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""
+api.py — drop-in FastAPI router for tabularmapper.
+Two ways to use it from your existing backend:
+  A) Mount the router on your app (prefix defaults to /mapper):
+        from fastapi import FastAPI
+        from tabularmapper.api import router, lifespan
+        app = FastAPI(lifespan=lifespan)      # builds cache + matcher once
+        app.include_router(router)
+        # -> POST /mapper/map , GET /mapper/health
+     Custom prefix: `make_router("/catalog")`, or set TABULARMAPPER_ROUTE_PREFIX.
+  B) Run it standalone:
+        uvicorn tabularmapper.api:app --reload
+Design notes:
+  * The MappingCache and the (optional) AI matcher are built ONCE in `lifespan`
+    and reused across requests — not per call.
+  * `process_file` is synchronous (openpyxl + a possible blocking LLM HTTP call),
+    so it runs in a threadpool to avoid blocking the event loop.
+  * If OPENAI_API_KEY is unset, the AI matcher is simply off: known banks still
+    map deterministically; unknown ones come back with needs_review=True.
+"""
+from __future__ import annotations
+import os
+from contextlib import asynccontextmanager
+from typing import Any, Optional
+from fastapi import APIRouter, FastAPI, File, HTTPException, UploadFile
+from fastapi.concurrency import run_in_threadpool
+from pydantic import BaseModel
+from . import engine                    # imported as a module so OUTPUT_SCHEMA is read
+from .engine import process_stream  # dynamically (after configure), never a stale copy
+from .mapping_cache import MappingCache
+# --------------------------------------------------------------------------
+# Shared singletons (built once at startup)
+# --------------------------------------------------------------------------
+def build_matcher():
+    """Return an OpenAICompatibleMatcher if OPENAI_API_KEY is set, else None
+    (deterministic-only mode)."""
+    if not os.getenv("OPENAI_API_KEY"):
+        return None
+    from .ai_matcher import OpenAICompatibleMatcher
+    # field descriptions come from the active config (not hardcoded)
+    return OpenAICompatibleMatcher(
+        field_defs=engine._ACTIVE_CONFIG.field_descriptions)
+def build_learn_store():
+    """Self-learning vocabulary store (URL via TABULARMAPPER_LEARN_STORE)."""
+    from .learn import LearnStore
+    return LearnStore()
+class _State:
+    cache: Optional[MappingCache] = None
+    matcher: Any = None
+    learn: Any = None
+state = _State()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Load the output template + synonyms from TABULARMAPPER_CONFIG (file / URL /
+    # s3:// / dict). Only if the env var is set — otherwise we keep whatever is
+    # already active, so a manual `configure("config.json")` before startup is
+    # NOT overwritten.
+    _cfg = os.getenv("TABULARMAPPER_CONFIG")
+    if _cfg:
+        engine.configure(_cfg)
+    state.cache = MappingCache()   # reads TABULARMAPPER_CACHE (URL) or the sqlite default
+    state.matcher = build_matcher()
+    state.learn = build_learn_store()
+    engine.apply_learned(state.learn)   # activate already-learned synonyms
+    yield
+    # nothing to tear down
+# --------------------------------------------------------------------------
+# Response schema
+# --------------------------------------------------------------------------
+class ColumnMapOut(BaseModel):
+    col_index: int
+    raw_header: str
+    field: Optional[str]
+    confidence: int
+    method: str
+class MapResponse(BaseModel):
+    header_index: int
+    needs_review: bool
+    review_reasons: list[str]
+    schema_columns: list[str]
+    columns: list[ColumnMapOut]
+    transactions: list[dict]
+# --------------------------------------------------------------------------
+# Endpoint handlers (plain functions so the router prefix can be configured)
+# --------------------------------------------------------------------------
+async def health() -> dict:
+    return {"status": "ok", "ai_enabled": state.matcher is not None}
+async def map_statement(file: UploadFile = File(...)) -> MapResponse:
+    """Upload a spreadsheet (.xlsx); get the standardized mapping + rows."""
+    name = (file.filename or "").lower()
+    if not name.endswith((".xlsx", ".xls")):
+        raise HTTPException(status_code=400, detail="expected an .xlsx/.xls file")
+    data = await file.read()          # raw bytes, parsed in memory (never hits disk)
+    try:
+        # blocking work -> threadpool; process_stream reads straight from bytes
+        res = await run_in_threadpool(
+            process_stream, data,
+            table_matcher=state.matcher, cache=state.cache,
+            learn_store=state.learn,
+            source_label=file.filename or "<upload>",
+        )
+    except Exception as exc:  # noqa: BLE001
+        raise HTTPException(status_code=422, detail=f"could not process file: {exc}")
+    return MapResponse(
+        header_index=res.header_index,
+        needs_review=res.needs_review,
+        review_reasons=res.review_reasons,
+        schema_columns=[disp for _, disp in engine.OUTPUT_SCHEMA],
+        columns=[ColumnMapOut(**{
+            "col_index": m.col_index, "raw_header": m.raw_header,
+            "field": m.field, "confidence": m.confidence, "method": m.method,
+        }) for m in res.column_maps],
+        transactions=res.records,
+    )
+async def learn_pending() -> dict:
+    return {"pending": state.learn.pending(), "stats": state.learn.stats()}
+async def learn_approve(phrase: str, field: Optional[str] = None) -> dict:
+    ok = await run_in_threadpool(state.learn.approve, phrase, field)
+    if ok:
+        engine.apply_learned(state.learn)   # activate immediately
+    return {"approved": ok, "stats": state.learn.stats()}
+async def learn_reject(phrase: str, field: Optional[str] = None) -> dict:
+    ok = await run_in_threadpool(state.learn.reject, phrase, field)
+    return {"rejected": ok, "stats": state.learn.stats()}
+# --------------------------------------------------------------------------
+# Router factory — the prefix is configurable (default "/mapper", or the env
+# var TABULARMAPPER_ROUTE_PREFIX). This is a general table->schema mapper, so the
+# route name isn't bank-specific and you can set your own.
+# --------------------------------------------------------------------------
+def make_router(prefix: Optional[str] = None, tags: Optional[list] = None) -> APIRouter:
+    if prefix is None:
+        prefix = os.getenv("TABULARMAPPER_ROUTE_PREFIX", "/mapper")
+    r = APIRouter(prefix=prefix.rstrip("/"), tags=tags or ["mapper"])
+    r.add_api_route("/health", health, methods=["GET"])
+    r.add_api_route("/map", map_statement, methods=["POST"], response_model=MapResponse)
+    r.add_api_route("/learn/pending", learn_pending, methods=["GET"])
+    r.add_api_route("/learn/approve", learn_approve, methods=["POST"])
+    r.add_api_route("/learn/reject", learn_reject, methods=["POST"])
+    return r
+# Default router instance -> /mapper/*  (or TABULARMAPPER_ROUTE_PREFIX)
+router = make_router()
+# Standalone app (uvicorn tabularmapper.api:app)
+app = FastAPI(title="Tabular Mapper", lifespan=lifespan)
+app.include_router(router)