PyPI - tabularmapper - Versions diffs - 1.0.0__py3-none-any.whl - Mend

tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

tabularmapper/__init__.py +75 -0
tabularmapper/ai_matcher.py +247 -0
tabularmapper/api.py +186 -0
tabularmapper/cli.py +233 -0
tabularmapper/engine.py +938 -0
tabularmapper/learn.py +203 -0
tabularmapper/llm_fallback.py +118 -0
tabularmapper/mapping_cache.py +73 -0
tabularmapper/schema.py +341 -0
tabularmapper/stores.py +238 -0
tabularmapper-1.0.0.dist-info/METADATA +455 -0
tabularmapper-1.0.0.dist-info/RECORD +16 -0
tabularmapper-1.0.0.dist-info/WHEEL +5 -0
tabularmapper-1.0.0.dist-info/entry_points.txt +2 -0
tabularmapper-1.0.0.dist-info/licenses/LICENSE +21 -0
tabularmapper-1.0.0.dist-info/top_level.txt +1 -0

tabularmapper/schema.py ADDED Viewed

@@ -0,0 +1,341 @@
+"""
+schema.py — externalized, loadable configuration for the mapper.
+Everything that used to be a hardcoded constant in `engine.py` — the output
+template (`OUTPUT_SCHEMA`), the header vocabulary (`SYNONYMS`), and the critical
+fields — lives here as data, and can be loaded from a JSON file, an HTTP(S) URL,
+an S3 object, or an in-memory dict. Change the template by editing JSON in a
+bucket; no code change, no redeploy.
+Config JSON shape (all keys optional; missing keys fall back to the defaults):
+    {
+      "version": 1,
+      "output_schema": [
+        {"field": "date",        "header": "Date",             "type": "date"},
+        {"field": "description", "header": "Narration",        "type": "text"},
+        {"field": "debit",       "header": "Debit",            "type": "money"},
+        {"field": "credit",      "header": "Credit",           "type": "money"}
+      ],
+      "critical_fields": ["date"],
+      "synonyms": { "date": ["date", "txn date"], "debit": ["withdrawal"] }
+    }
+`type` ∈ {"date", "money", "text"} drives generic extraction, so adding a NEW
+column is a config-only change. The field keys `debit`, `credit`, `amount` keep
+their special money-reconciliation behavior (a single signed `amount` column is
+split into debit/credit).
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import urllib.request
+from dataclasses import dataclass, field as _field
+from typing import Optional, Union
+_log = logging.getLogger("engine.schema")
+# Field types the engine understands, grouped by how they're parsed. Many
+# aliases so configs read naturally ("string", "integer", "currency", …).
+DATE_TYPES = {"date", "datetime"}
+NUMERIC_TYPES = {"money", "number", "currency", "numeric", "decimal", "float",
+                 "integer", "int"}
+INTEGER_TYPES = {"integer", "int"}          # coerced to int when whole
+TEXT_TYPES = {"text", "string", "str"}
+VALID_TYPES = DATE_TYPES | NUMERIC_TYPES | TEXT_TYPES
+# --------------------------------------------------------------------------
+# Defaults — copied VERBATIM from the original engine.py constants so the
+# out-of-the-box behavior is byte-identical.
+# --------------------------------------------------------------------------
+BANK_SCHEMA: list[dict] = [
+    {"field": "date", "header": "Date", "type": "date",
+     "description": "the transaction date (post/value/booking date)"},
+    {"field": "description", "header": "Narration", "type": "text",
+     "description": "free-text narration / particulars / details of the transaction"},
+    {"field": "reference", "header": "Reference Number", "type": "text",
+     "description": "reference or cheque/UTR/instrument number identifying the entry"},
+    {"field": "debit", "header": "Debit", "type": "money",
+     "description": "money leaving the account (withdrawal / paid out); a debit-only column"},
+    {"field": "credit", "header": "Credit", "type": "money",
+     "description": "money entering the account (deposit / paid in); a credit-only column"},
+    {"field": "balance", "header": "Balance", "type": "money",
+     "description": "running account balance after the transaction"},
+]
+BANK_CRITICAL_FIELDS: list[str] = ["date"]
+# --- Bank preset behavior (all data, not engine logic) -------------------
+# reconcile: a single signed `amount` column is split into debit(-)/credit(+);
+#   when debit/credit are their own columns they're taken as positive.
+BANK_RECONCILE: dict = {"signed": "amount", "negative": "debit", "positive": "credit"}
+# require_any: each group needs >=1 mapped field or the statement is flagged.
+BANK_REQUIRE_ANY: list = [["debit", "credit", "amount"]]
+# row_keep_if_any: a row is a real record if >=1 of these has a value.
+BANK_ROW_KEEP_IF_ANY: list = ["date", "debit", "credit"]
+# continuation_field: a row with only this field folds into the row above it.
+BANK_CONTINUATION_FIELD: Optional[str] = "description"
+# descriptions for fields the AI matcher may see but that aren't output columns
+BANK_FIELD_DESCRIPTIONS: dict = {
+    "amount": "a SINGLE signed amount column (one column, +credit / -debit)",
+}
+BANK_SYNONYMS: dict[str, list[str]] = {
+    "date": [
+        "date", "txn date", "transaction date", "value date", "posting date",
+        "post date", "tran date", "date of transaction", "trans date", "dt",
+        "booking date", "entry date",
+    ],
+    "description": [
+        "description", "narration", "particulars", "details", "remarks",
+        "transaction details", "transaction remarks", "narrative", "memo",
+        "transaction description", "txn description", "notes", "purpose",
+    ],
+    "reference": [
+        "reference", "reference number", "reference no", "ref no", "ref no.",
+        "ref no./cheque no", "ref no./cheque no.", "cheque no", "cheque no.",
+        "chq no", "chq no.", "ref", "reference id", "utr", "utr no",
+        "instrument no", "cheque/ref no", "chq/ref no", "transaction id",
+        "ref/cheque no",
+    ],
+    "debit": [
+        "debit", "withdrawal", "withdrawals", "withdrawal amt", "withdrawal amount",
+        "withdrawal (dr)", "dr", "dr amount", "debit amount", "debit amt",
+        "paid out", "payments", "money out", "amount debited", "outflow",
+        "debit(dr)", "withdrawal amt.",
+    ],
+    "credit": [
+        "credit", "deposit", "deposits", "deposit amt", "deposit amount",
+        "deposit (cr)", "cr", "cr amount", "credit amount", "credit amt",
+        "paid in", "receipts", "money in", "amount credited", "inflow",
+        "credit(cr)", "deposit amt.",
+    ],
+    "balance": [
+        "balance", "closing balance", "running balance", "available balance",
+        "balance amount", "bal", "closing bal", "ledger balance", "book balance",
+        "balance (inr)",
+    ],
+    "amount": [
+        "amount", "transaction amount", "txn amount", "amt", "value",
+        "signed amount", "amount (inr)", "amount(dr/cr)", "transaction amt",
+    ],
+}
+# --------------------------------------------------------------------------
+# Data classes
+# --------------------------------------------------------------------------
+@dataclass
+class FieldSpec:
+    field: str                 # internal key: date, description, debit, ...
+    header: str                # display name written to the output file
+    type: str = "text"         # date | number/money | text
+    description: str = ""       # optional; used by the AI matcher
+@dataclass
+class Config:
+    output_schema: list[FieldSpec]
+    synonyms: dict[str, list[str]]
+    critical_fields: list[str]
+    # domain behavior — all data-driven, empty by default for a generic mapper
+    reconcile: dict = _field(default_factory=dict)          # {signed,negative,positive}
+    require_any: list = _field(default_factory=list)        # [[field, ...], ...]
+    row_keep_if_any: list = _field(default_factory=list)    # keep row if any has a value
+    continuation_field: Optional[str] = None                # multi-line fold target
+    extra_field_descriptions: dict = _field(default_factory=dict)  # non-output field defs
+    # -- derived views the engine consumes --
+    @property
+    def fields(self) -> list[str]:
+        return [f.field for f in self.output_schema]
+    @property
+    def headers(self) -> list[tuple[str, str]]:
+        """Back-compat shape: list of (field_key, display_header)."""
+        return [(f.field, f.header) for f in self.output_schema]
+    @property
+    def field_types(self) -> dict[str, str]:
+        return {f.field: f.type for f in self.output_schema}
+    @property
+    def field_descriptions(self) -> dict[str, str]:
+        """{field: description} for the AI matcher (output fields + extras)."""
+        out = {f.field: (f.description or f.field) for f in self.output_schema}
+        out.update(self.extra_field_descriptions)
+        return out
+    @property
+    def reconcile_fields(self) -> list[str]:
+        """The fields involved in signed/split reconciliation, if any."""
+        r = self.reconcile or {}
+        return [r[k] for k in ("signed", "negative", "positive") if r.get(k)]
+    @property
+    def allowed_fields(self) -> list[str]:
+        fs = list(self.fields)
+        for extra in list(self.extra_field_descriptions) + self.reconcile_fields:
+            if extra not in fs:
+                fs.append(extra)
+        return fs
+# --------------------------------------------------------------------------
+# Builders
+# --------------------------------------------------------------------------
+def _infer_type(field_key: str) -> str:
+    if field_key == "date":
+        return "date"
+    if field_key in {"debit", "credit", "balance", "amount"}:
+        return "money"
+    return "text"
+def default_config() -> Config:
+    """The built-in default: EMPTY. This is a general mapper, so with no config
+    it maps nothing — you must provide an output_schema + synonyms (a file/URL via
+    BANK_MAPPER_CONFIG, a dict, or configure()). Use `bank_preset()` for the
+    ready-made bank-statement schema."""
+    return Config(output_schema=[], synonyms={}, critical_fields=[])
+def bank_preset() -> Config:
+    """Ready-made preset for bank statements (Date, Narration, Reference, Debit,
+    Credit, Balance) with debit/credit reconciliation. Also in config.example.json.
+        from tabularmapper import bank_preset, configure
+        configure(config=bank_preset())
+    """
+    return Config(
+        output_schema=[FieldSpec(**d) for d in BANK_SCHEMA],
+        synonyms={k: list(v) for k, v in BANK_SYNONYMS.items()},
+        critical_fields=list(BANK_CRITICAL_FIELDS),
+        reconcile=dict(BANK_RECONCILE),
+        require_any=[list(g) for g in BANK_REQUIRE_ANY],
+        row_keep_if_any=list(BANK_ROW_KEEP_IF_ANY),
+        continuation_field=BANK_CONTINUATION_FIELD,
+        extra_field_descriptions=dict(BANK_FIELD_DESCRIPTIONS),
+    )
+def config_from_dict(d: dict, _origin: str = "<dict>") -> Config:
+    """Build a Config from a parsed JSON dict. This is the GENERIC path — nothing
+    bank-specific is assumed; declare what you want."""
+    if not d.get("output_schema"):
+        _log.warning(
+            "config %s has no non-empty 'output_schema' — nothing will be mapped. "
+            "Provide output_schema (or use bank_preset() for the bank layout).",
+            _origin)
+    specs: list[FieldSpec] = []
+    for item in d.get("output_schema") or []:
+        if isinstance(item, dict):
+            key = item["field"]
+            specs.append(FieldSpec(
+                field=key,
+                header=item.get("header", key),
+                type=item.get("type") or _infer_type(key),
+                description=item.get("description", ""),
+            ))
+        elif isinstance(item, (list, tuple)) and len(item) >= 2:
+            specs.append(FieldSpec(field=item[0], header=item[1],
+                                   type=_infer_type(item[0])))
+    for s in specs:
+        if s.type not in VALID_TYPES:
+            s.type = _infer_type(s.field)
+    # Synonyms are exactly what you declare — no bank defaults are merged in.
+    syn = {k: list(v) for k, v in (d.get("synonyms") or {}).items()}
+    crit = d.get("critical_fields") or []
+    return Config(
+        output_schema=specs,
+        synonyms=syn,
+        critical_fields=list(crit),
+        reconcile=dict(d.get("reconcile") or {}),
+        require_any=[list(g) for g in (d.get("require_any") or [])],
+        row_keep_if_any=list(d.get("row_keep_if_any") or []),
+        continuation_field=d.get("continuation_field"),
+        extra_field_descriptions=dict(d.get("field_descriptions") or {}),
+    )
+# --------------------------------------------------------------------------
+# Loading — file / http(s) / s3 / dict, with a fail-safe to defaults
+# --------------------------------------------------------------------------
+def _read_source(source: str, timeout: float = 10.0) -> bytes:
+    if source.startswith("s3://"):
+        return _read_s3(source)
+    if source.startswith(("http://", "https://")):
+        with urllib.request.urlopen(source, timeout=timeout) as resp:
+            return resp.read()
+    if source.startswith("file://"):
+        source = source[len("file://"):]
+    with open(source, "rb") as fh:
+        return fh.read()
+def _read_s3(uri: str) -> bytes:
+    from urllib.parse import urlparse
+    try:
+        import boto3  # optional; only for s3:// sources — or use a presigned https URL
+    except ImportError as exc:
+        raise ImportError(
+            "Loading config from s3:// needs the 'boto3' package (pip install "
+            "boto3), or pass a presigned https:// URL instead (no dependency)."
+        ) from exc
+    parts = urlparse(uri)
+    obj = boto3.client("s3").get_object(Bucket=parts.netloc,
+                                        Key=parts.path.lstrip("/"))
+    return obj["Body"].read()
+def load_config(source: Optional[Union[str, dict]] = None,
+                strict: bool = False) -> Config:
+    """Load configuration.
+    source:
+      * None      -> env TABULARMAPPER_CONFIG, else the built-in defaults
+      * dict      -> used directly
+      * "s3://…"  -> S3 object (needs boto3) OR use a presigned https URL instead
+      * "http(s)://…" / path / "file://…" -> fetched via stdlib urllib
+    On any load/parse error, falls back to the defaults (so a bad or unreachable
+    config never takes the service down) unless `strict=True`.
+    """
+    if source is None:
+        source = os.getenv("TABULARMAPPER_CONFIG")
+    if source is None:
+        return default_config()
+    if isinstance(source, dict):
+        return config_from_dict(source)
+    try:
+        raw = _read_source(str(source))
+        return config_from_dict(json.loads(raw), _origin=str(source))
+    except Exception as exc:
+        if strict:
+            raise
+        _log.warning(
+            "TABULARMAPPER config %r failed to load (%s: %s) — falling back to "
+            "built-in defaults", source, type(exc).__name__, exc)
+        return default_config()
+def config_to_dict(cfg: Config) -> dict:
+    """Serialize a Config back to the JSON-friendly shape (for saving/harvest)."""
+    return {
+        "version": 1,
+        "output_schema": [
+            {"field": f.field, "header": f.header, "type": f.type,
+             **({"description": f.description} if f.description else {})}
+            for f in cfg.output_schema
+        ],
+        "critical_fields": list(cfg.critical_fields),
+        "reconcile": dict(cfg.reconcile),
+        "require_any": [list(g) for g in cfg.require_any],
+        "row_keep_if_any": list(cfg.row_keep_if_any),
+        "continuation_field": cfg.continuation_field,
+        "field_descriptions": dict(cfg.extra_field_descriptions),
+        "synonyms": {k: list(v) for k, v in cfg.synonyms.items()},
+    }

tabularmapper/stores.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""
+stores.py — pluggable key/value backends behind one URL convention.
+Every persistent store in the package (the mapping cache today, the learned
+synonyms next) is a `KeyValueStore`. You pick the backend with a URL, exactly
+like SQLAlchemy / Celery — swap it with an env var, no code change:
+    memory://                         in-process dict (tests, single worker)
+    ./mapping_cache.db  /  sqlite:///mapping_cache.db
+                                      SQLite file — no server, concurrency-safe (DEFAULT)
+    ./mapping_cache.json / file://... legacy JSON file (NOT multi-worker safe)
+    redis://host:6379/0               Redis            (pip install ...[redis])
+    valkey://host:6379/0              Valkey           (pip install ...[valkey])
+    postgresql://user@host/db         Postgres         (pip install ...[postgres])
+Escape hatch: any object with get()/put() works — pass your own to open_store's
+consumers directly if you have a backend we don't ship.
+"""
+from __future__ import annotations
+import json
+import os
+import threading
+from typing import Optional
+try:                       # typing only; Protocol may be absent on very old pythons
+    from typing import Protocol
+except ImportError:        # pragma: no cover
+    Protocol = object      # type: ignore
+class KeyValueStore(Protocol):
+    def get(self, key: str) -> Optional[dict]: ...
+    def put(self, key: str, value: dict) -> None: ...
+    def close(self) -> None: ...
+# --------------------------------------------------------------------------
+# In-memory
+# --------------------------------------------------------------------------
+class MemoryStore:
+    def __init__(self) -> None:
+        self._d: dict[str, dict] = {}
+    def get(self, key: str) -> Optional[dict]:
+        return self._d.get(key)
+    def put(self, key: str, value: dict) -> None:
+        self._d[key] = value
+    def close(self) -> None:
+        pass
+# --------------------------------------------------------------------------
+# JSON file (legacy default; whole-file rewrite, NOT multi-worker safe)
+# --------------------------------------------------------------------------
+class JsonFileStore:
+    def __init__(self, path: str) -> None:
+        self.path = path
+        self._data: dict[str, dict] = {}
+        self._lock = threading.Lock()
+        if os.path.exists(path):
+            try:
+                with open(path, "r", encoding="utf-8") as fh:
+                    self._data = json.load(fh)
+            except (json.JSONDecodeError, OSError):
+                self._data = {}
+    def get(self, key: str) -> Optional[dict]:
+        return self._data.get(key)
+    def put(self, key: str, value: dict) -> None:
+        with self._lock:
+            self._data[key] = value
+            tmp = f"{self.path}.tmp"
+            with open(tmp, "w", encoding="utf-8") as fh:
+                json.dump(self._data, fh, indent=2)
+            os.replace(tmp, self.path)   # atomic-ish within a single process
+    def close(self) -> None:
+        pass
+# --------------------------------------------------------------------------
+# SQLite (default) — file-based, no server, concurrency-safe via WAL
+# --------------------------------------------------------------------------
+class SqliteStore:
+    def __init__(self, path: str) -> None:
+        import sqlite3
+        self.path = path
+        self._lock = threading.Lock()
+        self._conn = sqlite3.connect(path, check_same_thread=False)
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        self._conn.execute("PRAGMA busy_timeout=5000")
+        self._conn.execute(
+            "CREATE TABLE IF NOT EXISTS kv (key TEXT PRIMARY KEY, value TEXT NOT NULL)")
+        self._conn.commit()
+    def get(self, key: str) -> Optional[dict]:
+        cur = self._conn.execute("SELECT value FROM kv WHERE key = ?", (key,))
+        row = cur.fetchone()
+        return json.loads(row[0]) if row else None
+    def put(self, key: str, value: dict) -> None:
+        with self._lock:
+            self._conn.execute(
+                "INSERT INTO kv (key, value) VALUES (?, ?) "
+                "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
+                (key, json.dumps(value)))
+            self._conn.commit()
+    def close(self) -> None:
+        self._conn.close()
+# --------------------------------------------------------------------------
+# Redis / Valkey (optional deps, lazy import).
+# Valkey is the open-source Redis fork; the two speak the same wire protocol,
+# so a single client resolver + get/put serves both. Per the Aiven docs, the
+# Valkey client is built with the module-level `valkey.from_url(uri)`.
+# --------------------------------------------------------------------------
+def _redis_proto_client(url: str, prefer: str = "redis"):
+    """Build a client for any redis-protocol server (Redis or Valkey).
+    Both drivers are wire-compatible and either can serve either scheme, so we
+    try the preferred driver first, then the other, normalizing the URL scheme
+    for whichever library is used. Managed Valkey (e.g. Aiven) hands out a TLS
+    URI — pass it straight through as valkey:// / valkeys:// / rediss://.
+    """
+    order = ["valkey", "redis"] if prefer == "valkey" else ["redis", "valkey"]
+    last_err = None
+    for lib in order:
+        try:
+            mod = __import__(lib)              # valkey-py or redis-py
+        except ImportError as exc:
+            last_err = exc
+            continue
+        u = url
+        if lib == "redis":                     # redis-py doesn't know valkey://
+            u = u.replace("valkeys://", "rediss://", 1).replace("valkey://", "redis://", 1)
+        else:                                  # valkey-py: normalize redis:// -> valkey://
+            u = u.replace("rediss://", "valkeys://", 1).replace("redis://", "valkey://", 1)
+        return mod.from_url(u)                  # module-level from_url (both expose it)
+    raise ImportError(
+        "This cache backend needs the 'valkey' or 'redis' package. Install one "
+        "with:  pip install bank-statement-mapper[valkey]   (or [redis]). Both "
+        "are optional — the default SQLite backend needs nothing extra."
+    ) from last_err
+class _RedisProtocolStore:
+    def __init__(self, client, prefix: str = "bankmap:") -> None:
+        self._r = client
+        self._prefix = prefix
+    def get(self, key: str) -> Optional[dict]:
+        raw = self._r.get(self._prefix + key)
+        return json.loads(raw) if raw else None   # json.loads accepts bytes
+    def put(self, key: str, value: dict) -> None:
+        self._r.set(self._prefix + key, json.dumps(value))
+    def close(self) -> None:
+        pass
+class RedisStore(_RedisProtocolStore):
+    def __init__(self, url: str, prefix: str = "bankmap:") -> None:
+        super().__init__(_redis_proto_client(url, prefer="redis"), prefix)
+class ValkeyStore(_RedisProtocolStore):
+    """Valkey (the open-source Redis fork). Uses valkey-py (`valkey.from_url`)
+    if installed, else falls back to the wire-compatible redis-py."""
+    def __init__(self, url: str, prefix: str = "bankmap:") -> None:
+        super().__init__(_redis_proto_client(url, prefer="valkey"), prefix)
+# --------------------------------------------------------------------------
+# Postgres (optional dep, lazy import)
+# --------------------------------------------------------------------------
+class PostgresStore:
+    def __init__(self, url: str, table: str = "engine_kv") -> None:
+        try:
+            import psycopg
+        except ImportError as exc:
+            raise ImportError(
+                "The postgres cache backend needs the 'psycopg' package. Install "
+                "it with:  pip install bank-statement-mapper[postgres]. It is "
+                "optional — the default SQLite backend needs nothing extra."
+            ) from exc
+        self._table = table
+        self._conn = psycopg.connect(url, autocommit=True)
+        self._conn.execute(
+            f"CREATE TABLE IF NOT EXISTS {table} "
+            "(key TEXT PRIMARY KEY, value JSONB NOT NULL)")
+    def get(self, key: str) -> Optional[dict]:
+        cur = self._conn.execute(
+            f"SELECT value FROM {self._table} WHERE key = %s", (key,))
+        row = cur.fetchone()
+        return row[0] if row else None
+    def put(self, key: str, value: dict) -> None:
+        self._conn.execute(
+            f"INSERT INTO {self._table} (key, value) VALUES (%s, %s) "
+            "ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value",
+            (key, json.dumps(value)))
+    def close(self) -> None:
+        self._conn.close()
+# --------------------------------------------------------------------------
+# The factory
+# --------------------------------------------------------------------------
+def open_store(url: Optional[str]) -> KeyValueStore:
+    """Return a KeyValueStore for a URL/path. `None` -> in-memory."""
+    if not url or url == "memory://" or url == "memory:":
+        return MemoryStore()
+    if url.startswith(("valkey://", "valkeys://")):
+        return ValkeyStore(url)
+    if url.startswith(("redis://", "rediss://")):
+        return RedisStore(url)
+    if url.startswith(("postgresql://", "postgres://")):
+        return PostgresStore(url)
+    if url.startswith("sqlite://"):
+        # sqlite:///abs/or/rel.db  ->  strip scheme
+        path = url[len("sqlite:///"):] if url.startswith("sqlite:///") else url[len("sqlite://"):]
+        return SqliteStore(path or ":memory:")
+    if url.startswith("file://"):
+        url = url[len("file://"):]
+    # bare path: choose by extension
+    if url.endswith((".db", ".sqlite", ".sqlite3")):
+        return SqliteStore(url)
+    return JsonFileStore(url)