tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ """
2
+ tabularmapper — map any spreadsheet (.xlsx) to a schema you define.
3
+
4
+ Two-stage, auditable pipeline: deterministic header detection + synonym/fuzzy
5
+ column mapping, with an optional AI table matcher and a self-learning vocabulary.
6
+ The engine is domain-agnostic; "bank statements" is just a built-in preset.
7
+
8
+ Quick start:
9
+
10
+ from tabularmapper import process_file, configure, config_from_dict
11
+ configure(config_from_dict({"output_schema": [...], "synonyms": {...}}))
12
+ res = process_file("file.xlsx")
13
+ print(res.records) # list[dict], ready for JSON / DB
14
+
15
+ # or the ready-made bank layout:
16
+ from tabularmapper import bank_preset, configure
17
+ configure(config=bank_preset())
18
+
19
+ Heavier pieces are kept as submodules so importing this package stays light:
20
+ from tabularmapper.ai_matcher import OpenAICompatibleMatcher
21
+ from tabularmapper.api import router # needs [api] extra
22
+ """
23
+
24
+ from .engine import (
25
+ ALLOWED_FIELDS,
26
+ OUTPUT_SCHEMA,
27
+ ColumnMap,
28
+ OutputResult,
29
+ ProcessResult,
30
+ apply_learned,
31
+ configure,
32
+ detect_header_row,
33
+ map_columns,
34
+ normalize_amount,
35
+ normalize_date,
36
+ process_file,
37
+ process_stream,
38
+ records_to_csv_bytes,
39
+ )
40
+ from .learn import LearnStore, harvest_folder, learn_from_result
41
+ from .mapping_cache import MappingCache
42
+ from .schema import (
43
+ Config, bank_preset, config_from_dict, default_config, load_config,
44
+ )
45
+ from .stores import open_store
46
+
47
+ __version__ = "1.0.0"
48
+
49
+ __all__ = [
50
+ "process_file",
51
+ "process_stream",
52
+ "records_to_csv_bytes",
53
+ "configure",
54
+ "apply_learned",
55
+ "MappingCache",
56
+ "LearnStore",
57
+ "learn_from_result",
58
+ "harvest_folder",
59
+ "load_config",
60
+ "config_from_dict",
61
+ "default_config",
62
+ "bank_preset",
63
+ "Config",
64
+ "open_store",
65
+ "ProcessResult",
66
+ "ColumnMap",
67
+ "OutputResult",
68
+ "OUTPUT_SCHEMA",
69
+ "ALLOWED_FIELDS",
70
+ "detect_header_row",
71
+ "map_columns",
72
+ "normalize_amount",
73
+ "normalize_date",
74
+ "__version__",
75
+ ]
@@ -0,0 +1,247 @@
1
+ """
2
+ ai_matcher.py — LLM-based, table-level column matcher for NEW bank layouts.
3
+
4
+ This is the high-accuracy path your boss is asking for: when a statement's
5
+ header is unknown to the synonym table, one LLM call maps the whole header row
6
+ to the output fields and the result is written straight into mapping_cache.json,
7
+ so that bank is "known" forever after (never hits the LLM again).
8
+
9
+ PRIVACY — the model matches the TABLE, never the data
10
+ -----------------------------------------------------
11
+ The prompt contains ONLY:
12
+ * column header strings (e.g. "Withdrawals", "Value Dt")
13
+ * a structural profile per column computed locally (dtype, sign, fill-rate,
14
+ which columns are mutually exclusive) — this is metadata, NOT cell contents
15
+ * the list of allowed output fields + short descriptions
16
+
17
+ It NEVER contains transaction amounts, dates, names, narrations or references.
18
+ No real statement data leaves the machine. (You can opt into sending a couple of
19
+ sanitized sample values with include_samples=True, but it is OFF by default.)
20
+
21
+ Provider — OpenAI-compatible
22
+ ----------------------------
23
+ Works with any endpoint that speaks the OpenAI /chat/completions API: OpenAI,
24
+ Azure OpenAI, Together, Groq, or a local vLLM / Ollama / LM Studio server. Set
25
+ base_url + api_key + model. Uses only the Python standard library (urllib), so
26
+ there is no SDK dependency to install or pin.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import datetime as _dt
32
+ import json
33
+ import os
34
+ import re
35
+ import urllib.request
36
+ from typing import Callable, Optional
37
+
38
+ # No hardcoded field definitions — descriptions come from the config (each
39
+ # output field may carry a `description`). When a field has none, the matcher
40
+ # falls back to the field name itself, so this works for ANY domain, not just
41
+ # banking. Pass `field_defs={field: description}` to override.
42
+ FIELD_DEFS: dict[str, str] = {}
43
+
44
+
45
+ # --------------------------------------------------------------------------
46
+ # Structural profiling — deterministic, no cell contents leave this function
47
+ # --------------------------------------------------------------------------
48
+ def _classify(v) -> str:
49
+ if v is None or (isinstance(v, str) and v.strip() == ""):
50
+ return "empty"
51
+ if isinstance(v, (_dt.datetime, _dt.date)):
52
+ return "date"
53
+ if isinstance(v, bool):
54
+ return "text"
55
+ if isinstance(v, (int, float)):
56
+ return "number"
57
+ s = str(v).strip()
58
+ if re.match(r"^[-(]?[\d,]+\.?\d*\)?\s*(dr|cr)?$", s, re.I):
59
+ return "number"
60
+ if re.search(r"\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}", s) or \
61
+ re.match(r"\d{1,2}\s*[A-Za-z]{3,9}\s*\d{2,4}", s):
62
+ return "date"
63
+ return "text"
64
+
65
+
66
+ def _is_negative(v) -> bool:
67
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
68
+ return v < 0
69
+ if isinstance(v, str):
70
+ s = v.strip().lower()
71
+ return s.startswith("-") or ("(" in s and ")" in s) or s.endswith("dr")
72
+ return False
73
+
74
+
75
+ def profile_columns(header_row: list, data_rows: list[list],
76
+ max_rows: int = 40) -> list[dict]:
77
+ """Return a per-column STRUCTURAL profile — no raw cell values.
78
+
79
+ Fields: index, name, dtype (majority), fill_rate, has_negative,
80
+ mutually_exclusive_with (column indices never co-filled -> debit/credit
81
+ pairs). This is exactly the signal a human uses to tell debit from credit
82
+ without reading the numbers.
83
+ """
84
+ ncols = len(header_row)
85
+ rows = data_rows[:max_rows]
86
+ filled = [[False] * ncols for _ in rows]
87
+ dtypes: list[list[str]] = [[] for _ in range(ncols)]
88
+ neg = [False] * ncols
89
+
90
+ for r_i, row in enumerate(rows):
91
+ for c in range(ncols):
92
+ v = row[c] if c < len(row) else None
93
+ t = _classify(v)
94
+ if t != "empty":
95
+ filled[r_i][c] = True
96
+ dtypes[c].append(t)
97
+ if _is_negative(v):
98
+ neg[c] = True
99
+
100
+ profiles = []
101
+ for c in range(ncols):
102
+ types = dtypes[c]
103
+ majority = max(set(types), key=types.count) if types else "empty"
104
+ fill_rate = (sum(1 for r in filled if r[c]) / len(rows)) if rows else 0.0
105
+ # mutual exclusivity: never filled in the same row as column d
106
+ excl = []
107
+ for d in range(ncols):
108
+ if d == c:
109
+ continue
110
+ both = any(r[c] and r[d] for r in filled)
111
+ c_has = any(r[c] for r in filled)
112
+ d_has = any(r[d] for r in filled)
113
+ if c_has and d_has and not both:
114
+ excl.append(d)
115
+ profiles.append({
116
+ "index": c,
117
+ "name": ("" if header_row[c] is None else str(header_row[c]).strip()),
118
+ "dtype": majority,
119
+ "fill_rate": round(fill_rate, 2),
120
+ "has_negative": neg[c],
121
+ "mutually_exclusive_with": excl,
122
+ })
123
+ return profiles
124
+
125
+
126
+ # --------------------------------------------------------------------------
127
+ # OpenAI-compatible table matcher
128
+ # --------------------------------------------------------------------------
129
+ class OpenAICompatibleMatcher:
130
+ """Map an unknown header row to output fields with one LLM call.
131
+
132
+ Transport is any OpenAI-compatible /chat/completions endpoint. Inject a
133
+ custom `transport` (messages -> assistant_text) to unit-test without network.
134
+ """
135
+
136
+ def __init__(self,
137
+ base_url: Optional[str] = None,
138
+ api_key: Optional[str] = None,
139
+ model: Optional[str] = None,
140
+ field_defs: Optional[dict] = None,
141
+ include_samples: bool = False,
142
+ timeout: float = 30.0,
143
+ temperature: float = 0.0,
144
+ transport: Optional[Callable[[list], str]] = None):
145
+ self.base_url = (base_url or os.getenv("OPENAI_BASE_URL")
146
+ or "https://api.openai.com/v1").rstrip("/")
147
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
148
+ self.model = model or os.getenv("OPENAI_MODEL", "gpt-4o-mini")
149
+ self.field_defs = field_defs if field_defs is not None else dict(FIELD_DEFS)
150
+ self.include_samples = include_samples
151
+ self.timeout = timeout
152
+ self.temperature = temperature
153
+ self._transport = transport # for tests / custom clients
154
+
155
+ # -- prompt construction (structure only) --
156
+ def _build_messages(self, profiles: list[dict], allowed_fields: list[str]) -> list:
157
+ field_lines = "\n".join(
158
+ f" - {f}: {self.field_defs.get(f, f)}"
159
+ for f in allowed_fields
160
+ )
161
+ col_lines = []
162
+ for p in profiles:
163
+ excl = (f", mutually-exclusive with columns {p['mutually_exclusive_with']}"
164
+ if p["mutually_exclusive_with"] else "")
165
+ neg = ", contains negative values" if p["has_negative"] else ""
166
+ col_lines.append(
167
+ f" [{p['index']}] name={p['name']!r} "
168
+ f"type={p['dtype']} fill={p['fill_rate']}{neg}{excl}"
169
+ )
170
+ cols = "\n".join(col_lines)
171
+ system = (
172
+ "You map bank-statement spreadsheet COLUMNS to a fixed schema. "
173
+ "You are given only column headers and structural metadata (data "
174
+ "types, fill rates, sign, and which columns are mutually exclusive) "
175
+ "— never the actual transaction values. Use the header wording plus "
176
+ "these structural hints. Two money columns that are mutually "
177
+ "exclusive are almost always a debit/credit pair; decide direction "
178
+ "from the header wording. A single signed money column (has negative "
179
+ "values, not mutually exclusive with another money column) is "
180
+ "'amount'. Respond with ONLY a JSON object mapping the column index "
181
+ "(as a string) to one field name, or null if a column matches no "
182
+ "field. Do not invent fields."
183
+ )
184
+ user = (
185
+ f"Allowed fields:\n{field_lines}\n\n"
186
+ f"Columns:\n{cols}\n\n"
187
+ "Return JSON like {\"0\": \"date\", \"1\": \"description\", "
188
+ "\"4\": null}. Every column index must appear exactly once."
189
+ )
190
+ return [{"role": "system", "content": system},
191
+ {"role": "user", "content": user}]
192
+
193
+ # -- HTTP transport (stdlib) --
194
+ def _http(self, messages: list) -> str:
195
+ payload = {
196
+ "model": self.model,
197
+ "messages": messages,
198
+ "temperature": self.temperature,
199
+ "response_format": {"type": "json_object"},
200
+ }
201
+ req = urllib.request.Request(
202
+ f"{self.base_url}/chat/completions",
203
+ data=json.dumps(payload).encode("utf-8"),
204
+ headers={
205
+ "Content-Type": "application/json",
206
+ "Authorization": f"Bearer {self.api_key}",
207
+ },
208
+ method="POST",
209
+ )
210
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
211
+ body = json.loads(resp.read().decode("utf-8"))
212
+ return body["choices"][0]["message"]["content"]
213
+
214
+ # -- parse + validate --
215
+ @staticmethod
216
+ def _parse(text: str, ncols: int, allowed_fields: list[str]) -> dict:
217
+ m = re.search(r"\{.*\}", text, re.S)
218
+ raw = json.loads(m.group(0) if m else text)
219
+ # single-slot fields: keep only the first (highest-priority) assignment
220
+ result: dict[int, str] = {}
221
+ seen: set[str] = set()
222
+ for k, v in raw.items():
223
+ try:
224
+ ci = int(k)
225
+ except (ValueError, TypeError):
226
+ continue
227
+ if not (0 <= ci < ncols):
228
+ continue
229
+ if v in allowed_fields and v not in seen:
230
+ result[ci] = v
231
+ seen.add(v)
232
+ return result
233
+
234
+ def __call__(self, header_row: list, data_rows: list[list],
235
+ allowed_fields: list[str]) -> dict:
236
+ """Return {col_index: field} for the header. Empty dict on any failure
237
+ (caller then leaves those columns unmapped -> needs_review)."""
238
+ profiles = profile_columns(header_row, data_rows)
239
+ messages = self._build_messages(profiles, allowed_fields)
240
+ try:
241
+ text = self._transport(messages) if self._transport else self._http(messages)
242
+ except Exception: # noqa: BLE001 — network/parse errors must not crash the pipeline
243
+ return {}
244
+ try:
245
+ return self._parse(text, len(header_row), allowed_fields)
246
+ except (json.JSONDecodeError, ValueError, TypeError):
247
+ return {}
tabularmapper/api.py ADDED
@@ -0,0 +1,186 @@
1
+ """
2
+ api.py — drop-in FastAPI router for tabularmapper.
3
+
4
+ Two ways to use it from your existing backend:
5
+
6
+ A) Mount the router on your app (prefix defaults to /mapper):
7
+
8
+ from fastapi import FastAPI
9
+ from tabularmapper.api import router, lifespan
10
+ app = FastAPI(lifespan=lifespan) # builds cache + matcher once
11
+ app.include_router(router)
12
+ # -> POST /mapper/map , GET /mapper/health
13
+
14
+ Custom prefix: `make_router("/catalog")`, or set TABULARMAPPER_ROUTE_PREFIX.
15
+
16
+ B) Run it standalone:
17
+
18
+ uvicorn tabularmapper.api:app --reload
19
+
20
+ Design notes:
21
+ * The MappingCache and the (optional) AI matcher are built ONCE in `lifespan`
22
+ and reused across requests — not per call.
23
+ * `process_file` is synchronous (openpyxl + a possible blocking LLM HTTP call),
24
+ so it runs in a threadpool to avoid blocking the event loop.
25
+ * If OPENAI_API_KEY is unset, the AI matcher is simply off: known banks still
26
+ map deterministically; unknown ones come back with needs_review=True.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import os
32
+ from contextlib import asynccontextmanager
33
+ from typing import Any, Optional
34
+
35
+ from fastapi import APIRouter, FastAPI, File, HTTPException, UploadFile
36
+ from fastapi.concurrency import run_in_threadpool
37
+ from pydantic import BaseModel
38
+
39
+ from . import engine # imported as a module so OUTPUT_SCHEMA is read
40
+ from .engine import process_stream # dynamically (after configure), never a stale copy
41
+ from .mapping_cache import MappingCache
42
+
43
+
44
+ # --------------------------------------------------------------------------
45
+ # Shared singletons (built once at startup)
46
+ # --------------------------------------------------------------------------
47
+ def build_matcher():
48
+ """Return an OpenAICompatibleMatcher if OPENAI_API_KEY is set, else None
49
+ (deterministic-only mode)."""
50
+ if not os.getenv("OPENAI_API_KEY"):
51
+ return None
52
+ from .ai_matcher import OpenAICompatibleMatcher
53
+ # field descriptions come from the active config (not hardcoded)
54
+ return OpenAICompatibleMatcher(
55
+ field_defs=engine._ACTIVE_CONFIG.field_descriptions)
56
+
57
+
58
+ def build_learn_store():
59
+ """Self-learning vocabulary store (URL via TABULARMAPPER_LEARN_STORE)."""
60
+ from .learn import LearnStore
61
+ return LearnStore()
62
+
63
+
64
+ class _State:
65
+ cache: Optional[MappingCache] = None
66
+ matcher: Any = None
67
+ learn: Any = None
68
+
69
+
70
+ state = _State()
71
+
72
+
73
+ @asynccontextmanager
74
+ async def lifespan(app: FastAPI):
75
+ # Load the output template + synonyms from TABULARMAPPER_CONFIG (file / URL /
76
+ # s3:// / dict). Only if the env var is set — otherwise we keep whatever is
77
+ # already active, so a manual `configure("config.json")` before startup is
78
+ # NOT overwritten.
79
+ _cfg = os.getenv("TABULARMAPPER_CONFIG")
80
+ if _cfg:
81
+ engine.configure(_cfg)
82
+ state.cache = MappingCache() # reads TABULARMAPPER_CACHE (URL) or the sqlite default
83
+ state.matcher = build_matcher()
84
+ state.learn = build_learn_store()
85
+ engine.apply_learned(state.learn) # activate already-learned synonyms
86
+ yield
87
+ # nothing to tear down
88
+
89
+
90
+ # --------------------------------------------------------------------------
91
+ # Response schema
92
+ # --------------------------------------------------------------------------
93
+ class ColumnMapOut(BaseModel):
94
+ col_index: int
95
+ raw_header: str
96
+ field: Optional[str]
97
+ confidence: int
98
+ method: str
99
+
100
+
101
+ class MapResponse(BaseModel):
102
+ header_index: int
103
+ needs_review: bool
104
+ review_reasons: list[str]
105
+ schema_columns: list[str]
106
+ columns: list[ColumnMapOut]
107
+ transactions: list[dict]
108
+
109
+
110
+ # --------------------------------------------------------------------------
111
+ # Endpoint handlers (plain functions so the router prefix can be configured)
112
+ # --------------------------------------------------------------------------
113
+ async def health() -> dict:
114
+ return {"status": "ok", "ai_enabled": state.matcher is not None}
115
+
116
+
117
+ async def map_statement(file: UploadFile = File(...)) -> MapResponse:
118
+ """Upload a spreadsheet (.xlsx); get the standardized mapping + rows."""
119
+ name = (file.filename or "").lower()
120
+ if not name.endswith((".xlsx", ".xls")):
121
+ raise HTTPException(status_code=400, detail="expected an .xlsx/.xls file")
122
+
123
+ data = await file.read() # raw bytes, parsed in memory (never hits disk)
124
+ try:
125
+ # blocking work -> threadpool; process_stream reads straight from bytes
126
+ res = await run_in_threadpool(
127
+ process_stream, data,
128
+ table_matcher=state.matcher, cache=state.cache,
129
+ learn_store=state.learn,
130
+ source_label=file.filename or "<upload>",
131
+ )
132
+ except Exception as exc: # noqa: BLE001
133
+ raise HTTPException(status_code=422, detail=f"could not process file: {exc}")
134
+
135
+ return MapResponse(
136
+ header_index=res.header_index,
137
+ needs_review=res.needs_review,
138
+ review_reasons=res.review_reasons,
139
+ schema_columns=[disp for _, disp in engine.OUTPUT_SCHEMA],
140
+ columns=[ColumnMapOut(**{
141
+ "col_index": m.col_index, "raw_header": m.raw_header,
142
+ "field": m.field, "confidence": m.confidence, "method": m.method,
143
+ }) for m in res.column_maps],
144
+ transactions=res.records,
145
+ )
146
+
147
+
148
+ async def learn_pending() -> dict:
149
+ return {"pending": state.learn.pending(), "stats": state.learn.stats()}
150
+
151
+
152
+ async def learn_approve(phrase: str, field: Optional[str] = None) -> dict:
153
+ ok = await run_in_threadpool(state.learn.approve, phrase, field)
154
+ if ok:
155
+ engine.apply_learned(state.learn) # activate immediately
156
+ return {"approved": ok, "stats": state.learn.stats()}
157
+
158
+
159
+ async def learn_reject(phrase: str, field: Optional[str] = None) -> dict:
160
+ ok = await run_in_threadpool(state.learn.reject, phrase, field)
161
+ return {"rejected": ok, "stats": state.learn.stats()}
162
+
163
+
164
+ # --------------------------------------------------------------------------
165
+ # Router factory — the prefix is configurable (default "/mapper", or the env
166
+ # var TABULARMAPPER_ROUTE_PREFIX). This is a general table->schema mapper, so the
167
+ # route name isn't bank-specific and you can set your own.
168
+ # --------------------------------------------------------------------------
169
+ def make_router(prefix: Optional[str] = None, tags: Optional[list] = None) -> APIRouter:
170
+ if prefix is None:
171
+ prefix = os.getenv("TABULARMAPPER_ROUTE_PREFIX", "/mapper")
172
+ r = APIRouter(prefix=prefix.rstrip("/"), tags=tags or ["mapper"])
173
+ r.add_api_route("/health", health, methods=["GET"])
174
+ r.add_api_route("/map", map_statement, methods=["POST"], response_model=MapResponse)
175
+ r.add_api_route("/learn/pending", learn_pending, methods=["GET"])
176
+ r.add_api_route("/learn/approve", learn_approve, methods=["POST"])
177
+ r.add_api_route("/learn/reject", learn_reject, methods=["POST"])
178
+ return r
179
+
180
+
181
+ # Default router instance -> /mapper/* (or TABULARMAPPER_ROUTE_PREFIX)
182
+ router = make_router()
183
+
184
+ # Standalone app (uvicorn tabularmapper.api:app)
185
+ app = FastAPI(title="Tabular Mapper", lifespan=lifespan)
186
+ app.include_router(router)