tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tabularmapper/learn.py ADDED
@@ -0,0 +1,203 @@
1
+ """
2
+ learn.py — self-learning synonym vocabulary.
3
+
4
+ When the AI (or a human) confirms that a header maps to a field, that phrase is
5
+ recorded here. Next time any bank uses that header it's a deterministic EXACT
6
+ match — the AI never fires for it again, and nobody edits code. Over time the
7
+ AI-call rate trends to zero.
8
+
9
+ Two halves, cleanly split:
10
+ * CONFIG (schema.py / config.json) = seed vocabulary, read-only, from S3/URL.
11
+ * LEARNED (this store) = mutable, grows from real traffic.
12
+ Effective synonyms at match time = seed + learned (seed wins on conflict).
13
+
14
+ Storage uses the same URL convention as the cache (stores.open_store):
15
+ LearnStore() # env TABULARMAPPER_LEARN_STORE, else sqlite
16
+ LearnStore("redis://localhost:6379/0")
17
+ LearnStore("memory://") # tests
18
+
19
+ Trust policy (financial-data safe by default):
20
+ * date / description / reference / balance / amount -> auto-applied.
21
+ * debit / credit -> held in `pending` for a
22
+ human to approve(), because a wrong debit/credit direction is the one costly
23
+ error. Set auto_apply_gated=True to skip review (fully unattended).
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import os
29
+ import re
30
+ import time
31
+ from typing import Optional
32
+
33
+ from .stores import open_store
34
+
35
+ # In-memory by default — creates NO files. Set TABULARMAPPER_LEARN_STORE (or pass a
36
+ # URL) to a path / redis:// / valkey:// / postgresql:// for persistence.
37
+ _DEFAULT_URL = "memory://"
38
+ _KEY = "learned"
39
+ _DEFAULT_GATED = frozenset({"debit", "credit"})
40
+
41
+
42
+ def _norm(s) -> str:
43
+ return re.sub(r"\s+", " ", str(s).strip().lower()) if s is not None else ""
44
+
45
+
46
+ def _now() -> str:
47
+ return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
48
+
49
+
50
+ class LearnStore:
51
+ def __init__(self, source: Optional[str] = None, *,
52
+ gated_fields=_DEFAULT_GATED, auto_apply_gated: bool = False):
53
+ url = source or os.getenv("TABULARMAPPER_LEARN_STORE") or _DEFAULT_URL
54
+ self.url = url
55
+ self._store = open_store(url)
56
+ self.gated_fields = set(gated_fields)
57
+ self.auto_apply_gated = auto_apply_gated
58
+
59
+ # -- persistence (single provenance-rich record) --
60
+ def _load(self) -> dict:
61
+ return self._store.get(_KEY) or {
62
+ "version": 1, "fields": {}, "pending": [], "conflicts": []}
63
+
64
+ def _save(self, blob: dict) -> None:
65
+ self._store.put(_KEY, blob)
66
+
67
+ @staticmethod
68
+ def _field_of(blob: dict, phrase: str) -> Optional[str]:
69
+ for fld, entries in blob["fields"].items():
70
+ if any(e["phrase"] == phrase for e in entries):
71
+ return fld
72
+ return None
73
+
74
+ # -- read views --
75
+ def synonyms(self) -> dict:
76
+ """Applied learned synonyms as {field: [phrases]} (for matching)."""
77
+ blob = self._load()
78
+ return {f: [e["phrase"] for e in es] for f, es in blob["fields"].items() if es}
79
+
80
+ def pending(self) -> list:
81
+ return self._load()["pending"]
82
+
83
+ def conflicts(self) -> list:
84
+ return self._load()["conflicts"]
85
+
86
+ def stats(self) -> dict:
87
+ b = self._load()
88
+ return {"applied": sum(len(v) for v in b["fields"].values()),
89
+ "pending": len(b["pending"]), "conflicts": len(b["conflicts"])}
90
+
91
+ # -- write path --
92
+ def add(self, header: str, field: str, *, source: str = "ai",
93
+ bank: Optional[str] = None) -> str:
94
+ """Record a header->field mapping. Returns one of:
95
+ 'learned' | 'pending' | 'exists' | 'conflict' | 'skip'."""
96
+ phrase = _norm(header)
97
+ if not phrase or not field:
98
+ return "skip"
99
+ blob = self._load()
100
+ existing = self._field_of(blob, phrase)
101
+ if existing == field:
102
+ return "exists"
103
+ if existing is not None: # phrase already means something else
104
+ blob["conflicts"].append({
105
+ "phrase": phrase, "existing": existing, "proposed": field,
106
+ "source": source, "ts": _now()})
107
+ self._save(blob)
108
+ return "conflict"
109
+ entry = {"phrase": phrase, "field": field, "source": source,
110
+ "bank": bank, "ts": _now()}
111
+ gated = (field in self.gated_fields and source in ("ai", "harvest")
112
+ and not self.auto_apply_gated)
113
+ if gated:
114
+ if not any(p["phrase"] == phrase for p in blob["pending"]):
115
+ blob["pending"].append(entry)
116
+ self._save(blob)
117
+ return "pending"
118
+ blob["fields"].setdefault(field, []).append(entry)
119
+ self._save(blob)
120
+ return "learned"
121
+
122
+ # -- human review of gated entries --
123
+ def approve(self, phrase: str, field: Optional[str] = None) -> bool:
124
+ phrase = _norm(phrase)
125
+ blob = self._load()
126
+ keep, moved = [], False
127
+ for p in blob["pending"]:
128
+ if p["phrase"] == phrase and (field is None or p["field"] == field):
129
+ blob["fields"].setdefault(p["field"], []).append(
130
+ dict(p, source="human", ts=_now()))
131
+ moved = True
132
+ else:
133
+ keep.append(p)
134
+ blob["pending"] = keep
135
+ self._save(blob)
136
+ return moved
137
+
138
+ def reject(self, phrase: str, field: Optional[str] = None) -> bool:
139
+ phrase = _norm(phrase)
140
+ blob = self._load()
141
+ before = len(blob["pending"])
142
+ blob["pending"] = [p for p in blob["pending"]
143
+ if not (p["phrase"] == phrase and
144
+ (field is None or p["field"] == field))]
145
+ self._save(blob)
146
+ return len(blob["pending"]) < before
147
+
148
+ def close(self) -> None:
149
+ self._store.close()
150
+
151
+
152
+ def learn_from_result(result, store: LearnStore, *, min_confidence: int = 85,
153
+ methods=("ai",), source: str = "ai",
154
+ bank: Optional[str] = None) -> dict:
155
+ """Walk a ProcessResult's column maps and teach the store every confident,
156
+ model-resolved (non-exact) mapping. Returns a summary keyed by outcome."""
157
+ summary: dict[str, list] = {
158
+ "learned": [], "pending": [], "exists": [], "conflict": [], "skip": []}
159
+ for m in result.column_maps:
160
+ if not m.field or m.method == "exact" or m.method not in methods:
161
+ continue
162
+ if m.confidence < min_confidence:
163
+ continue
164
+ outcome = store.add(m.raw_header, m.field, source=source, bank=bank)
165
+ summary[outcome].append((m.raw_header, m.field))
166
+ return summary
167
+
168
+
169
+ def harvest_folder(folder: str, store: LearnStore, *,
170
+ table_matcher=None, min_confidence: int = 85,
171
+ methods=("ai", "fuzzy"), recursive: bool = False) -> dict:
172
+ """Bootstrap the vocabulary from a folder of past statements.
173
+
174
+ Runs the mapper over every .xlsx in `folder`, and teaches the store each
175
+ confident header->field pair that the seed synonyms didn't already resolve
176
+ exactly (fuzzy + AI matches). Gated fields (debit/credit) land in the pending
177
+ queue for a quick one-time review. Returns a report.
178
+
179
+ Pass a `table_matcher` (OpenAICompatibleMatcher) to also resolve headers that
180
+ fuzzy can't place; omit it to harvest deterministically only.
181
+ """
182
+ import glob
183
+ from .engine import process_file
184
+
185
+ pattern = os.path.join(folder, "**", "*.xlsx") if recursive \
186
+ else os.path.join(folder, "*.xlsx")
187
+ report: dict = {"files": 0, "learned": [], "pending": [],
188
+ "exists": [], "conflict": [], "skip": [], "errors": []}
189
+ for path in sorted(glob.glob(pattern, recursive=recursive)):
190
+ bank = os.path.splitext(os.path.basename(path))[0]
191
+ try:
192
+ res = process_file(path, table_matcher=table_matcher)
193
+ except Exception as exc: # noqa: BLE001 — a bad file shouldn't abort the batch
194
+ report["errors"].append((os.path.basename(path), str(exc)))
195
+ continue
196
+ report["files"] += 1
197
+ summ = learn_from_result(res, store, min_confidence=min_confidence,
198
+ methods=methods, source="harvest", bank=bank)
199
+ for outcome, pairs in summ.items():
200
+ report.setdefault(outcome, []).extend(
201
+ (os.path.basename(path), h, f) for h, f in pairs)
202
+ report["stats"] = store.stats()
203
+ return report
@@ -0,0 +1,118 @@
1
+ """
2
+ llm_fallback.py — pluggable fallback adapters for map_columns().
3
+
4
+ Interface (matches engine.map_columns' `llm_fallback` param):
5
+
6
+ fallback(header: str, samples: list[str], allowed_fields: list[str]) -> str | None
7
+
8
+ Contract: the callable receives ONLY a header string, up to 3 sample cell
9
+ strings, and the list of allowed field keys. It never sees full transaction
10
+ rows. It returns one of `allowed_fields` or None. This keeps bank data local
11
+ and auditable.
12
+
13
+ Adapters here are the PER-COLUMN, offline degraded path:
14
+ * HashingEmbeddingFallback — zero-dependency char-ngram cosine. Lexical only
15
+ (weak), but runs fully air-gapped with no model file
16
+ and no API. A last resort when the AI matcher is off
17
+ or unreachable.
18
+ * make_llm_fallback — wrap any 'text -> text' model into the per-column
19
+ interface (OFF by default).
20
+
21
+ For the primary, high-accuracy path — a real LLM that reads the whole header row
22
+ plus structural profiles and returns a full mapping — see `ai_matcher.py`
23
+ (OpenAICompatibleMatcher). That is the recommended way to auto-map new banks.
24
+
25
+ None of these fire unless the deterministic exact+fuzzy matcher fails a header.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import math
31
+ import re
32
+ from collections import Counter
33
+ from typing import Callable, Optional
34
+
35
+ # Human-readable descriptions per field. The embedding models compare the raw
36
+ # header against these, so richer phrasing -> better cosine separation.
37
+ FIELD_DESCRIPTIONS: dict[str, str] = {
38
+ "date": "date of the transaction when it was posted or valued",
39
+ "description": "narration particulars description details remarks of the transaction",
40
+ "reference": "reference number cheque number transaction id utr instrument number code",
41
+ "debit": "debit withdrawal money going out paid out outgoing spent amount reducing balance",
42
+ "credit": "credit deposit money coming in paid in incoming received income amount increasing balance",
43
+ "balance": "account balance remaining after the transaction closing running available balance",
44
+ "amount": "single signed transaction amount positive or negative value",
45
+ }
46
+
47
+
48
+ # --------------------------------------------------------------------------
49
+ # 2) Zero-dependency offline fallback (no model download)
50
+ # --------------------------------------------------------------------------
51
+ class HashingEmbeddingFallback:
52
+ """Char-ngram cosine similarity. No torch, no download, fully offline.
53
+
54
+ Lexical only, so weaker than the AI matcher — but it needs no API, no model
55
+ file, and runs fully air-gapped / in CI. Same interface & contract.
56
+ """
57
+
58
+ def __init__(self, min_similarity: float = 0.18, ngram: int = 3,
59
+ field_descriptions: Optional[dict] = None):
60
+ self.min_similarity = min_similarity
61
+ self.ngram = ngram
62
+ self.field_descriptions = field_descriptions or FIELD_DESCRIPTIONS
63
+ self._field_vecs = {
64
+ f: self._vec(d) for f, d in self.field_descriptions.items()
65
+ }
66
+
67
+ def _vec(self, text: str) -> Counter:
68
+ t = re.sub(r"[^a-z0-9]", " ", text.lower())
69
+ grams: Counter = Counter()
70
+ for tok in t.split():
71
+ grams[tok] += 1 # word tokens
72
+ padded = f" {tok} "
73
+ for i in range(len(padded) - self.ngram + 1):
74
+ grams[padded[i:i + self.ngram]] += 1
75
+ return grams
76
+
77
+ @staticmethod
78
+ def _cos(a: Counter, b: Counter) -> float:
79
+ common = set(a) & set(b)
80
+ num = sum(a[k] * b[k] for k in common)
81
+ na = math.sqrt(sum(v * v for v in a.values()))
82
+ nb = math.sqrt(sum(v * v for v in b.values()))
83
+ return num / (na * nb) if na and nb else 0.0
84
+
85
+ def __call__(self, header: str, samples: list[str],
86
+ allowed_fields: list[str]) -> Optional[str]:
87
+ qv = self._vec(header + " " + " ".join(samples))
88
+ best_field, best_sim = None, -1.0
89
+ for fld in allowed_fields:
90
+ fv = self._field_vecs.get(fld) or self._vec(fld)
91
+ sim = self._cos(qv, fv)
92
+ if sim > best_sim:
93
+ best_field, best_sim = fld, sim
94
+ return best_field if best_sim >= self.min_similarity else None
95
+
96
+
97
+ # --------------------------------------------------------------------------
98
+ # Optional hosted small-model adapter (OFF by default)
99
+ # --------------------------------------------------------------------------
100
+ def make_llm_fallback(client_call: Callable[[str], str]) -> Callable:
101
+ """Wrap any 'text -> text' small-model call into the fallback interface.
102
+
103
+ `client_call(prompt)` should return a single field name. This never sends
104
+ transaction rows — only the header + samples + allowed fields.
105
+ """
106
+ def _fallback(header: str, samples: list[str],
107
+ allowed_fields: list[str]) -> Optional[str]:
108
+ prompt = (
109
+ "You map one spreadsheet column header to exactly one field.\n"
110
+ f"Allowed fields: {', '.join(allowed_fields)}\n"
111
+ f"Header: {header!r}\n"
112
+ f"Sample values: {samples}\n"
113
+ "Reply with ONLY the single best field name, or 'none'."
114
+ )
115
+ ans = (client_call(prompt) or "").strip().lower()
116
+ ans = re.sub(r"[^a-z]", "", ans)
117
+ return ans if ans in allowed_fields else None
118
+ return _fallback
@@ -0,0 +1,73 @@
1
+ """
2
+ mapping_cache.py — persistent {header_fingerprint: field_mapping} cache.
3
+
4
+ A repeat bank format skips detection/mapping entirely -> true 100% on seen
5
+ formats. The fingerprint is a hash of the normalized header cell strings, so
6
+ the same header layout always resolves to the same cached mapping regardless
7
+ of row content.
8
+
9
+ Storage is pluggable via a URL (see stores.open_store):
10
+ MappingCache() # env TABULARMAPPER_CACHE, else sqlite default
11
+ MappingCache("sqlite:///mapping_cache.db") # file, no server, concurrency-safe
12
+ MappingCache("redis://localhost:6379/0") # multi-worker
13
+ MappingCache("memory://") # tests
14
+ MappingCache(path="legacy.json") # legacy JSON file (back-compat)
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import hashlib
20
+ import os
21
+ import re
22
+ from typing import Optional
23
+
24
+ from .engine import ColumnMap
25
+ from .stores import open_store
26
+
27
+ # In-memory by default — creates NO files. Persistence is opt-in: set
28
+ # TABULARMAPPER_CACHE (or pass a URL) to a path / redis:// / valkey:// /
29
+ # postgresql://. In-memory still caches within a process (lost on restart).
30
+ _DEFAULT_URL = "memory://"
31
+
32
+
33
+ def _fingerprint(header: list, namespace: str = "") -> str:
34
+ parts = []
35
+ for c in header:
36
+ s = "" if c is None else re.sub(r"\s+", " ", str(c).strip().lower())
37
+ parts.append(s)
38
+ # `namespace` scopes the key to the active schema, so a config change (e.g.
39
+ # adding a field) does NOT return a stale mapping for the same header.
40
+ raw = namespace + "\x00" + "|".join(parts)
41
+ return hashlib.sha1(raw.encode("utf-8")).hexdigest()
42
+
43
+
44
+ class MappingCache:
45
+ def __init__(self, source: Optional[str] = None, *, path: Optional[str] = None):
46
+ # precedence: explicit source > legacy path kwarg > env > sqlite default
47
+ url = source or path or os.getenv("TABULARMAPPER_CACHE") or _DEFAULT_URL
48
+ self.url = url
49
+ self._store = open_store(url)
50
+
51
+ def get(self, header: list, namespace: str = "") -> Optional[list[ColumnMap]]:
52
+ entry = self._store.get(_fingerprint(header, namespace))
53
+ if not entry:
54
+ return None
55
+ return [
56
+ ColumnMap(m["col_index"], m["raw_header"], m["field"],
57
+ m["confidence"], "cache")
58
+ for m in entry["columns"]
59
+ ]
60
+
61
+ def put(self, header: list, col_maps: list[ColumnMap],
62
+ namespace: str = "") -> None:
63
+ self._store.put(_fingerprint(header, namespace), {
64
+ "header_preview": [("" if c is None else str(c)) for c in header],
65
+ "columns": [
66
+ {"col_index": m.col_index, "raw_header": m.raw_header,
67
+ "field": m.field, "confidence": m.confidence, "method": m.method}
68
+ for m in col_maps
69
+ ],
70
+ })
71
+
72
+ def close(self) -> None:
73
+ self._store.close()