bank-statement-mapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ """
2
+ bank_statement_mapper — map any bank statement .xlsx to a standard schema.
3
+
4
+ Two-stage, auditable pipeline: deterministic header detection + synonym/fuzzy
5
+ column mapping, with an optional AI table matcher and a self-learning vocabulary.
6
+
7
+ Quick start:
8
+
9
+ from bank_statement_mapper import process_file, MappingCache
10
+ res = process_file("statement.xlsx", cache=MappingCache())
11
+ print(res.records) # list[dict], ready for JSON / DB
12
+
13
+ Heavier pieces are kept as submodules so importing this package stays light:
14
+ from bank_statement_mapper.ai_matcher import OpenAICompatibleMatcher
15
+ from bank_statement_mapper.bank_mapper_api import router # needs [api] extra
16
+ """
17
+
18
+ from .bank_mapper import (
19
+ ALLOWED_FIELDS,
20
+ OUTPUT_SCHEMA,
21
+ ColumnMap,
22
+ OutputResult,
23
+ ProcessResult,
24
+ apply_learned,
25
+ configure,
26
+ detect_header_row,
27
+ map_columns,
28
+ normalize_amount,
29
+ normalize_date,
30
+ process_file,
31
+ process_stream,
32
+ records_to_csv_bytes,
33
+ )
34
+ from .learn import LearnStore, harvest_folder, learn_from_result
35
+ from .mapping_cache import MappingCache
36
+ from .schema import Config, config_from_dict, default_config, load_config
37
+ from .stores import open_store
38
+
39
+ __version__ = "0.1.0"
40
+
41
+ __all__ = [
42
+ "process_file",
43
+ "process_stream",
44
+ "records_to_csv_bytes",
45
+ "configure",
46
+ "apply_learned",
47
+ "MappingCache",
48
+ "LearnStore",
49
+ "learn_from_result",
50
+ "harvest_folder",
51
+ "load_config",
52
+ "config_from_dict",
53
+ "default_config",
54
+ "Config",
55
+ "open_store",
56
+ "ProcessResult",
57
+ "ColumnMap",
58
+ "OutputResult",
59
+ "OUTPUT_SCHEMA",
60
+ "ALLOWED_FIELDS",
61
+ "detect_header_row",
62
+ "map_columns",
63
+ "normalize_amount",
64
+ "normalize_date",
65
+ "__version__",
66
+ ]
@@ -0,0 +1,253 @@
1
+ """
2
+ ai_matcher.py — LLM-based, table-level column matcher for NEW bank layouts.
3
+
4
+ This is the high-accuracy path your boss is asking for: when a statement's
5
+ header is unknown to the synonym table, one LLM call maps the whole header row
6
+ to the output fields and the result is written straight into mapping_cache.json,
7
+ so that bank is "known" forever after (never hits the LLM again).
8
+
9
+ PRIVACY — the model matches the TABLE, never the data
10
+ -----------------------------------------------------
11
+ The prompt contains ONLY:
12
+ * column header strings (e.g. "Withdrawals", "Value Dt")
13
+ * a structural profile per column computed locally (dtype, sign, fill-rate,
14
+ which columns are mutually exclusive) — this is metadata, NOT cell contents
15
+ * the list of allowed output fields + short descriptions
16
+
17
+ It NEVER contains transaction amounts, dates, names, narrations or references.
18
+ No real statement data leaves the machine. (You can opt into sending a couple of
19
+ sanitized sample values with include_samples=True, but it is OFF by default.)
20
+
21
+ Provider — OpenAI-compatible
22
+ ----------------------------
23
+ Works with any endpoint that speaks the OpenAI /chat/completions API: OpenAI,
24
+ Azure OpenAI, Together, Groq, or a local vLLM / Ollama / LM Studio server. Set
25
+ base_url + api_key + model. Uses only the Python standard library (urllib), so
26
+ there is no SDK dependency to install or pin.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import datetime as _dt
32
+ import json
33
+ import os
34
+ import re
35
+ import urllib.request
36
+ from typing import Callable, Optional
37
+
38
+ # Concise field definitions the LLM maps onto. Kept separate from the embedding
39
+ # descriptions because an instruct model wants crisp semantics, not keyword soup.
40
+ FIELD_DEFS: dict[str, str] = {
41
+ "date": "the transaction date (post/value/booking date)",
42
+ "description": "free-text narration / particulars / details of the transaction",
43
+ "reference": "reference or cheque/UTR/instrument number identifying the entry",
44
+ "debit": "money leaving the account (withdrawal / paid out); a debit-only column",
45
+ "credit": "money entering the account (deposit / paid in); a credit-only column",
46
+ "balance": "running account balance after the transaction",
47
+ "amount": "a SINGLE signed amount column (one column, +credit / -debit)",
48
+ }
49
+
50
+
51
+ # --------------------------------------------------------------------------
52
+ # Structural profiling — deterministic, no cell contents leave this function
53
+ # --------------------------------------------------------------------------
54
+ def _classify(v) -> str:
55
+ if v is None or (isinstance(v, str) and v.strip() == ""):
56
+ return "empty"
57
+ if isinstance(v, (_dt.datetime, _dt.date)):
58
+ return "date"
59
+ if isinstance(v, bool):
60
+ return "text"
61
+ if isinstance(v, (int, float)):
62
+ return "number"
63
+ s = str(v).strip()
64
+ if re.match(r"^[-(]?[\d,]+\.?\d*\)?\s*(dr|cr)?$", s, re.I):
65
+ return "number"
66
+ if re.search(r"\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}", s) or \
67
+ re.match(r"\d{1,2}\s*[A-Za-z]{3,9}\s*\d{2,4}", s):
68
+ return "date"
69
+ return "text"
70
+
71
+
72
+ def _is_negative(v) -> bool:
73
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
74
+ return v < 0
75
+ if isinstance(v, str):
76
+ s = v.strip().lower()
77
+ return s.startswith("-") or ("(" in s and ")" in s) or s.endswith("dr")
78
+ return False
79
+
80
+
81
+ def profile_columns(header_row: list, data_rows: list[list],
82
+ max_rows: int = 40) -> list[dict]:
83
+ """Return a per-column STRUCTURAL profile — no raw cell values.
84
+
85
+ Fields: index, name, dtype (majority), fill_rate, has_negative,
86
+ mutually_exclusive_with (column indices never co-filled -> debit/credit
87
+ pairs). This is exactly the signal a human uses to tell debit from credit
88
+ without reading the numbers.
89
+ """
90
+ ncols = len(header_row)
91
+ rows = data_rows[:max_rows]
92
+ filled = [[False] * ncols for _ in rows]
93
+ dtypes: list[list[str]] = [[] for _ in range(ncols)]
94
+ neg = [False] * ncols
95
+
96
+ for r_i, row in enumerate(rows):
97
+ for c in range(ncols):
98
+ v = row[c] if c < len(row) else None
99
+ t = _classify(v)
100
+ if t != "empty":
101
+ filled[r_i][c] = True
102
+ dtypes[c].append(t)
103
+ if _is_negative(v):
104
+ neg[c] = True
105
+
106
+ profiles = []
107
+ for c in range(ncols):
108
+ types = dtypes[c]
109
+ majority = max(set(types), key=types.count) if types else "empty"
110
+ fill_rate = (sum(1 for r in filled if r[c]) / len(rows)) if rows else 0.0
111
+ # mutual exclusivity: never filled in the same row as column d
112
+ excl = []
113
+ for d in range(ncols):
114
+ if d == c:
115
+ continue
116
+ both = any(r[c] and r[d] for r in filled)
117
+ c_has = any(r[c] for r in filled)
118
+ d_has = any(r[d] for r in filled)
119
+ if c_has and d_has and not both:
120
+ excl.append(d)
121
+ profiles.append({
122
+ "index": c,
123
+ "name": ("" if header_row[c] is None else str(header_row[c]).strip()),
124
+ "dtype": majority,
125
+ "fill_rate": round(fill_rate, 2),
126
+ "has_negative": neg[c],
127
+ "mutually_exclusive_with": excl,
128
+ })
129
+ return profiles
130
+
131
+
132
+ # --------------------------------------------------------------------------
133
+ # OpenAI-compatible table matcher
134
+ # --------------------------------------------------------------------------
135
+ class OpenAICompatibleMatcher:
136
+ """Map an unknown header row to output fields with one LLM call.
137
+
138
+ Transport is any OpenAI-compatible /chat/completions endpoint. Inject a
139
+ custom `transport` (messages -> assistant_text) to unit-test without network.
140
+ """
141
+
142
+ def __init__(self,
143
+ base_url: Optional[str] = None,
144
+ api_key: Optional[str] = None,
145
+ model: Optional[str] = None,
146
+ field_defs: Optional[dict] = None,
147
+ include_samples: bool = False,
148
+ timeout: float = 30.0,
149
+ temperature: float = 0.0,
150
+ transport: Optional[Callable[[list], str]] = None):
151
+ self.base_url = (base_url or os.getenv("OPENAI_BASE_URL")
152
+ or "https://api.openai.com/v1").rstrip("/")
153
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY", "")
154
+ self.model = model or os.getenv("OPENAI_MODEL", "gpt-4o-mini")
155
+ self.field_defs = field_defs or FIELD_DEFS
156
+ self.include_samples = include_samples
157
+ self.timeout = timeout
158
+ self.temperature = temperature
159
+ self._transport = transport # for tests / custom clients
160
+
161
+ # -- prompt construction (structure only) --
162
+ def _build_messages(self, profiles: list[dict], allowed_fields: list[str]) -> list:
163
+ field_lines = "\n".join(
164
+ f" - {f}: {self.field_defs.get(f, f)}"
165
+ for f in allowed_fields
166
+ )
167
+ col_lines = []
168
+ for p in profiles:
169
+ excl = (f", mutually-exclusive with columns {p['mutually_exclusive_with']}"
170
+ if p["mutually_exclusive_with"] else "")
171
+ neg = ", contains negative values" if p["has_negative"] else ""
172
+ col_lines.append(
173
+ f" [{p['index']}] name={p['name']!r} "
174
+ f"type={p['dtype']} fill={p['fill_rate']}{neg}{excl}"
175
+ )
176
+ cols = "\n".join(col_lines)
177
+ system = (
178
+ "You map bank-statement spreadsheet COLUMNS to a fixed schema. "
179
+ "You are given only column headers and structural metadata (data "
180
+ "types, fill rates, sign, and which columns are mutually exclusive) "
181
+ "— never the actual transaction values. Use the header wording plus "
182
+ "these structural hints. Two money columns that are mutually "
183
+ "exclusive are almost always a debit/credit pair; decide direction "
184
+ "from the header wording. A single signed money column (has negative "
185
+ "values, not mutually exclusive with another money column) is "
186
+ "'amount'. Respond with ONLY a JSON object mapping the column index "
187
+ "(as a string) to one field name, or null if a column matches no "
188
+ "field. Do not invent fields."
189
+ )
190
+ user = (
191
+ f"Allowed fields:\n{field_lines}\n\n"
192
+ f"Columns:\n{cols}\n\n"
193
+ "Return JSON like {\"0\": \"date\", \"1\": \"description\", "
194
+ "\"4\": null}. Every column index must appear exactly once."
195
+ )
196
+ return [{"role": "system", "content": system},
197
+ {"role": "user", "content": user}]
198
+
199
+ # -- HTTP transport (stdlib) --
200
+ def _http(self, messages: list) -> str:
201
+ payload = {
202
+ "model": self.model,
203
+ "messages": messages,
204
+ "temperature": self.temperature,
205
+ "response_format": {"type": "json_object"},
206
+ }
207
+ req = urllib.request.Request(
208
+ f"{self.base_url}/chat/completions",
209
+ data=json.dumps(payload).encode("utf-8"),
210
+ headers={
211
+ "Content-Type": "application/json",
212
+ "Authorization": f"Bearer {self.api_key}",
213
+ },
214
+ method="POST",
215
+ )
216
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
217
+ body = json.loads(resp.read().decode("utf-8"))
218
+ return body["choices"][0]["message"]["content"]
219
+
220
+ # -- parse + validate --
221
+ @staticmethod
222
+ def _parse(text: str, ncols: int, allowed_fields: list[str]) -> dict:
223
+ m = re.search(r"\{.*\}", text, re.S)
224
+ raw = json.loads(m.group(0) if m else text)
225
+ # single-slot fields: keep only the first (highest-priority) assignment
226
+ result: dict[int, str] = {}
227
+ seen: set[str] = set()
228
+ for k, v in raw.items():
229
+ try:
230
+ ci = int(k)
231
+ except (ValueError, TypeError):
232
+ continue
233
+ if not (0 <= ci < ncols):
234
+ continue
235
+ if v in allowed_fields and v not in seen:
236
+ result[ci] = v
237
+ seen.add(v)
238
+ return result
239
+
240
+ def __call__(self, header_row: list, data_rows: list[list],
241
+ allowed_fields: list[str]) -> dict:
242
+ """Return {col_index: field} for the header. Empty dict on any failure
243
+ (caller then leaves those columns unmapped -> needs_review)."""
244
+ profiles = profile_columns(header_row, data_rows)
245
+ messages = self._build_messages(profiles, allowed_fields)
246
+ try:
247
+ text = self._transport(messages) if self._transport else self._http(messages)
248
+ except Exception: # noqa: BLE001 — network/parse errors must not crash the pipeline
249
+ return {}
250
+ try:
251
+ return self._parse(text, len(header_row), allowed_fields)
252
+ except (json.JSONDecodeError, ValueError, TypeError):
253
+ return {}