tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,938 @@
1
+ """
2
+ engine.py — Bank Statement -> Standard Tabular Mapper (engine)
3
+
4
+ Two-stage, auditable pipeline:
5
+ Stage 1 detect_header_row() deterministic scoring (NO AI)
6
+ Stage 2 map_columns() exact synonym -> fuzzy -> optional llm/embedding fallback
7
+
8
+ Design invariants (see BUILD_PROMPT.md):
9
+ * No LLM/embedding model ever sees transaction rows. It only sees header
10
+ strings + <=3 sample cells per column. All row/date/amount work is
11
+ deterministic Python.
12
+ * Header detection is scoring, never a model call.
13
+ * Anything ambiguous is flagged needs_review instead of silently guessed.
14
+ * Every column decision carries a method (exact/fuzzy/llm/cache) + 0-100
15
+ confidence for human audit.
16
+ """
17
+
18
+
19
+ from __future__ import annotations
20
+
21
+ import base64
22
+ import csv
23
+ import datetime as _dt
24
+ import io
25
+ import json
26
+ import re
27
+ from dataclasses import dataclass, field
28
+ from typing import Callable, Literal, Optional, Union
29
+
30
+ from dateutil import parser as _dateparser
31
+ from rapidfuzz import fuzz
32
+
33
+ # --------------------------------------------------------------------------
34
+ # Active configuration (output template + synonyms + critical fields).
35
+ # Loaded from schema.py — by default the built-in constants (byte-identical to
36
+ # the previous hardcoded values), or from a JSON file / URL / S3 object / dict
37
+ # via env TABULARMAPPER_CONFIG or a call to configure().
38
+ #
39
+ # These module globals are kept for backward compatibility; everything reads
40
+ # them, and configure() swaps them atomically.
41
+ # --------------------------------------------------------------------------
42
+ from .schema import ( # noqa: E402
43
+ Config as _Config, load_config as _load_config,
44
+ DATE_TYPES as _DATE_TYPES, NUMERIC_TYPES as _NUMERIC_TYPES,
45
+ INTEGER_TYPES as _INTEGER_TYPES,
46
+ )
47
+
48
+ def _build_header_vocab(synonyms: dict) -> set:
49
+ """Header-detection vocabulary, derived from the active config's synonyms +
50
+ field names — NOT a hardcoded bank list. Adapts to whatever domain the
51
+ config describes."""
52
+ import re as _re
53
+ vocab: set = set()
54
+ for fld, phrases in synonyms.items():
55
+ for tok in _re.split(r"[^a-z0-9]+", str(fld).lower()):
56
+ if len(tok) >= 2:
57
+ vocab.add(tok)
58
+ for phrase in phrases:
59
+ for tok in _re.split(r"[^a-z0-9]+", str(phrase).lower()):
60
+ if len(tok) >= 2:
61
+ vocab.add(tok)
62
+ return vocab
63
+
64
+
65
+ _ACTIVE_CONFIG: _Config = _load_config()
66
+ _LEARNED_SYNONYMS: dict = {} # {field: [phrases]} from the learn store
67
+
68
+ OUTPUT_SCHEMA: list[tuple[str, str]] = _ACTIVE_CONFIG.headers # [(field, header)]
69
+ SYNONYMS: dict[str, list[str]] = {k: list(v) for k, v in _ACTIVE_CONFIG.synonyms.items()}
70
+ CRITICAL_FIELDS: set = set(_ACTIVE_CONFIG.critical_fields)
71
+ ALLOWED_FIELDS: list[str] = _ACTIVE_CONFIG.allowed_fields
72
+ _FIELD_TYPES: dict[str, str] = _ACTIVE_CONFIG.field_types
73
+ _HEADER_VOCAB: set = _build_header_vocab(SYNONYMS)
74
+
75
+
76
+ def _apply_synonyms() -> None:
77
+ """Rebuild the effective SYNONYMS + exact lookup + header vocab = config seed
78
+ + learned. Config (seed) phrases are authoritative on conflict; learned
79
+ phrases only *extend* the vocabulary."""
80
+ global SYNONYMS, _EXACT_LOOKUP, _HEADER_VOCAB
81
+ merged = {f: list(v) for f, v in _ACTIVE_CONFIG.synonyms.items()}
82
+ for fld, phrases in _LEARNED_SYNONYMS.items():
83
+ merged.setdefault(fld, [])
84
+ for p in phrases:
85
+ if p not in merged[fld]:
86
+ merged[fld].append(p)
87
+ SYNONYMS = merged
88
+ lut = _build_exact_lookup(_LEARNED_SYNONYMS) # learned first...
89
+ lut.update(_build_exact_lookup(_ACTIVE_CONFIG.synonyms)) # ...seed overrides
90
+ _EXACT_LOOKUP = lut
91
+ _HEADER_VOCAB = _build_header_vocab(merged)
92
+
93
+
94
+ def configure(source=None, config: "Optional[_Config]" = None) -> None:
95
+ """Swap the active configuration at runtime.
96
+
97
+ Accepts either a Config object or a `source` for load_config (path / http(s)
98
+ URL / s3:// / dict), positionally or by keyword — so all of these work:
99
+
100
+ configure(bank_preset()) # Config, positional
101
+ configure(config_from_dict({...})) # Config, positional
102
+ configure("config.json") # source
103
+ configure(config=my_config) # explicit
104
+
105
+ Rebuilds the derived globals so the new schema takes effect immediately.
106
+ """
107
+ global _ACTIVE_CONFIG, OUTPUT_SCHEMA, CRITICAL_FIELDS
108
+ global ALLOWED_FIELDS, _FIELD_TYPES
109
+ if config is None and isinstance(source, _Config):
110
+ source, config = None, source # a Config passed positionally
111
+ _ACTIVE_CONFIG = config if config is not None else _load_config(source)
112
+ OUTPUT_SCHEMA = _ACTIVE_CONFIG.headers
113
+ CRITICAL_FIELDS = set(_ACTIVE_CONFIG.critical_fields)
114
+ ALLOWED_FIELDS = _ACTIVE_CONFIG.allowed_fields
115
+ _FIELD_TYPES = _ACTIVE_CONFIG.field_types
116
+ _apply_synonyms()
117
+
118
+
119
+ def apply_learned(store) -> None:
120
+ """Load learned synonyms from a LearnStore and merge them into matching.
121
+ Call once at startup after configure(); process_file(..., learn_store=...)
122
+ keeps it fresh as new phrases are learned."""
123
+ global _LEARNED_SYNONYMS
124
+ _LEARNED_SYNONYMS = store.synonyms() if store is not None else {}
125
+ _apply_synonyms()
126
+
127
+
128
+ # --------------------------------------------------------------------------
129
+ # Output format types
130
+ # --------------------------------------------------------------------------
131
+ OutputFormat = Literal["records", "json", "bytes", "base64", "file"]
132
+
133
+
134
+ # --------------------------------------------------------------------------
135
+ # Data classes
136
+ # --------------------------------------------------------------------------
137
+ @dataclass
138
+ class HeaderCandidate:
139
+ index: int
140
+ score: float
141
+ cells: list
142
+ breakdown: dict = field(default_factory=dict)
143
+
144
+
145
+ @dataclass
146
+ class ColumnMap:
147
+ col_index: int
148
+ raw_header: str
149
+ field: Optional[str]
150
+ confidence: int
151
+ method: str # exact | fuzzy | llm | cache | none
152
+
153
+
154
+ @dataclass
155
+ class OutputResult:
156
+ """Lazy-evaluated output container supporting multiple serialization formats."""
157
+ records: list[dict]
158
+ format: OutputFormat
159
+ file_path: Optional[str] = None
160
+ _json: Optional[str] = field(default=None, repr=False)
161
+ _bytes: Optional[bytes] = field(default=None, repr=False)
162
+ _base64: Optional[str] = field(default=None, repr=False)
163
+
164
+ @property
165
+ def json(self) -> str:
166
+ """Records as JSON string."""
167
+ if self._json is None:
168
+ self._json = json.dumps(self.records, ensure_ascii=False)
169
+ return self._json
170
+
171
+ @property
172
+ def bytes(self) -> bytes:
173
+ """Records as .xlsx bytes (lazy, cached)."""
174
+ if self._bytes is None:
175
+ self._bytes = _records_to_xlsx_bytes(self.records)
176
+ return self._bytes
177
+
178
+ @property
179
+ def base64(self) -> str:
180
+ """Base64-encoded .xlsx bytes (lazy, cached)."""
181
+ if self._base64 is None:
182
+ self._base64 = base64.b64encode(self.bytes).decode("ascii")
183
+ return self._base64
184
+
185
+ def to_response(self) -> Union[list[dict], str, bytes]:
186
+ """Return the native Python object for the requested format."""
187
+ if self.format == "json":
188
+ return self.json
189
+ if self.format == "bytes":
190
+ return self.bytes
191
+ if self.format == "base64":
192
+ return self.base64
193
+ if self.format == "file":
194
+ if self.file_path is None:
195
+ raise ValueError("file_path required for 'file' output format")
196
+ _write_output(self.file_path, self.records)
197
+ return self.file_path
198
+ return self.records
199
+
200
+ def __repr__(self) -> str:
201
+ return f"<OutputResult format={self.format} records={len(self.records)}>"
202
+
203
+
204
+ @dataclass
205
+ class ProcessResult:
206
+ input_path: str
207
+ output_path: Optional[str]
208
+ header_index: int
209
+ header_score: float
210
+ column_maps: list[ColumnMap]
211
+ records: list[dict]
212
+ needs_review: bool
213
+ review_reasons: list[str]
214
+ header_breakdown: dict = field(default_factory=dict)
215
+ output: Optional[OutputResult] = field(default=None, repr=False)
216
+
217
+
218
+ # --------------------------------------------------------------------------
219
+ # Helpers
220
+ # --------------------------------------------------------------------------
221
+ _NUM_RE = re.compile(r"^[\s₹$€£rs\.]*[-(]?[\d,]+\.?\d*\)?[\s]*(dr|cr)?$", re.I)
222
+
223
+
224
+ def _norm(s) -> str:
225
+ return re.sub(r"\s+", " ", str(s).strip().lower()) if s is not None else ""
226
+
227
+
228
+ def _is_blank(v) -> bool:
229
+ return v is None or (isinstance(v, str) and v.strip() == "")
230
+
231
+
232
+ def _looks_numeric(v) -> bool:
233
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
234
+ return True
235
+ if isinstance(v, str) and _NUM_RE.match(v.strip()):
236
+ return True
237
+ return False
238
+
239
+
240
+ def _looks_datey(v) -> bool:
241
+ if isinstance(v, (_dt.datetime, _dt.date)):
242
+ return True
243
+ if isinstance(v, str):
244
+ s = v.strip()
245
+ if re.search(r"\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}", s):
246
+ return True
247
+ if re.match(r"\d{1,2}\s*[A-Za-z]{3,9}\s*\d{2,4}", s):
248
+ return True
249
+ return False
250
+
251
+
252
+ def _text_ratio(cells) -> float:
253
+ """Fraction of non-blank cells that are word-like (not numbers/dates)."""
254
+ non_blank = [c for c in cells if not _is_blank(c)]
255
+ if not non_blank:
256
+ return 0.0
257
+ wordy = sum(
258
+ 1 for c in non_blank
259
+ if isinstance(c, str) and not _looks_numeric(c) and not _looks_datey(c)
260
+ )
261
+ return wordy / len(non_blank)
262
+
263
+
264
+ # --------------------------------------------------------------------------
265
+ # Stage 1 — header detection (deterministic)
266
+ # --------------------------------------------------------------------------
267
+ def detect_header_row(rows: list[list], scan_limit: int = 25) -> HeaderCandidate:
268
+ """Score the first ~scan_limit rows and return the best header candidate.
269
+
270
+ Signals (see BUILD_PROMPT.md §5):
271
+ + density many non-empty cells
272
+ + text_ratio headers are words, not numbers/dates
273
+ + short_labels header cells are short strings
274
+ + vocab_hits banking-vocabulary matches (strongest)
275
+ + data_below rows below look like transaction data
276
+ - self_penalty the row itself is mostly numbers/dates
277
+ """
278
+ n = min(scan_limit, len(rows))
279
+ best: Optional[HeaderCandidate] = None
280
+
281
+ for i in range(n):
282
+ cells = rows[i]
283
+ non_blank = [c for c in cells if not _is_blank(c)]
284
+ if not non_blank:
285
+ continue
286
+
287
+ density = min(len(non_blank), 8) / 8.0 # cap so wide junk rows don't win
288
+ text_ratio = _text_ratio(cells)
289
+
290
+ strs = [c for c in non_blank if isinstance(c, str)]
291
+ short = sum(1 for c in strs if len(c.strip()) <= 25)
292
+ short_labels = short / len(non_blank)
293
+
294
+ toks = set()
295
+ for c in strs:
296
+ for t in re.split(r"[^a-z]+", c.lower()):
297
+ if t:
298
+ toks.add(t)
299
+ vocab_hits = len(toks & _HEADER_VOCAB)
300
+
301
+ # data_below: sample up to 5 rows beneath; reward numeric/date content
302
+ below_scores = []
303
+ for j in range(i + 1, min(i + 6, len(rows))):
304
+ b = rows[j]
305
+ bnb = [c for c in b if not _is_blank(c)]
306
+ if not bnb:
307
+ continue
308
+ datalike = sum(1 for c in bnb if _looks_numeric(c) or _looks_datey(c))
309
+ below_scores.append(datalike / len(bnb))
310
+ data_below = (sum(below_scores) / len(below_scores)) if below_scores else 0.0
311
+
312
+ self_numeric = sum(1 for c in non_blank if _looks_numeric(c) or _looks_datey(c))
313
+ self_penalty = self_numeric / len(non_blank)
314
+
315
+ score = (
316
+ 1.5 * density
317
+ + 2.0 * text_ratio
318
+ + 1.0 * short_labels
319
+ + 3.0 * min(vocab_hits, 6) # dominant signal
320
+ + 2.0 * data_below
321
+ - 3.0 * self_penalty
322
+ )
323
+ breakdown = {
324
+ "density": round(density, 2),
325
+ "text_ratio": round(text_ratio, 2),
326
+ "short_labels": round(short_labels, 2),
327
+ "vocab_hits": vocab_hits,
328
+ "data_below": round(data_below, 2),
329
+ "self_penalty": round(self_penalty, 2),
330
+ "score": round(score, 2),
331
+ }
332
+ if best is None or score > best.score:
333
+ best = HeaderCandidate(index=i, score=round(score, 2), cells=list(cells),
334
+ breakdown=breakdown)
335
+
336
+ if best is None:
337
+ best = HeaderCandidate(index=0, score=0.0,
338
+ cells=list(rows[0]) if rows else [], breakdown={})
339
+ return best
340
+
341
+
342
+ # --------------------------------------------------------------------------
343
+ # Stage 2 — column mapping (exact -> fuzzy -> fallback)
344
+ # --------------------------------------------------------------------------
345
+ # Build a flat lookup: phrase -> field, for O(1) exact matching.
346
+ def _build_exact_lookup(synonyms: dict) -> dict:
347
+ lut: dict[str, str] = {}
348
+ for _fld, _phrases in synonyms.items():
349
+ for _p in _phrases:
350
+ lut[_norm(_p)] = _fld
351
+ return lut
352
+
353
+
354
+ _EXACT_LOOKUP: dict[str, str] = _build_exact_lookup(SYNONYMS)
355
+
356
+
357
+ def _fuzzy_best(header: str) -> tuple[Optional[str], int]:
358
+ """Best fuzzy field + score across the synonym phrases of fields that are
359
+ actually in the active schema (ALLOWED_FIELDS). Synonyms for fields you
360
+ didn't declare are ignored, so they never sneak into the mapping."""
361
+ allowed = set(ALLOWED_FIELDS)
362
+ best_field, best_score = None, 0
363
+ for fld, phrases in SYNONYMS.items():
364
+ if fld not in allowed:
365
+ continue
366
+ for p in phrases:
367
+ s = fuzz.token_set_ratio(header, p)
368
+ # token_set_ratio can over-reward; blend with a stricter ratio
369
+ s = int(0.5 * s + 0.5 * fuzz.ratio(header, p))
370
+ if s > best_score:
371
+ best_field, best_score = fld, s
372
+ return best_field, best_score
373
+
374
+
375
+ def map_columns(
376
+ header_row: list,
377
+ sample_rows: Optional[list[list]] = None,
378
+ llm_fallback: Optional[Callable[[str, list, list], Optional[str]]] = None,
379
+ threshold: int = 80,
380
+ ) -> list[ColumnMap]:
381
+ """Map each header cell to an output field.
382
+
383
+ 1. exact synonym -> confidence 100, method 'exact'
384
+ 2. fuzzy (rapidfuzz) -> confidence = score, method 'fuzzy'
385
+ 3. if still < threshold and llm_fallback given -> method 'llm'
386
+
387
+ The fallback only ever receives the header string + up to 3 sample cells
388
+ for that column + the allowed field list. Never full rows.
389
+ """
390
+ sample_rows = sample_rows or []
391
+ maps: list[ColumnMap] = []
392
+ assigned: set[str] = set()
393
+
394
+ for ci, raw in enumerate(header_row):
395
+ raw_str = "" if raw is None else str(raw).strip()
396
+ key = _norm(raw)
397
+
398
+ if key == "":
399
+ maps.append(ColumnMap(ci, raw_str, None, 0, "none"))
400
+ continue
401
+
402
+ # 1. exact
403
+ if key in _EXACT_LOOKUP:
404
+ ef = _EXACT_LOOKUP[key]
405
+ if ef in ALLOWED_FIELDS:
406
+ maps.append(ColumnMap(ci, raw_str, ef, 100, "exact"))
407
+ else:
408
+ # recognized, but that field isn't in your output schema -> skip
409
+ maps.append(ColumnMap(ci, raw_str, None, 0, "not_in_schema"))
410
+ continue
411
+
412
+ # 2. fuzzy
413
+ fld, score = _fuzzy_best(key)
414
+ if score >= threshold:
415
+ maps.append(ColumnMap(ci, raw_str, fld, int(score), "fuzzy"))
416
+ continue
417
+
418
+ # 3. fallback (llm / embedding), header + <=3 samples only
419
+ if llm_fallback is not None:
420
+ samples = []
421
+ for r in sample_rows[:3]:
422
+ if ci < len(r) and not _is_blank(r[ci]):
423
+ samples.append(str(r[ci])[:40])
424
+ guess = llm_fallback(raw_str, samples, list(ALLOWED_FIELDS))
425
+ if guess in ALLOWED_FIELDS:
426
+ # fallback carries a moderate confidence, clearly below exact
427
+ maps.append(ColumnMap(ci, raw_str, guess, max(int(score), 70), "llm"))
428
+ continue
429
+
430
+ # unresolved
431
+ maps.append(ColumnMap(ci, raw_str, None, int(score), "fuzzy"))
432
+
433
+ # Resolve duplicates: if two columns claim the same field, keep the higher
434
+ # confidence one; demote the loser to unresolved (needs_review will catch).
435
+ by_field: dict[str, ColumnMap] = {}
436
+ for m in maps:
437
+ if m.field is None:
438
+ continue
439
+ if m.field not in by_field or m.confidence > by_field[m.field].confidence:
440
+ prev = by_field.get(m.field)
441
+ if prev is not None:
442
+ prev.field = None
443
+ prev.method = "dup"
444
+ by_field[m.field] = m
445
+ else:
446
+ m.field = None
447
+ m.method = "dup"
448
+ assigned.update(by_field.keys())
449
+ return maps
450
+
451
+
452
+ # --------------------------------------------------------------------------
453
+ # Normalizers (deterministic)
454
+ # --------------------------------------------------------------------------
455
+ _YEAR_FIRST_RE = re.compile(r"^\s*\d{4}[-/.]")
456
+
457
+
458
+ def normalize_date(v, dayfirst: Optional[bool] = None) -> Optional[str]:
459
+ """Return 'YYYY-MM-DD' or None.
460
+
461
+ * Excel datetime/date objects pass straight through.
462
+ * Year-first strings (yyyy-mm-dd, yyyy/mm/dd) are parsed with dayfirst=False
463
+ so they are never flipped.
464
+ * Other strings default to dayfirst=True (dd-mm-yyyy is the common non-US
465
+ bank format) unless the caller overrides via `dayfirst`.
466
+ """
467
+ if _is_blank(v):
468
+ return None
469
+ if isinstance(v, _dt.datetime):
470
+ return v.date().isoformat()
471
+ if isinstance(v, _dt.date):
472
+ return v.isoformat()
473
+
474
+ s = str(v).strip()
475
+ if not s:
476
+ return None
477
+
478
+ if _YEAR_FIRST_RE.match(s):
479
+ df = False
480
+ elif dayfirst is None:
481
+ df = True
482
+ else:
483
+ df = dayfirst
484
+
485
+ try:
486
+ dt = _dateparser.parse(s, dayfirst=df, fuzzy=True)
487
+ return dt.date().isoformat()
488
+ except (ValueError, OverflowError, TypeError):
489
+ return None
490
+
491
+
492
+ _AMT_CLEAN_RE = re.compile(r"[^\d.\-()]")
493
+
494
+
495
+ def normalize_amount(v) -> Optional[float]:
496
+ """Return a signed float or None.
497
+
498
+ Handles: '1,200.50', '(500)' -> -500, '500 Dr' -> -500, '₹500 Cr' -> 500,
499
+ leading minus, currency symbols, stray spaces. Dr/Cr suffix wins over sign.
500
+ """
501
+ if _is_blank(v):
502
+ return None
503
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
504
+ return float(v)
505
+
506
+ s = str(v).strip()
507
+ if not s:
508
+ return None
509
+
510
+ low = s.lower()
511
+ is_dr = bool(re.search(r"\bdr\b|dr$", low))
512
+ is_cr = bool(re.search(r"\bcr\b|cr$", low))
513
+ neg_paren = "(" in s and ")" in s
514
+ neg_sign = s.lstrip().startswith("-")
515
+
516
+ cleaned = _AMT_CLEAN_RE.sub("", s).replace("(", "").replace(")", "")
517
+ cleaned = cleaned.replace(",", "")
518
+ # collapse accidental multiple dots / trailing minus
519
+ if cleaned.count("-") > 1:
520
+ cleaned = cleaned.replace("-", "")
521
+ neg_sign = True
522
+ if cleaned in ("", "-", "."):
523
+ return None
524
+ try:
525
+ val = float(cleaned)
526
+ except ValueError:
527
+ return None
528
+
529
+ val = abs(val)
530
+ if is_dr:
531
+ return -val
532
+ if is_cr:
533
+ return val
534
+ if neg_paren or neg_sign:
535
+ return -val
536
+ return val
537
+
538
+
539
+ # --------------------------------------------------------------------------
540
+ # Extraction
541
+ # --------------------------------------------------------------------------
542
+ def _field_col(col_maps: list[ColumnMap], fld: str) -> Optional[int]:
543
+ for m in col_maps:
544
+ if m.field == fld:
545
+ return m.col_index
546
+ return None
547
+
548
+
549
+ def extract_records(rows: list[list], header_idx: int,
550
+ col_maps: list[ColumnMap]) -> list[dict]:
551
+ """Turn data rows into standardized dicts.
552
+
553
+ Reconciles the two money layouts:
554
+ * separate debit + credit columns -> used as-is (positive floats)
555
+ * single signed `amount` column -> negative to debit, positive to credit
556
+ Skips non-transaction rows (no date AND no money). Merges multi-line
557
+ descriptions that spill into blank cells below a transaction.
558
+ """
559
+ cfg = _ACTIVE_CONFIG
560
+ fields = cfg.fields # ordered output field keys
561
+ types = _FIELD_TYPES # field -> "date"|"money"/"number"|"text"
562
+ col_of = {f: _field_col(col_maps, f) for f in fields}
563
+
564
+ # Optional signed/split reconciliation, declared in config (not hardcoded):
565
+ # reconcile = {"signed": <src>, "negative": <fld>, "positive": <fld>}
566
+ rec_cfg = cfg.reconcile or {}
567
+ neg_f, pos_f, sig_f = rec_cfg.get("negative"), rec_cfg.get("positive"), rec_cfg.get("signed")
568
+ ci_neg = _field_col(col_maps, neg_f) if neg_f else None
569
+ ci_pos = _field_col(col_maps, pos_f) if pos_f else None
570
+ ci_sig = _field_col(col_maps, sig_f) if sig_f else None
571
+
572
+ # Which fields decide "this row is a real record". Config-driven; if unset,
573
+ # keep any row that has at least one non-empty mapped value (generic).
574
+ keep_fields = cfg.row_keep_if_any or fields
575
+ cont_field = cfg.continuation_field
576
+
577
+ def cell(r, ci):
578
+ return r[ci] if (ci is not None and ci < len(r)) else None
579
+
580
+ def _present(v):
581
+ return v is not None and v != ""
582
+
583
+ records: list[dict] = []
584
+ for r in rows[header_idx + 1:]:
585
+ if all(_is_blank(c) for c in r):
586
+ continue
587
+
588
+ # --- reconcile the directional pair, if declared ---
589
+ neg_val = pos_val = None
590
+ if rec_cfg:
591
+ if ci_sig is not None and ci_neg is None and ci_pos is None:
592
+ amt = normalize_amount(cell(r, ci_sig)) # one signed column -> split
593
+ if amt is not None:
594
+ if amt < 0:
595
+ neg_val = abs(amt)
596
+ elif amt > 0:
597
+ pos_val = amt
598
+ else:
599
+ neg_val = 0.0
600
+ else:
601
+ d = normalize_amount(cell(r, ci_neg)) if ci_neg is not None else None
602
+ c = normalize_amount(cell(r, ci_pos)) if ci_pos is not None else None
603
+ neg_val = abs(d) if d is not None else None
604
+ pos_val = abs(c) if c is not None else None
605
+
606
+ # --- build the record, one value per schema field, by type ---
607
+ rec: dict = {}
608
+ for f in fields:
609
+ if rec_cfg and f == neg_f:
610
+ rec[f] = neg_val
611
+ elif rec_cfg and f == pos_f:
612
+ rec[f] = pos_val
613
+ else:
614
+ t = types.get(f, "text")
615
+ v = cell(r, col_of.get(f))
616
+ if t in _DATE_TYPES:
617
+ rec[f] = normalize_date(v)
618
+ elif t in _NUMERIC_TYPES:
619
+ val = normalize_amount(v) # signed float
620
+ if val is not None and t in _INTEGER_TYPES and float(val).is_integer():
621
+ val = int(val) # integer type -> int
622
+ rec[f] = val
623
+ else:
624
+ rec[f] = str(v).strip() if not _is_blank(v) else ""
625
+
626
+ kept = any(_present(rec.get(f)) for f in keep_fields)
627
+
628
+ # continuation: a row with only the continuation field (and nothing that
629
+ # would keep it) folds into the record above it.
630
+ if not kept and cont_field and _present(rec.get(cont_field)) and records:
631
+ records[-1][cont_field] = (
632
+ str(records[-1].get(cont_field, "")) + " " + str(rec[cont_field])).strip()
633
+ continue
634
+
635
+ if not kept:
636
+ continue
637
+
638
+ records.append(rec)
639
+ return records
640
+
641
+
642
+ # --------------------------------------------------------------------------
643
+ # needs_review gate
644
+ # --------------------------------------------------------------------------
645
+ def evaluate_review(col_maps: list[ColumnMap], records: list[dict],
646
+ threshold: int = 80) -> tuple[bool, list[str]]:
647
+ reasons: list[str] = []
648
+ mapped_fields = {m.field for m in col_maps if m.field}
649
+
650
+ # critical: each configured critical field must be mapped
651
+ for cf in CRITICAL_FIELDS:
652
+ if cf not in mapped_fields:
653
+ reasons.append(f"missing critical field: {cf}")
654
+
655
+ # require_any: each configured group needs at least one mapped field
656
+ for group in (_ACTIVE_CONFIG.require_any or []):
657
+ if not (set(group) & mapped_fields):
658
+ reasons.append(f"none of {group} was mapped")
659
+
660
+ # low-confidence mapped columns
661
+ for m in col_maps:
662
+ if m.field and m.confidence < threshold and m.method != "exact":
663
+ reasons.append(
664
+ f"low-confidence column '{m.raw_header}' -> {m.field} "
665
+ f"({m.confidence}, {m.method})"
666
+ )
667
+
668
+ # any fallback-resolved column is worth a human glance
669
+ for m in col_maps:
670
+ if m.field and m.method == "llm":
671
+ reasons.append(f"fallback-resolved column '{m.raw_header}' -> {m.field}")
672
+
673
+ if not records:
674
+ reasons.append("no transaction rows extracted")
675
+
676
+ return (len(reasons) > 0), reasons
677
+
678
+
679
+ # --------------------------------------------------------------------------
680
+ # Output serializers
681
+ # --------------------------------------------------------------------------
682
+ def _records_to_xlsx_bytes(records: list[dict]) -> bytes:
683
+ """Serialize records to .xlsx bytes in-memory (no temp file)."""
684
+ import openpyxl
685
+ wb = openpyxl.Workbook()
686
+ ws = wb.active
687
+ ws.title = "Standardized"
688
+ headers = [disp for _, disp in OUTPUT_SCHEMA]
689
+ ws.append(headers)
690
+ for rec in records:
691
+ ws.append([rec.get(fld) for fld, _ in OUTPUT_SCHEMA])
692
+ bio = io.BytesIO()
693
+ wb.save(bio)
694
+ return bio.getvalue()
695
+
696
+
697
+ def records_to_csv_bytes(records: list[dict], encoding: str = "utf-8") -> bytes:
698
+ """Serialize records to CSV bytes."""
699
+ bio = io.BytesIO()
700
+ text = io.TextIOWrapper(bio, encoding=encoding, newline="")
701
+ headers = [disp for _, disp in OUTPUT_SCHEMA]
702
+ writer = csv.DictWriter(text, fieldnames=[f for f, _ in OUTPUT_SCHEMA])
703
+ writer.writeheader()
704
+ for rec in records:
705
+ writer.writerow(rec)
706
+ text.flush()
707
+ return bio.getvalue()
708
+
709
+
710
+ def _write_output(path: str, records: list[dict]) -> None:
711
+ """Write records to an .xlsx file on disk."""
712
+ import openpyxl
713
+ wb = openpyxl.Workbook()
714
+ ws = wb.active
715
+ ws.title = "Standardized"
716
+ headers = [disp for _, disp in OUTPUT_SCHEMA]
717
+ ws.append(headers)
718
+ for rec in records:
719
+ ws.append([rec.get(fld) for fld, _ in OUTPUT_SCHEMA])
720
+ wb.save(path)
721
+
722
+
723
+ # --------------------------------------------------------------------------
724
+ # AI integration
725
+ # --------------------------------------------------------------------------
726
+ AI_CONFIDENCE = 85 # below exact(100), at/above the fuzzy gate so it stands on its own
727
+
728
+
729
+ def _has_critical_gap(col_maps: list[ColumnMap]) -> bool:
730
+ """True if the deterministic pass is missing a critical field — i.e. this
731
+ looks like a new/unknown layout worth asking the AI about."""
732
+ fields = {m.field for m in col_maps if m.field}
733
+ if not CRITICAL_FIELDS.issubset(fields):
734
+ return True
735
+ for group in (_ACTIVE_CONFIG.require_any or []):
736
+ if not (set(group) & fields):
737
+ return True
738
+ return False
739
+
740
+
741
+ def merge_ai_mapping(col_maps: list[ColumnMap], ai: dict) -> list[ColumnMap]:
742
+ """Overlay an AI {col_index: field} mapping onto deterministic col_maps.
743
+
744
+ Exact (100) matches are ground truth and are kept. The AI fills columns the
745
+ deterministic pass could not place, without stealing a field an exact column
746
+ already owns. Single-slot fields are de-duplicated with exact > ai priority.
747
+ """
748
+ exact_fields = {m.field for m in col_maps if m.method == "exact" and m.field}
749
+ by_index = {m.col_index: m for m in col_maps}
750
+ for ci, field in ai.items():
751
+ m = by_index.get(ci)
752
+ if m is None or field not in ALLOWED_FIELDS:
753
+ continue
754
+ if m.method == "exact":
755
+ continue # never override ground truth
756
+ if field in exact_fields:
757
+ continue # an exact column already owns this field
758
+ m.field = field
759
+ m.method = "ai"
760
+ m.confidence = AI_CONFIDENCE
761
+
762
+ # de-dup single-slot fields: keep highest confidence, prefer exact then ai
763
+ prio = {"exact": 3, "ai": 2, "cache": 2, "fuzzy": 1}
764
+ best: dict[str, ColumnMap] = {}
765
+ for m in col_maps:
766
+ if not m.field:
767
+ continue
768
+ cur = best.get(m.field)
769
+ if cur is None or (prio.get(m.method, 0), m.confidence) > \
770
+ (prio.get(cur.method, 0), cur.confidence):
771
+ if cur is not None:
772
+ cur.field, cur.method = None, "dup"
773
+ best[m.field] = m
774
+ else:
775
+ m.field, m.method = None, "dup"
776
+ return col_maps
777
+
778
+
779
+ def _schema_signature() -> str:
780
+ """Short hash of the active schema (output fields + types, allowed fields,
781
+ reconcile, and the config synonyms). The mapping cache is scoped by this, so
782
+ changing the config — e.g. adding a field — invalidates stale entries for the
783
+ same header instead of replaying an old mapping. Learned synonyms are NOT
784
+ included, so learning doesn't churn the cache."""
785
+ import hashlib
786
+ payload = json.dumps({
787
+ "fields": [[f, t] for f, t in _FIELD_TYPES.items()],
788
+ "allowed": sorted(ALLOWED_FIELDS),
789
+ "reconcile": _ACTIVE_CONFIG.reconcile,
790
+ "synonyms": {k: sorted(v) for k, v in _ACTIVE_CONFIG.synonyms.items()},
791
+ }, sort_keys=True, default=str)
792
+ return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:12]
793
+
794
+
795
+ # --------------------------------------------------------------------------
796
+ # Core runner
797
+ # --------------------------------------------------------------------------
798
+ def _run(rows: list[list], source_label: str, out_path, llm_fallback,
799
+ table_matcher, scan_limit, threshold, cache,
800
+ output_format: OutputFormat, learn_store=None) -> ProcessResult:
801
+ """Shared core: detect header -> map -> (AI) -> extract -> review -> output.
802
+ Works on already-read `rows` so it is source-agnostic (path or stream)."""
803
+ if not rows:
804
+ return ProcessResult(source_label, None, 0, 0.0, [], [], True,
805
+ ["empty sheet"], {})
806
+
807
+ hc = detect_header_row(rows, scan_limit=scan_limit)
808
+ header = hc.cells
809
+ sample_rows = rows[hc.index + 1: hc.index + 6]
810
+
811
+ from_cache = False
812
+ col_maps = None
813
+ schema_sig = _schema_signature() # scope the cache to the active schema
814
+ if cache is not None:
815
+ cached = cache.get(header, namespace=schema_sig)
816
+ if cached is not None:
817
+ col_maps = cached
818
+ from_cache = True
819
+
820
+ if col_maps is None:
821
+ col_maps = map_columns(header, sample_rows, llm_fallback=llm_fallback,
822
+ threshold=threshold)
823
+ # Unknown layout? Ask the AI to map the whole table (structure only).
824
+ if table_matcher is not None and _has_critical_gap(col_maps):
825
+ ai = table_matcher(header, rows[hc.index + 1: hc.index + 46],
826
+ list(ALLOWED_FIELDS))
827
+ if ai:
828
+ col_maps = merge_ai_mapping(col_maps, ai)
829
+
830
+ records = extract_records(rows, hc.index, col_maps)
831
+ needs_review, reasons = evaluate_review(col_maps, records, threshold=threshold)
832
+
833
+ # Only cache a freshly-computed mapping if it's trustworthy. Never persist an
834
+ # unconfirmed fallback/low-confidence guess — that would let it be replayed
835
+ # as if approved. (A human-approved mapping can be cached explicitly.)
836
+ if cache is not None and not from_cache and not needs_review:
837
+ cache.put(header, col_maps, namespace=schema_sig)
838
+
839
+ result = ProcessResult(
840
+ input_path=source_label, output_path=out_path, header_index=hc.index,
841
+ header_score=hc.score, column_maps=col_maps, records=records,
842
+ needs_review=needs_review, review_reasons=reasons,
843
+ header_breakdown=hc.breakdown,
844
+ output=OutputResult(records=records, format=output_format,
845
+ file_path=out_path),
846
+ )
847
+
848
+ # Learn AI-resolved headers into the vocabulary, then refresh matching so the
849
+ # new phrases take effect immediately (next statement is an exact match).
850
+ # Gated fields (debit/credit) go to the learn store's pending queue.
851
+ if learn_store is not None and not from_cache:
852
+ from .learn import learn_from_result
853
+ learn_from_result(result, learn_store)
854
+ apply_learned(learn_store)
855
+
856
+ # For backward compat: still write file if out_path given and format is "file"
857
+ if out_path and output_format == "file":
858
+ _write_output(out_path, records)
859
+
860
+ return result
861
+
862
+
863
+ # --------------------------------------------------------------------------
864
+ # Public API
865
+ # --------------------------------------------------------------------------
866
+ def process_file(
867
+ path: str,
868
+ out_path: Optional[str] = None,
869
+ output_format: OutputFormat = "file",
870
+ llm_fallback: Optional[Callable] = None,
871
+ table_matcher: Optional[Callable] = None,
872
+ scan_limit: int = 25,
873
+ threshold: int = 80,
874
+ cache: Optional["MappingCache"] = None,
875
+ learn_store=None,
876
+ ) -> ProcessResult:
877
+ """Read an .xlsx from a filesystem `path` and map it.
878
+
879
+ `output_format` controls the serialization of `result.output`:
880
+ - "file" : writes to `out_path` on disk, returns path string
881
+ - "records" : raw Python list[dict] (default for streams)
882
+ - "json" : JSON string
883
+ - "bytes" : in-memory .xlsx bytes
884
+ - "base64" : base64-encoded .xlsx bytes
885
+
886
+ `table_matcher(header_row, data_rows, allowed_fields) -> {col_index: field}`
887
+ is the LLM path (see ai_matcher.OpenAICompatibleMatcher). It fires only when
888
+ the deterministic pass leaves a critical gap AND the header isn't cached.
889
+ """
890
+ rows = _read_sheet(path)
891
+ return _run(rows, path, out_path, llm_fallback, table_matcher,
892
+ scan_limit, threshold, cache, output_format, learn_store)
893
+
894
+
895
+ def process_stream(
896
+ data,
897
+ out_path: Optional[str] = None,
898
+ output_format: OutputFormat = "records",
899
+ llm_fallback: Optional[Callable] = None,
900
+ table_matcher: Optional[Callable] = None,
901
+ scan_limit: int = 25,
902
+ threshold: int = 80,
903
+ cache: Optional["MappingCache"] = None,
904
+ source_label: str = "<stream>",
905
+ learn_store=None,
906
+ ) -> ProcessResult:
907
+ """Map an .xlsx received as raw bytes or a binary file-like object — no temp
908
+ file, nothing written to disk. Ideal for a FastAPI UploadFile: pass
909
+ `await file.read()` (bytes) or `file.file` (a stream) straight in.
910
+
911
+ For bank data this is the preferred entry point: the statement is parsed
912
+ entirely in memory and never lands on the filesystem.
913
+
914
+ Default `output_format` is "records" since streams are typically consumed
915
+ by an API that serializes its own response.
916
+ """
917
+ import io
918
+ if isinstance(data, (bytes, bytearray)):
919
+ fileobj = io.BytesIO(data)
920
+ else:
921
+ fileobj = data # already a binary file-like object
922
+ rows = _read_sheet(fileobj)
923
+ return _run(rows, source_label, out_path, llm_fallback, table_matcher,
924
+ scan_limit, threshold, cache, output_format, learn_store)
925
+
926
+
927
+ # --------------------------------------------------------------------------
928
+ # Internal sheet reader
929
+ # --------------------------------------------------------------------------
930
+ def _read_sheet(path: str) -> list[list]:
931
+ # `src` may be a filesystem path OR a binary file-like object (BytesIO) —
932
+ # openpyxl accepts both, so uploads can be read straight from memory.
933
+ import openpyxl
934
+ wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
935
+ ws = wb.active
936
+ rows = [list(r) for r in ws.iter_rows(values_only=True)]
937
+ wb.close()
938
+ return rows