tabularmapper 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tabularmapper/__init__.py +75 -0
- tabularmapper/ai_matcher.py +247 -0
- tabularmapper/api.py +186 -0
- tabularmapper/cli.py +233 -0
- tabularmapper/engine.py +938 -0
- tabularmapper/learn.py +203 -0
- tabularmapper/llm_fallback.py +118 -0
- tabularmapper/mapping_cache.py +73 -0
- tabularmapper/schema.py +341 -0
- tabularmapper/stores.py +238 -0
- tabularmapper-1.0.0.dist-info/METADATA +455 -0
- tabularmapper-1.0.0.dist-info/RECORD +16 -0
- tabularmapper-1.0.0.dist-info/WHEEL +5 -0
- tabularmapper-1.0.0.dist-info/entry_points.txt +2 -0
- tabularmapper-1.0.0.dist-info/licenses/LICENSE +21 -0
- tabularmapper-1.0.0.dist-info/top_level.txt +1 -0
tabularmapper/engine.py
ADDED
|
@@ -0,0 +1,938 @@
|
|
|
1
|
+
"""
|
|
2
|
+
engine.py — Bank Statement -> Standard Tabular Mapper (engine)
|
|
3
|
+
|
|
4
|
+
Two-stage, auditable pipeline:
|
|
5
|
+
Stage 1 detect_header_row() deterministic scoring (NO AI)
|
|
6
|
+
Stage 2 map_columns() exact synonym -> fuzzy -> optional llm/embedding fallback
|
|
7
|
+
|
|
8
|
+
Design invariants (see BUILD_PROMPT.md):
|
|
9
|
+
* No LLM/embedding model ever sees transaction rows. It only sees header
|
|
10
|
+
strings + <=3 sample cells per column. All row/date/amount work is
|
|
11
|
+
deterministic Python.
|
|
12
|
+
* Header detection is scoring, never a model call.
|
|
13
|
+
* Anything ambiguous is flagged needs_review instead of silently guessed.
|
|
14
|
+
* Every column decision carries a method (exact/fuzzy/llm/cache) + 0-100
|
|
15
|
+
confidence for human audit.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import base64
|
|
22
|
+
import csv
|
|
23
|
+
import datetime as _dt
|
|
24
|
+
import io
|
|
25
|
+
import json
|
|
26
|
+
import re
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
from typing import Callable, Literal, Optional, Union
|
|
29
|
+
|
|
30
|
+
from dateutil import parser as _dateparser
|
|
31
|
+
from rapidfuzz import fuzz
|
|
32
|
+
|
|
33
|
+
# --------------------------------------------------------------------------
|
|
34
|
+
# Active configuration (output template + synonyms + critical fields).
|
|
35
|
+
# Loaded from schema.py — by default the built-in constants (byte-identical to
|
|
36
|
+
# the previous hardcoded values), or from a JSON file / URL / S3 object / dict
|
|
37
|
+
# via env TABULARMAPPER_CONFIG or a call to configure().
|
|
38
|
+
#
|
|
39
|
+
# These module globals are kept for backward compatibility; everything reads
|
|
40
|
+
# them, and configure() swaps them atomically.
|
|
41
|
+
# --------------------------------------------------------------------------
|
|
42
|
+
from .schema import ( # noqa: E402
|
|
43
|
+
Config as _Config, load_config as _load_config,
|
|
44
|
+
DATE_TYPES as _DATE_TYPES, NUMERIC_TYPES as _NUMERIC_TYPES,
|
|
45
|
+
INTEGER_TYPES as _INTEGER_TYPES,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _build_header_vocab(synonyms: dict) -> set:
|
|
49
|
+
"""Header-detection vocabulary, derived from the active config's synonyms +
|
|
50
|
+
field names — NOT a hardcoded bank list. Adapts to whatever domain the
|
|
51
|
+
config describes."""
|
|
52
|
+
import re as _re
|
|
53
|
+
vocab: set = set()
|
|
54
|
+
for fld, phrases in synonyms.items():
|
|
55
|
+
for tok in _re.split(r"[^a-z0-9]+", str(fld).lower()):
|
|
56
|
+
if len(tok) >= 2:
|
|
57
|
+
vocab.add(tok)
|
|
58
|
+
for phrase in phrases:
|
|
59
|
+
for tok in _re.split(r"[^a-z0-9]+", str(phrase).lower()):
|
|
60
|
+
if len(tok) >= 2:
|
|
61
|
+
vocab.add(tok)
|
|
62
|
+
return vocab
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
_ACTIVE_CONFIG: _Config = _load_config()
|
|
66
|
+
_LEARNED_SYNONYMS: dict = {} # {field: [phrases]} from the learn store
|
|
67
|
+
|
|
68
|
+
OUTPUT_SCHEMA: list[tuple[str, str]] = _ACTIVE_CONFIG.headers # [(field, header)]
|
|
69
|
+
SYNONYMS: dict[str, list[str]] = {k: list(v) for k, v in _ACTIVE_CONFIG.synonyms.items()}
|
|
70
|
+
CRITICAL_FIELDS: set = set(_ACTIVE_CONFIG.critical_fields)
|
|
71
|
+
ALLOWED_FIELDS: list[str] = _ACTIVE_CONFIG.allowed_fields
|
|
72
|
+
_FIELD_TYPES: dict[str, str] = _ACTIVE_CONFIG.field_types
|
|
73
|
+
_HEADER_VOCAB: set = _build_header_vocab(SYNONYMS)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _apply_synonyms() -> None:
|
|
77
|
+
"""Rebuild the effective SYNONYMS + exact lookup + header vocab = config seed
|
|
78
|
+
+ learned. Config (seed) phrases are authoritative on conflict; learned
|
|
79
|
+
phrases only *extend* the vocabulary."""
|
|
80
|
+
global SYNONYMS, _EXACT_LOOKUP, _HEADER_VOCAB
|
|
81
|
+
merged = {f: list(v) for f, v in _ACTIVE_CONFIG.synonyms.items()}
|
|
82
|
+
for fld, phrases in _LEARNED_SYNONYMS.items():
|
|
83
|
+
merged.setdefault(fld, [])
|
|
84
|
+
for p in phrases:
|
|
85
|
+
if p not in merged[fld]:
|
|
86
|
+
merged[fld].append(p)
|
|
87
|
+
SYNONYMS = merged
|
|
88
|
+
lut = _build_exact_lookup(_LEARNED_SYNONYMS) # learned first...
|
|
89
|
+
lut.update(_build_exact_lookup(_ACTIVE_CONFIG.synonyms)) # ...seed overrides
|
|
90
|
+
_EXACT_LOOKUP = lut
|
|
91
|
+
_HEADER_VOCAB = _build_header_vocab(merged)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def configure(source=None, config: "Optional[_Config]" = None) -> None:
|
|
95
|
+
"""Swap the active configuration at runtime.
|
|
96
|
+
|
|
97
|
+
Accepts either a Config object or a `source` for load_config (path / http(s)
|
|
98
|
+
URL / s3:// / dict), positionally or by keyword — so all of these work:
|
|
99
|
+
|
|
100
|
+
configure(bank_preset()) # Config, positional
|
|
101
|
+
configure(config_from_dict({...})) # Config, positional
|
|
102
|
+
configure("config.json") # source
|
|
103
|
+
configure(config=my_config) # explicit
|
|
104
|
+
|
|
105
|
+
Rebuilds the derived globals so the new schema takes effect immediately.
|
|
106
|
+
"""
|
|
107
|
+
global _ACTIVE_CONFIG, OUTPUT_SCHEMA, CRITICAL_FIELDS
|
|
108
|
+
global ALLOWED_FIELDS, _FIELD_TYPES
|
|
109
|
+
if config is None and isinstance(source, _Config):
|
|
110
|
+
source, config = None, source # a Config passed positionally
|
|
111
|
+
_ACTIVE_CONFIG = config if config is not None else _load_config(source)
|
|
112
|
+
OUTPUT_SCHEMA = _ACTIVE_CONFIG.headers
|
|
113
|
+
CRITICAL_FIELDS = set(_ACTIVE_CONFIG.critical_fields)
|
|
114
|
+
ALLOWED_FIELDS = _ACTIVE_CONFIG.allowed_fields
|
|
115
|
+
_FIELD_TYPES = _ACTIVE_CONFIG.field_types
|
|
116
|
+
_apply_synonyms()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def apply_learned(store) -> None:
|
|
120
|
+
"""Load learned synonyms from a LearnStore and merge them into matching.
|
|
121
|
+
Call once at startup after configure(); process_file(..., learn_store=...)
|
|
122
|
+
keeps it fresh as new phrases are learned."""
|
|
123
|
+
global _LEARNED_SYNONYMS
|
|
124
|
+
_LEARNED_SYNONYMS = store.synonyms() if store is not None else {}
|
|
125
|
+
_apply_synonyms()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# --------------------------------------------------------------------------
|
|
129
|
+
# Output format types
|
|
130
|
+
# --------------------------------------------------------------------------
|
|
131
|
+
OutputFormat = Literal["records", "json", "bytes", "base64", "file"]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# --------------------------------------------------------------------------
|
|
135
|
+
# Data classes
|
|
136
|
+
# --------------------------------------------------------------------------
|
|
137
|
+
@dataclass
|
|
138
|
+
class HeaderCandidate:
|
|
139
|
+
index: int
|
|
140
|
+
score: float
|
|
141
|
+
cells: list
|
|
142
|
+
breakdown: dict = field(default_factory=dict)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@dataclass
|
|
146
|
+
class ColumnMap:
|
|
147
|
+
col_index: int
|
|
148
|
+
raw_header: str
|
|
149
|
+
field: Optional[str]
|
|
150
|
+
confidence: int
|
|
151
|
+
method: str # exact | fuzzy | llm | cache | none
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class OutputResult:
|
|
156
|
+
"""Lazy-evaluated output container supporting multiple serialization formats."""
|
|
157
|
+
records: list[dict]
|
|
158
|
+
format: OutputFormat
|
|
159
|
+
file_path: Optional[str] = None
|
|
160
|
+
_json: Optional[str] = field(default=None, repr=False)
|
|
161
|
+
_bytes: Optional[bytes] = field(default=None, repr=False)
|
|
162
|
+
_base64: Optional[str] = field(default=None, repr=False)
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def json(self) -> str:
|
|
166
|
+
"""Records as JSON string."""
|
|
167
|
+
if self._json is None:
|
|
168
|
+
self._json = json.dumps(self.records, ensure_ascii=False)
|
|
169
|
+
return self._json
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def bytes(self) -> bytes:
|
|
173
|
+
"""Records as .xlsx bytes (lazy, cached)."""
|
|
174
|
+
if self._bytes is None:
|
|
175
|
+
self._bytes = _records_to_xlsx_bytes(self.records)
|
|
176
|
+
return self._bytes
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def base64(self) -> str:
|
|
180
|
+
"""Base64-encoded .xlsx bytes (lazy, cached)."""
|
|
181
|
+
if self._base64 is None:
|
|
182
|
+
self._base64 = base64.b64encode(self.bytes).decode("ascii")
|
|
183
|
+
return self._base64
|
|
184
|
+
|
|
185
|
+
def to_response(self) -> Union[list[dict], str, bytes]:
|
|
186
|
+
"""Return the native Python object for the requested format."""
|
|
187
|
+
if self.format == "json":
|
|
188
|
+
return self.json
|
|
189
|
+
if self.format == "bytes":
|
|
190
|
+
return self.bytes
|
|
191
|
+
if self.format == "base64":
|
|
192
|
+
return self.base64
|
|
193
|
+
if self.format == "file":
|
|
194
|
+
if self.file_path is None:
|
|
195
|
+
raise ValueError("file_path required for 'file' output format")
|
|
196
|
+
_write_output(self.file_path, self.records)
|
|
197
|
+
return self.file_path
|
|
198
|
+
return self.records
|
|
199
|
+
|
|
200
|
+
def __repr__(self) -> str:
|
|
201
|
+
return f"<OutputResult format={self.format} records={len(self.records)}>"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@dataclass
|
|
205
|
+
class ProcessResult:
|
|
206
|
+
input_path: str
|
|
207
|
+
output_path: Optional[str]
|
|
208
|
+
header_index: int
|
|
209
|
+
header_score: float
|
|
210
|
+
column_maps: list[ColumnMap]
|
|
211
|
+
records: list[dict]
|
|
212
|
+
needs_review: bool
|
|
213
|
+
review_reasons: list[str]
|
|
214
|
+
header_breakdown: dict = field(default_factory=dict)
|
|
215
|
+
output: Optional[OutputResult] = field(default=None, repr=False)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# --------------------------------------------------------------------------
|
|
219
|
+
# Helpers
|
|
220
|
+
# --------------------------------------------------------------------------
|
|
221
|
+
_NUM_RE = re.compile(r"^[\s₹$€£rs\.]*[-(]?[\d,]+\.?\d*\)?[\s]*(dr|cr)?$", re.I)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _norm(s) -> str:
|
|
225
|
+
return re.sub(r"\s+", " ", str(s).strip().lower()) if s is not None else ""
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _is_blank(v) -> bool:
|
|
229
|
+
return v is None or (isinstance(v, str) and v.strip() == "")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _looks_numeric(v) -> bool:
|
|
233
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
234
|
+
return True
|
|
235
|
+
if isinstance(v, str) and _NUM_RE.match(v.strip()):
|
|
236
|
+
return True
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _looks_datey(v) -> bool:
|
|
241
|
+
if isinstance(v, (_dt.datetime, _dt.date)):
|
|
242
|
+
return True
|
|
243
|
+
if isinstance(v, str):
|
|
244
|
+
s = v.strip()
|
|
245
|
+
if re.search(r"\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}", s):
|
|
246
|
+
return True
|
|
247
|
+
if re.match(r"\d{1,2}\s*[A-Za-z]{3,9}\s*\d{2,4}", s):
|
|
248
|
+
return True
|
|
249
|
+
return False
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _text_ratio(cells) -> float:
|
|
253
|
+
"""Fraction of non-blank cells that are word-like (not numbers/dates)."""
|
|
254
|
+
non_blank = [c for c in cells if not _is_blank(c)]
|
|
255
|
+
if not non_blank:
|
|
256
|
+
return 0.0
|
|
257
|
+
wordy = sum(
|
|
258
|
+
1 for c in non_blank
|
|
259
|
+
if isinstance(c, str) and not _looks_numeric(c) and not _looks_datey(c)
|
|
260
|
+
)
|
|
261
|
+
return wordy / len(non_blank)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# --------------------------------------------------------------------------
|
|
265
|
+
# Stage 1 — header detection (deterministic)
|
|
266
|
+
# --------------------------------------------------------------------------
|
|
267
|
+
def detect_header_row(rows: list[list], scan_limit: int = 25) -> HeaderCandidate:
|
|
268
|
+
"""Score the first ~scan_limit rows and return the best header candidate.
|
|
269
|
+
|
|
270
|
+
Signals (see BUILD_PROMPT.md §5):
|
|
271
|
+
+ density many non-empty cells
|
|
272
|
+
+ text_ratio headers are words, not numbers/dates
|
|
273
|
+
+ short_labels header cells are short strings
|
|
274
|
+
+ vocab_hits banking-vocabulary matches (strongest)
|
|
275
|
+
+ data_below rows below look like transaction data
|
|
276
|
+
- self_penalty the row itself is mostly numbers/dates
|
|
277
|
+
"""
|
|
278
|
+
n = min(scan_limit, len(rows))
|
|
279
|
+
best: Optional[HeaderCandidate] = None
|
|
280
|
+
|
|
281
|
+
for i in range(n):
|
|
282
|
+
cells = rows[i]
|
|
283
|
+
non_blank = [c for c in cells if not _is_blank(c)]
|
|
284
|
+
if not non_blank:
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
density = min(len(non_blank), 8) / 8.0 # cap so wide junk rows don't win
|
|
288
|
+
text_ratio = _text_ratio(cells)
|
|
289
|
+
|
|
290
|
+
strs = [c for c in non_blank if isinstance(c, str)]
|
|
291
|
+
short = sum(1 for c in strs if len(c.strip()) <= 25)
|
|
292
|
+
short_labels = short / len(non_blank)
|
|
293
|
+
|
|
294
|
+
toks = set()
|
|
295
|
+
for c in strs:
|
|
296
|
+
for t in re.split(r"[^a-z]+", c.lower()):
|
|
297
|
+
if t:
|
|
298
|
+
toks.add(t)
|
|
299
|
+
vocab_hits = len(toks & _HEADER_VOCAB)
|
|
300
|
+
|
|
301
|
+
# data_below: sample up to 5 rows beneath; reward numeric/date content
|
|
302
|
+
below_scores = []
|
|
303
|
+
for j in range(i + 1, min(i + 6, len(rows))):
|
|
304
|
+
b = rows[j]
|
|
305
|
+
bnb = [c for c in b if not _is_blank(c)]
|
|
306
|
+
if not bnb:
|
|
307
|
+
continue
|
|
308
|
+
datalike = sum(1 for c in bnb if _looks_numeric(c) or _looks_datey(c))
|
|
309
|
+
below_scores.append(datalike / len(bnb))
|
|
310
|
+
data_below = (sum(below_scores) / len(below_scores)) if below_scores else 0.0
|
|
311
|
+
|
|
312
|
+
self_numeric = sum(1 for c in non_blank if _looks_numeric(c) or _looks_datey(c))
|
|
313
|
+
self_penalty = self_numeric / len(non_blank)
|
|
314
|
+
|
|
315
|
+
score = (
|
|
316
|
+
1.5 * density
|
|
317
|
+
+ 2.0 * text_ratio
|
|
318
|
+
+ 1.0 * short_labels
|
|
319
|
+
+ 3.0 * min(vocab_hits, 6) # dominant signal
|
|
320
|
+
+ 2.0 * data_below
|
|
321
|
+
- 3.0 * self_penalty
|
|
322
|
+
)
|
|
323
|
+
breakdown = {
|
|
324
|
+
"density": round(density, 2),
|
|
325
|
+
"text_ratio": round(text_ratio, 2),
|
|
326
|
+
"short_labels": round(short_labels, 2),
|
|
327
|
+
"vocab_hits": vocab_hits,
|
|
328
|
+
"data_below": round(data_below, 2),
|
|
329
|
+
"self_penalty": round(self_penalty, 2),
|
|
330
|
+
"score": round(score, 2),
|
|
331
|
+
}
|
|
332
|
+
if best is None or score > best.score:
|
|
333
|
+
best = HeaderCandidate(index=i, score=round(score, 2), cells=list(cells),
|
|
334
|
+
breakdown=breakdown)
|
|
335
|
+
|
|
336
|
+
if best is None:
|
|
337
|
+
best = HeaderCandidate(index=0, score=0.0,
|
|
338
|
+
cells=list(rows[0]) if rows else [], breakdown={})
|
|
339
|
+
return best
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# --------------------------------------------------------------------------
|
|
343
|
+
# Stage 2 — column mapping (exact -> fuzzy -> fallback)
|
|
344
|
+
# --------------------------------------------------------------------------
|
|
345
|
+
# Build a flat lookup: phrase -> field, for O(1) exact matching.
|
|
346
|
+
def _build_exact_lookup(synonyms: dict) -> dict:
|
|
347
|
+
lut: dict[str, str] = {}
|
|
348
|
+
for _fld, _phrases in synonyms.items():
|
|
349
|
+
for _p in _phrases:
|
|
350
|
+
lut[_norm(_p)] = _fld
|
|
351
|
+
return lut
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
_EXACT_LOOKUP: dict[str, str] = _build_exact_lookup(SYNONYMS)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _fuzzy_best(header: str) -> tuple[Optional[str], int]:
|
|
358
|
+
"""Best fuzzy field + score across the synonym phrases of fields that are
|
|
359
|
+
actually in the active schema (ALLOWED_FIELDS). Synonyms for fields you
|
|
360
|
+
didn't declare are ignored, so they never sneak into the mapping."""
|
|
361
|
+
allowed = set(ALLOWED_FIELDS)
|
|
362
|
+
best_field, best_score = None, 0
|
|
363
|
+
for fld, phrases in SYNONYMS.items():
|
|
364
|
+
if fld not in allowed:
|
|
365
|
+
continue
|
|
366
|
+
for p in phrases:
|
|
367
|
+
s = fuzz.token_set_ratio(header, p)
|
|
368
|
+
# token_set_ratio can over-reward; blend with a stricter ratio
|
|
369
|
+
s = int(0.5 * s + 0.5 * fuzz.ratio(header, p))
|
|
370
|
+
if s > best_score:
|
|
371
|
+
best_field, best_score = fld, s
|
|
372
|
+
return best_field, best_score
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def map_columns(
|
|
376
|
+
header_row: list,
|
|
377
|
+
sample_rows: Optional[list[list]] = None,
|
|
378
|
+
llm_fallback: Optional[Callable[[str, list, list], Optional[str]]] = None,
|
|
379
|
+
threshold: int = 80,
|
|
380
|
+
) -> list[ColumnMap]:
|
|
381
|
+
"""Map each header cell to an output field.
|
|
382
|
+
|
|
383
|
+
1. exact synonym -> confidence 100, method 'exact'
|
|
384
|
+
2. fuzzy (rapidfuzz) -> confidence = score, method 'fuzzy'
|
|
385
|
+
3. if still < threshold and llm_fallback given -> method 'llm'
|
|
386
|
+
|
|
387
|
+
The fallback only ever receives the header string + up to 3 sample cells
|
|
388
|
+
for that column + the allowed field list. Never full rows.
|
|
389
|
+
"""
|
|
390
|
+
sample_rows = sample_rows or []
|
|
391
|
+
maps: list[ColumnMap] = []
|
|
392
|
+
assigned: set[str] = set()
|
|
393
|
+
|
|
394
|
+
for ci, raw in enumerate(header_row):
|
|
395
|
+
raw_str = "" if raw is None else str(raw).strip()
|
|
396
|
+
key = _norm(raw)
|
|
397
|
+
|
|
398
|
+
if key == "":
|
|
399
|
+
maps.append(ColumnMap(ci, raw_str, None, 0, "none"))
|
|
400
|
+
continue
|
|
401
|
+
|
|
402
|
+
# 1. exact
|
|
403
|
+
if key in _EXACT_LOOKUP:
|
|
404
|
+
ef = _EXACT_LOOKUP[key]
|
|
405
|
+
if ef in ALLOWED_FIELDS:
|
|
406
|
+
maps.append(ColumnMap(ci, raw_str, ef, 100, "exact"))
|
|
407
|
+
else:
|
|
408
|
+
# recognized, but that field isn't in your output schema -> skip
|
|
409
|
+
maps.append(ColumnMap(ci, raw_str, None, 0, "not_in_schema"))
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
# 2. fuzzy
|
|
413
|
+
fld, score = _fuzzy_best(key)
|
|
414
|
+
if score >= threshold:
|
|
415
|
+
maps.append(ColumnMap(ci, raw_str, fld, int(score), "fuzzy"))
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
# 3. fallback (llm / embedding), header + <=3 samples only
|
|
419
|
+
if llm_fallback is not None:
|
|
420
|
+
samples = []
|
|
421
|
+
for r in sample_rows[:3]:
|
|
422
|
+
if ci < len(r) and not _is_blank(r[ci]):
|
|
423
|
+
samples.append(str(r[ci])[:40])
|
|
424
|
+
guess = llm_fallback(raw_str, samples, list(ALLOWED_FIELDS))
|
|
425
|
+
if guess in ALLOWED_FIELDS:
|
|
426
|
+
# fallback carries a moderate confidence, clearly below exact
|
|
427
|
+
maps.append(ColumnMap(ci, raw_str, guess, max(int(score), 70), "llm"))
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
# unresolved
|
|
431
|
+
maps.append(ColumnMap(ci, raw_str, None, int(score), "fuzzy"))
|
|
432
|
+
|
|
433
|
+
# Resolve duplicates: if two columns claim the same field, keep the higher
|
|
434
|
+
# confidence one; demote the loser to unresolved (needs_review will catch).
|
|
435
|
+
by_field: dict[str, ColumnMap] = {}
|
|
436
|
+
for m in maps:
|
|
437
|
+
if m.field is None:
|
|
438
|
+
continue
|
|
439
|
+
if m.field not in by_field or m.confidence > by_field[m.field].confidence:
|
|
440
|
+
prev = by_field.get(m.field)
|
|
441
|
+
if prev is not None:
|
|
442
|
+
prev.field = None
|
|
443
|
+
prev.method = "dup"
|
|
444
|
+
by_field[m.field] = m
|
|
445
|
+
else:
|
|
446
|
+
m.field = None
|
|
447
|
+
m.method = "dup"
|
|
448
|
+
assigned.update(by_field.keys())
|
|
449
|
+
return maps
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
# --------------------------------------------------------------------------
|
|
453
|
+
# Normalizers (deterministic)
|
|
454
|
+
# --------------------------------------------------------------------------
|
|
455
|
+
_YEAR_FIRST_RE = re.compile(r"^\s*\d{4}[-/.]")
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def normalize_date(v, dayfirst: Optional[bool] = None) -> Optional[str]:
|
|
459
|
+
"""Return 'YYYY-MM-DD' or None.
|
|
460
|
+
|
|
461
|
+
* Excel datetime/date objects pass straight through.
|
|
462
|
+
* Year-first strings (yyyy-mm-dd, yyyy/mm/dd) are parsed with dayfirst=False
|
|
463
|
+
so they are never flipped.
|
|
464
|
+
* Other strings default to dayfirst=True (dd-mm-yyyy is the common non-US
|
|
465
|
+
bank format) unless the caller overrides via `dayfirst`.
|
|
466
|
+
"""
|
|
467
|
+
if _is_blank(v):
|
|
468
|
+
return None
|
|
469
|
+
if isinstance(v, _dt.datetime):
|
|
470
|
+
return v.date().isoformat()
|
|
471
|
+
if isinstance(v, _dt.date):
|
|
472
|
+
return v.isoformat()
|
|
473
|
+
|
|
474
|
+
s = str(v).strip()
|
|
475
|
+
if not s:
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
if _YEAR_FIRST_RE.match(s):
|
|
479
|
+
df = False
|
|
480
|
+
elif dayfirst is None:
|
|
481
|
+
df = True
|
|
482
|
+
else:
|
|
483
|
+
df = dayfirst
|
|
484
|
+
|
|
485
|
+
try:
|
|
486
|
+
dt = _dateparser.parse(s, dayfirst=df, fuzzy=True)
|
|
487
|
+
return dt.date().isoformat()
|
|
488
|
+
except (ValueError, OverflowError, TypeError):
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
_AMT_CLEAN_RE = re.compile(r"[^\d.\-()]")
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def normalize_amount(v) -> Optional[float]:
|
|
496
|
+
"""Return a signed float or None.
|
|
497
|
+
|
|
498
|
+
Handles: '1,200.50', '(500)' -> -500, '500 Dr' -> -500, '₹500 Cr' -> 500,
|
|
499
|
+
leading minus, currency symbols, stray spaces. Dr/Cr suffix wins over sign.
|
|
500
|
+
"""
|
|
501
|
+
if _is_blank(v):
|
|
502
|
+
return None
|
|
503
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
504
|
+
return float(v)
|
|
505
|
+
|
|
506
|
+
s = str(v).strip()
|
|
507
|
+
if not s:
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
low = s.lower()
|
|
511
|
+
is_dr = bool(re.search(r"\bdr\b|dr$", low))
|
|
512
|
+
is_cr = bool(re.search(r"\bcr\b|cr$", low))
|
|
513
|
+
neg_paren = "(" in s and ")" in s
|
|
514
|
+
neg_sign = s.lstrip().startswith("-")
|
|
515
|
+
|
|
516
|
+
cleaned = _AMT_CLEAN_RE.sub("", s).replace("(", "").replace(")", "")
|
|
517
|
+
cleaned = cleaned.replace(",", "")
|
|
518
|
+
# collapse accidental multiple dots / trailing minus
|
|
519
|
+
if cleaned.count("-") > 1:
|
|
520
|
+
cleaned = cleaned.replace("-", "")
|
|
521
|
+
neg_sign = True
|
|
522
|
+
if cleaned in ("", "-", "."):
|
|
523
|
+
return None
|
|
524
|
+
try:
|
|
525
|
+
val = float(cleaned)
|
|
526
|
+
except ValueError:
|
|
527
|
+
return None
|
|
528
|
+
|
|
529
|
+
val = abs(val)
|
|
530
|
+
if is_dr:
|
|
531
|
+
return -val
|
|
532
|
+
if is_cr:
|
|
533
|
+
return val
|
|
534
|
+
if neg_paren or neg_sign:
|
|
535
|
+
return -val
|
|
536
|
+
return val
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
# --------------------------------------------------------------------------
|
|
540
|
+
# Extraction
|
|
541
|
+
# --------------------------------------------------------------------------
|
|
542
|
+
def _field_col(col_maps: list[ColumnMap], fld: str) -> Optional[int]:
|
|
543
|
+
for m in col_maps:
|
|
544
|
+
if m.field == fld:
|
|
545
|
+
return m.col_index
|
|
546
|
+
return None
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def extract_records(rows: list[list], header_idx: int,
|
|
550
|
+
col_maps: list[ColumnMap]) -> list[dict]:
|
|
551
|
+
"""Turn data rows into standardized dicts.
|
|
552
|
+
|
|
553
|
+
Reconciles the two money layouts:
|
|
554
|
+
* separate debit + credit columns -> used as-is (positive floats)
|
|
555
|
+
* single signed `amount` column -> negative to debit, positive to credit
|
|
556
|
+
Skips non-transaction rows (no date AND no money). Merges multi-line
|
|
557
|
+
descriptions that spill into blank cells below a transaction.
|
|
558
|
+
"""
|
|
559
|
+
cfg = _ACTIVE_CONFIG
|
|
560
|
+
fields = cfg.fields # ordered output field keys
|
|
561
|
+
types = _FIELD_TYPES # field -> "date"|"money"/"number"|"text"
|
|
562
|
+
col_of = {f: _field_col(col_maps, f) for f in fields}
|
|
563
|
+
|
|
564
|
+
# Optional signed/split reconciliation, declared in config (not hardcoded):
|
|
565
|
+
# reconcile = {"signed": <src>, "negative": <fld>, "positive": <fld>}
|
|
566
|
+
rec_cfg = cfg.reconcile or {}
|
|
567
|
+
neg_f, pos_f, sig_f = rec_cfg.get("negative"), rec_cfg.get("positive"), rec_cfg.get("signed")
|
|
568
|
+
ci_neg = _field_col(col_maps, neg_f) if neg_f else None
|
|
569
|
+
ci_pos = _field_col(col_maps, pos_f) if pos_f else None
|
|
570
|
+
ci_sig = _field_col(col_maps, sig_f) if sig_f else None
|
|
571
|
+
|
|
572
|
+
# Which fields decide "this row is a real record". Config-driven; if unset,
|
|
573
|
+
# keep any row that has at least one non-empty mapped value (generic).
|
|
574
|
+
keep_fields = cfg.row_keep_if_any or fields
|
|
575
|
+
cont_field = cfg.continuation_field
|
|
576
|
+
|
|
577
|
+
def cell(r, ci):
|
|
578
|
+
return r[ci] if (ci is not None and ci < len(r)) else None
|
|
579
|
+
|
|
580
|
+
def _present(v):
|
|
581
|
+
return v is not None and v != ""
|
|
582
|
+
|
|
583
|
+
records: list[dict] = []
|
|
584
|
+
for r in rows[header_idx + 1:]:
|
|
585
|
+
if all(_is_blank(c) for c in r):
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
# --- reconcile the directional pair, if declared ---
|
|
589
|
+
neg_val = pos_val = None
|
|
590
|
+
if rec_cfg:
|
|
591
|
+
if ci_sig is not None and ci_neg is None and ci_pos is None:
|
|
592
|
+
amt = normalize_amount(cell(r, ci_sig)) # one signed column -> split
|
|
593
|
+
if amt is not None:
|
|
594
|
+
if amt < 0:
|
|
595
|
+
neg_val = abs(amt)
|
|
596
|
+
elif amt > 0:
|
|
597
|
+
pos_val = amt
|
|
598
|
+
else:
|
|
599
|
+
neg_val = 0.0
|
|
600
|
+
else:
|
|
601
|
+
d = normalize_amount(cell(r, ci_neg)) if ci_neg is not None else None
|
|
602
|
+
c = normalize_amount(cell(r, ci_pos)) if ci_pos is not None else None
|
|
603
|
+
neg_val = abs(d) if d is not None else None
|
|
604
|
+
pos_val = abs(c) if c is not None else None
|
|
605
|
+
|
|
606
|
+
# --- build the record, one value per schema field, by type ---
|
|
607
|
+
rec: dict = {}
|
|
608
|
+
for f in fields:
|
|
609
|
+
if rec_cfg and f == neg_f:
|
|
610
|
+
rec[f] = neg_val
|
|
611
|
+
elif rec_cfg and f == pos_f:
|
|
612
|
+
rec[f] = pos_val
|
|
613
|
+
else:
|
|
614
|
+
t = types.get(f, "text")
|
|
615
|
+
v = cell(r, col_of.get(f))
|
|
616
|
+
if t in _DATE_TYPES:
|
|
617
|
+
rec[f] = normalize_date(v)
|
|
618
|
+
elif t in _NUMERIC_TYPES:
|
|
619
|
+
val = normalize_amount(v) # signed float
|
|
620
|
+
if val is not None and t in _INTEGER_TYPES and float(val).is_integer():
|
|
621
|
+
val = int(val) # integer type -> int
|
|
622
|
+
rec[f] = val
|
|
623
|
+
else:
|
|
624
|
+
rec[f] = str(v).strip() if not _is_blank(v) else ""
|
|
625
|
+
|
|
626
|
+
kept = any(_present(rec.get(f)) for f in keep_fields)
|
|
627
|
+
|
|
628
|
+
# continuation: a row with only the continuation field (and nothing that
|
|
629
|
+
# would keep it) folds into the record above it.
|
|
630
|
+
if not kept and cont_field and _present(rec.get(cont_field)) and records:
|
|
631
|
+
records[-1][cont_field] = (
|
|
632
|
+
str(records[-1].get(cont_field, "")) + " " + str(rec[cont_field])).strip()
|
|
633
|
+
continue
|
|
634
|
+
|
|
635
|
+
if not kept:
|
|
636
|
+
continue
|
|
637
|
+
|
|
638
|
+
records.append(rec)
|
|
639
|
+
return records
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
# --------------------------------------------------------------------------
|
|
643
|
+
# needs_review gate
|
|
644
|
+
# --------------------------------------------------------------------------
|
|
645
|
+
def evaluate_review(col_maps: list[ColumnMap], records: list[dict],
|
|
646
|
+
threshold: int = 80) -> tuple[bool, list[str]]:
|
|
647
|
+
reasons: list[str] = []
|
|
648
|
+
mapped_fields = {m.field for m in col_maps if m.field}
|
|
649
|
+
|
|
650
|
+
# critical: each configured critical field must be mapped
|
|
651
|
+
for cf in CRITICAL_FIELDS:
|
|
652
|
+
if cf not in mapped_fields:
|
|
653
|
+
reasons.append(f"missing critical field: {cf}")
|
|
654
|
+
|
|
655
|
+
# require_any: each configured group needs at least one mapped field
|
|
656
|
+
for group in (_ACTIVE_CONFIG.require_any or []):
|
|
657
|
+
if not (set(group) & mapped_fields):
|
|
658
|
+
reasons.append(f"none of {group} was mapped")
|
|
659
|
+
|
|
660
|
+
# low-confidence mapped columns
|
|
661
|
+
for m in col_maps:
|
|
662
|
+
if m.field and m.confidence < threshold and m.method != "exact":
|
|
663
|
+
reasons.append(
|
|
664
|
+
f"low-confidence column '{m.raw_header}' -> {m.field} "
|
|
665
|
+
f"({m.confidence}, {m.method})"
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
# any fallback-resolved column is worth a human glance
|
|
669
|
+
for m in col_maps:
|
|
670
|
+
if m.field and m.method == "llm":
|
|
671
|
+
reasons.append(f"fallback-resolved column '{m.raw_header}' -> {m.field}")
|
|
672
|
+
|
|
673
|
+
if not records:
|
|
674
|
+
reasons.append("no transaction rows extracted")
|
|
675
|
+
|
|
676
|
+
return (len(reasons) > 0), reasons
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
# --------------------------------------------------------------------------
|
|
680
|
+
# Output serializers
|
|
681
|
+
# --------------------------------------------------------------------------
|
|
682
|
+
def _records_to_xlsx_bytes(records: list[dict]) -> bytes:
|
|
683
|
+
"""Serialize records to .xlsx bytes in-memory (no temp file)."""
|
|
684
|
+
import openpyxl
|
|
685
|
+
wb = openpyxl.Workbook()
|
|
686
|
+
ws = wb.active
|
|
687
|
+
ws.title = "Standardized"
|
|
688
|
+
headers = [disp for _, disp in OUTPUT_SCHEMA]
|
|
689
|
+
ws.append(headers)
|
|
690
|
+
for rec in records:
|
|
691
|
+
ws.append([rec.get(fld) for fld, _ in OUTPUT_SCHEMA])
|
|
692
|
+
bio = io.BytesIO()
|
|
693
|
+
wb.save(bio)
|
|
694
|
+
return bio.getvalue()
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def records_to_csv_bytes(records: list[dict], encoding: str = "utf-8") -> bytes:
|
|
698
|
+
"""Serialize records to CSV bytes."""
|
|
699
|
+
bio = io.BytesIO()
|
|
700
|
+
text = io.TextIOWrapper(bio, encoding=encoding, newline="")
|
|
701
|
+
headers = [disp for _, disp in OUTPUT_SCHEMA]
|
|
702
|
+
writer = csv.DictWriter(text, fieldnames=[f for f, _ in OUTPUT_SCHEMA])
|
|
703
|
+
writer.writeheader()
|
|
704
|
+
for rec in records:
|
|
705
|
+
writer.writerow(rec)
|
|
706
|
+
text.flush()
|
|
707
|
+
return bio.getvalue()
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def _write_output(path: str, records: list[dict]) -> None:
|
|
711
|
+
"""Write records to an .xlsx file on disk."""
|
|
712
|
+
import openpyxl
|
|
713
|
+
wb = openpyxl.Workbook()
|
|
714
|
+
ws = wb.active
|
|
715
|
+
ws.title = "Standardized"
|
|
716
|
+
headers = [disp for _, disp in OUTPUT_SCHEMA]
|
|
717
|
+
ws.append(headers)
|
|
718
|
+
for rec in records:
|
|
719
|
+
ws.append([rec.get(fld) for fld, _ in OUTPUT_SCHEMA])
|
|
720
|
+
wb.save(path)
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
# --------------------------------------------------------------------------
|
|
724
|
+
# AI integration
|
|
725
|
+
# --------------------------------------------------------------------------
|
|
726
|
+
AI_CONFIDENCE = 85 # below exact(100), at/above the fuzzy gate so it stands on its own
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _has_critical_gap(col_maps: list[ColumnMap]) -> bool:
|
|
730
|
+
"""True if the deterministic pass is missing a critical field — i.e. this
|
|
731
|
+
looks like a new/unknown layout worth asking the AI about."""
|
|
732
|
+
fields = {m.field for m in col_maps if m.field}
|
|
733
|
+
if not CRITICAL_FIELDS.issubset(fields):
|
|
734
|
+
return True
|
|
735
|
+
for group in (_ACTIVE_CONFIG.require_any or []):
|
|
736
|
+
if not (set(group) & fields):
|
|
737
|
+
return True
|
|
738
|
+
return False
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def merge_ai_mapping(col_maps: list[ColumnMap], ai: dict) -> list[ColumnMap]:
|
|
742
|
+
"""Overlay an AI {col_index: field} mapping onto deterministic col_maps.
|
|
743
|
+
|
|
744
|
+
Exact (100) matches are ground truth and are kept. The AI fills columns the
|
|
745
|
+
deterministic pass could not place, without stealing a field an exact column
|
|
746
|
+
already owns. Single-slot fields are de-duplicated with exact > ai priority.
|
|
747
|
+
"""
|
|
748
|
+
exact_fields = {m.field for m in col_maps if m.method == "exact" and m.field}
|
|
749
|
+
by_index = {m.col_index: m for m in col_maps}
|
|
750
|
+
for ci, field in ai.items():
|
|
751
|
+
m = by_index.get(ci)
|
|
752
|
+
if m is None or field not in ALLOWED_FIELDS:
|
|
753
|
+
continue
|
|
754
|
+
if m.method == "exact":
|
|
755
|
+
continue # never override ground truth
|
|
756
|
+
if field in exact_fields:
|
|
757
|
+
continue # an exact column already owns this field
|
|
758
|
+
m.field = field
|
|
759
|
+
m.method = "ai"
|
|
760
|
+
m.confidence = AI_CONFIDENCE
|
|
761
|
+
|
|
762
|
+
# de-dup single-slot fields: keep highest confidence, prefer exact then ai
|
|
763
|
+
prio = {"exact": 3, "ai": 2, "cache": 2, "fuzzy": 1}
|
|
764
|
+
best: dict[str, ColumnMap] = {}
|
|
765
|
+
for m in col_maps:
|
|
766
|
+
if not m.field:
|
|
767
|
+
continue
|
|
768
|
+
cur = best.get(m.field)
|
|
769
|
+
if cur is None or (prio.get(m.method, 0), m.confidence) > \
|
|
770
|
+
(prio.get(cur.method, 0), cur.confidence):
|
|
771
|
+
if cur is not None:
|
|
772
|
+
cur.field, cur.method = None, "dup"
|
|
773
|
+
best[m.field] = m
|
|
774
|
+
else:
|
|
775
|
+
m.field, m.method = None, "dup"
|
|
776
|
+
return col_maps
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def _schema_signature() -> str:
|
|
780
|
+
"""Short hash of the active schema (output fields + types, allowed fields,
|
|
781
|
+
reconcile, and the config synonyms). The mapping cache is scoped by this, so
|
|
782
|
+
changing the config — e.g. adding a field — invalidates stale entries for the
|
|
783
|
+
same header instead of replaying an old mapping. Learned synonyms are NOT
|
|
784
|
+
included, so learning doesn't churn the cache."""
|
|
785
|
+
import hashlib
|
|
786
|
+
payload = json.dumps({
|
|
787
|
+
"fields": [[f, t] for f, t in _FIELD_TYPES.items()],
|
|
788
|
+
"allowed": sorted(ALLOWED_FIELDS),
|
|
789
|
+
"reconcile": _ACTIVE_CONFIG.reconcile,
|
|
790
|
+
"synonyms": {k: sorted(v) for k, v in _ACTIVE_CONFIG.synonyms.items()},
|
|
791
|
+
}, sort_keys=True, default=str)
|
|
792
|
+
return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:12]
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
# --------------------------------------------------------------------------
|
|
796
|
+
# Core runner
|
|
797
|
+
# --------------------------------------------------------------------------
|
|
798
|
+
def _run(rows: list[list], source_label: str, out_path, llm_fallback,
|
|
799
|
+
table_matcher, scan_limit, threshold, cache,
|
|
800
|
+
output_format: OutputFormat, learn_store=None) -> ProcessResult:
|
|
801
|
+
"""Shared core: detect header -> map -> (AI) -> extract -> review -> output.
|
|
802
|
+
Works on already-read `rows` so it is source-agnostic (path or stream)."""
|
|
803
|
+
if not rows:
|
|
804
|
+
return ProcessResult(source_label, None, 0, 0.0, [], [], True,
|
|
805
|
+
["empty sheet"], {})
|
|
806
|
+
|
|
807
|
+
hc = detect_header_row(rows, scan_limit=scan_limit)
|
|
808
|
+
header = hc.cells
|
|
809
|
+
sample_rows = rows[hc.index + 1: hc.index + 6]
|
|
810
|
+
|
|
811
|
+
from_cache = False
|
|
812
|
+
col_maps = None
|
|
813
|
+
schema_sig = _schema_signature() # scope the cache to the active schema
|
|
814
|
+
if cache is not None:
|
|
815
|
+
cached = cache.get(header, namespace=schema_sig)
|
|
816
|
+
if cached is not None:
|
|
817
|
+
col_maps = cached
|
|
818
|
+
from_cache = True
|
|
819
|
+
|
|
820
|
+
if col_maps is None:
|
|
821
|
+
col_maps = map_columns(header, sample_rows, llm_fallback=llm_fallback,
|
|
822
|
+
threshold=threshold)
|
|
823
|
+
# Unknown layout? Ask the AI to map the whole table (structure only).
|
|
824
|
+
if table_matcher is not None and _has_critical_gap(col_maps):
|
|
825
|
+
ai = table_matcher(header, rows[hc.index + 1: hc.index + 46],
|
|
826
|
+
list(ALLOWED_FIELDS))
|
|
827
|
+
if ai:
|
|
828
|
+
col_maps = merge_ai_mapping(col_maps, ai)
|
|
829
|
+
|
|
830
|
+
records = extract_records(rows, hc.index, col_maps)
|
|
831
|
+
needs_review, reasons = evaluate_review(col_maps, records, threshold=threshold)
|
|
832
|
+
|
|
833
|
+
# Only cache a freshly-computed mapping if it's trustworthy. Never persist an
|
|
834
|
+
# unconfirmed fallback/low-confidence guess — that would let it be replayed
|
|
835
|
+
# as if approved. (A human-approved mapping can be cached explicitly.)
|
|
836
|
+
if cache is not None and not from_cache and not needs_review:
|
|
837
|
+
cache.put(header, col_maps, namespace=schema_sig)
|
|
838
|
+
|
|
839
|
+
result = ProcessResult(
|
|
840
|
+
input_path=source_label, output_path=out_path, header_index=hc.index,
|
|
841
|
+
header_score=hc.score, column_maps=col_maps, records=records,
|
|
842
|
+
needs_review=needs_review, review_reasons=reasons,
|
|
843
|
+
header_breakdown=hc.breakdown,
|
|
844
|
+
output=OutputResult(records=records, format=output_format,
|
|
845
|
+
file_path=out_path),
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
# Learn AI-resolved headers into the vocabulary, then refresh matching so the
|
|
849
|
+
# new phrases take effect immediately (next statement is an exact match).
|
|
850
|
+
# Gated fields (debit/credit) go to the learn store's pending queue.
|
|
851
|
+
if learn_store is not None and not from_cache:
|
|
852
|
+
from .learn import learn_from_result
|
|
853
|
+
learn_from_result(result, learn_store)
|
|
854
|
+
apply_learned(learn_store)
|
|
855
|
+
|
|
856
|
+
# For backward compat: still write file if out_path given and format is "file"
|
|
857
|
+
if out_path and output_format == "file":
|
|
858
|
+
_write_output(out_path, records)
|
|
859
|
+
|
|
860
|
+
return result
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
# --------------------------------------------------------------------------
|
|
864
|
+
# Public API
|
|
865
|
+
# --------------------------------------------------------------------------
|
|
866
|
+
def process_file(
|
|
867
|
+
path: str,
|
|
868
|
+
out_path: Optional[str] = None,
|
|
869
|
+
output_format: OutputFormat = "file",
|
|
870
|
+
llm_fallback: Optional[Callable] = None,
|
|
871
|
+
table_matcher: Optional[Callable] = None,
|
|
872
|
+
scan_limit: int = 25,
|
|
873
|
+
threshold: int = 80,
|
|
874
|
+
cache: Optional["MappingCache"] = None,
|
|
875
|
+
learn_store=None,
|
|
876
|
+
) -> ProcessResult:
|
|
877
|
+
"""Read an .xlsx from a filesystem `path` and map it.
|
|
878
|
+
|
|
879
|
+
`output_format` controls the serialization of `result.output`:
|
|
880
|
+
- "file" : writes to `out_path` on disk, returns path string
|
|
881
|
+
- "records" : raw Python list[dict] (default for streams)
|
|
882
|
+
- "json" : JSON string
|
|
883
|
+
- "bytes" : in-memory .xlsx bytes
|
|
884
|
+
- "base64" : base64-encoded .xlsx bytes
|
|
885
|
+
|
|
886
|
+
`table_matcher(header_row, data_rows, allowed_fields) -> {col_index: field}`
|
|
887
|
+
is the LLM path (see ai_matcher.OpenAICompatibleMatcher). It fires only when
|
|
888
|
+
the deterministic pass leaves a critical gap AND the header isn't cached.
|
|
889
|
+
"""
|
|
890
|
+
rows = _read_sheet(path)
|
|
891
|
+
return _run(rows, path, out_path, llm_fallback, table_matcher,
|
|
892
|
+
scan_limit, threshold, cache, output_format, learn_store)
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def process_stream(
|
|
896
|
+
data,
|
|
897
|
+
out_path: Optional[str] = None,
|
|
898
|
+
output_format: OutputFormat = "records",
|
|
899
|
+
llm_fallback: Optional[Callable] = None,
|
|
900
|
+
table_matcher: Optional[Callable] = None,
|
|
901
|
+
scan_limit: int = 25,
|
|
902
|
+
threshold: int = 80,
|
|
903
|
+
cache: Optional["MappingCache"] = None,
|
|
904
|
+
source_label: str = "<stream>",
|
|
905
|
+
learn_store=None,
|
|
906
|
+
) -> ProcessResult:
|
|
907
|
+
"""Map an .xlsx received as raw bytes or a binary file-like object — no temp
|
|
908
|
+
file, nothing written to disk. Ideal for a FastAPI UploadFile: pass
|
|
909
|
+
`await file.read()` (bytes) or `file.file` (a stream) straight in.
|
|
910
|
+
|
|
911
|
+
For bank data this is the preferred entry point: the statement is parsed
|
|
912
|
+
entirely in memory and never lands on the filesystem.
|
|
913
|
+
|
|
914
|
+
Default `output_format` is "records" since streams are typically consumed
|
|
915
|
+
by an API that serializes its own response.
|
|
916
|
+
"""
|
|
917
|
+
import io
|
|
918
|
+
if isinstance(data, (bytes, bytearray)):
|
|
919
|
+
fileobj = io.BytesIO(data)
|
|
920
|
+
else:
|
|
921
|
+
fileobj = data # already a binary file-like object
|
|
922
|
+
rows = _read_sheet(fileobj)
|
|
923
|
+
return _run(rows, source_label, out_path, llm_fallback, table_matcher,
|
|
924
|
+
scan_limit, threshold, cache, output_format, learn_store)
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
# --------------------------------------------------------------------------
|
|
928
|
+
# Internal sheet reader
|
|
929
|
+
# --------------------------------------------------------------------------
|
|
930
|
+
def _read_sheet(path: str) -> list[list]:
|
|
931
|
+
# `src` may be a filesystem path OR a binary file-like object (BytesIO) —
|
|
932
|
+
# openpyxl accepts both, so uploads can be read straight from memory.
|
|
933
|
+
import openpyxl
|
|
934
|
+
wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
|
935
|
+
ws = wb.active
|
|
936
|
+
rows = [list(r) for r in ws.iter_rows(values_only=True)]
|
|
937
|
+
wb.close()
|
|
938
|
+
return rows
|