tabularmapper 1.0.2__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tabularmapper-1.0.2/src/tabularmapper.egg-info → tabularmapper-1.0.3}/PKG-INFO +14 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/README.md +13 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/pyproject.toml +1 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/__init__.py +1 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/api.py +23 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/engine.py +4 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/schema.py +1 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3/src/tabularmapper.egg-info}/PKG-INFO +14 -1
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_api.py +44 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/LICENSE +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/setup.cfg +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/ai_matcher.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/cli.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/learn.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/llm_fallback.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/mapping_cache.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/stores.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/SOURCES.txt +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/dependency_links.txt +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/entry_points.txt +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/requires.txt +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/top_level.txt +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_learn.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_mapper.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_schema.py +0 -0
- {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_stores.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tabularmapper
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher
|
|
5
5
|
Author-email: Karthikeyan Duraisamy <karthikeyanduraisamy@kultivateindia.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -165,6 +165,7 @@ All are optional; sensible defaults apply.
|
|
|
165
165
|
| `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
|
|
166
166
|
| `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
|
|
167
167
|
| `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
|
|
168
|
+
| `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
|
|
168
169
|
| `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
|
|
169
170
|
| `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
|
|
170
171
|
| `OPENAI_MODEL` | `gpt-4o-mini` | model name |
|
|
@@ -280,6 +281,18 @@ app.include_router(router)
|
|
|
280
281
|
blocking work in a threadpool. Store the original file to S3 in your own endpoint
|
|
281
282
|
if you need it — the mapper stays out of AWS.
|
|
282
283
|
|
|
284
|
+
Two query params shape the request:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
|
|
288
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
|
|
289
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
`format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
|
|
293
|
+
`TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
|
|
294
|
+
matches to the AI matcher instead of trusting them.
|
|
295
|
+
|
|
283
296
|
The `/mapper` prefix is configurable (this is a general table→schema mapper, not
|
|
284
297
|
just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
|
|
285
298
|
|
|
@@ -125,6 +125,7 @@ All are optional; sensible defaults apply.
|
|
|
125
125
|
| `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
|
|
126
126
|
| `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
|
|
127
127
|
| `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
|
|
128
|
+
| `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
|
|
128
129
|
| `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
|
|
129
130
|
| `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
|
|
130
131
|
| `OPENAI_MODEL` | `gpt-4o-mini` | model name |
|
|
@@ -240,6 +241,18 @@ app.include_router(router)
|
|
|
240
241
|
blocking work in a threadpool. Store the original file to S3 in your own endpoint
|
|
241
242
|
if you need it — the mapper stays out of AWS.
|
|
242
243
|
|
|
244
|
+
Two query params shape the request:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
|
|
248
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
|
|
249
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
`format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
|
|
253
|
+
`TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
|
|
254
|
+
matches to the AI matcher instead of trusting them.
|
|
255
|
+
|
|
243
256
|
The `/mapper` prefix is configurable (this is a general table→schema mapper, not
|
|
244
257
|
just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
|
|
245
258
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tabularmapper"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.3"
|
|
8
8
|
description = "Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -45,6 +45,17 @@ from .mapping_cache import MappingCache
|
|
|
45
45
|
_XLSX_MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
46
46
|
|
|
47
47
|
|
|
48
|
+
def _default_threshold() -> int:
|
|
49
|
+
"""The fuzzy-accept gate (0-100). Below this, a column is left unmapped and,
|
|
50
|
+
if it's a critical field, the AI matcher is asked to fill it. Raise it to
|
|
51
|
+
push borderline fuzzy matches to the AI instead of trusting them. Read from
|
|
52
|
+
TABULARMAPPER_THRESHOLD at request time; falls back to 80."""
|
|
53
|
+
try:
|
|
54
|
+
return max(0, min(100, int(os.getenv("TABULARMAPPER_THRESHOLD", "80"))))
|
|
55
|
+
except (TypeError, ValueError):
|
|
56
|
+
return 80
|
|
57
|
+
|
|
58
|
+
|
|
48
59
|
class OutFormat(str, Enum):
|
|
49
60
|
"""Response shape for POST /map — rendered as a dropdown in the docs."""
|
|
50
61
|
json = "json" # rows inline (default)
|
|
@@ -136,6 +147,13 @@ async def map_statement(
|
|
|
136
147
|
".xlsx encoded in file_base64; file = download the .xlsx "
|
|
137
148
|
"directly (binary, no JSON body).",
|
|
138
149
|
),
|
|
150
|
+
threshold: Optional[int] = Query(
|
|
151
|
+
None,
|
|
152
|
+
ge=0, le=100,
|
|
153
|
+
description="Fuzzy-accept gate 0-100. Overrides TABULARMAPPER_THRESHOLD "
|
|
154
|
+
"(default 80) for this request. Raise it to send borderline "
|
|
155
|
+
"fuzzy matches to the AI matcher instead of trusting them.",
|
|
156
|
+
),
|
|
139
157
|
):
|
|
140
158
|
"""Upload a spreadsheet (.xlsx); get the standardized mapping + rows.
|
|
141
159
|
|
|
@@ -143,18 +161,22 @@ async def map_statement(
|
|
|
143
161
|
* json -> MapResponse with the rows in `transactions`
|
|
144
162
|
* base64 -> same MapResponse, plus a mapped .xlsx in `file_base64`
|
|
145
163
|
* file -> the mapped .xlsx as a downloadable attachment
|
|
164
|
+
|
|
165
|
+
`threshold` (query) overrides the fuzzy gate for this one call; otherwise the
|
|
166
|
+
server default (TABULARMAPPER_THRESHOLD, else 80) is used.
|
|
146
167
|
"""
|
|
147
168
|
name = (file.filename or "").lower()
|
|
148
169
|
if not name.endswith((".xlsx", ".xls")):
|
|
149
170
|
raise HTTPException(status_code=400, detail="expected an .xlsx/.xls file")
|
|
150
171
|
|
|
172
|
+
gate = threshold if threshold is not None else _default_threshold()
|
|
151
173
|
data = await file.read() # raw bytes, parsed in memory (never hits disk)
|
|
152
174
|
try:
|
|
153
175
|
# blocking work -> threadpool; process_stream reads straight from bytes
|
|
154
176
|
res = await run_in_threadpool(
|
|
155
177
|
process_stream, data,
|
|
156
178
|
table_matcher=state.matcher, cache=state.cache,
|
|
157
|
-
learn_store=state.learn,
|
|
179
|
+
learn_store=state.learn, threshold=gate,
|
|
158
180
|
source_label=file.filename or "<upload>",
|
|
159
181
|
)
|
|
160
182
|
except Exception as exc: # noqa: BLE001
|
|
@@ -810,7 +810,10 @@ def _run(rows: list[list], source_label: str, out_path, llm_fallback,
|
|
|
810
810
|
|
|
811
811
|
from_cache = False
|
|
812
812
|
col_maps = None
|
|
813
|
-
|
|
813
|
+
# Scope the cache to the active schema AND the fuzzy gate: a different
|
|
814
|
+
# threshold can change which columns map, so it must not reuse a mapping
|
|
815
|
+
# computed at another threshold.
|
|
816
|
+
schema_sig = f"{_schema_signature()}:t{threshold}"
|
|
814
817
|
if cache is not None:
|
|
815
818
|
cached = cache.get(header, namespace=schema_sig)
|
|
816
819
|
if cached is not None:
|
|
@@ -198,7 +198,7 @@ def _infer_type(field_key: str) -> str:
|
|
|
198
198
|
def default_config() -> Config:
|
|
199
199
|
"""The built-in default: EMPTY. This is a general mapper, so with no config
|
|
200
200
|
it maps nothing — you must provide an output_schema + synonyms (a file/URL via
|
|
201
|
-
|
|
201
|
+
TABULARMAPPER_CONFIG, a dict, or configure()). Use `bank_preset()` for the
|
|
202
202
|
ready-made bank-statement schema."""
|
|
203
203
|
return Config(output_schema=[], synonyms={}, critical_fields=[])
|
|
204
204
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tabularmapper
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher
|
|
5
5
|
Author-email: Karthikeyan Duraisamy <karthikeyanduraisamy@kultivateindia.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -165,6 +165,7 @@ All are optional; sensible defaults apply.
|
|
|
165
165
|
| `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
|
|
166
166
|
| `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
|
|
167
167
|
| `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
|
|
168
|
+
| `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
|
|
168
169
|
| `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
|
|
169
170
|
| `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
|
|
170
171
|
| `OPENAI_MODEL` | `gpt-4o-mini` | model name |
|
|
@@ -280,6 +281,18 @@ app.include_router(router)
|
|
|
280
281
|
blocking work in a threadpool. Store the original file to S3 in your own endpoint
|
|
281
282
|
if you need it — the mapper stays out of AWS.
|
|
282
283
|
|
|
284
|
+
Two query params shape the request:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
|
|
288
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
|
|
289
|
+
curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
`format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
|
|
293
|
+
`TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
|
|
294
|
+
matches to the AI matcher instead of trusting them.
|
|
295
|
+
|
|
283
296
|
The `/mapper` prefix is configurable (this is a general table→schema mapper, not
|
|
284
297
|
just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
|
|
285
298
|
|
|
@@ -107,6 +107,50 @@ def test_map_rejects_non_xlsx(client):
|
|
|
107
107
|
assert r.status_code == 400
|
|
108
108
|
|
|
109
109
|
|
|
110
|
+
def _fuzzy_xlsx_bytes():
|
|
111
|
+
"""A tiny bank sheet whose 'Descriptn' header only fuzzy-matches (score 90)."""
|
|
112
|
+
from openpyxl import Workbook
|
|
113
|
+
wb = Workbook(); ws = wb.active
|
|
114
|
+
ws.append(["Date", "Descriptn", "Debit", "Credit"])
|
|
115
|
+
ws.append(["01-06-2026", "Coffee", "150", ""])
|
|
116
|
+
ws.append(["02-06-2026", "Salary", "", "45000"])
|
|
117
|
+
buf = io.BytesIO(); wb.save(buf); return buf.getvalue()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_map_threshold_query_changes_mapping(client):
|
|
121
|
+
payload = _fuzzy_xlsx_bytes()
|
|
122
|
+
|
|
123
|
+
# default gate (80): 'Descriptn' (score 90) is accepted as fuzzy
|
|
124
|
+
r = client.post("/mapper/map", files={"file": ("s.xlsx", io.BytesIO(payload))})
|
|
125
|
+
cols = {c["raw_header"]: c for c in r.json()["columns"]}
|
|
126
|
+
assert cols["Descriptn"]["field"] == "description"
|
|
127
|
+
assert cols["Descriptn"]["method"] == "fuzzy"
|
|
128
|
+
|
|
129
|
+
# raise the gate above 90: the same column now falls through -> unmapped
|
|
130
|
+
r = client.post("/mapper/map", params={"threshold": 95},
|
|
131
|
+
files={"file": ("s.xlsx", io.BytesIO(payload))})
|
|
132
|
+
cols = {c["raw_header"]: c for c in r.json()["columns"]}
|
|
133
|
+
assert cols["Descriptn"]["field"] is None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_map_threshold_out_of_range(client):
|
|
137
|
+
payload = _fuzzy_xlsx_bytes()
|
|
138
|
+
for bad in (150, -1):
|
|
139
|
+
r = client.post("/mapper/map", params={"threshold": bad},
|
|
140
|
+
files={"file": ("s.xlsx", io.BytesIO(payload))})
|
|
141
|
+
assert r.status_code == 422 # ge=0 / le=100 validation
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_default_threshold_reads_env(monkeypatch):
|
|
145
|
+
import tabularmapper.api as api
|
|
146
|
+
monkeypatch.setenv("TABULARMAPPER_THRESHOLD", "90")
|
|
147
|
+
assert api._default_threshold() == 90
|
|
148
|
+
monkeypatch.setenv("TABULARMAPPER_THRESHOLD", "banana") # invalid -> falls back
|
|
149
|
+
assert api._default_threshold() == 80
|
|
150
|
+
monkeypatch.delenv("TABULARMAPPER_THRESHOLD", raising=False)
|
|
151
|
+
assert api._default_threshold() == 80
|
|
152
|
+
|
|
153
|
+
|
|
110
154
|
def test_router_prefix_default_and_custom():
|
|
111
155
|
import tabularmapper.api as api
|
|
112
156
|
assert {r.path for r in api.router.routes} == {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|