tabularmapper 1.0.1__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {tabularmapper-1.0.1/src/tabularmapper.egg-info → tabularmapper-1.0.3}/PKG-INFO +16 -1
  2. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/README.md +13 -0
  3. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/pyproject.toml +3 -1
  4. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/__init__.py +1 -1
  5. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/api.py +33 -4
  6. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/engine.py +4 -1
  7. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/schema.py +1 -1
  8. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/stores.py +2 -2
  9. {tabularmapper-1.0.1 → tabularmapper-1.0.3/src/tabularmapper.egg-info}/PKG-INFO +16 -1
  10. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/tests/test_api.py +44 -0
  11. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/LICENSE +0 -0
  12. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/setup.cfg +0 -0
  13. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/ai_matcher.py +0 -0
  14. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/cli.py +0 -0
  15. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/learn.py +0 -0
  16. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/llm_fallback.py +0 -0
  17. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper/mapping_cache.py +0 -0
  18. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/SOURCES.txt +0 -0
  19. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/dependency_links.txt +0 -0
  20. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/entry_points.txt +0 -0
  21. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/requires.txt +0 -0
  22. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/top_level.txt +0 -0
  23. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/tests/test_learn.py +0 -0
  24. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/tests/test_mapper.py +0 -0
  25. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/tests/test_schema.py +0 -0
  26. {tabularmapper-1.0.1 → tabularmapper-1.0.3}/tests/test_stores.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tabularmapper
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher
5
5
  Author-email: Karthikeyan Duraisamy <karthikeyanduraisamy@kultivateindia.com>
6
6
  License-Expression: MIT
@@ -15,6 +15,8 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
18
20
  Classifier: Topic :: Office/Business
19
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
22
  Requires-Python: >=3.9
@@ -163,6 +165,7 @@ All are optional; sensible defaults apply.
163
165
  | `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
164
166
  | `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
165
167
  | `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
168
+ | `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
166
169
  | `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
167
170
  | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
168
171
  | `OPENAI_MODEL` | `gpt-4o-mini` | model name |
@@ -278,6 +281,18 @@ app.include_router(router)
278
281
  blocking work in a threadpool. Store the original file to S3 in your own endpoint
279
282
  if you need it — the mapper stays out of AWS.
280
283
 
284
+ Two query params shape the request:
285
+
286
+ ```bash
287
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
288
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
289
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
290
+ ```
291
+
292
+ `format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
293
+ `TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
294
+ matches to the AI matcher instead of trusting them.
295
+
281
296
  The `/mapper` prefix is configurable (this is a general table→schema mapper, not
282
297
  just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
283
298
 
@@ -125,6 +125,7 @@ All are optional; sensible defaults apply.
125
125
  | `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
126
126
  | `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
127
127
  | `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
128
+ | `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
128
129
  | `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
129
130
  | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
130
131
  | `OPENAI_MODEL` | `gpt-4o-mini` | model name |
@@ -240,6 +241,18 @@ app.include_router(router)
240
241
  blocking work in a threadpool. Store the original file to S3 in your own endpoint
241
242
  if you need it — the mapper stays out of AWS.
242
243
 
244
+ Two query params shape the request:
245
+
246
+ ```bash
247
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
248
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
249
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
250
+ ```
251
+
252
+ `format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
253
+ `TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
254
+ matches to the AI matcher instead of trusting them.
255
+
243
256
  The `/mapper` prefix is configurable (this is a general table→schema mapper, not
244
257
  just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
245
258
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tabularmapper"
7
- version = "1.0.1"
7
+ version = "1.0.3"
8
8
  description = "Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -22,6 +22,8 @@ classifiers = [
22
22
  "Programming Language :: Python :: 3.10",
23
23
  "Programming Language :: Python :: 3.11",
24
24
  "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Programming Language :: Python :: 3.14",
25
27
  "Topic :: Office/Business",
26
28
  "Topic :: Software Development :: Libraries :: Python Modules",
27
29
  ]
@@ -44,7 +44,7 @@ from .schema import (
44
44
  )
45
45
  from .stores import open_store
46
46
 
47
- __version__ = "1.0.1"
47
+ __version__ = "1.0.3"
48
48
 
49
49
  __all__ = [
50
50
  "process_file",
@@ -30,6 +30,7 @@ from __future__ import annotations
30
30
 
31
31
  import os
32
32
  from contextlib import asynccontextmanager
33
+ from enum import Enum
33
34
  from typing import Any, Optional
34
35
 
35
36
  from fastapi import APIRouter, FastAPI, File, HTTPException, Query, UploadFile
@@ -44,6 +45,24 @@ from .mapping_cache import MappingCache
44
45
  _XLSX_MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
45
46
 
46
47
 
48
+ def _default_threshold() -> int:
49
+ """The fuzzy-accept gate (0-100). Below this, a column is left unmapped and,
50
+ if it's a critical field, the AI matcher is asked to fill it. Raise it to
51
+ push borderline fuzzy matches to the AI instead of trusting them. Read from
52
+ TABULARMAPPER_THRESHOLD at request time; falls back to 80."""
53
+ try:
54
+ return max(0, min(100, int(os.getenv("TABULARMAPPER_THRESHOLD", "80"))))
55
+ except (TypeError, ValueError):
56
+ return 80
57
+
58
+
59
+ class OutFormat(str, Enum):
60
+ """Response shape for POST /map — rendered as a dropdown in the docs."""
61
+ json = "json" # rows inline (default)
62
+ base64 = "base64" # rows inline + a mapped .xlsx in file_base64
63
+ file = "file" # download the .xlsx directly (binary, no JSON body)
64
+
65
+
47
66
  # --------------------------------------------------------------------------
48
67
  # Shared singletons (built once at startup)
49
68
  # --------------------------------------------------------------------------
@@ -122,13 +141,19 @@ async def health() -> dict:
122
141
 
123
142
  async def map_statement(
124
143
  file: UploadFile = File(...),
125
- format: str = Query(
126
- "json",
127
- pattern="^(json|base64|file)$",
144
+ format: OutFormat = Query(
145
+ OutFormat.json,
128
146
  description="json = rows inline (default); base64 = rows inline + an "
129
147
  ".xlsx encoded in file_base64; file = download the .xlsx "
130
148
  "directly (binary, no JSON body).",
131
149
  ),
150
+ threshold: Optional[int] = Query(
151
+ None,
152
+ ge=0, le=100,
153
+ description="Fuzzy-accept gate 0-100. Overrides TABULARMAPPER_THRESHOLD "
154
+ "(default 80) for this request. Raise it to send borderline "
155
+ "fuzzy matches to the AI matcher instead of trusting them.",
156
+ ),
132
157
  ):
133
158
  """Upload a spreadsheet (.xlsx); get the standardized mapping + rows.
134
159
 
@@ -136,18 +161,22 @@ async def map_statement(
136
161
  * json -> MapResponse with the rows in `transactions`
137
162
  * base64 -> same MapResponse, plus a mapped .xlsx in `file_base64`
138
163
  * file -> the mapped .xlsx as a downloadable attachment
164
+
165
+ `threshold` (query) overrides the fuzzy gate for this one call; otherwise the
166
+ server default (TABULARMAPPER_THRESHOLD, else 80) is used.
139
167
  """
140
168
  name = (file.filename or "").lower()
141
169
  if not name.endswith((".xlsx", ".xls")):
142
170
  raise HTTPException(status_code=400, detail="expected an .xlsx/.xls file")
143
171
 
172
+ gate = threshold if threshold is not None else _default_threshold()
144
173
  data = await file.read() # raw bytes, parsed in memory (never hits disk)
145
174
  try:
146
175
  # blocking work -> threadpool; process_stream reads straight from bytes
147
176
  res = await run_in_threadpool(
148
177
  process_stream, data,
149
178
  table_matcher=state.matcher, cache=state.cache,
150
- learn_store=state.learn,
179
+ learn_store=state.learn, threshold=gate,
151
180
  source_label=file.filename or "<upload>",
152
181
  )
153
182
  except Exception as exc: # noqa: BLE001
@@ -810,7 +810,10 @@ def _run(rows: list[list], source_label: str, out_path, llm_fallback,
810
810
 
811
811
  from_cache = False
812
812
  col_maps = None
813
- schema_sig = _schema_signature() # scope the cache to the active schema
813
+ # Scope the cache to the active schema AND the fuzzy gate: a different
814
+ # threshold can change which columns map, so it must not reuse a mapping
815
+ # computed at another threshold.
816
+ schema_sig = f"{_schema_signature()}:t{threshold}"
814
817
  if cache is not None:
815
818
  cached = cache.get(header, namespace=schema_sig)
816
819
  if cached is not None:
@@ -198,7 +198,7 @@ def _infer_type(field_key: str) -> str:
198
198
  def default_config() -> Config:
199
199
  """The built-in default: EMPTY. This is a general mapper, so with no config
200
200
  it maps nothing — you must provide an output_schema + synonyms (a file/URL via
201
- BANK_MAPPER_CONFIG, a dict, or configure()). Use `bank_preset()` for the
201
+ TABULARMAPPER_CONFIG, a dict, or configure()). Use `bank_preset()` for the
202
202
  ready-made bank-statement schema."""
203
203
  return Config(output_schema=[], synonyms={}, critical_fields=[])
204
204
 
@@ -145,7 +145,7 @@ def _redis_proto_client(url: str, prefer: str = "redis"):
145
145
  return mod.from_url(u) # module-level from_url (both expose it)
146
146
  raise ImportError(
147
147
  "This cache backend needs the 'valkey' or 'redis' package. Install one "
148
- "with: pip install bank-statement-mapper[valkey] (or [redis]). Both "
148
+ "with: pip install tabularmapper[valkey] (or [redis]). Both "
149
149
  "are optional — the default SQLite backend needs nothing extra."
150
150
  ) from last_err
151
151
 
@@ -188,7 +188,7 @@ class PostgresStore:
188
188
  except ImportError as exc:
189
189
  raise ImportError(
190
190
  "The postgres cache backend needs the 'psycopg' package. Install "
191
- "it with: pip install bank-statement-mapper[postgres]. It is "
191
+ "it with: pip install tabularmapper[postgres]. It is "
192
192
  "optional — the default SQLite backend needs nothing extra."
193
193
  ) from exc
194
194
  self._table = table
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tabularmapper
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher
5
5
  Author-email: Karthikeyan Duraisamy <karthikeyanduraisamy@kultivateindia.com>
6
6
  License-Expression: MIT
@@ -15,6 +15,8 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
18
20
  Classifier: Topic :: Office/Business
19
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
22
  Requires-Python: >=3.9
@@ -163,6 +165,7 @@ All are optional; sensible defaults apply.
163
165
  | `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
164
166
  | `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
165
167
  | `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
168
+ | `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
166
169
  | `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
167
170
  | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
168
171
  | `OPENAI_MODEL` | `gpt-4o-mini` | model name |
@@ -278,6 +281,18 @@ app.include_router(router)
278
281
  blocking work in a threadpool. Store the original file to S3 in your own endpoint
279
282
  if you need it — the mapper stays out of AWS.
280
283
 
284
+ Two query params shape the request:
285
+
286
+ ```bash
287
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
288
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
289
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
290
+ ```
291
+
292
+ `format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
293
+ `TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
294
+ matches to the AI matcher instead of trusting them.
295
+
281
296
  The `/mapper` prefix is configurable (this is a general table→schema mapper, not
282
297
  just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
283
298
 
@@ -107,6 +107,50 @@ def test_map_rejects_non_xlsx(client):
107
107
  assert r.status_code == 400
108
108
 
109
109
 
110
+ def _fuzzy_xlsx_bytes():
111
+ """A tiny bank sheet whose 'Descriptn' header only fuzzy-matches (score 90)."""
112
+ from openpyxl import Workbook
113
+ wb = Workbook(); ws = wb.active
114
+ ws.append(["Date", "Descriptn", "Debit", "Credit"])
115
+ ws.append(["01-06-2026", "Coffee", "150", ""])
116
+ ws.append(["02-06-2026", "Salary", "", "45000"])
117
+ buf = io.BytesIO(); wb.save(buf); return buf.getvalue()
118
+
119
+
120
+ def test_map_threshold_query_changes_mapping(client):
121
+ payload = _fuzzy_xlsx_bytes()
122
+
123
+ # default gate (80): 'Descriptn' (score 90) is accepted as fuzzy
124
+ r = client.post("/mapper/map", files={"file": ("s.xlsx", io.BytesIO(payload))})
125
+ cols = {c["raw_header"]: c for c in r.json()["columns"]}
126
+ assert cols["Descriptn"]["field"] == "description"
127
+ assert cols["Descriptn"]["method"] == "fuzzy"
128
+
129
+ # raise the gate above 90: the same column now falls through -> unmapped
130
+ r = client.post("/mapper/map", params={"threshold": 95},
131
+ files={"file": ("s.xlsx", io.BytesIO(payload))})
132
+ cols = {c["raw_header"]: c for c in r.json()["columns"]}
133
+ assert cols["Descriptn"]["field"] is None
134
+
135
+
136
+ def test_map_threshold_out_of_range(client):
137
+ payload = _fuzzy_xlsx_bytes()
138
+ for bad in (150, -1):
139
+ r = client.post("/mapper/map", params={"threshold": bad},
140
+ files={"file": ("s.xlsx", io.BytesIO(payload))})
141
+ assert r.status_code == 422 # ge=0 / le=100 validation
142
+
143
+
144
+ def test_default_threshold_reads_env(monkeypatch):
145
+ import tabularmapper.api as api
146
+ monkeypatch.setenv("TABULARMAPPER_THRESHOLD", "90")
147
+ assert api._default_threshold() == 90
148
+ monkeypatch.setenv("TABULARMAPPER_THRESHOLD", "banana") # invalid -> falls back
149
+ assert api._default_threshold() == 80
150
+ monkeypatch.delenv("TABULARMAPPER_THRESHOLD", raising=False)
151
+ assert api._default_threshold() == 80
152
+
153
+
110
154
  def test_router_prefix_default_and_custom():
111
155
  import tabularmapper.api as api
112
156
  assert {r.path for r in api.router.routes} == {
File without changes
File without changes