tabularmapper 1.0.2__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {tabularmapper-1.0.2/src/tabularmapper.egg-info → tabularmapper-1.0.3}/PKG-INFO +14 -1
  2. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/README.md +13 -0
  3. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/pyproject.toml +1 -1
  4. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/__init__.py +1 -1
  5. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/api.py +23 -1
  6. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/engine.py +4 -1
  7. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/schema.py +1 -1
  8. {tabularmapper-1.0.2 → tabularmapper-1.0.3/src/tabularmapper.egg-info}/PKG-INFO +14 -1
  9. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_api.py +44 -0
  10. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/LICENSE +0 -0
  11. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/setup.cfg +0 -0
  12. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/ai_matcher.py +0 -0
  13. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/cli.py +0 -0
  14. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/learn.py +0 -0
  15. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/llm_fallback.py +0 -0
  16. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/mapping_cache.py +0 -0
  17. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper/stores.py +0 -0
  18. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/SOURCES.txt +0 -0
  19. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/dependency_links.txt +0 -0
  20. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/entry_points.txt +0 -0
  21. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/requires.txt +0 -0
  22. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/src/tabularmapper.egg-info/top_level.txt +0 -0
  23. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_learn.py +0 -0
  24. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_mapper.py +0 -0
  25. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_schema.py +0 -0
  26. {tabularmapper-1.0.2 → tabularmapper-1.0.3}/tests/test_stores.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tabularmapper
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher
5
5
  Author-email: Karthikeyan Duraisamy <karthikeyanduraisamy@kultivateindia.com>
6
6
  License-Expression: MIT
@@ -165,6 +165,7 @@ All are optional; sensible defaults apply.
165
165
  | `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
166
166
  | `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
167
167
  | `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
168
+ | `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
168
169
  | `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
169
170
  | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
170
171
  | `OPENAI_MODEL` | `gpt-4o-mini` | model name |
@@ -280,6 +281,18 @@ app.include_router(router)
280
281
  blocking work in a threadpool. Store the original file to S3 in your own endpoint
281
282
  if you need it — the mapper stays out of AWS.
282
283
 
284
+ Two query params shape the request:
285
+
286
+ ```bash
287
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
288
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
289
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
290
+ ```
291
+
292
+ `format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
293
+ `TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
294
+ matches to the AI matcher instead of trusting them.
295
+
283
296
  The `/mapper` prefix is configurable (this is a general table→schema mapper, not
284
297
  just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
285
298
 
@@ -125,6 +125,7 @@ All are optional; sensible defaults apply.
125
125
  | `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
126
126
  | `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
127
127
  | `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
128
+ | `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
128
129
  | `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
129
130
  | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
130
131
  | `OPENAI_MODEL` | `gpt-4o-mini` | model name |
@@ -240,6 +241,18 @@ app.include_router(router)
240
241
  blocking work in a threadpool. Store the original file to S3 in your own endpoint
241
242
  if you need it — the mapper stays out of AWS.
242
243
 
244
+ Two query params shape the request:
245
+
246
+ ```bash
247
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
248
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
249
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
250
+ ```
251
+
252
+ `format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
253
+ `TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
254
+ matches to the AI matcher instead of trusting them.
255
+
243
256
  The `/mapper` prefix is configurable (this is a general table→schema mapper, not
244
257
  just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
245
258
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tabularmapper"
7
- version = "1.0.2"
7
+ version = "1.0.3"
8
8
  description = "Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -44,7 +44,7 @@ from .schema import (
44
44
  )
45
45
  from .stores import open_store
46
46
 
47
- __version__ = "1.0.2"
47
+ __version__ = "1.0.3"
48
48
 
49
49
  __all__ = [
50
50
  "process_file",
@@ -45,6 +45,17 @@ from .mapping_cache import MappingCache
45
45
  _XLSX_MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
46
46
 
47
47
 
48
+ def _default_threshold() -> int:
49
+ """The fuzzy-accept gate (0-100). Below this, a column is left unmapped and,
50
+ if it's a critical field, the AI matcher is asked to fill it. Raise it to
51
+ push borderline fuzzy matches to the AI instead of trusting them. Read from
52
+ TABULARMAPPER_THRESHOLD at request time; falls back to 80."""
53
+ try:
54
+ return max(0, min(100, int(os.getenv("TABULARMAPPER_THRESHOLD", "80"))))
55
+ except (TypeError, ValueError):
56
+ return 80
57
+
58
+
48
59
  class OutFormat(str, Enum):
49
60
  """Response shape for POST /map — rendered as a dropdown in the docs."""
50
61
  json = "json" # rows inline (default)
@@ -136,6 +147,13 @@ async def map_statement(
136
147
  ".xlsx encoded in file_base64; file = download the .xlsx "
137
148
  "directly (binary, no JSON body).",
138
149
  ),
150
+ threshold: Optional[int] = Query(
151
+ None,
152
+ ge=0, le=100,
153
+ description="Fuzzy-accept gate 0-100. Overrides TABULARMAPPER_THRESHOLD "
154
+ "(default 80) for this request. Raise it to send borderline "
155
+ "fuzzy matches to the AI matcher instead of trusting them.",
156
+ ),
139
157
  ):
140
158
  """Upload a spreadsheet (.xlsx); get the standardized mapping + rows.
141
159
 
@@ -143,18 +161,22 @@ async def map_statement(
143
161
  * json -> MapResponse with the rows in `transactions`
144
162
  * base64 -> same MapResponse, plus a mapped .xlsx in `file_base64`
145
163
  * file -> the mapped .xlsx as a downloadable attachment
164
+
165
+ `threshold` (query) overrides the fuzzy gate for this one call; otherwise the
166
+ server default (TABULARMAPPER_THRESHOLD, else 80) is used.
146
167
  """
147
168
  name = (file.filename or "").lower()
148
169
  if not name.endswith((".xlsx", ".xls")):
149
170
  raise HTTPException(status_code=400, detail="expected an .xlsx/.xls file")
150
171
 
172
+ gate = threshold if threshold is not None else _default_threshold()
151
173
  data = await file.read() # raw bytes, parsed in memory (never hits disk)
152
174
  try:
153
175
  # blocking work -> threadpool; process_stream reads straight from bytes
154
176
  res = await run_in_threadpool(
155
177
  process_stream, data,
156
178
  table_matcher=state.matcher, cache=state.cache,
157
- learn_store=state.learn,
179
+ learn_store=state.learn, threshold=gate,
158
180
  source_label=file.filename or "<upload>",
159
181
  )
160
182
  except Exception as exc: # noqa: BLE001
@@ -810,7 +810,10 @@ def _run(rows: list[list], source_label: str, out_path, llm_fallback,
810
810
 
811
811
  from_cache = False
812
812
  col_maps = None
813
- schema_sig = _schema_signature() # scope the cache to the active schema
813
+ # Scope the cache to the active schema AND the fuzzy gate: a different
814
+ # threshold can change which columns map, so it must not reuse a mapping
815
+ # computed at another threshold.
816
+ schema_sig = f"{_schema_signature()}:t{threshold}"
814
817
  if cache is not None:
815
818
  cached = cache.get(header, namespace=schema_sig)
816
819
  if cached is not None:
@@ -198,7 +198,7 @@ def _infer_type(field_key: str) -> str:
198
198
  def default_config() -> Config:
199
199
  """The built-in default: EMPTY. This is a general mapper, so with no config
200
200
  it maps nothing — you must provide an output_schema + synonyms (a file/URL via
201
- BANK_MAPPER_CONFIG, a dict, or configure()). Use `bank_preset()` for the
201
+ TABULARMAPPER_CONFIG, a dict, or configure()). Use `bank_preset()` for the
202
202
  ready-made bank-statement schema."""
203
203
  return Config(output_schema=[], synonyms={}, critical_fields=[])
204
204
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tabularmapper
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Map any spreadsheet (.xlsx) to a schema you define — deterministic column mapping with an optional AI matcher
5
5
  Author-email: Karthikeyan Duraisamy <karthikeyanduraisamy@kultivateindia.com>
6
6
  License-Expression: MIT
@@ -165,6 +165,7 @@ All are optional; sensible defaults apply.
165
165
  | `TABULARMAPPER_LEARN_STORE` | `memory://` (no files) | where self-learned header synonyms live |
166
166
  | `TABULARMAPPER_CONFIG` | *(none — required)* | output template + synonyms JSON (file / `https://` / `s3://`) |
167
167
  | `TABULARMAPPER_ROUTE_PREFIX` | `/mapper` | FastAPI router path prefix |
168
+ | `TABULARMAPPER_THRESHOLD` | `80` | fuzzy-accept gate (0–100); raise it to push borderline fuzzy matches to the AI matcher |
168
169
  | `OPENAI_API_KEY` | *(unset → AI off)* | enables the AI column matcher |
169
170
  | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | any OpenAI-compatible endpoint |
170
171
  | `OPENAI_MODEL` | `gpt-4o-mini` | model name |
@@ -280,6 +281,18 @@ app.include_router(router)
280
281
  blocking work in a threadpool. Store the original file to S3 in your own endpoint
281
282
  if you need it — the mapper stays out of AWS.
282
283
 
284
+ Two query params shape the request:
285
+
286
+ ```bash
287
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=base64" # json + a mapped .xlsx in file_base64
288
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?format=file" -OJ # download the mapped .xlsx
289
+ curl -F file=@f.xlsx "http://localhost:8000/mapper/map?threshold=90" # stricter fuzzy gate for this call
290
+ ```
291
+
292
+ `format` is `json` (default) / `base64` / `file`. `threshold` (0–100) overrides
293
+ `TABULARMAPPER_THRESHOLD` for one request — raise it to send borderline fuzzy
294
+ matches to the AI matcher instead of trusting them.
295
+
283
296
  The `/mapper` prefix is configurable (this is a general table→schema mapper, not
284
297
  just banks): set `TABULARMAPPER_ROUTE_PREFIX`, or build the router yourself:
285
298
 
@@ -107,6 +107,50 @@ def test_map_rejects_non_xlsx(client):
107
107
  assert r.status_code == 400
108
108
 
109
109
 
110
+ def _fuzzy_xlsx_bytes():
111
+ """A tiny bank sheet whose 'Descriptn' header only fuzzy-matches (score 90)."""
112
+ from openpyxl import Workbook
113
+ wb = Workbook(); ws = wb.active
114
+ ws.append(["Date", "Descriptn", "Debit", "Credit"])
115
+ ws.append(["01-06-2026", "Coffee", "150", ""])
116
+ ws.append(["02-06-2026", "Salary", "", "45000"])
117
+ buf = io.BytesIO(); wb.save(buf); return buf.getvalue()
118
+
119
+
120
+ def test_map_threshold_query_changes_mapping(client):
121
+ payload = _fuzzy_xlsx_bytes()
122
+
123
+ # default gate (80): 'Descriptn' (score 90) is accepted as fuzzy
124
+ r = client.post("/mapper/map", files={"file": ("s.xlsx", io.BytesIO(payload))})
125
+ cols = {c["raw_header"]: c for c in r.json()["columns"]}
126
+ assert cols["Descriptn"]["field"] == "description"
127
+ assert cols["Descriptn"]["method"] == "fuzzy"
128
+
129
+ # raise the gate above 90: the same column now falls through -> unmapped
130
+ r = client.post("/mapper/map", params={"threshold": 95},
131
+ files={"file": ("s.xlsx", io.BytesIO(payload))})
132
+ cols = {c["raw_header"]: c for c in r.json()["columns"]}
133
+ assert cols["Descriptn"]["field"] is None
134
+
135
+
136
+ def test_map_threshold_out_of_range(client):
137
+ payload = _fuzzy_xlsx_bytes()
138
+ for bad in (150, -1):
139
+ r = client.post("/mapper/map", params={"threshold": bad},
140
+ files={"file": ("s.xlsx", io.BytesIO(payload))})
141
+ assert r.status_code == 422 # ge=0 / le=100 validation
142
+
143
+
144
+ def test_default_threshold_reads_env(monkeypatch):
145
+ import tabularmapper.api as api
146
+ monkeypatch.setenv("TABULARMAPPER_THRESHOLD", "90")
147
+ assert api._default_threshold() == 90
148
+ monkeypatch.setenv("TABULARMAPPER_THRESHOLD", "banana") # invalid -> falls back
149
+ assert api._default_threshold() == 80
150
+ monkeypatch.delenv("TABULARMAPPER_THRESHOLD", raising=False)
151
+ assert api._default_threshold() == 80
152
+
153
+
110
154
  def test_router_prefix_default_and_custom():
111
155
  import tabularmapper.api as api
112
156
  assert {r.path for r in api.router.routes} == {
File without changes
File without changes