tabularmapper 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tabularmapper/__init__.py +75 -0
- tabularmapper/ai_matcher.py +247 -0
- tabularmapper/api.py +186 -0
- tabularmapper/cli.py +233 -0
- tabularmapper/engine.py +938 -0
- tabularmapper/learn.py +203 -0
- tabularmapper/llm_fallback.py +118 -0
- tabularmapper/mapping_cache.py +73 -0
- tabularmapper/schema.py +341 -0
- tabularmapper/stores.py +238 -0
- tabularmapper-1.0.0.dist-info/METADATA +455 -0
- tabularmapper-1.0.0.dist-info/RECORD +16 -0
- tabularmapper-1.0.0.dist-info/WHEEL +5 -0
- tabularmapper-1.0.0.dist-info/entry_points.txt +2 -0
- tabularmapper-1.0.0.dist-info/licenses/LICENSE +21 -0
- tabularmapper-1.0.0.dist-info/top_level.txt +1 -0
tabularmapper/cli.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
cli.py — command-line runner for tabularmapper.
|
|
4
|
+
|
|
5
|
+
python cli.py <input.xlsx> [output.xlsx] [options]
|
|
6
|
+
|
|
7
|
+
Options:
|
|
8
|
+
--format {file,json,bytes,base64,records}
|
|
9
|
+
output format (default: file)
|
|
10
|
+
--ai use the LLM table matcher for unknown
|
|
11
|
+
headers (OpenAI-compatible; structure
|
|
12
|
+
only, never transaction data)
|
|
13
|
+
--model NAME LLM model (or env OPENAI_MODEL)
|
|
14
|
+
--fallback {none,hashing} offline per-column fallback
|
|
15
|
+
(default: none -> zero network calls)
|
|
16
|
+
--config PATH output template + synonyms JSON
|
|
17
|
+
(file / URL / s3://; or env TABULARMAPPER_CONFIG)
|
|
18
|
+
--cache URL cache backend: sqlite:/// | redis:// |
|
|
19
|
+
postgresql:// | memory:// (or env
|
|
20
|
+
TABULARMAPPER_CACHE; use env for secrets)
|
|
21
|
+
--no-cache disable the mapping cache
|
|
22
|
+
--learn [URL] enable self-learning (store URL optional;
|
|
23
|
+
env TABULARMAPPER_LEARN_STORE / sqlite default)
|
|
24
|
+
--harvest DIR seed the learned vocabulary from a folder
|
|
25
|
+
of .xlsx statements, then exit
|
|
26
|
+
--threshold N fuzzy confidence gate (default 80)
|
|
27
|
+
|
|
28
|
+
Env for --ai: OPENAI_API_KEY, OPENAI_BASE_URL (default OpenAI), OPENAI_MODEL.
|
|
29
|
+
Works with any OpenAI-compatible endpoint (OpenAI, Azure, vLLM, Ollama, ...).
|
|
30
|
+
|
|
31
|
+
Prints: the detected header row, full column mapping with confidences and
|
|
32
|
+
method, transaction count, and any review flags.
|
|
33
|
+
|
|
34
|
+
Output format notes:
|
|
35
|
+
file — writes .xlsx to disk (original behavior)
|
|
36
|
+
json — prints JSON string to stdout
|
|
37
|
+
bytes — writes raw .xlsx bytes to stdout (pipe to file: > out.xlsx)
|
|
38
|
+
base64 — prints base64-encoded .xlsx string to stdout
|
|
39
|
+
records — prints Python repr of the records list
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import argparse
|
|
45
|
+
import os
|
|
46
|
+
import sys
|
|
47
|
+
|
|
48
|
+
from .engine import process_file
|
|
49
|
+
from .mapping_cache import MappingCache
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _build_fallback(kind: str):
|
|
53
|
+
if kind == "none":
|
|
54
|
+
return None
|
|
55
|
+
if kind == "hashing":
|
|
56
|
+
from .llm_fallback import HashingEmbeddingFallback
|
|
57
|
+
return HashingEmbeddingFallback()
|
|
58
|
+
raise ValueError(kind)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _maybe_matcher(args):
|
|
62
|
+
from .ai_matcher import OpenAICompatibleMatcher
|
|
63
|
+
if not os.getenv("OPENAI_API_KEY"):
|
|
64
|
+
print("warning: --ai set but OPENAI_API_KEY is empty; the AI call "
|
|
65
|
+
"will fail and columns stay unmapped.", file=sys.stderr)
|
|
66
|
+
return OpenAICompatibleMatcher(model=args.model)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _write_output_file(res, out_path: str) -> None:
|
|
70
|
+
"""Write the result to disk in the requested format."""
|
|
71
|
+
fmt = res.output.format
|
|
72
|
+
if fmt == "file":
|
|
73
|
+
# Already written by process_file; just confirm
|
|
74
|
+
print(f" written: {out_path}")
|
|
75
|
+
elif fmt == "json":
|
|
76
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
77
|
+
f.write(res.output.json)
|
|
78
|
+
print(f" written: {out_path}")
|
|
79
|
+
elif fmt == "bytes":
|
|
80
|
+
with open(out_path, "wb") as f:
|
|
81
|
+
f.write(res.output.bytes)
|
|
82
|
+
print(f" written: {out_path}")
|
|
83
|
+
elif fmt == "base64":
|
|
84
|
+
with open(out_path, "w", encoding="ascii") as f:
|
|
85
|
+
f.write(res.output.base64)
|
|
86
|
+
print(f" written: {out_path}")
|
|
87
|
+
elif fmt == "records":
|
|
88
|
+
import json
|
|
89
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
90
|
+
json.dump(res.output.records, f, indent=2, ensure_ascii=False)
|
|
91
|
+
print(f" written: {out_path}")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def main(argv=None) -> int:
|
|
95
|
+
# Auto-load a local .env if python-dotenv is available (optional convenience),
|
|
96
|
+
# so TABULARMAPPER_CACHE / _CONFIG / _LEARN_STORE / OPENAI_* are picked up
|
|
97
|
+
# without exporting. No-op if the package isn't installed.
|
|
98
|
+
try:
|
|
99
|
+
from dotenv import load_dotenv
|
|
100
|
+
load_dotenv()
|
|
101
|
+
except ImportError:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
ap = argparse.ArgumentParser(description="Map any spreadsheet (.xlsx) to a schema you define")
|
|
105
|
+
ap.add_argument("input", nargs="?", default=None,
|
|
106
|
+
help="input .xlsx (omit when using --harvest)")
|
|
107
|
+
ap.add_argument("output", nargs="?", default=None)
|
|
108
|
+
ap.add_argument("--format", choices=["file", "json", "bytes", "base64", "records"],
|
|
109
|
+
default="file",
|
|
110
|
+
help="output format (default: file)")
|
|
111
|
+
ap.add_argument("--ai", action="store_true",
|
|
112
|
+
help="LLM table matcher for unknown headers")
|
|
113
|
+
ap.add_argument("--model", default=None, help="LLM model (or env OPENAI_MODEL)")
|
|
114
|
+
ap.add_argument("--fallback", choices=["none", "hashing"], default="none")
|
|
115
|
+
ap.add_argument("--config", default=None,
|
|
116
|
+
help="config JSON (file / URL / s3://); or env TABULARMAPPER_CONFIG")
|
|
117
|
+
ap.add_argument("--cache", default=None,
|
|
118
|
+
help="cache backend URL: sqlite:///f.db | redis://… | "
|
|
119
|
+
"postgresql://… | memory:// (or env TABULARMAPPER_CACHE). "
|
|
120
|
+
"Prefer the env var for URLs containing secrets.")
|
|
121
|
+
ap.add_argument("--no-cache", action="store_true")
|
|
122
|
+
ap.add_argument("--learn", nargs="?", const="", default=None,
|
|
123
|
+
help="enable self-learning; optional store URL "
|
|
124
|
+
"(or env TABULARMAPPER_LEARN_STORE / sqlite default)")
|
|
125
|
+
ap.add_argument("--harvest", default=None, metavar="DIR",
|
|
126
|
+
help="bootstrap the learned vocabulary from a folder of "
|
|
127
|
+
".xlsx statements, then exit")
|
|
128
|
+
ap.add_argument("--preset", choices=["bank"], default=None,
|
|
129
|
+
help="use a built-in preset instead of a config (e.g. bank)")
|
|
130
|
+
ap.add_argument("--threshold", type=int, default=80)
|
|
131
|
+
args = ap.parse_args(argv)
|
|
132
|
+
|
|
133
|
+
# Load the schema: --preset, then --config / TABULARMAPPER_CONFIG. With none
|
|
134
|
+
# of these, the default is EMPTY and nothing is mapped.
|
|
135
|
+
from .engine import configure, apply_learned
|
|
136
|
+
if args.preset == "bank":
|
|
137
|
+
from .schema import bank_preset
|
|
138
|
+
configure(config=bank_preset())
|
|
139
|
+
else:
|
|
140
|
+
configure(args.config or os.getenv("TABULARMAPPER_CONFIG"))
|
|
141
|
+
|
|
142
|
+
# Learning: enabled by --learn or --harvest. `--learn` with no value uses the
|
|
143
|
+
# env/sqlite default; `--learn URL` overrides.
|
|
144
|
+
learn_store = None
|
|
145
|
+
if args.learn is not None or args.harvest:
|
|
146
|
+
from .learn import LearnStore
|
|
147
|
+
learn_store = LearnStore(args.learn or None)
|
|
148
|
+
apply_learned(learn_store)
|
|
149
|
+
|
|
150
|
+
if args.harvest:
|
|
151
|
+
from .learn import harvest_folder
|
|
152
|
+
matcher = _maybe_matcher(args) if args.ai else None
|
|
153
|
+
report = harvest_folder(args.harvest, learn_store, table_matcher=matcher)
|
|
154
|
+
print(f"Harvested {report['files']} file(s) from {args.harvest}")
|
|
155
|
+
print(f" learned : {len(report['learned'])}")
|
|
156
|
+
print(f" pending : {len(report['pending'])} (debit/credit await approval)")
|
|
157
|
+
print(f" conflicts: {len(report['conflict'])} errors: {len(report['errors'])}")
|
|
158
|
+
print(f" store stats: {report['stats']}")
|
|
159
|
+
return 0
|
|
160
|
+
|
|
161
|
+
if not args.input:
|
|
162
|
+
print("error: input file required (or use --harvest DIR)", file=sys.stderr)
|
|
163
|
+
return 2
|
|
164
|
+
if not os.path.exists(args.input):
|
|
165
|
+
print(f"error: input not found: {args.input}", file=sys.stderr)
|
|
166
|
+
return 2
|
|
167
|
+
|
|
168
|
+
out = args.output
|
|
169
|
+
if out is None and args.format == "file":
|
|
170
|
+
base, _ = os.path.splitext(args.input)
|
|
171
|
+
out = base + ".standardized.xlsx"
|
|
172
|
+
|
|
173
|
+
fallback = _build_fallback(args.fallback)
|
|
174
|
+
# args.cache is None unless --cache is passed; MappingCache(None) then falls
|
|
175
|
+
# back to TABULARMAPPER_CACHE or the sqlite default. Never hardcode a secret URL.
|
|
176
|
+
cache = None if args.no_cache else MappingCache(args.cache)
|
|
177
|
+
|
|
178
|
+
table_matcher = _maybe_matcher(args) if args.ai else None
|
|
179
|
+
|
|
180
|
+
res = process_file(args.input, out_path=out, output_format=args.format,
|
|
181
|
+
llm_fallback=fallback, table_matcher=table_matcher,
|
|
182
|
+
threshold=args.threshold, cache=cache,
|
|
183
|
+
learn_store=learn_store)
|
|
184
|
+
|
|
185
|
+
print(f"\nInput : {res.input_path}")
|
|
186
|
+
print(f"Output: {res.output_path}")
|
|
187
|
+
print(f"\nHeader row detected at index {res.header_index} "
|
|
188
|
+
f"(score {res.header_score}) — 0-based")
|
|
189
|
+
print(f" breakdown: {res.header_breakdown}")
|
|
190
|
+
hdr = [str(c) if c is not None else "" for c in
|
|
191
|
+
(res.column_maps and [m.raw_header for m in res.column_maps])]
|
|
192
|
+
print(f" cells: {hdr}")
|
|
193
|
+
|
|
194
|
+
print("\nColumn mapping:")
|
|
195
|
+
print(f" {'col':>3} {'raw header':<28} {'-> field':<14} {'conf':>4} method")
|
|
196
|
+
for m in res.column_maps:
|
|
197
|
+
fld = m.field if m.field else "(unmapped)"
|
|
198
|
+
print(f" {m.col_index:>3} {m.raw_header[:28]:<28} {fld:<14} "
|
|
199
|
+
f"{m.confidence:>4} {m.method}")
|
|
200
|
+
|
|
201
|
+
print(f"\nTransactions extracted: {len(res.records)}")
|
|
202
|
+
if res.records:
|
|
203
|
+
r = res.records[0]
|
|
204
|
+
print(f" first: {r}")
|
|
205
|
+
|
|
206
|
+
# Output the serialized result
|
|
207
|
+
print(f"\nOutput format: {args.format}")
|
|
208
|
+
if args.format == "json":
|
|
209
|
+
print(res.output.json)
|
|
210
|
+
elif args.format == "base64":
|
|
211
|
+
print(res.output.base64)
|
|
212
|
+
elif args.format == "bytes":
|
|
213
|
+
# Write raw bytes to stdout (binary)
|
|
214
|
+
sys.stdout.buffer.write(res.output.bytes)
|
|
215
|
+
elif args.format == "records":
|
|
216
|
+
import json
|
|
217
|
+
print(json.dumps(res.output.records, indent=2, ensure_ascii=False))
|
|
218
|
+
elif out:
|
|
219
|
+
# file format — already written by process_file
|
|
220
|
+
print(f" written: {out}")
|
|
221
|
+
|
|
222
|
+
if res.needs_review:
|
|
223
|
+
print("\n⚠ NEEDS REVIEW:")
|
|
224
|
+
for reason in res.review_reasons:
|
|
225
|
+
print(f" - {reason}")
|
|
226
|
+
else:
|
|
227
|
+
print("\n✓ Clean — no review flags.")
|
|
228
|
+
|
|
229
|
+
return 0
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
if __name__ == "__main__":
|
|
233
|
+
raise SystemExit(main())
|