tabularmapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tabularmapper/cli.py ADDED
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cli.py — command-line runner for tabularmapper.
4
+
5
+ python cli.py <input.xlsx> [output.xlsx] [options]
6
+
7
+ Options:
8
+ --format {file,json,bytes,base64,records}
9
+ output format (default: file)
10
+ --ai use the LLM table matcher for unknown
11
+ headers (OpenAI-compatible; structure
12
+ only, never transaction data)
13
+ --model NAME LLM model (or env OPENAI_MODEL)
14
+ --fallback {none,hashing} offline per-column fallback
15
+ (default: none -> zero network calls)
16
+ --config PATH output template + synonyms JSON
17
+ (file / URL / s3://; or env TABULARMAPPER_CONFIG)
18
+ --cache URL cache backend: sqlite:/// | redis:// |
19
+ postgresql:// | memory:// (or env
20
+ TABULARMAPPER_CACHE; use env for secrets)
21
+ --no-cache disable the mapping cache
22
+ --learn [URL] enable self-learning (store URL optional;
23
+ env TABULARMAPPER_LEARN_STORE / sqlite default)
24
+ --harvest DIR seed the learned vocabulary from a folder
25
+ of .xlsx statements, then exit
26
+ --threshold N fuzzy confidence gate (default 80)
27
+
28
+ Env for --ai: OPENAI_API_KEY, OPENAI_BASE_URL (default OpenAI), OPENAI_MODEL.
29
+ Works with any OpenAI-compatible endpoint (OpenAI, Azure, vLLM, Ollama, ...).
30
+
31
+ Prints: the detected header row, full column mapping with confidences and
32
+ method, transaction count, and any review flags.
33
+
34
+ Output format notes:
35
+ file — writes .xlsx to disk (original behavior)
36
+ json — prints JSON string to stdout
37
+ bytes — writes raw .xlsx bytes to stdout (pipe to file: > out.xlsx)
38
+ base64 — prints base64-encoded .xlsx string to stdout
39
+ records — prints Python repr of the records list
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import argparse
45
+ import os
46
+ import sys
47
+
48
+ from .engine import process_file
49
+ from .mapping_cache import MappingCache
50
+
51
+
52
+ def _build_fallback(kind: str):
53
+ if kind == "none":
54
+ return None
55
+ if kind == "hashing":
56
+ from .llm_fallback import HashingEmbeddingFallback
57
+ return HashingEmbeddingFallback()
58
+ raise ValueError(kind)
59
+
60
+
61
+ def _maybe_matcher(args):
62
+ from .ai_matcher import OpenAICompatibleMatcher
63
+ if not os.getenv("OPENAI_API_KEY"):
64
+ print("warning: --ai set but OPENAI_API_KEY is empty; the AI call "
65
+ "will fail and columns stay unmapped.", file=sys.stderr)
66
+ return OpenAICompatibleMatcher(model=args.model)
67
+
68
+
69
+ def _write_output_file(res, out_path: str) -> None:
70
+ """Write the result to disk in the requested format."""
71
+ fmt = res.output.format
72
+ if fmt == "file":
73
+ # Already written by process_file; just confirm
74
+ print(f" written: {out_path}")
75
+ elif fmt == "json":
76
+ with open(out_path, "w", encoding="utf-8") as f:
77
+ f.write(res.output.json)
78
+ print(f" written: {out_path}")
79
+ elif fmt == "bytes":
80
+ with open(out_path, "wb") as f:
81
+ f.write(res.output.bytes)
82
+ print(f" written: {out_path}")
83
+ elif fmt == "base64":
84
+ with open(out_path, "w", encoding="ascii") as f:
85
+ f.write(res.output.base64)
86
+ print(f" written: {out_path}")
87
+ elif fmt == "records":
88
+ import json
89
+ with open(out_path, "w", encoding="utf-8") as f:
90
+ json.dump(res.output.records, f, indent=2, ensure_ascii=False)
91
+ print(f" written: {out_path}")
92
+
93
+
94
+ def main(argv=None) -> int:
95
+ # Auto-load a local .env if python-dotenv is available (optional convenience),
96
+ # so TABULARMAPPER_CACHE / _CONFIG / _LEARN_STORE / OPENAI_* are picked up
97
+ # without exporting. No-op if the package isn't installed.
98
+ try:
99
+ from dotenv import load_dotenv
100
+ load_dotenv()
101
+ except ImportError:
102
+ pass
103
+
104
+ ap = argparse.ArgumentParser(description="Map any spreadsheet (.xlsx) to a schema you define")
105
+ ap.add_argument("input", nargs="?", default=None,
106
+ help="input .xlsx (omit when using --harvest)")
107
+ ap.add_argument("output", nargs="?", default=None)
108
+ ap.add_argument("--format", choices=["file", "json", "bytes", "base64", "records"],
109
+ default="file",
110
+ help="output format (default: file)")
111
+ ap.add_argument("--ai", action="store_true",
112
+ help="LLM table matcher for unknown headers")
113
+ ap.add_argument("--model", default=None, help="LLM model (or env OPENAI_MODEL)")
114
+ ap.add_argument("--fallback", choices=["none", "hashing"], default="none")
115
+ ap.add_argument("--config", default=None,
116
+ help="config JSON (file / URL / s3://); or env TABULARMAPPER_CONFIG")
117
+ ap.add_argument("--cache", default=None,
118
+ help="cache backend URL: sqlite:///f.db | redis://… | "
119
+ "postgresql://… | memory:// (or env TABULARMAPPER_CACHE). "
120
+ "Prefer the env var for URLs containing secrets.")
121
+ ap.add_argument("--no-cache", action="store_true")
122
+ ap.add_argument("--learn", nargs="?", const="", default=None,
123
+ help="enable self-learning; optional store URL "
124
+ "(or env TABULARMAPPER_LEARN_STORE / sqlite default)")
125
+ ap.add_argument("--harvest", default=None, metavar="DIR",
126
+ help="bootstrap the learned vocabulary from a folder of "
127
+ ".xlsx statements, then exit")
128
+ ap.add_argument("--preset", choices=["bank"], default=None,
129
+ help="use a built-in preset instead of a config (e.g. bank)")
130
+ ap.add_argument("--threshold", type=int, default=80)
131
+ args = ap.parse_args(argv)
132
+
133
+ # Load the schema: --preset, then --config / TABULARMAPPER_CONFIG. With none
134
+ # of these, the default is EMPTY and nothing is mapped.
135
+ from .engine import configure, apply_learned
136
+ if args.preset == "bank":
137
+ from .schema import bank_preset
138
+ configure(config=bank_preset())
139
+ else:
140
+ configure(args.config or os.getenv("TABULARMAPPER_CONFIG"))
141
+
142
+ # Learning: enabled by --learn or --harvest. `--learn` with no value uses the
143
+ # env/sqlite default; `--learn URL` overrides.
144
+ learn_store = None
145
+ if args.learn is not None or args.harvest:
146
+ from .learn import LearnStore
147
+ learn_store = LearnStore(args.learn or None)
148
+ apply_learned(learn_store)
149
+
150
+ if args.harvest:
151
+ from .learn import harvest_folder
152
+ matcher = _maybe_matcher(args) if args.ai else None
153
+ report = harvest_folder(args.harvest, learn_store, table_matcher=matcher)
154
+ print(f"Harvested {report['files']} file(s) from {args.harvest}")
155
+ print(f" learned : {len(report['learned'])}")
156
+ print(f" pending : {len(report['pending'])} (debit/credit await approval)")
157
+ print(f" conflicts: {len(report['conflict'])} errors: {len(report['errors'])}")
158
+ print(f" store stats: {report['stats']}")
159
+ return 0
160
+
161
+ if not args.input:
162
+ print("error: input file required (or use --harvest DIR)", file=sys.stderr)
163
+ return 2
164
+ if not os.path.exists(args.input):
165
+ print(f"error: input not found: {args.input}", file=sys.stderr)
166
+ return 2
167
+
168
+ out = args.output
169
+ if out is None and args.format == "file":
170
+ base, _ = os.path.splitext(args.input)
171
+ out = base + ".standardized.xlsx"
172
+
173
+ fallback = _build_fallback(args.fallback)
174
+ # args.cache is None unless --cache is passed; MappingCache(None) then falls
175
+ # back to TABULARMAPPER_CACHE or the sqlite default. Never hardcode a secret URL.
176
+ cache = None if args.no_cache else MappingCache(args.cache)
177
+
178
+ table_matcher = _maybe_matcher(args) if args.ai else None
179
+
180
+ res = process_file(args.input, out_path=out, output_format=args.format,
181
+ llm_fallback=fallback, table_matcher=table_matcher,
182
+ threshold=args.threshold, cache=cache,
183
+ learn_store=learn_store)
184
+
185
+ print(f"\nInput : {res.input_path}")
186
+ print(f"Output: {res.output_path}")
187
+ print(f"\nHeader row detected at index {res.header_index} "
188
+ f"(score {res.header_score}) — 0-based")
189
+ print(f" breakdown: {res.header_breakdown}")
190
+ hdr = [str(c) if c is not None else "" for c in
191
+ (res.column_maps and [m.raw_header for m in res.column_maps])]
192
+ print(f" cells: {hdr}")
193
+
194
+ print("\nColumn mapping:")
195
+ print(f" {'col':>3} {'raw header':<28} {'-> field':<14} {'conf':>4} method")
196
+ for m in res.column_maps:
197
+ fld = m.field if m.field else "(unmapped)"
198
+ print(f" {m.col_index:>3} {m.raw_header[:28]:<28} {fld:<14} "
199
+ f"{m.confidence:>4} {m.method}")
200
+
201
+ print(f"\nTransactions extracted: {len(res.records)}")
202
+ if res.records:
203
+ r = res.records[0]
204
+ print(f" first: {r}")
205
+
206
+ # Output the serialized result
207
+ print(f"\nOutput format: {args.format}")
208
+ if args.format == "json":
209
+ print(res.output.json)
210
+ elif args.format == "base64":
211
+ print(res.output.base64)
212
+ elif args.format == "bytes":
213
+ # Write raw bytes to stdout (binary)
214
+ sys.stdout.buffer.write(res.output.bytes)
215
+ elif args.format == "records":
216
+ import json
217
+ print(json.dumps(res.output.records, indent=2, ensure_ascii=False))
218
+ elif out:
219
+ # file format — already written by process_file
220
+ print(f" written: {out}")
221
+
222
+ if res.needs_review:
223
+ print("\n⚠ NEEDS REVIEW:")
224
+ for reason in res.review_reasons:
225
+ print(f" - {reason}")
226
+ else:
227
+ print("\n✓ Clean — no review flags.")
228
+
229
+ return 0
230
+
231
+
232
+ if __name__ == "__main__":
233
+ raise SystemExit(main())