grepxcel 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. grepxcel-0.1.0/LICENSE +21 -0
  2. grepxcel-0.1.0/PKG-INFO +672 -0
  3. grepxcel-0.1.0/README.md +625 -0
  4. grepxcel-0.1.0/grepxcel/__init__.py +47 -0
  5. grepxcel-0.1.0/grepxcel/__main__.py +3 -0
  6. grepxcel-0.1.0/grepxcel/cli.py +1068 -0
  7. grepxcel-0.1.0/grepxcel/color.py +59 -0
  8. grepxcel-0.1.0/grepxcel/docs_generator.py +245 -0
  9. grepxcel-0.1.0/grepxcel/doctor.py +289 -0
  10. grepxcel-0.1.0/grepxcel/drafter.py +1368 -0
  11. grepxcel-0.1.0/grepxcel/engine.py +1005 -0
  12. grepxcel-0.1.0/grepxcel/examples/01_simple_invoice/data.xlsx +0 -0
  13. grepxcel-0.1.0/grepxcel/examples/01_simple_invoice/pattern.xlsx +0 -0
  14. grepxcel-0.1.0/grepxcel/examples/02_product_catalog/data.xlsx +0 -0
  15. grepxcel-0.1.0/grepxcel/examples/02_product_catalog/pattern.xlsx +0 -0
  16. grepxcel-0.1.0/grepxcel/examples/03_expense_report/data.xlsx +0 -0
  17. grepxcel-0.1.0/grepxcel/examples/03_expense_report/pattern.xlsx +0 -0
  18. grepxcel-0.1.0/grepxcel/examples/04_loan_schedule/data.xlsx +0 -0
  19. grepxcel-0.1.0/grepxcel/examples/04_loan_schedule/pattern.xlsx +0 -0
  20. grepxcel-0.1.0/grepxcel/examples/__init__.py +0 -0
  21. grepxcel-0.1.0/grepxcel/examples_generator.py +123 -0
  22. grepxcel-0.1.0/grepxcel/lint.py +267 -0
  23. grepxcel-0.1.0/grepxcel/logger.py +799 -0
  24. grepxcel-0.1.0/grepxcel/mcp_config.py +76 -0
  25. grepxcel-0.1.0/grepxcel/mcp_server.py +226 -0
  26. grepxcel-0.1.0/grepxcel/model_manager.py +552 -0
  27. grepxcel-0.1.0/grepxcel/models.py +62 -0
  28. grepxcel-0.1.0/grepxcel/pattern_check.py +194 -0
  29. grepxcel-0.1.0/grepxcel/pattern_colors.py +118 -0
  30. grepxcel-0.1.0/grepxcel/pattern_parser.py +687 -0
  31. grepxcel-0.1.0/grepxcel/proxy_support.py +195 -0
  32. grepxcel-0.1.0/grepxcel/sbom.py +214 -0
  33. grepxcel-0.1.0/grepxcel/schema.py +253 -0
  34. grepxcel-0.1.0/grepxcel/security.py +316 -0
  35. grepxcel-0.1.0/grepxcel/skill.py +126 -0
  36. grepxcel-0.1.0/grepxcel/suggester.py +11 -0
  37. grepxcel-0.1.0/grepxcel/utils.py +156 -0
  38. grepxcel-0.1.0/grepxcel.egg-info/PKG-INFO +672 -0
  39. grepxcel-0.1.0/grepxcel.egg-info/SOURCES.txt +43 -0
  40. grepxcel-0.1.0/grepxcel.egg-info/dependency_links.txt +1 -0
  41. grepxcel-0.1.0/grepxcel.egg-info/entry_points.txt +2 -0
  42. grepxcel-0.1.0/grepxcel.egg-info/requires.txt +23 -0
  43. grepxcel-0.1.0/grepxcel.egg-info/top_level.txt +1 -0
  44. grepxcel-0.1.0/pyproject.toml +82 -0
  45. grepxcel-0.1.0/setup.cfg +4 -0
grepxcel-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 SCPG
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,672 @@
1
+ Metadata-Version: 2.4
2
+ Name: grepxcel
3
+ Version: 0.1.0
4
+ Summary: Pattern-based data extraction for Excel
5
+ Author: scpg
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/scpg/grepxcel
8
+ Project-URL: Repository, https://github.com/scpg/grepxcel
9
+ Project-URL: Issues, https://github.com/scpg/grepxcel/issues
10
+ Project-URL: Changelog, https://github.com/scpg/grepxcel/blob/main/CHANGELOG.md
11
+ Project-URL: Funding, https://buymeacoffee.com/scpg.dev
12
+ Keywords: excel,xlsx,spreadsheet,data-extraction,pattern,etl,openpyxl
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: End Users/Desktop
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
21
+ Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Operating System :: OS Independent
24
+ Requires-Python: >=3.11
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: openpyxl>=3.1
28
+ Requires-Dist: defusedxml>=0.7
29
+ Requires-Dist: regex>=2024.11.6
30
+ Requires-Dist: structlog>=24.1
31
+ Provides-Extra: suggest
32
+ Requires-Dist: llama-cpp-python>=0.2.90; extra == "suggest"
33
+ Requires-Dist: huggingface_hub>=0.23; extra == "suggest"
34
+ Requires-Dist: platformdirs>=4.0; extra == "suggest"
35
+ Requires-Dist: truststore>=0.9; extra == "suggest"
36
+ Provides-Extra: draft-cloud
37
+ Requires-Dist: anthropic>=0.40; extra == "draft-cloud"
38
+ Requires-Dist: openai>=1.40; extra == "draft-cloud"
39
+ Requires-Dist: google-genai>=0.3; extra == "draft-cloud"
40
+ Requires-Dist: truststore>=0.9; extra == "draft-cloud"
41
+ Provides-Extra: mcp
42
+ Requires-Dist: mcp<2,>=1.23; extra == "mcp"
43
+ Provides-Extra: dev
44
+ Requires-Dist: pytest>=8.0; extra == "dev"
45
+ Requires-Dist: jsonschema>=4.20; extra == "dev"
46
+ Dynamic: license-file
47
+
48
+ # grepxcel
49
+
50
+ Extract structured data from Excel files — reliably, without writing custom code for every template.
51
+
52
+ If you have ever written Python to parse an Excel file from a supplier, a client, or another department — and had it silently produce wrong output the moment they moved a column or renamed a header — grepxcel is built for exactly trying to easily get around this kind of problem.
53
+
54
+ You describe the layout of the file once in a **pattern file** (an `.xlsx` workbook or a plain `.csv`). The engine reads any conforming data file and extracts cells and tables into clean, hierarchical JSON. When a file does not match the pattern, grepxcel fails loudly with a clear error — it is never silently wrong.
55
+
56
+ Think of it as *[grep](https://en.wikipedia.org/wiki/Grep) for [Excel](https://en.wikipedia.org/wiki/Microsoft_Excel)*: a repeatable extraction layer you define once and run on any number of files.
57
+
58
+ ![CI](https://github.com/scpg/grepxcel/actions/workflows/ci.yml/badge.svg)
59
+
60
+ ---
61
+
62
+ ## Common use cases
63
+
64
+ - **Supplier invoices and price lists** — each supplier sends Excel files in their own format. Define a pattern once per template, extract data from every new file automatically.
65
+ - **Timesheets and HR reports** — team leads use slightly different layouts. grepxcel anchors on cell labels, not row numbers, so minor layout drift does not break extraction.
66
+ - **Data pipelines ingesting Excel** — replace brittle `openpyxl` or `pandas` parsing code that breaks when a column shifts. The pattern file lives in git; extraction is deterministic and testable.
67
+ - **Research and clinical data** — field workers submit data in Excel templates. Any file that deviates from the expected structure is rejected immediately, before it can corrupt your dataset.
68
+
69
+ *The engineering philosophy behind grepxcel: [docs/MINDSET.md](docs/MINDSET.md).*
70
+
71
+ ---
72
+
73
+ ## What it does
74
+
75
+ You describe the layout of your Excel sheet in a **pattern file** — either an Excel workbook (`.xlsx`) or a plain `.csv` (handy for hand-editing and git diffs). The engine reads any matching data file and extracts cells and tables into clean, hierarchical JSON — no coding required to define new patterns.
76
+
77
+ ---
78
+
79
+ ## How it works
80
+
81
+ ```
82
+ pattern.xlsx + data.xlsx → { "po": { "number": "PO-2026" }, "line": [ {…} ] }
83
+ ```
84
+
85
+ The pattern file has four row types:
86
+
87
+ | Row type | Purpose |
88
+ |---|---|
89
+ | `config:` | Global settings: read direction, currency symbol, empty-cell aliases, case-insensitive matching (`ignore.case`) |
90
+ | `lbl:` | Anchor label — matched for position, **never written to output JSON** |
91
+ | `var:` | Data field — extracted and written to output JSON |
92
+ | `doc:` | Comment / documentation row — ignored by the engine |
93
+
94
+ Between `START:` and `END:` you list the extraction sequence:
95
+
96
+ - `cell:next fieldName` — read the next non-empty cell into a field (alias: `cell:1`)
97
+ - `cell:B5 fieldName` — jump directly to an absolute cell (A1-notation reference)
98
+ - `seek:G5` — reposition the cursor to G5 **without** reading it; the next `cell:next` starts from there
99
+ - `dir:LR` / `dir:TD` — switch the scan direction partway through (left-to-right / top-down); affects subsequent `cell:next`
100
+ - `table:*` — match all instances of a repeating mini-table block
101
+
102
+ Dot notation in `var:` field names creates nested output: `po.number` → `{"po": {"number": …}}`.
103
+
104
+ ### Pattern file formats
105
+
106
+ A pattern can be authored as **`.xlsx`** or **`.csv`** — both are read into the
107
+ same internal grid, so they behave identically. CSV is convenient for
108
+ hand-editing and produces clean git diffs. When writing CSV:
109
+
110
+ - One pattern row per CSV line; column A is the keyword (`config:`, `lbl:`, `var:`, `cell:…`).
111
+ - Table-template rows start with an **empty first field** (blank column A), e.g. `,HEADER:1,col_a,col_b`.
112
+ - **Quote any regex containing a comma**, e.g. `var,line.qty,integer,"\d{1,3}"`.
113
+ - Plain text only — formulas (a leading `=`) are rejected, exactly as in `.xlsx`.
114
+
115
+ ---
116
+
117
+ ## Writing the match pattern (regex) — the simple version
118
+
119
+ The last column of a `lbl:` or `var:` row is a **regex**: a little pattern that
120
+ describes *what the cell should look like*. You don't need to know regex to start —
121
+ here's everything most people need.
122
+
123
+ **Two rules to remember:**
124
+
125
+ 1. **The whole cell must match.** If your pattern is `\d{4}` (four digits), the cell
126
+ `2026` matches but `2026-05` does **not** (the `-05` is left over). You don't add
127
+ `^` or `$` anchors — grepxcel does that for you.
128
+ 2. **Leave it blank to accept anything.** No pattern in the last column = "any value
129
+ is fine". For `date`/`datetime` fields, always leave it blank (the type is checked,
130
+ not the text).
131
+
132
+ **The building blocks you'll actually use:**
133
+
134
+ | You want to match… | Write this | Matches |
135
+ |---|---|---|
136
+ | Any value at all | *(leave blank)* or `.*` | anything |
137
+ | A whole number | `\d+` | `7`, `2026`, `100` |
138
+ | Exactly 4 digits | `\d{4}` | `2026` (not `26`) |
139
+ | Between 1 and 3 digits | `\d{1,3}` | `5`, `42`, `999` |
140
+ | Letters only | `[A-Za-z]+` | `Acme` |
141
+ | A code like `PO-2026` | `PO-\d+` | `PO-1`, `PO-2026` |
142
+ | One of a few words | `paid\|unpaid\|pending` | `paid` |
143
+ | A price like `19.99` | `\d+\.\d{2}` | `19.99` |
144
+
145
+ Cheat-sheet: `\d` = a digit, `[A-Z]` = one capital letter, `+` = "one or more",
146
+ `{4}` = "exactly four", `{1,3}` = "between one and three", `.` = any character,
147
+ `.*` = "anything", `|` = "or".
148
+
149
+ **Tips for beginners:**
150
+
151
+ - **Start loose, then tighten.** Begin with a blank pattern (accept anything), run
152
+ `extract`, see what comes out, then add a pattern only where you need to be strict.
153
+ - **Use `[A-Z]+`, not `([A-Z])+`.** Both look similar, but the bracket form is faster
154
+ and the parentheses-with-a-`+` form is rejected by grepxcel as unsafe (it can make
155
+ matching hang). The tool will tell you if you hit this.
156
+ - **Case doesn't matter?** Add one config row at the top of the pattern file:
157
+ `config: | ignore.case | yes`. Then `PAID`, `Paid`, and `paid` all match the same
158
+ pattern. (Default is case-sensitive.)
159
+ - **CSV pattern files:** if your pattern contains a comma (like `\d{1,3}`), wrap it in
160
+ quotes — `"\d{1,3}"` — so the comma isn't read as a new column.
161
+
162
+ > Under the hood these are standard [Python `re`](https://docs.python.org/3/library/re.html)
163
+ > patterns, so anything from that syntax works if you already know regex.
164
+
165
+ ---
166
+
167
+ ## Quick start
168
+
169
+ ### Install
170
+
171
+ ```bash
172
+ git clone https://github.com/scpg/grepxcel.git
173
+ cd grepxcel
174
+ python -m venv .venv
175
+ .venv/bin/pip install -e . # core (extract + docs)
176
+ .venv/bin/python3 scripts/install_llm_deps.py # optional: local draft (auto-detects GPU)
177
+ ```
178
+
179
+ **What you actually need** — grepxcel ships in layers, so you only install what you use:
180
+
181
+ | You want to… | Install |
182
+ |---|---|
183
+ | Extract data / generate the docs reference | `pip install grepxcel` *(core — nothing extra)* |
184
+ | Draft patterns with the **local** model (offline) | `pip install 'grepxcel[suggest]'` |
185
+ | Draft patterns with a **cloud** model (Claude) | `pip install 'grepxcel[draft-cloud]'` |
186
+ | Use as an **MCP server** for AI agents | `pip install 'grepxcel[mcp]'` |
187
+
188
+ The `extract` and `docs` commands work with the core install alone. The `draft`
189
+ command is optional; if its dependencies are missing, grepxcel tells you exactly
190
+ what to install (and points you at the cloud option as an alternative).
191
+
192
+ > **Windows (PowerShell):** use `.venv\Scripts\` instead of `.venv/bin/`, e.g.
193
+ > `.venv\Scripts\pip install -e .` and `.venv\Scripts\grepxcel ...`.
194
+
195
+ Requires Python **3.11+**.
196
+
197
+ ### Try it — bundled examples
198
+
199
+ Don't have your own Excel files yet? Generate ready-to-run examples:
200
+
201
+ ```bash
202
+ .venv/bin/grepxcel generate-examples # creates ./grepxcel-examples/
203
+ cd grepxcel-examples/01_simple_invoice
204
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx
205
+ ```
206
+
207
+ Each example includes a pattern, a data file, and a README with commands to try.
208
+
209
+ ### CLI usage
210
+
211
+ ```bash
212
+ # Extract data from an Excel file using a pattern
213
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx
214
+
215
+ # Write output to a directory instead of stdout
216
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx -o output/
217
+
218
+ # Process a specific sheet, or every sheet at once
219
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx --sheet 2025
220
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx --all-sheets
221
+
222
+ # Verbose mode (step-by-step match log)
223
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx -v
224
+
225
+ # Legacy flat output format ({"cells":{}, "tables":[]})
226
+ .venv/bin/grepxcel extract -p pattern.xlsx data.xlsx --format legacy
227
+
228
+ # Generate a colour-coded pattern reference file
229
+ .venv/bin/grepxcel docs -o pattern-reference.xlsx
230
+
231
+ # Inspect an Excel file before extraction (format, encryption, extent, merges)
232
+ .venv/bin/grepxcel lint data.xlsx
233
+
234
+ # Generate a JSON Schema describing the extraction output of a pattern
235
+ .venv/bin/grepxcel schema pattern.xlsx -o schema.json
236
+
237
+ # Use a local LLM to draft a starter pattern for an unseen Excel file
238
+ .venv/bin/grepxcel draft data.xlsx -o draft-pattern.xlsx
239
+ ```
240
+
241
+ ### Python API
242
+
243
+ The one-call entry point is `grepxcel.extract()`:
244
+
245
+ ```python
246
+ import grepxcel
247
+
248
+ result = grepxcel.extract("pattern.xlsx", "data.xlsx")
249
+
250
+ # Fields are grouped by dot-notation prefix
251
+ print(result["po"]["number"]) # "PO-2026"
252
+ print(result["vendor"]["name"]) # "Acme Supplies"
253
+
254
+ # Tables are arrays of instance objects
255
+ for row in result["line"][0]["data"]:
256
+ print(row["item"], row["qty"])
257
+
258
+ # Options: a specific sheet, every sheet, or the legacy output shape
259
+ result = grepxcel.extract("pattern.xlsx", "data.xlsx", sheet="Q1")
260
+ by_sheet = grepxcel.extract("pattern.xlsx", "data.xlsx", all_sheets=True)
261
+ ```
262
+
263
+ `extract()` is silent by default. For progress/warnings, or to reuse one engine
264
+ across many calls, use the underlying `Engine` directly:
265
+
266
+ ```python
267
+ from grepxcel import Engine, Logger, VerbosityLevel
268
+
269
+ logger = Logger(level=VerbosityLevel.NORMAL)
270
+ result = Engine().process("pattern.xlsx", "data.xlsx", logger=logger)
271
+ ```
272
+
273
+ ---
274
+
275
+ ## Output format
276
+
277
+ Fields defined with dot notation (`po.number`, `po.date`) are grouped into nested objects.
278
+ Tables always produce an array of instance objects, each containing `data`, and optionally
279
+ `header` and `footer` sections.
280
+
281
+ ```json
282
+ {
283
+ "po": { "number": "PO-2026", "date": "2026-05-01" },
284
+ "vendor": { "name": "Acme Supplies" },
285
+ "line": [
286
+ {
287
+ "data": [
288
+ { "item": "Laptop", "qty": 2, "price": 1200.00, "total": 2400.00 },
289
+ { "item": "Dock", "qty": 6, "price": 75.00, "total": 450.00 }
290
+ ],
291
+ "footer": { "label": "Grand Total", "value": 3030.00 }
292
+ }
293
+ ]
294
+ }
295
+ ```
296
+
297
+ Label fields (`lbl:`) are used only for positional anchoring and are never included in output.
298
+
299
+ ---
300
+
301
+ ## CLI reference
302
+
303
+ ### `grepxcel extract`
304
+
305
+ | Flag | Default | Purpose |
306
+ |---|---|---|
307
+ | `-p FILE` | required | Pattern file — `.xlsx` or `.csv` |
308
+ | `--format` | `nested` | Output format: `nested` (default) or `legacy` |
309
+ | `-o DIR` | — | Write JSON to directory (stdout if omitted) |
310
+ | `-l FILE` | — | Append structured log to file |
311
+ | `--log-format` | `text` | Log format: `text` (human) or `json` (NDJSON for SIEM/cloud) |
312
+ | `--meta` | off | Add `_meta` block to JSON output (run_id, stats, issues) |
313
+ | `-v` / `-vv` | off | Verbosity: per-field trace (`field ← B1 = value ✓/✗`) / anchor probes |
314
+ | `-d` / `--debug` | off | Same as `-vv` |
315
+ | `--max-rows N` | 2048 | Max data-sheet rows (raise up to Excel's 1,048,576; untested above default) |
316
+ | `--max-columns N` | 1024 | Max data-sheet columns (raise up to Excel's 16,384; untested above default) |
317
+ | `--max-size MB` | 5 | Compressed file size limit |
318
+ | `--max-uncompressed MB` | 50 | Uncompressed ZIP content limit (ZIP bomb guard) |
319
+ | `--max-cell-len N` | 1000 | Max cell chars fed to regex (ReDoS guard) |
320
+ | `--sheet NAME_OR_INDEX` | active | Sheet name or 0-based index to process |
321
+ | `--all-sheets` | off | Process every sheet; output is a dict keyed by sheet name (mutually exclusive with `--sheet`) |
322
+
323
+ ### `grepxcel validate-pattern`
324
+
325
+ Check that a pattern file (`.xlsx` or `.csv`) is valid to use — **without** running
326
+ an extraction. It applies the same rules extraction does (structure, field types,
327
+ multiplicities, regex safety, comments) and additionally flags an empty extraction
328
+ sequence and references to undefined fields. **Exits non-zero** if any file is invalid.
329
+
330
+ ```bash
331
+ grepxcel validate-pattern pattern.xlsx
332
+ grepxcel validate-pattern pattern.csv -v # + parsed config, fields, and steps
333
+ grepxcel validate-pattern a.xlsx b.csv # validate several at once
334
+ ```
335
+
336
+ | Flag | Purpose |
337
+ |---|---|
338
+ | `FILE...` | One or more pattern files to validate |
339
+ | `-v`, `--verbose` | Print the parsed config, fields, and extraction sequence |
340
+
341
+ ### `grepxcel docs`
342
+
343
+ | Flag | Default | Purpose |
344
+ |---|---|---|
345
+ | `-o FILE` | `pattern-reference.xlsx` | Output path for the reference file |
346
+
347
+ ### `grepxcel generate-examples`
348
+
349
+ Creates 4 ready-to-run example sets (pattern + data + README) in a local directory.
350
+
351
+ | Flag | Default | Purpose |
352
+ |---|---|---|
353
+ | `-o DIR` | `./grepxcel-examples/` | Directory to create |
354
+
355
+ Examples included: simple invoice (KV), product catalog (table), expense report
356
+ (KV + table + footer), loan schedule (KV header + amortization table).
357
+
358
+ ### `grepxcel draft`
359
+
360
+ Drafts a starter pattern file for an unseen Excel file using an LLM. The output is
361
+ a *starting point* — review and refine the generated regexes before use.
362
+ `grepxcel suggest` is a backward-compatible alias for this command.
363
+
364
+ > Curious which model to use, and how good `draft` actually is? See
365
+ > [docs/EVALUATION.md](docs/EVALUATION.md) for the methodology, model
366
+ > comparisons (local vs cloud), and honest notes on where it succeeds and fails.
367
+
368
+ | Flag | Default | Purpose |
369
+ |---|---|---|
370
+ | `-o FILE` | `draft_pattern.xlsx` | Write draft pattern to this path |
371
+ | `-v` | off | Print the Excel analysis sent to the model + update status |
372
+ | `--dry-run` | off | Print the analysis that would be sent to the model, then exit (no inference) |
373
+ | `--backend local\|claude\|github\|server\|gemini` | `local` | Inference backend (see below) |
374
+ | `--sheet NAME_OR_INDEX` | active | Sheet to analyse |
375
+ | `--max-size MB` | 5 | Compressed file size limit |
376
+ | `--max-uncompressed MB` | 50 | Uncompressed ZIP content limit (ZIP bomb guard) |
377
+
378
+ #### Backends
379
+
380
+ | Backend | Install | Notes |
381
+ |---|---|---|
382
+ | `local` *(default)* | `python3 scripts/install_llm_deps.py` | Runs Gemma-4-E4B (GGUF) in-process. **No data leaves your machine.** Model is pinned to a revision and downloaded once (~5 GB); set `GREPXCEL_MODEL_AUTOUPDATE=1` to track upstream, `GREPXCEL_MODEL_DIR` to relocate the cache. |
383
+ | `github` | `pip install -e '.[draft-cloud]'` | **GitHub Models** — free with a GitHub subscription (quota-limited, no per-token charge), and the **highest-quality option** in our eval. Requires `GITHUB_TOKEN` (fine-grained, `Models: read`). Pick a model with `--github-model`, e.g. `openai/gpt-4.1`, `openai/gpt-4o`, `meta/llama-3.3-70b-instruct`. Per-draft token usage + remaining quota are printed. |
384
+ | `claude` | `pip install -e '.[draft-cloud]'` | Anthropic API. Requires `ANTHROPIC_API_KEY`. Prints a one-line privacy notice and per-call token cost (~$0.003–0.04/draft). |
385
+ | `server` | `pip install openai` | Any **OpenAI-compatible server** (LM Studio, Ollama, vLLM, text-generation-inference). Default URL: `http://localhost:1234/v1` (override with `--server-url` or `GREPXCEL_SERVER_URL`). Model is auto-discovered unless `--server-model` is set. Data stays local. |
386
+ | `gemini` | — | **Planned for a future release** — not yet available. Selecting it prints a notice and exits. |
387
+
388
+ Backends read keys from a `.env` file in the project (or any parent) directory,
389
+ so you don't have to export them. Real environment variables take precedence.
390
+ On draft quality (execution-based eval, all fixtures): the free **`github`**
391
+ models (`openai/gpt-4.1`, `openai/gpt-4o`, `meta/llama-3.3-70b-instruct`) lead by
392
+ a wide margin, ahead of the `local` model — see
393
+ [docs/EVALUATION.md](docs/EVALUATION.md).
394
+
395
+ > **Faster first download (`local` backend):** the model is fetched once from
396
+ > HuggingFace. Anonymous downloads work but are rate-limited and can be slow or
397
+ > stall on the ~5 GB file. For a faster, more reliable first run, set a free
398
+ > [HuggingFace token](https://huggingface.co/settings/tokens) (read scope):
399
+ > `HF_TOKEN=hf_... grepxcel draft data.xlsx`. It's optional and one-time — the
400
+ > model is cached afterwards. If a download stalls, grepxcel prints this tip too.
401
+
402
+ > **Privacy:** the `claude` cloud backend sends only the *structure description*
403
+ > of your sheet (column types, sample values, labels) — never the raw file. The
404
+ > `local` backend sends nothing over the network during inference.
405
+
406
+ ```bash
407
+ grepxcel draft data.xlsx # local model (default)
408
+ grepxcel draft data.xlsx --dry-run # inspect the analysis, no inference
409
+ ANTHROPIC_API_KEY=sk-... grepxcel draft data.xlsx --backend claude
410
+ grepxcel draft data.xlsx --backend server # LM Studio / Ollama on localhost:1234
411
+ grepxcel draft data.xlsx --backend server --server-url http://host:8080/v1
412
+ ```
413
+
414
+ #### Model cache & offline / alternative downloads
415
+
416
+ The local model is stored once in the platform-appropriate per-user cache
417
+ (resolved with [`platformdirs`](https://pypi.org/project/platformdirs/), the same
418
+ convention pip uses):
419
+
420
+ | OS | Default cache |
421
+ |---|---|
422
+ | Linux | `$XDG_CACHE_HOME/grepxcel/models/` (default `~/.cache/grepxcel/models/`) |
423
+ | macOS | `~/Library/Caches/grepxcel/models/` |
424
+ | Windows | `%LOCALAPPDATA%\grepxcel\Cache\models\` |
425
+
426
+ Override the location with `GREPXCEL_MODEL_DIR` (handy for Docker volumes or a
427
+ shared model dir) — it takes precedence over the defaults above.
428
+
429
+ If the HuggingFace download is slow or blocked, you don't have to let grepxcel
430
+ fetch it — **grepxcel only downloads when the file isn't already in the cache**,
431
+ so you can supply it yourself from any source:
432
+
433
+ ```bash
434
+ mkdir -p ~/.cache/grepxcel/models # or your $GREPXCEL_MODEL_DIR
435
+ # download the GGUF anywhere (HF website, a mirror, ModelScope, …), then place it
436
+ # with this EXACT name so grepxcel finds it and skips the download:
437
+ mv gemma-4-E4B-it-Q4_K_M.gguf ~/.cache/grepxcel/models/
438
+ grepxcel draft data.xlsx # uses the local file — no download
439
+ ```
440
+
441
+ Other options:
442
+
443
+ - **Mirror:** `huggingface_hub` honors `HF_ENDPOINT`, e.g.
444
+ `HF_ENDPOINT=https://hf-mirror.com grepxcel draft data.xlsx` (third-party mirror).
445
+ - **Token:** `HF_TOKEN=hf_...` removes the anonymous rate limit (fastest fix).
446
+
447
+ > The normal HuggingFace download is **hash-verified** against the pinned
448
+ > revision. A **manually-placed file is also trusted automatically when its
449
+ > sha256 matches the pinned build** (grepxcel ships the known-good hash, so the
450
+ > trust is portable across machines). A file that doesn't match a known-good
451
+ > hash and has no recorded fingerprint is used on trust with a warning — make
452
+ > sure such a source is trustworthy, or pass `--allow-unverified-model` to
453
+ > silence the warning.
454
+
455
+ #### Behind a corporate proxy / TLS inspection (experimental)
456
+
457
+ Corporate networks often route HTTPS through an inspection proxy (NetSkope,
458
+ Zscaler, …) that re-signs traffic with a **company CA** Python doesn't trust by
459
+ default — so the model download and the cloud backends fail with
460
+ `CERTIFICATE_VERIFY_FAILED`. grepxcel can trust that CA (TLS verification stays
461
+ on — it is never disabled):
462
+
463
+ - **Best:** have IT install the corporate root CA in your OS trust store, then
464
+ `pip install truststore` (it ships with the `suggest` / `draft-cloud` extras).
465
+ grepxcel then uses the OS store automatically — no env vars needed.
466
+ - **Or** point grepxcel at the CA `.pem` file:
467
+ ```bash
468
+ grepxcel draft data.xlsx --ca-bundle /path/to/corporate-ca.pem
469
+ # or: export GREPXCEL_CA_BUNDLE=/path/to/corporate-ca.pem
470
+ # (REQUESTS_CA_BUNDLE / SSL_CERT_FILE are also honored)
471
+ ```
472
+ - If your network requires a proxy, set `HTTPS_PROXY` / `NO_PROXY` as usual
473
+ (both `requests` and `httpx` read them automatically).
474
+ - Run `grepxcel doctor` to verify the setup — it does a live TLS handshake.
475
+
476
+ > ⚠️ This support has **not been tested against a real intercept proxy**;
477
+ > grepxcel prints a one-time notice when proxy/CA settings are in effect.
478
+
479
+ ### `grepxcel lint`
480
+
481
+ Inspect an Excel data file before extraction — catches issues that would cause
482
+ extraction to fail or produce unexpected results.
483
+
484
+ ```bash
485
+ grepxcel lint data.xlsx
486
+ grepxcel lint jan.xlsx feb.xlsx # lint several files
487
+ ```
488
+
489
+ It prints a `✓/⚠/✗/ℹ` checklist covering:
490
+ - **File format** — extension, ZIP integrity, encryption/IRM detection
491
+ - **Microsoft Information Protection (MIP)** — detects OLE Compound Documents
492
+ (encrypted by sensitivity labels) and explains how to obtain an extractable copy
493
+ - **Sheet dimensions** — declared vs real used extent (detects styling inflation)
494
+ - **Merged cells** — listed with a note on how grepxcel handles them
495
+ - **Formula cells** — warns about potentially stale cached values
496
+ - **Empty sheets** — nothing to extract
497
+ - **Advisory notes** — known limitations not yet auto-detected (password
498
+ protection, conditional formatting, pivot tables, VBA)
499
+
500
+ ### `grepxcel schema`
501
+
502
+ Generate a [JSON Schema](https://json-schema.org/) (draft 2020-12) that describes
503
+ the extraction output for a pattern. Use it to validate extracted JSON with any
504
+ standard JSON Schema validator — handy when importing thousands of files that must
505
+ all follow the same structure.
506
+
507
+ ```bash
508
+ grepxcel schema pattern.xlsx # print schema to stdout
509
+ grepxcel schema pattern.xlsx -o schema.json
510
+ grepxcel schema pattern.csv # CSV patterns work too
511
+ ```
512
+
513
+ The schema mirrors the nested output shape:
514
+ - **Field types** map to JSON types (`currency`/`percentage` → `number`,
515
+ `date`/`datetime` → `string` with `format: date-time`, etc.)
516
+ - **Dot-notation** fields (`po.number`) become nested objects
517
+ - **Tables** become arrays of objects with `_source`/`header`/`data`/`footer`
518
+ sub-objects
519
+ - All fields are **nullable** (`["string", "null"]`) since an empty cell extracts
520
+ as `null`
521
+
522
+ Validate an extraction against it with any tool, e.g. Python's `jsonschema`:
523
+
524
+ ```bash
525
+ grepxcel schema pattern.xlsx -o schema.json
526
+ grepxcel extract -p pattern.xlsx data.xlsx -o out/
527
+ python -m jsonschema -i out/data.json schema.json
528
+ ```
529
+
530
+ ### `grepxcel doctor`
531
+
532
+ Preflight check that tells you exactly what's needed to run the tool.
533
+
534
+ ```bash
535
+ grepxcel doctor # everything: extract + draft + proxy/TLS
536
+ grepxcel doctor extract # only what extract needs (offline)
537
+ grepxcel doctor draft # only what draft needs (deps, keys, model, proxy/TLS)
538
+ ```
539
+
540
+ It prints a `✓/⚠/✗` checklist — Python version, dependencies, API keys, the
541
+ local-model cache + disk space, and proxy/CA config with a live handshake — and
542
+ **exits non-zero** if the selected area has a blocking (`✗`) problem.
543
+
544
+ | Flag | Purpose |
545
+ |---|---|
546
+ | `area` | `extract`, `draft`, or `all` (default `all`) |
547
+ | `--no-probe` | Skip the live TLS handshake (offline / faster) |
548
+
549
+ ### `grepxcel mcp`
550
+
551
+ Starts grepxcel as a [Model Context Protocol](https://modelcontextprotocol.io)
552
+ (MCP) server so AI agents can call grepxcel tools directly.
553
+
554
+ ```bash
555
+ pip install 'grepxcel[mcp]' # install the MCP dependency
556
+ grepxcel mcp-config # print the config for your AI agent
557
+ grepxcel mcp # start the server (stdio transport)
558
+ ```
559
+
560
+ **Exposed tools:** `extract`, `validate_pattern`, `lint`, `schema`, `docs`,
561
+ `doctor`, `generate_examples`.
562
+
563
+ | Flag | Default | Purpose |
564
+ |---|---|---|
565
+ | `--target` (mcp-config) | `claude-code` | Config format: `claude-code`, `claude-desktop`, `cursor` |
566
+
567
+ ---
568
+
569
+ ## Verbosity levels
570
+
571
+ | Level | What you see |
572
+ |---|---|
573
+ | `QUIET` | Nothing — records still collected in memory |
574
+ | `NORMAL` | Summary + all validation warnings with hints *(default)* |
575
+ | `VERBOSE` | + every cell and table match step by step |
576
+ | `DEBUG` | + every anchor probe and rejection reason |
577
+
578
+ ---
579
+
580
+ ## Logging model & data safety
581
+
582
+ grepxcel's logging is designed so that **no extracted Excel cell value ever reaches
583
+ a structured log record** — by construction, not by redaction.
584
+
585
+ ### How it works
586
+
587
+ - **Console / text logs** (`--log-format text`, default) show the full diagnostic
588
+ detail (including cell values) on stderr — intended for a human operator sitting
589
+ at the terminal.
590
+ - **Structured logs** (`--log-format json`) write one NDJSON record per event to
591
+ the `--log` file. Each record is built from an **allow-list of safe keys** only:
592
+ `ts`, `level`, `category`, `event`, `cell`, `field`, `field_type`, `value_len`,
593
+ `value_sha8`, `run_id`, `source`, `schema_version`. Free-form fields (`message`,
594
+ `hint`, `expected`, `found`) are never serialized.
595
+ - A **non-reversible value fingerprint** (`value_len` + `value_sha8`, a truncated
596
+ SHA-256) is included for diagnostics without revealing the cell content.
597
+ - An end-of-run **summary** event carries extraction statistics — counts and field
598
+ names only, never values.
599
+ - There is **no opt-out** for the allow-list model; the text log remains human-only.
600
+
601
+ ### GDPR / PII considerations
602
+
603
+ Because structured JSON logs never contain cell values, they can be shipped to a
604
+ SIEM or cloud log aggregator without risk of leaking personal data that may be
605
+ present in the Excel files being processed.
606
+
607
+ The `--meta` flag adds a `_meta` block to the extracted JSON output with run_id,
608
+ statistics, and safe issue records — suitable for pipeline auto-verification. This
609
+ block also follows the allow-list model and never contains cell values.
610
+
611
+ **Operators are responsible for** controlling access to the text-mode log files and
612
+ the extracted JSON output, which do contain the actual data.
613
+
614
+ ---
615
+
616
+
617
+ ## Project layout
618
+
619
+ ```
620
+ grepxcel/ ← importable Python package (engine, parser, models, security, cli, drafter)
621
+ scripts/ ← reusable utility scripts for contributors
622
+ tests/
623
+ fixtures/ ← pattern + data xlsx pairs (one folder per scenario)
624
+ unit/ ← pytest unit tests
625
+ integration/ ← pytest integration tests
626
+ docs/ ← additional documentation
627
+ tmp.local/ ← throwaway scripts, never synced (gitignored)
628
+ samples.local/ ← local-only Excel files, never synced (gitignored)
629
+ logs/ ← log file output (gitignored)
630
+ output/ ← JSON extraction results (gitignored)
631
+ ```
632
+
633
+ ---
634
+
635
+ ## Running the tests
636
+
637
+ ```bash
638
+ .venv/bin/pytest tests/ -q
639
+ ```
640
+
641
+ 1200+ unit and integration tests across 22 fixture scenarios — all green.
642
+
643
+ ---
644
+
645
+ ## Requirements
646
+
647
+ - Python 3.11+
648
+ - `openpyxl >= 3.1`
649
+ - `defusedxml >= 0.7`
650
+ - `llama-cpp-python >= 0.2.90` and `huggingface_hub >= 0.23` — only for `grepxcel draft`
651
+
652
+ ---
653
+
654
+ ## Support
655
+
656
+ grepxcel is free and open source. If it saves you time and you'd like to say thanks,
657
+ you can [buy me a coffee](https://buymeacoffee.com/scpg.dev) ☕ — entirely optional and
658
+ always appreciated.
659
+
660
+ <a href="https://buymeacoffee.com/scpg.dev" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" height="41" width="174"></a>
661
+
662
+ ## Contributing
663
+
664
+ See [CONTRIBUTING.md](CONTRIBUTING.md). All PRs target the `dev` branch.
665
+
666
+ ## Security
667
+
668
+ See [SECURITY.md](SECURITY.md) for how to report vulnerabilities privately.
669
+
670
+ ## License
671
+
672
+ MIT — see [LICENSE](LICENSE).