xlsx-for-ai 1.5.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -94,6 +94,7 @@ npx xlsx-for-ai data.xlsx "Sheet1" --stdout --max-rows 50 --compact
94
94
  | `[sheetName]` | Positional: dump only this sheet |
95
95
  | `--range A1:D50` | Dump only this rectangular range |
96
96
  | `--named-range NAME` | Dump only the cells covered by a workbook-defined name |
97
+ | `--region` | Auto-detect the dominant contiguous data block (Excel "current region" / Ctrl+Shift+*). Picks the largest region by populated-cell count when multiple disjoint blocks exist. Compatible with `--max-rows` / `--max-cols`. |
97
98
  | `--max-rows N` | Cap at the first N rows per sheet |
98
99
  | `--max-cols N` | Cap at the first N columns per sheet |
99
100
 
@@ -318,7 +319,22 @@ A future release may apply these dep upgrades via `patch-package` so they travel
318
319
 
319
320
  ## Reporting bugs
320
321
 
321
- **The privacy contract: we never auto-send your data.** xlsx-for-ai has no telemetry endpoint and no consent dialog to maintainthere's nothing to opt out of, because nothing leaves your machine unless you choose to attach it to a GitHub issue.
322
+ **The privacy contract: we never auto-send workbook data.** Anonymous crash telemetry is opt-in via `--enable-telemetry`; even then, we receive only error type, error message (sanitizedpaths scrubbed, capped at 200 chars), tool version, Node version, and OS/arch. No paths, no cell values, no identifiers.
323
+
324
+ To enable or manage crash telemetry:
325
+
326
+ ```bash
327
+ # Opt in — prints the exact payload schema so you can see what gets sent
328
+ xlsx-for-ai --enable-telemetry
329
+
330
+ # Opt out
331
+ xlsx-for-ai --disable-telemetry
332
+
333
+ # Check current state and config path
334
+ xlsx-for-ai --telemetry-status
335
+ ```
336
+
337
+ Consent is stored at `~/.xlsx-for-ai/config.json` and persists across `npm install -g xlsx-for-ai@latest` upgrades. If the telemetry shape ever changes, the tool pauses sending and prompts you to re-opt-in — we never silently expand what we collect under old consent.
322
338
 
323
339
  When something breaks on a real workbook, two flags help us reproduce locally without asking you to share the original file:
324
340
 
package/WHY.md CHANGED
@@ -92,3 +92,7 @@ Spreadsheet libraries are designed for developers building software *on top of*
92
92
  `xlsx-for-ai` is the first one built specifically for that. The output is shaped for an LLM's context window — markdown tables when the model just needs to read, structured JSON when it needs to reason, token-aware truncation when the spreadsheet is too big to fit, and a real `.xlsx` writer that produces a file you can hand back to a human along with a built-in note explaining everything that changed.
93
93
 
94
94
  It's a small tool. It just happens to fix the one thing standing between AI assistants and the file format most knowledge work actually lives in.
95
+
96
+ ## Privacy contract
97
+
98
+ We never auto-send workbook data. Anonymous crash telemetry is opt-in via `xlsx-for-ai --enable-telemetry`; even then, we receive only error type, error message (sanitized — paths scrubbed, capped at 200 chars), and tool/Node/OS version — no paths, no cell values, no identifiers. Nothing leaves your machine unless you choose to enable it.
package/index.js CHANGED
@@ -22,10 +22,11 @@ if (!process.env.XLSX_FOR_AI_RESPAWNED) {
22
22
  const path = require('path');
23
23
  const fs = require('fs');
24
24
  // All xlsx-engine access goes through the engine abstraction in lib/engine.js
25
- // — never require the underlying engine directly. To swap engines (fork,
26
- // different library, server-side service), replace lib/engine.js. Nothing
27
- // else changes. Current engine: @protobi/exceljs (drop-in fork of exceljs
28
- // with active maintenance + preservation patches; see ROADMAP for rationale).
25
+ // — lib/engine.js is the ONLY place in lib/ that requires @protobi/exceljs.
26
+ // To swap engines (fork, different library, server-side service), replace
27
+ // lib/engine.js; nothing else changes. Current engine: @protobi/exceljs
28
+ // (drop-in fork of exceljs with active maintenance + preservation patches;
29
+ // see ROADMAP for rationale).
29
30
  const engine = require('./lib/engine');
30
31
 
31
32
  // Lazy-load heavy deps only when their feature is used (keeps cold start fast
@@ -55,12 +56,16 @@ function parseArgs(argv) {
55
56
  diff: null,
56
57
  range: null,
57
58
  namedRange: null,
59
+ region: false,
58
60
  maxRows: null,
59
61
  maxCols: null,
60
62
  maxTokens: null,
61
63
  reportBug: null,
62
64
  exportRedactedWorkbook: null,
63
65
  help: false,
66
+ enableTelemetry: false,
67
+ disableTelemetry: false,
68
+ telemetryStatus: false,
64
69
  };
65
70
  let i = 0;
66
71
  while (i < argv.length) {
@@ -77,11 +82,15 @@ function parseArgs(argv) {
77
82
  else if (arg === '--diff') { opts.diff = argv[++i]; }
78
83
  else if (arg === '--range') { opts.range = argv[++i]; }
79
84
  else if (arg === '--named-range') { opts.namedRange = argv[++i]; }
85
+ else if (arg === '--region') opts.region = true;
80
86
  else if (arg === '--max-rows') { opts.maxRows = parseInt(argv[++i], 10); }
81
87
  else if (arg === '--max-cols') { opts.maxCols = parseInt(argv[++i], 10); }
82
88
  else if (arg === '--max-tokens') { opts.maxTokens = parseInt(argv[++i], 10); }
83
89
  else if (arg === '--report-bug') { opts.reportBug = argv[++i]; }
84
90
  else if (arg === '--export-redacted-workbook'){ opts.exportRedactedWorkbook = argv[++i]; }
91
+ else if (arg === '--enable-telemetry') opts.enableTelemetry = true;
92
+ else if (arg === '--disable-telemetry') opts.disableTelemetry = true;
93
+ else if (arg === '--telemetry-status') opts.telemetryStatus = true;
85
94
  else if (arg === '-h' || arg === '--help') opts.help = true;
86
95
  else opts.positional.push(arg);
87
96
  i++;
@@ -111,6 +120,10 @@ Selection:
111
120
  [sheetName] Positional second arg, dump only this sheet
112
121
  --range A1:D50 Dump only this rectangular range
113
122
  --named-range NM Dump only the cells covered by this defined name
123
+ --region Auto-detect the dominant contiguous data block (Excel
124
+ "current region" semantics); picks the largest region
125
+ by populated-cell count when multiple disjoint blocks
126
+ exist. Compatible with --max-rows / --max-cols.
114
127
  --max-rows N Limit to first N rows per sheet
115
128
  --max-cols N Limit to first N columns per sheet
116
129
 
@@ -141,6 +154,21 @@ Bug reporting (privacy-by-design — no data leaves your machine):
141
154
  structure, styles, named ranges preserved. Optional
142
155
  attachment for hard-to-repro bugs.
143
156
 
157
+ Crash telemetry (opt-in only):
158
+ --enable-telemetry
159
+ Opt in to anonymous crash telemetry. Only error type,
160
+ sanitized error message (paths scrubbed, ≤200 chars),
161
+ tool version, Node version, and OS/arch are sent.
162
+ No paths, no cell values, no identifiers.
163
+ Payload: { v, ts, error_type, error_message, command,
164
+ xlsx_for_ai_version, node_version, os_arch }
165
+ Consent persists at ~/.xlsx-for-ai/config.json across
166
+ upgrades.
167
+ --disable-telemetry
168
+ Opt out. Config file is kept (explicit "no" is recorded).
169
+ --telemetry-status
170
+ Show current state and config path.
171
+
144
172
  Misc:
145
173
  -h, --help Show this help
146
174
 
@@ -150,6 +178,8 @@ Examples:
150
178
  npx xlsx-for-ai data.xlsx --json --max-tokens 8000 --stdout
151
179
  npx xlsx-for-ai data.csv --md --stdout
152
180
  npx xlsx-for-ai data.xlsx --range B2:F100 --stdout
181
+ npx xlsx-for-ai data.xlsx --region --stdout
182
+ npx xlsx-for-ai data.xlsx --region --max-rows 50 --stdout
153
183
  npx xlsx-for-ai data.xlsx --named-range MyTotals --stdout
154
184
  npx xlsx-for-ai data.xlsx --sql --stdout > schema.sql
155
185
  npx xlsx-for-ai old.xlsx --diff new.xlsx --stdout
@@ -325,6 +355,115 @@ function resolveNamedRange(wb, name) {
325
355
  return { sheet: sheetName, range: parseRange(rangeStr) };
326
356
  }
327
357
 
358
+ // ---------------------------------------------------------------------------
359
+ // Region detection — "current region" semantics (Excel Ctrl+Shift+*)
360
+ //
361
+ // Finds the dominant contiguous data block on a worksheet. Algorithm:
362
+ // 1. Scan the sheet to collect all populated cells.
363
+ // 2. Build connected components using 8-neighbor flood fill (cells that
364
+ // share a corner or edge are in the same region).
365
+ // 3. For each component, compute the bounding rectangle and the count of
366
+ // populated cells inside it.
367
+ // 4. Return the bounding box of the component with the most populated cells
368
+ // (tie-break: largest populated count; if still tied, the first found).
369
+ //
370
+ // Returns {startRow, startCol, endRow, endCol} (1-indexed), or null if the
371
+ // sheet has no populated cells.
372
+ // ---------------------------------------------------------------------------
373
+
374
+ function detectRegion(ws) {
375
+ // Step 1: collect all populated cells into a Set for O(1) lookup.
376
+ // We store them as "row,col" strings and also keep a list for iteration.
377
+ const populated = new Set();
378
+ const cells = [];
379
+
380
+ const rowCount = ws.rowCount;
381
+ const colCount = ws.columnCount;
382
+ if (rowCount === 0 || colCount === 0) return null;
383
+
384
+ // ExcelJS reports rowCount/columnCount as the highest USED row/column,
385
+ // not actual storage. A workbook with one cell at XFD1048576 reports
386
+ // 1048576 × 16384 = ~17B coordinates. Refuse the scan past 5M cells —
387
+ // pathological/malicious inputs would otherwise hang the CLI.
388
+ if (rowCount * colCount > 5_000_000) {
389
+ console.warn(
390
+ `detectRegion: workbook reports ${rowCount}×${colCount} cell dimensions, ` +
391
+ `exceeds 5M-cell scan cap; skipping region detection`
392
+ );
393
+ return null;
394
+ }
395
+
396
+ for (let r = 1; r <= rowCount; r++) {
397
+ const row = ws.getRow(r);
398
+ for (let c = 1; c <= colCount; c++) {
399
+ const v = row.getCell(c).value;
400
+ if (v != null && v !== '') {
401
+ const key = `${r},${c}`;
402
+ populated.add(key);
403
+ cells.push([r, c]);
404
+ }
405
+ }
406
+ }
407
+
408
+ if (cells.length === 0) return null;
409
+
410
+ // Step 2: flood-fill connected components (8-neighbor).
411
+ const visited = new Set();
412
+ const components = [];
413
+
414
+ for (const [startR, startC] of cells) {
415
+ const key = `${startR},${startC}`;
416
+ if (visited.has(key)) continue;
417
+
418
+ // BFS from this seed cell.
419
+ const component = [];
420
+ const queue = [[startR, startC]];
421
+ visited.add(key);
422
+
423
+ while (queue.length > 0) {
424
+ const [r, c] = queue.shift();
425
+ component.push([r, c]);
426
+ // 8 neighbors
427
+ for (let dr = -1; dr <= 1; dr++) {
428
+ for (let dc = -1; dc <= 1; dc++) {
429
+ if (dr === 0 && dc === 0) continue;
430
+ const nr = r + dr;
431
+ const nc = c + dc;
432
+ if (nr < 1 || nc < 1) continue;
433
+ const nk = `${nr},${nc}`;
434
+ if (!visited.has(nk) && populated.has(nk)) {
435
+ visited.add(nk);
436
+ queue.push([nr, nc]);
437
+ }
438
+ }
439
+ }
440
+ }
441
+ components.push(component);
442
+ }
443
+
444
+ // Step 3: pick the component with the most populated cells.
445
+ let best = null;
446
+ let bestCount = -1;
447
+ for (const comp of components) {
448
+ if (comp.length > bestCount) {
449
+ bestCount = comp.length;
450
+ best = comp;
451
+ }
452
+ }
453
+
454
+ // Step 4: compute bounding rectangle of the winning component.
455
+ let minR = Infinity, maxR = -Infinity;
456
+ let minC = Infinity, maxC = -Infinity;
457
+ for (const [r, c] of best) {
458
+ if (r < minR) minR = r;
459
+ if (r > maxR) maxR = r;
460
+ if (c < minC) minC = c;
461
+ if (c > maxC) maxC = c;
462
+ }
463
+
464
+ return { startRow: minR, endRow: maxR, startCol: minC, endCol: maxC };
465
+ }
466
+
328
467
  // ---------------------------------------------------------------------------
329
468
  // Selection bounds — combines --range, --named-range, --max-rows/cols, sheet
330
469
  // dimensions into a single {startRow, startCol, endRow, endCol}.
@@ -336,6 +475,9 @@ function selectionBounds(ws, opts) {
336
475
  bounds = parseRange(opts.range);
337
476
  } else if (opts.namedRangeBounds) {
338
477
  bounds = opts.namedRangeBounds;
478
+ } else if (opts.region) {
479
+ bounds = detectRegion(ws);
480
+ // bounds may be null (empty sheet); handled below by falling back to sheet dimensions.
339
481
  }
340
482
  const startRow = bounds ? bounds.startRow : 1;
341
483
  const startCol = bounds ? bounds.startCol : 1;
@@ -1709,6 +1851,56 @@ async function main() {
1709
1851
 
1710
1852
  if (opts.help) { printHelp(); process.exit(0); }
1711
1853
 
1854
+ // ---------------------------------------------------------------------------
1855
+ // Telemetry management flags — handled before crash hooks are registered.
1856
+ // ---------------------------------------------------------------------------
1857
+ if (opts.enableTelemetry || opts.disableTelemetry || opts.telemetryStatus) {
1858
+ const telCfg = require('./lib/telemetry-config');
1859
+
1860
+ if (opts.enableTelemetry) {
1861
+ telCfg.enableTelemetry();
1862
+ console.log('Crash telemetry enabled.');
1863
+ console.log('');
1864
+ console.log('When a crash occurs, this payload will be sent:');
1865
+ console.log(JSON.stringify({
1866
+ v: 1,
1867
+ ts: '<ISO-timestamp>',
1868
+ error_type: '<e.g. TypeError>',
1869
+ error_message: '<sanitized, ≤200 chars — paths scrubbed>',
1870
+ command: '<first CLI arg from allowlist, or "<other>">',
1871
+ xlsx_for_ai_version: require('./package.json').version,
1872
+ node_version: process.version,
1873
+ os_arch: `${process.platform}-${process.arch}`,
1874
+ }, null, 2));
1875
+ console.log('');
1876
+ console.log('No paths, no cell values, no identifiers. Consent stored at:');
1877
+ console.log(telCfg.configPath());
1878
+ return;
1879
+ }
1880
+
1881
+ if (opts.disableTelemetry) {
1882
+ telCfg.disableTelemetry();
1883
+ console.log('Crash telemetry disabled.');
1884
+ console.log('Config kept at: ' + telCfg.configPath());
1885
+ return;
1886
+ }
1887
+
1888
+ if (opts.telemetryStatus) {
1889
+ const status = telCfg.telemetryStatus();
1890
+ console.log(`Telemetry status: ${status}`);
1891
+ console.log(`Config path: ${telCfg.configPath()}`);
1892
+ return;
1893
+ }
1894
+ }
1895
+
1896
+ // ---------------------------------------------------------------------------
1897
+ // Register process-level crash hooks (no-op if user hasn't opted in).
1898
+ // ---------------------------------------------------------------------------
1899
+ {
1900
+ const { registerCrashHooks } = require('./lib/telemetry-hooks');
1901
+ registerCrashHooks(require('./package.json').version);
1902
+ }
1903
+
1712
1904
  // Bug-report and redacted-workbook modes consume their input via the
1713
1905
  // flag itself, so they bypass the normal positional / loader path.
1714
1906
  if (opts.reportBug) {
@@ -1838,6 +2030,16 @@ async function main() {
1838
2030
 
1839
2031
  const baseName = path.basename(filePath, path.extname(filePath));
1840
2032
 
2033
+ // --region: warn per-sheet if no data block was found (empty sheet).
2034
+ if (opts.region) {
2035
+ for (const ws of sheets) {
2036
+ const r = detectRegion(ws);
2037
+ if (!r) {
2038
+ console.error(`note: --region: no data found in sheet "${ws.name}"; dumping full sheet dimensions.`);
2039
+ }
2040
+ }
2041
+ }
2042
+
1841
2043
  // Pick output formatter.
1842
2044
  const renderText = (ws) => dumpSheet(ws, wb, perSheetOpts);
1843
2045
  const renderMd = (ws) => dumpSheetMarkdown(ws, wb, perSheetOpts);
@@ -1965,4 +2167,7 @@ module.exports = {
1965
2167
  trySimpleEval,
1966
2168
  // budget
1967
2169
  applyTokenBudget,
2170
+ // region detection
2171
+ detectRegion,
2172
+ selectionBounds,
1968
2173
  };
package/lib/bugReport.js CHANGED
@@ -21,7 +21,7 @@ const fs = require('fs');
21
21
  const path = require('path');
22
22
  const os = require('os');
23
23
  const JSZip = require('jszip');
24
- const ExcelJS = require('@protobi/exceljs');
24
+ const engine = require('./engine');
25
25
 
26
26
  const PKG_VERSION = require('../package.json').version;
27
27
 
@@ -117,21 +117,6 @@ function inventoryFeatures(filenames) {
117
117
  return out;
118
118
  }
119
119
 
120
- // Given the workbook.xml, extract the sheet relationship Ids and order
121
- // without reading any user content. We just need names and rIds so we
122
- // can pair them with worksheet parts to compute per-sheet stats.
123
- function listSheetPartNames(zip) {
124
- // Resolve via workbook rels: xl/_rels/workbook.xml.rels.
125
- const out = [];
126
- const relsFile = zip.file('xl/_rels/workbook.xml.rels');
127
- if (!relsFile) return out;
128
- // Sync — we already have the file in memory inside JSZip.
129
- // We use a lightweight regex; structural only, no values inside.
130
- // Each Relationship: <Relationship Id="rId1" Type="..." Target="worksheets/sheet1.xml"/>
131
- // We can't do sync read without loading; caller already loaded.
132
- return out;
133
- }
134
-
135
120
  async function generateBugReport(filePath) {
136
121
  if (!fs.existsSync(filePath)) {
137
122
  throw new Error(`File not found: ${filePath}`);
@@ -167,8 +152,7 @@ async function generateBugReport(filePath) {
167
152
  let exceljsError = null;
168
153
 
169
154
  try {
170
- const wb = new ExcelJS.Workbook();
171
- await wb.xlsx.readFile(filePath);
155
+ const wb = await engine.loadWorkbook(filePath);
172
156
  sheetCount = wb.worksheets.length;
173
157
  for (const ws of wb.worksheets) {
174
158
  const merges = ws.model && ws.model.merges ? ws.model.merges.length : 0;
package/lib/engine.js CHANGED
@@ -23,7 +23,7 @@ class ExcelJSEngine {
23
23
  /** Engine identifier — useful for diagnostics. */
24
24
  get name() { return 'exceljs'; }
25
25
  get version() {
26
- try { return require('exceljs/package.json').version; } catch (_) { return 'unknown'; }
26
+ try { return require('@protobi/exceljs/package.json').version; } catch (_) { return 'unknown'; }
27
27
  }
28
28
 
29
29
  /**
@@ -117,19 +117,138 @@ function redactSharedStringsXml(xml) {
117
117
  }
118
118
 
119
119
  // Comments: <comment><text><r>...<t>USER TEXT</t></r></text></comment>
120
- // Replace every <t> payload with "x".
120
+ // Replace every <t> payload with "x". Also strips <author>NAME</author>
121
+ // display names in <authors>; the numeric authorId on each <comment>
122
+ // references the (now redacted) author entry.
121
123
  function redactCommentsXml(xml) {
122
- return xml.replace(/(<t\b[^>]*>)([\s\S]*?)(<\/t>)/g, (m, open, payload, close) => {
124
+ let out = xml.replace(/(<t\b[^>]*>)([\s\S]*?)(<\/t>)/g, (m, open, payload, close) => {
125
+ return open + (payload === '' ? '' : 'x') + close;
126
+ });
127
+ out = out.replace(/(<author\b[^>]*>)([\s\S]*?)(<\/author>)/g, (m, open, payload, close) => {
123
128
  return open + (payload === '' ? '' : 'x') + close;
124
129
  });
130
+ return out;
125
131
  }
126
132
 
127
133
  // Threaded comments: <threadedComment ... text="USER TEXT" .../>
128
- // Excel encodes the body as an attribute — must redact in place.
134
+ // Excel encodes the body as an attribute — must redact in place. Both
135
+ // double-quoted and single-quoted attribute values are valid XML and we
136
+ // must scrub both forms.
129
137
  function redactThreadedCommentsXml(xml) {
130
- return xml.replace(/\btext="[^"]*"/g, 'text="x"');
138
+ return xml.replace(/\btext=("[^"]*"|'[^']*')/g, 'text="x"');
131
139
  }
132
140
 
141
+ // xl/persons/person.xml — author registry for threaded comments.
142
+ // <person displayName="Alice" id="..." userId="alice@co.com" providerId="AzureAD"/>
143
+ // Strip the three identifying attributes; leave id (a UUID) so threaded comment
144
+ // authorId references still resolve.
145
+ function redactPersonsXml(xml) {
146
+ return xml
147
+ .replace(/\bdisplayName="[^"]*"/g, 'displayName="x"')
148
+ .replace(/\buserId="[^"]*"/g, 'userId="x"')
149
+ .replace(/\bproviderId="[^"]*"/g, 'providerId="x"');
150
+ }
151
+
152
+ // docProps/core.xml — strip author, title, subject, description, keywords,
153
+ // category, lastModifiedBy, and any other user-text elements.
154
+ // The timestamp elements (dcterms:created / dcterms:modified) and structural
155
+ // elements (the xmlns declarations, DocSecurity, etc.) are left alone because
156
+ // they're non-identifying metadata needed for round-trip fidelity.
157
+ //
158
+ // Elements scrubbed:
159
+ // dc:creator → the file's original author name
160
+ // dc:title → document title set by author
161
+ // dc:subject → subject field
162
+ // dc:description → description field
163
+ // cp:keywords → keyword tags
164
+ // cp:category → category field
165
+ // cp:lastModifiedBy → last editor's name
166
+ // cp:contentStatus → rarely set, but can contain user text
167
+ const CORE_SCRUB_TAGS = [
168
+ 'dc:creator',
169
+ 'dc:title',
170
+ 'dc:subject',
171
+ 'dc:description',
172
+ 'cp:keywords',
173
+ 'cp:category',
174
+ 'cp:lastModifiedBy',
175
+ 'cp:contentStatus',
176
+ ];
177
+
178
+ function redactCoreXml(xml) {
179
+ let out = xml;
180
+ for (const tag of CORE_SCRUB_TAGS) {
181
+ // Replace inner content: <dc:creator>...</dc:creator> → <dc:creator></dc:creator>
182
+ // Handles attributes on the opening tag and multi-line content.
183
+ out = out.replace(
184
+ new RegExp(`(<${tag}\\b[^>]*>)[\\s\\S]*?(<\\/${tag}>)`, 'g'),
185
+ '$1$2'
186
+ );
187
+ }
188
+ return out;
189
+ }
190
+
191
+ // docProps/app.xml — strip Company, Manager, and HyperlinkBase which can
192
+ // contain user-identifying strings. The Application, AppVersion, DocSecurity,
193
+ // HeadingPairs, and TitlesOfParts (sheet names) fields are structural and left
194
+ // alone — sheet names are part of workbook structure, not cell values.
195
+ const APP_SCRUB_TAGS = [
196
+ 'Company',
197
+ 'Manager',
198
+ 'HyperlinkBase',
199
+ ];
200
+
201
+ function redactAppXml(xml) {
202
+ let out = xml;
203
+ for (const tag of APP_SCRUB_TAGS) {
204
+ out = out.replace(
205
+ new RegExp(`(<${tag}\\b[^>]*>)[\\s\\S]*?(<\\/${tag}>)`, 'g'),
206
+ '$1$2'
207
+ );
208
+ }
209
+ return out;
210
+ }
211
+
212
+ // docProps/custom.xml — custom properties are arbitrary user-defined key/value
213
+ // pairs. Strip the value payloads; keep the property names so the file remains
214
+ // structurally valid.
215
+ function redactCustomPropsXml(xml) {
216
+ // Custom property values live inside <vt:*> typed-value elements.
217
+ // Replace their inner text with empty string (preserves type nodes).
218
+ //
219
+ // The character class includes digits so OOXML numeric type names
220
+ // (vt:r4, vt:r8, vt:i1/i2/i4/i8, vt:ui1/ui2/ui4/ui8, vt:filetime) match.
221
+ // The \2 backreference forces the open and close tag names to match,
222
+ // so a payload that contains nested elements (e.g.
223
+ // <vt:variant><vt:lpwstr>X</vt:lpwstr></vt:variant>) doesn't produce
224
+ // mangled output. The inner text class [^<] keeps the match strictly
225
+ // text-only; the outer wrapper is left structurally intact and any
226
+ // nested vt:* elements get scrubbed by the same regex on overlapping
227
+ // passes.
228
+ return xml.replace(/(<(vt:[a-zA-Z0-9]+)\b[^>]*>)[^<]*(<\/\2>)/g, '$1$3');
229
+ }
230
+
231
+ // 1×1 transparent PNG — minimum valid PNG bytes. Used as a safe placeholder
232
+ // when stripping xl/media/ binary blobs so the ZIP remains structurally valid
233
+ // and drawing relationships don't point to missing entries.
234
+ // (96 bytes: PNG sig + IHDR + IDAT with one transparent pixel + IEND)
235
+ const TRANSPARENT_1X1_PNG = Buffer.from(
236
+ '89504e470d0a1a0a' + // PNG signature
237
+ '0000000d49484452' + // IHDR length + type
238
+ '00000001' + // width = 1
239
+ '00000001' + // height = 1
240
+ '08060000' + // 8-bit RGBA
241
+ '001f15c4' + // IHDR CRC
242
+ '89' + // IHDR chunk footer padding
243
+ '0000000a49444154' + // IDAT length + type
244
+ '789c6260' + // zlib header + deflate block
245
+ '0000000200' + // deflate end
246
+ '01e221bc33' + // IDAT CRC
247
+ '0000000049454e44' + // IEND length + type
248
+ 'ae426082', // IEND CRC
249
+ 'hex',
250
+ );
251
+
133
252
  async function exportRedactedWorkbook(inputPath, outputPath) {
134
253
  if (!fs.existsSync(inputPath)) {
135
254
  throw new Error(`File not found: ${inputPath}`);
@@ -160,6 +279,23 @@ async function exportRedactedWorkbook(inputPath, outputPath) {
160
279
  } else if (/^xl\/threadedComments\/threadedComment\d+\.xml$/i.test(name)) {
161
280
  const xml = await file.async('string');
162
281
  zip.file(name, redactThreadedCommentsXml(xml));
282
+ } else if (/^docProps\/core\.xml$/i.test(name)) {
283
+ const xml = await file.async('string');
284
+ zip.file(name, redactCoreXml(xml));
285
+ } else if (/^docProps\/app\.xml$/i.test(name)) {
286
+ const xml = await file.async('string');
287
+ zip.file(name, redactAppXml(xml));
288
+ } else if (/^docProps\/custom\.xml$/i.test(name)) {
289
+ const xml = await file.async('string');
290
+ zip.file(name, redactCustomPropsXml(xml));
291
+ } else if (/^xl\/persons\/person\.xml$/i.test(name)) {
292
+ const xml = await file.async('string');
293
+ zip.file(name, redactPersonsXml(xml));
294
+ } else if (/^xl\/media\//i.test(name)) {
295
+ // Embedded images / media — replace with a 1×1 transparent PNG so
296
+ // drawing relationships remain valid and the ZIP is structurally intact,
297
+ // but no user-supplied binary data survives in the output.
298
+ zip.file(name, TRANSPARENT_1X1_PNG);
163
299
  }
164
300
  // All other parts pass through untouched.
165
301
  }
@@ -180,4 +316,11 @@ module.exports = {
180
316
  // exported for unit testing
181
317
  _redactSheetXml: redactSheetXml,
182
318
  _redactSharedStringsXml: redactSharedStringsXml,
319
+ _redactCommentsXml: redactCommentsXml,
320
+ _redactThreadedCommentsXml: redactThreadedCommentsXml,
321
+ _redactCoreXml: redactCoreXml,
322
+ _redactAppXml: redactAppXml,
323
+ _redactCustomPropsXml: redactCustomPropsXml,
324
+ _redactPersonsXml: redactPersonsXml,
325
+ _TRANSPARENT_1X1_PNG: TRANSPARENT_1X1_PNG,
183
326
  };
@@ -0,0 +1,115 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Persistent user-level telemetry config at ~/.xlsx-for-ai/config.json.
5
+ *
6
+ * Stored outside node_modules so consent survives `npm install -g xlsx-for-ai@latest`
7
+ * upgrades. Path is resolved via os.homedir() for cross-platform support.
8
+ *
9
+ * Config shape:
10
+ * { "telemetry": true, "consented_at": "ISO-string", "consent_version": 1 }
11
+ *
12
+ * consent_version: bump CURRENT_CONSENT_VERSION when the telemetry shape changes.
13
+ * If the file's version is older, telemetry is PAUSED until the user re-runs
14
+ * --enable-telemetry. Never silently expand data shape under old consent.
15
+ */
16
+
17
+ const fs = require('fs');
18
+ const path = require('path');
19
+ const os = require('os');
20
+
21
+ const CURRENT_CONSENT_VERSION = 1;
22
+
23
+ /**
24
+ * Return the path to the config file. Uses XFA_CONFIG_DIR env var for test
25
+ * isolation; otherwise defaults to ~/.xlsx-for-ai/config.json.
26
+ */
27
+ function configDir() {
28
+ return process.env.XFA_CONFIG_DIR || path.join(os.homedir(), '.xlsx-for-ai');
29
+ }
30
+
31
+ function configPath() {
32
+ return path.join(configDir(), 'config.json');
33
+ }
34
+
35
+ /**
36
+ * Read config from disk. Returns null if file doesn't exist or is unreadable.
37
+ */
38
+ function readConfig() {
39
+ try {
40
+ const raw = fs.readFileSync(configPath(), 'utf8');
41
+ return JSON.parse(raw);
42
+ } catch (_) {
43
+ return null;
44
+ }
45
+ }
46
+
47
+ /**
48
+ * Write config to disk atomically. Creates the directory if needed.
49
+ */
50
+ function writeConfig(data) {
51
+ const dir = configDir();
52
+ fs.mkdirSync(dir, { recursive: true });
53
+ fs.writeFileSync(configPath(), JSON.stringify(data, null, 2) + '\n', 'utf8');
54
+ }
55
+
56
+ /**
57
+ * Telemetry status as one of:
58
+ * 'enabled' - opt-in, consent_version matches
59
+ * 'disabled' - explicitly opted out
60
+ * 'not configured' - no config file yet
61
+ * 'paused (consent_version mismatch)' - opted in but consent_version is stale
62
+ */
63
+ function telemetryStatus() {
64
+ const cfg = readConfig();
65
+ if (!cfg) return 'not configured';
66
+ if (cfg.telemetry === false) return 'disabled';
67
+ if (cfg.telemetry === true) {
68
+ if (cfg.consent_version !== CURRENT_CONSENT_VERSION) {
69
+ return 'paused (consent_version mismatch)';
70
+ }
71
+ return 'enabled';
72
+ }
73
+ return 'not configured';
74
+ }
75
+
76
+ /**
77
+ * Returns true only if telemetry is fully active (opted in AND version matches).
78
+ */
79
+ function isTelemetryActive() {
80
+ return telemetryStatus() === 'enabled';
81
+ }
82
+
83
+ /**
84
+ * Enable telemetry — write consent with current version.
85
+ * Idempotent.
86
+ */
87
+ function enableTelemetry() {
88
+ const existing = readConfig() || {};
89
+ writeConfig({
90
+ ...existing,
91
+ telemetry: true,
92
+ consented_at: new Date().toISOString(),
93
+ consent_version: CURRENT_CONSENT_VERSION,
94
+ });
95
+ }
96
+
97
+ /**
98
+ * Disable telemetry — write explicit false (keeps the file so we can distinguish
99
+ * "user said no" from "never asked").
100
+ */
101
+ function disableTelemetry() {
102
+ const existing = readConfig() || {};
103
+ writeConfig({ ...existing, telemetry: false });
104
+ }
105
+
106
+ module.exports = {
107
+ CURRENT_CONSENT_VERSION,
108
+ configPath,
109
+ readConfig,
110
+ writeConfig,
111
+ telemetryStatus,
112
+ isTelemetryActive,
113
+ enableTelemetry,
114
+ disableTelemetry,
115
+ };
@@ -0,0 +1,138 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Process-level crash telemetry hooks for xlsx-for-ai.
5
+ *
6
+ * Registers uncaughtException + unhandledRejection handlers only when the user
7
+ * has opted in (isTelemetryActive() === true). On crash, sends a minimal,
8
+ * sanitized payload and then re-throws the original error so the user still
9
+ * sees the stack trace and gets a non-zero exit code.
10
+ *
11
+ * Endpoint: XLSX_FOR_AI_TELEMETRY_ENDPOINT env var if set, else default below.
12
+ * // Endpoint deployment tracked separately — see project memory
13
+ * // project_xlsx_for_ai_telemetry_endpoint.md (TBD).
14
+ */
15
+
16
+ const https = require('https');
17
+ const http = require('http');
18
+ const { URL } = require('url');
19
+
20
+ const { isTelemetryActive, telemetryStatus } = require('./telemetry-config');
21
+ const { buildPayload } = require('./telemetry-sanitize');
22
+
23
+ const SEND_TIMEOUT_MS = 2000;
24
+
25
+ // Endpoint deployment tracked separately — see project memory
26
+ // project_xlsx_for_ai_telemetry_endpoint.md (TBD).
27
+ const DEFAULT_ENDPOINT = 'https://telemetry.xlsx-for-ai.dev/v1/crash';
28
+
29
+ function resolveEndpoint() {
30
+ return process.env.XLSX_FOR_AI_TELEMETRY_ENDPOINT || DEFAULT_ENDPOINT;
31
+ }
32
+
33
+ /**
34
+ * Send payload to the telemetry endpoint. Returns a Promise that:
35
+ * - resolves on success (2xx)
36
+ * - resolves (with a warning) on non-2xx or send failure
37
+ * - resolves on timeout (after SEND_TIMEOUT_MS)
38
+ *
39
+ * The Promise ALWAYS resolves — never rejects. A hung send must not block exit.
40
+ */
41
+ function sendPayload(payload) {
42
+ return new Promise((resolve) => {
43
+ const body = JSON.stringify(payload);
44
+ const endpoint = resolveEndpoint();
45
+
46
+ let parsed;
47
+ try {
48
+ parsed = new URL(endpoint);
49
+ } catch (_) {
50
+ resolve();
51
+ return;
52
+ }
53
+
54
+ const transport = parsed.protocol === 'http:' ? http : https;
55
+ const options = {
56
+ hostname: parsed.hostname,
57
+ port: parsed.port || (parsed.protocol === 'http:' ? 80 : 443),
58
+ path: parsed.pathname + parsed.search,
59
+ method: 'POST',
60
+ headers: {
61
+ 'Content-Type': 'application/json',
62
+ 'Content-Length': Buffer.byteLength(body),
63
+ },
64
+ };
65
+
66
+ const timer = setTimeout(() => {
67
+ try { req.destroy(); } catch (_) { /* ignore */ }
68
+ resolve();
69
+ }, SEND_TIMEOUT_MS);
70
+
71
+ const req = transport.request(options, (res) => {
72
+ clearTimeout(timer);
73
+ // Drain response body to free the socket.
74
+ res.resume();
75
+ res.on('end', resolve);
76
+ res.on('error', resolve);
77
+ });
78
+
79
+ req.on('error', () => {
80
+ clearTimeout(timer);
81
+ resolve();
82
+ });
83
+
84
+ req.write(body);
85
+ req.end();
86
+ });
87
+ }
88
+
89
+ /**
90
+ * Register process-level crash handlers if telemetry is active.
91
+ * Call once at startup. No-op if telemetry is not enabled.
92
+ *
93
+ * Prints a one-line notice if telemetry was opted in but consent_version is stale.
94
+ */
95
+ function registerCrashHooks(version) {
96
+ const status = telemetryStatus();
97
+
98
+ if (status === 'paused (consent_version mismatch)') {
99
+ process.stderr.write(
100
+ 'xlsx-for-ai: telemetry has been updated. Run `xlsx-for-ai --enable-telemetry`' +
101
+ ' to resume on the new shape, or `--telemetry-status` for details.\n'
102
+ );
103
+ return;
104
+ }
105
+
106
+ if (status !== 'enabled') return;
107
+
108
+ async function handleCrash(err) {
109
+ const payload = buildPayload(err, version);
110
+ try {
111
+ await sendPayload(payload);
112
+ } catch (_) {
113
+ // Never let telemetry mask the real error.
114
+ }
115
+ // Re-throw so the original stack + non-zero exit still happens.
116
+ // We use process.exit(1) here because re-throwing from an
117
+ // uncaughtException handler after it fires causes Node to call the
118
+ // handler again, creating an infinite loop.
119
+ process.stderr.write((err && (err.stack || err.message)) ? (err.stack || err.message) + '\n' : String(err) + '\n');
120
+ process.exit(1);
121
+ }
122
+
123
+ process.on('uncaughtException', (err) => {
124
+ handleCrash(err);
125
+ });
126
+
127
+ process.on('unhandledRejection', (reason) => {
128
+ handleCrash(reason instanceof Error ? reason : new Error(String(reason)));
129
+ });
130
+ }
131
+
132
+ module.exports = {
133
+ registerCrashHooks,
134
+ sendPayload,
135
+ resolveEndpoint,
136
+ DEFAULT_ENDPOINT,
137
+ SEND_TIMEOUT_MS,
138
+ };
@@ -0,0 +1,180 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Sanitization for crash telemetry payloads.
5
+ *
6
+ * INVARIANTS (non-negotiable):
7
+ * - No file paths: scrub /Users/<x>/..., C:\Users\<x>\..., /home/<x>/...
8
+ * - Cap error_message at 200 chars (after scrubbing)
9
+ * - No cell values, no workbook structure (not available post-crash anyway)
10
+ * - No env vars, no argv beyond a hardcoded allowlist
11
+ * - No machine identifier (no hostname, MAC, install ID)
12
+ *
13
+ * Future maintainers: do NOT enrich this payload. The consent_version gates
14
+ * any shape expansion. Bump CURRENT_CONSENT_VERSION in telemetry-config.js
15
+ * before adding new fields.
16
+ */
17
+
18
+ const os = require('os');
19
+
20
+ const MAX_MESSAGE_LENGTH = 200;
21
+
22
+ // Allowlisted first-arg values for the command field.
23
+ // Everything else becomes '<other>'.
24
+ const ALLOWED_COMMANDS = new Set([
25
+ 'xlsx-for-ai',
26
+ 'cursor-reads-xlsx',
27
+ 'write',
28
+ '--json',
29
+ '--md',
30
+ '--stdout',
31
+ '--sql',
32
+ '--schema',
33
+ '--compact',
34
+ '--evaluate',
35
+ '--stream',
36
+ '--list-sheets',
37
+ '--diff',
38
+ '--range',
39
+ '--named-range',
40
+ '--max-rows',
41
+ '--max-cols',
42
+ '--max-tokens',
43
+ '--report-bug',
44
+ '--export-redacted-workbook',
45
+ '--enable-telemetry',
46
+ '--disable-telemetry',
47
+ '--telemetry-status',
48
+ '--help',
49
+ '--version',
50
+ '-h',
51
+ '-v',
52
+ ]);
53
+
54
+ /**
55
+ * Scrub filesystem paths from a string.
56
+ *
57
+ * Covers:
58
+ * /Users/<name>/... (macOS)
59
+ * /home/<name>/... (Linux)
60
+ * C:\Users\<name>\... (Windows, forward or back slash variants)
61
+ * /var/folders/... (macOS temp)
62
+ * /tmp/... (Linux/macOS tmp)
63
+ * /private/tmp/... (macOS private tmp — e.g. worktrees)
64
+ * URL-encoded forms of the above (%2FUsers%2F<name>%2F...)
65
+ * $HOME and %USERPROFILE% references
66
+ */
67
+ function scrubPaths(str) {
68
+ if (typeof str !== 'string') return str;
69
+
70
+ // URL-decode once before scanning, then re-check on the decoded copy.
71
+ // We do NOT modify `str` in-place with decoded content because the
72
+ // caller's downstream display may want the original encoding. Instead
73
+ // we run two passes: one on the raw string, one on the decoded copy,
74
+ // and use the more-scrubbed result.
75
+ let decoded;
76
+ try {
77
+ decoded = decodeURIComponent(str);
78
+ } catch (_) {
79
+ decoded = str;
80
+ }
81
+
82
+ function scrubLiteral(s) {
83
+ // Windows: C:\Users\<name>\... or C:/Users/<name>/...
84
+ let out = s.replace(
85
+ /[A-Za-z]:[/\\][Uu]sers[/\\][^/\\:\s]+([/\\][^\s]*)*/g,
86
+ '<path>'
87
+ );
88
+
89
+ // Unix home dirs: /Users/<name>/... or /home/<name>/...
90
+ out = out.replace(
91
+ /\/(Users|home)\/[^/\s:]+([^\s:])*/g,
92
+ '<path>'
93
+ );
94
+
95
+ // /tmp/... and /private/tmp/...
96
+ out = out.replace(
97
+ /\/(?:private\/)?tmp\/[^\s:]+/g,
98
+ '<path>'
99
+ );
100
+
101
+ // /var/folders/...
102
+ out = out.replace(
103
+ /\/var\/folders\/[^\s:]+/g,
104
+ '<path>'
105
+ );
106
+
107
+ // $HOME/... or ${HOME}/... (env-var style references to home dir)
108
+ out = out.replace(
109
+ /\$\{?HOME\}?\/[^\s]*/gi,
110
+ '<path>'
111
+ );
112
+
113
+ // %USERPROFILE%\... (Windows env-var style)
114
+ out = out.replace(
115
+ /%USERPROFILE%[/\\][^\s]*/gi,
116
+ '<path>'
117
+ );
118
+
119
+ return out;
120
+ }
121
+
122
+ const scrubbed = scrubLiteral(str);
123
+ const scrubbedDecoded = scrubLiteral(decoded);
124
+
125
+ // If the decoded version produced additional scrubbing (URL-encoded path),
126
+ // return the scrubbed-decoded version so the sensitive data is removed.
127
+ // Heuristic: if scrubbing the decoded string was MORE aggressive (fewer
128
+ // remaining path fragments) use that result.
129
+ if (scrubbedDecoded.length < scrubbed.length) {
130
+ return scrubbedDecoded;
131
+ }
132
+ return scrubbed;
133
+ }
134
+
135
+ /**
136
+ * Sanitize error message: scrub paths, cap at 200 chars.
137
+ */
138
+ function sanitizeMessage(message) {
139
+ if (!message) return '';
140
+ const scrubbed = scrubPaths(String(message));
141
+ return scrubbed.slice(0, MAX_MESSAGE_LENGTH);
142
+ }
143
+
144
+ /**
145
+ * Build the outgoing crash payload from an Error object.
146
+ * Returns a plain object with only the allowed fields.
147
+ */
148
+ function buildPayload(err, version) {
149
+ const errorType = (err && err.constructor && err.constructor.name) || 'Error';
150
+ const rawMessage = (err && err.message) ? err.message : String(err);
151
+ const message = sanitizeMessage(rawMessage);
152
+
153
+ // First arg from process.argv — only if in allowlist.
154
+ let command = '<other>';
155
+ try {
156
+ const firstArg = process.argv[2];
157
+ if (firstArg && ALLOWED_COMMANDS.has(firstArg)) {
158
+ command = firstArg;
159
+ }
160
+ } catch (_) { /* ignore */ }
161
+
162
+ return {
163
+ v: 1,
164
+ ts: new Date().toISOString(),
165
+ error_type: errorType,
166
+ error_message: message,
167
+ command,
168
+ xlsx_for_ai_version: version || 'unknown',
169
+ node_version: process.version,
170
+ os_arch: `${process.platform}-${process.arch}`,
171
+ };
172
+ }
173
+
174
+ module.exports = {
175
+ scrubPaths,
176
+ sanitizeMessage,
177
+ buildPayload,
178
+ MAX_MESSAGE_LENGTH,
179
+ ALLOWED_COMMANDS,
180
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xlsx-for-ai",
3
- "version": "1.5.0",
3
+ "version": "1.5.2",
4
4
  "description": "CLI that converts .xlsx files into rich text or JSON dumps that AI coding agents (Claude, Cursor, Copilot, ChatGPT, etc.) can read — preserving values, formulas, formatting, colors, column widths, frozen panes, named ranges, tables, and more.",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -17,7 +17,7 @@
17
17
  "LICENSE"
18
18
  ],
19
19
  "scripts": {
20
- "test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js"
20
+ "test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js tests/telemetry-sanitize.test.js tests/telemetry-config.test.js tests/telemetry-consent-version.test.js tests/telemetry-flags.test.js tests/redactWorkbook-leak-check.test.js"
21
21
  },
22
22
  "keywords": [
23
23
  "xlsx",
@@ -50,6 +50,7 @@
50
50
  "@formulajs/formulajs": "^4.6.0",
51
51
  "@protobi/exceljs": "^4.4.0-protobi.9",
52
52
  "gpt-tokenizer": "^3.4.0",
53
+ "jszip": "^3.10.1",
53
54
  "papaparse": "^5.5.3",
54
55
  "xlsx": "^0.18.5"
55
56
  },