xlsx-for-ai 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -94,6 +94,7 @@ npx xlsx-for-ai data.xlsx "Sheet1" --stdout --max-rows 50 --compact
94
94
  | `[sheetName]` | Positional: dump only this sheet |
95
95
  | `--range A1:D50` | Dump only this rectangular range |
96
96
  | `--named-range NAME` | Dump only the cells covered by a workbook-defined name |
97
+ | `--region` | Auto-detect the dominant contiguous data block (Excel "current region" / Ctrl+Shift+*). Picks the largest region by populated-cell count when multiple disjoint blocks exist. Compatible with `--max-rows` / `--max-cols`. |
97
98
  | `--max-rows N` | Cap at the first N rows per sheet |
98
99
  | `--max-cols N` | Cap at the first N columns per sheet |
99
100
 
@@ -318,7 +319,22 @@ A future release may apply these dep upgrades via `patch-package` so they travel
318
319
 
319
320
  ## Reporting bugs
320
321
 
321
- **The privacy contract: we never auto-send your data.** xlsx-for-ai has no telemetry endpoint and no consent dialog to maintainthere's nothing to opt out of, because nothing leaves your machine unless you choose to attach it to a GitHub issue.
322
+ **The privacy contract: we never auto-send workbook data.** Anonymous crash telemetry is opt-in via `--enable-telemetry`; even then, we receive only error type, error message (sanitizedpaths scrubbed, capped at 200 chars), tool version, Node version, and OS/arch. No paths, no cell values, no identifiers.
323
+
324
+ To enable or manage crash telemetry:
325
+
326
+ ```bash
327
+ # Opt in — prints the exact payload schema so you can see what gets sent
328
+ xlsx-for-ai --enable-telemetry
329
+
330
+ # Opt out
331
+ xlsx-for-ai --disable-telemetry
332
+
333
+ # Check current state and config path
334
+ xlsx-for-ai --telemetry-status
335
+ ```
336
+
337
+ Consent is stored at `~/.xlsx-for-ai/config.json` and persists across `npm install -g xlsx-for-ai@latest` upgrades. If the telemetry shape ever changes, the tool pauses sending and prompts you to re-opt-in — we never silently expand what we collect under old consent.
322
338
 
323
339
  When something breaks on a real workbook, two flags help us reproduce locally without asking you to share the original file:
324
340
 
package/WHY.md CHANGED
@@ -92,3 +92,7 @@ Spreadsheet libraries are designed for developers building software *on top of*
92
92
  `xlsx-for-ai` is the first one built specifically for that. The output is shaped for an LLM's context window — markdown tables when the model just needs to read, structured JSON when it needs to reason, token-aware truncation when the spreadsheet is too big to fit, and a real `.xlsx` writer that produces a file you can hand back to a human along with a built-in note explaining everything that changed.
93
93
 
94
94
  It's a small tool. It just happens to fix the one thing standing between AI assistants and the file format most knowledge work actually lives in.
95
+
96
+ ## Privacy contract
97
+
98
+ We never auto-send workbook data. Anonymous crash telemetry is opt-in via `xlsx-for-ai --enable-telemetry`; even then, we receive only error type, error message (sanitized — paths scrubbed, capped at 200 chars), and tool/Node/OS version — no paths, no cell values, no identifiers. Nothing leaves your machine unless you choose to enable it.
package/index.js CHANGED
@@ -22,10 +22,11 @@ if (!process.env.XLSX_FOR_AI_RESPAWNED) {
22
22
  const path = require('path');
23
23
  const fs = require('fs');
24
24
  // All xlsx-engine access goes through the engine abstraction in lib/engine.js
25
- // — never require the underlying engine directly. To swap engines (fork,
26
- // different library, server-side service), replace lib/engine.js. Nothing
27
- // else changes. Current engine: @protobi/exceljs (drop-in fork of exceljs
28
- // with active maintenance + preservation patches; see ROADMAP for rationale).
25
+ // — lib/engine.js is the ONLY place in lib/ that requires @protobi/exceljs.
26
+ // To swap engines (fork, different library, server-side service), replace
27
+ // lib/engine.js; nothing else changes. Current engine: @protobi/exceljs
28
+ // (drop-in fork of exceljs with active maintenance + preservation patches;
29
+ // see ROADMAP for rationale).
29
30
  const engine = require('./lib/engine');
30
31
 
31
32
  // Lazy-load heavy deps only when their feature is used (keeps cold start fast
@@ -55,12 +56,16 @@ function parseArgs(argv) {
55
56
  diff: null,
56
57
  range: null,
57
58
  namedRange: null,
59
+ region: false,
58
60
  maxRows: null,
59
61
  maxCols: null,
60
62
  maxTokens: null,
61
63
  reportBug: null,
62
64
  exportRedactedWorkbook: null,
63
65
  help: false,
66
+ enableTelemetry: false,
67
+ disableTelemetry: false,
68
+ telemetryStatus: false,
64
69
  };
65
70
  let i = 0;
66
71
  while (i < argv.length) {
@@ -77,11 +82,15 @@ function parseArgs(argv) {
77
82
  else if (arg === '--diff') { opts.diff = argv[++i]; }
78
83
  else if (arg === '--range') { opts.range = argv[++i]; }
79
84
  else if (arg === '--named-range') { opts.namedRange = argv[++i]; }
85
+ else if (arg === '--region') opts.region = true;
80
86
  else if (arg === '--max-rows') { opts.maxRows = parseInt(argv[++i], 10); }
81
87
  else if (arg === '--max-cols') { opts.maxCols = parseInt(argv[++i], 10); }
82
88
  else if (arg === '--max-tokens') { opts.maxTokens = parseInt(argv[++i], 10); }
83
89
  else if (arg === '--report-bug') { opts.reportBug = argv[++i]; }
84
90
  else if (arg === '--export-redacted-workbook'){ opts.exportRedactedWorkbook = argv[++i]; }
91
+ else if (arg === '--enable-telemetry') opts.enableTelemetry = true;
92
+ else if (arg === '--disable-telemetry') opts.disableTelemetry = true;
93
+ else if (arg === '--telemetry-status') opts.telemetryStatus = true;
85
94
  else if (arg === '-h' || arg === '--help') opts.help = true;
86
95
  else opts.positional.push(arg);
87
96
  i++;
@@ -111,6 +120,10 @@ Selection:
111
120
  [sheetName] Positional second arg, dump only this sheet
112
121
  --range A1:D50 Dump only this rectangular range
113
122
  --named-range NM Dump only the cells covered by this defined name
123
+ --region Auto-detect the dominant contiguous data block (Excel
124
+ "current region" semantics); picks the largest region
125
+ by populated-cell count when multiple disjoint blocks
126
+ exist. Compatible with --max-rows / --max-cols.
114
127
  --max-rows N Limit to first N rows per sheet
115
128
  --max-cols N Limit to first N columns per sheet
116
129
 
@@ -141,6 +154,21 @@ Bug reporting (privacy-by-design — no data leaves your machine):
141
154
  structure, styles, named ranges preserved. Optional
142
155
  attachment for hard-to-repro bugs.
143
156
 
157
+ Crash telemetry (opt-in only):
158
+ --enable-telemetry
159
+ Opt in to anonymous crash telemetry. Only error type,
160
+ sanitized error message (paths scrubbed, ≤200 chars),
161
+ tool version, Node version, and OS/arch are sent.
162
+ No paths, no cell values, no identifiers.
163
+ Payload: { v, ts, error_type, error_message, command,
164
+ xlsx_for_ai_version, node_version, os_arch }
165
+ Consent persists at ~/.xlsx-for-ai/config.json across
166
+ upgrades.
167
+ --disable-telemetry
168
+ Opt out. Config file is kept (explicit "no" is recorded).
169
+ --telemetry-status
170
+ Show current state and config path.
171
+
144
172
  Misc:
145
173
  -h, --help Show this help
146
174
 
@@ -150,6 +178,8 @@ Examples:
150
178
  npx xlsx-for-ai data.xlsx --json --max-tokens 8000 --stdout
151
179
  npx xlsx-for-ai data.csv --md --stdout
152
180
  npx xlsx-for-ai data.xlsx --range B2:F100 --stdout
181
+ npx xlsx-for-ai data.xlsx --region --stdout
182
+ npx xlsx-for-ai data.xlsx --region --max-rows 50 --stdout
153
183
  npx xlsx-for-ai data.xlsx --named-range MyTotals --stdout
154
184
  npx xlsx-for-ai data.xlsx --sql --stdout > schema.sql
155
185
  npx xlsx-for-ai old.xlsx --diff new.xlsx --stdout
@@ -325,6 +355,103 @@ function resolveNamedRange(wb, name) {
325
355
  return { sheet: sheetName, range: parseRange(rangeStr) };
326
356
  }
327
357
 
358
+ // ---------------------------------------------------------------------------
359
+ // Region detection — "current region" semantics (Excel Ctrl+Shift+*)
360
+ //
361
+ // Finds the dominant contiguous data block on a worksheet. Algorithm:
362
+ // 1. Scan the sheet to collect all populated cells.
363
+ // 2. Build connected components using 8-neighbor flood fill (cells that
364
+ // share a corner or edge are in the same region).
365
+ // 3. For each component, compute the bounding rectangle and the count of
366
+ // populated cells inside it.
367
+ // 4. Return the bounding box of the component with the most populated cells
368
+ // (tie-break: largest populated count; if still tied, the first found).
369
+ //
370
+ // Returns {startRow, startCol, endRow, endCol} (1-indexed), or null if the
371
+ // sheet has no populated cells.
372
+ // ---------------------------------------------------------------------------
373
+
374
+ function detectRegion(ws) {
375
+ // Step 1: collect all populated cells into a Set for O(1) lookup.
376
+ // We store them as "row,col" strings and also keep a list for iteration.
377
+ const populated = new Set();
378
+ const cells = [];
379
+
380
+ const rowCount = ws.rowCount;
381
+ const colCount = ws.columnCount;
382
+ if (rowCount === 0 || colCount === 0) return null;
383
+
384
+ for (let r = 1; r <= rowCount; r++) {
385
+ const row = ws.getRow(r);
386
+ for (let c = 1; c <= colCount; c++) {
387
+ const v = row.getCell(c).value;
388
+ if (v != null && v !== '') {
389
+ const key = `${r},${c}`;
390
+ populated.add(key);
391
+ cells.push([r, c]);
392
+ }
393
+ }
394
+ }
395
+
396
+ if (cells.length === 0) return null;
397
+
398
+ // Step 2: flood-fill connected components (8-neighbor).
399
+ const visited = new Set();
400
+ const components = [];
401
+
402
+ for (const [startR, startC] of cells) {
403
+ const key = `${startR},${startC}`;
404
+ if (visited.has(key)) continue;
405
+
406
+ // BFS from this seed cell.
407
+ const component = [];
408
+ const queue = [[startR, startC]];
409
+ visited.add(key);
410
+
411
+ while (queue.length > 0) {
412
+ const [r, c] = queue.shift();
413
+ component.push([r, c]);
414
+ // 8 neighbors
415
+ for (let dr = -1; dr <= 1; dr++) {
416
+ for (let dc = -1; dc <= 1; dc++) {
417
+ if (dr === 0 && dc === 0) continue;
418
+ const nr = r + dr;
419
+ const nc = c + dc;
420
+ if (nr < 1 || nc < 1) continue;
421
+ const nk = `${nr},${nc}`;
422
+ if (!visited.has(nk) && populated.has(nk)) {
423
+ visited.add(nk);
424
+ queue.push([nr, nc]);
425
+ }
426
+ }
427
+ }
428
+ }
429
+ components.push(component);
430
+ }
431
+
432
+ // Step 3: pick the component with the most populated cells.
433
+ let best = null;
434
+ let bestCount = -1;
435
+ for (const comp of components) {
436
+ if (comp.length > bestCount) {
437
+ bestCount = comp.length;
438
+ best = comp;
439
+ }
440
+ }
441
+
442
+ // Step 4: compute bounding rectangle of the winning component.
443
+ let minR = Infinity, maxR = -Infinity;
444
+ let minC = Infinity, maxC = -Infinity;
445
+ for (const [r, c] of best) {
446
+ if (r < minR) minR = r;
447
+ if (r > maxR) maxR = r;
448
+ if (c < minC) minC = c;
449
+ if (c > maxC) maxC = c;
450
+ }
451
+
452
+ return { startRow: minR, endRow: maxR, startCol: minC, endCol: maxC };
453
+ }
454
+
328
455
  // ---------------------------------------------------------------------------
329
456
  // Selection bounds — combines --range, --named-range, --max-rows/cols, sheet
330
457
  // dimensions into a single {startRow, startCol, endRow, endCol}.
@@ -336,6 +463,9 @@ function selectionBounds(ws, opts) {
336
463
  bounds = parseRange(opts.range);
337
464
  } else if (opts.namedRangeBounds) {
338
465
  bounds = opts.namedRangeBounds;
466
+ } else if (opts.region) {
467
+ bounds = detectRegion(ws);
468
+ // bounds may be null (empty sheet); handled below by falling back to sheet dimensions.
339
469
  }
340
470
  const startRow = bounds ? bounds.startRow : 1;
341
471
  const startCol = bounds ? bounds.startCol : 1;
@@ -1709,6 +1839,56 @@ async function main() {
1709
1839
 
1710
1840
  if (opts.help) { printHelp(); process.exit(0); }
1711
1841
 
1842
+ // ---------------------------------------------------------------------------
1843
+ // Telemetry management flags — handled before crash hooks are registered.
1844
+ // ---------------------------------------------------------------------------
1845
+ if (opts.enableTelemetry || opts.disableTelemetry || opts.telemetryStatus) {
1846
+ const telCfg = require('./lib/telemetry-config');
1847
+
1848
+ if (opts.enableTelemetry) {
1849
+ telCfg.enableTelemetry();
1850
+ console.log('Crash telemetry enabled.');
1851
+ console.log('');
1852
+ console.log('When a crash occurs, this payload will be sent:');
1853
+ console.log(JSON.stringify({
1854
+ v: 1,
1855
+ ts: '<ISO-timestamp>',
1856
+ error_type: '<e.g. TypeError>',
1857
+ error_message: '<sanitized, ≤200 chars — paths scrubbed>',
1858
+ command: '<first CLI arg from allowlist, or "<other>">',
1859
+ xlsx_for_ai_version: require('./package.json').version,
1860
+ node_version: process.version,
1861
+ os_arch: `${process.platform}-${process.arch}`,
1862
+ }, null, 2));
1863
+ console.log('');
1864
+ console.log('No paths, no cell values, no identifiers. Consent stored at:');
1865
+ console.log(telCfg.configPath());
1866
+ return;
1867
+ }
1868
+
1869
+ if (opts.disableTelemetry) {
1870
+ telCfg.disableTelemetry();
1871
+ console.log('Crash telemetry disabled.');
1872
+ console.log('Config kept at: ' + telCfg.configPath());
1873
+ return;
1874
+ }
1875
+
1876
+ if (opts.telemetryStatus) {
1877
+ const status = telCfg.telemetryStatus();
1878
+ console.log(`Telemetry status: ${status}`);
1879
+ console.log(`Config path: ${telCfg.configPath()}`);
1880
+ return;
1881
+ }
1882
+ }
1883
+
1884
+ // ---------------------------------------------------------------------------
1885
+ // Register process-level crash hooks (no-op if user hasn't opted in).
1886
+ // ---------------------------------------------------------------------------
1887
+ {
1888
+ const { registerCrashHooks } = require('./lib/telemetry-hooks');
1889
+ registerCrashHooks(require('./package.json').version);
1890
+ }
1891
+
1712
1892
  // Bug-report and redacted-workbook modes consume their input via the
1713
1893
  // flag itself, so they bypass the normal positional / loader path.
1714
1894
  if (opts.reportBug) {
@@ -1838,6 +2018,16 @@ async function main() {
1838
2018
 
1839
2019
  const baseName = path.basename(filePath, path.extname(filePath));
1840
2020
 
2021
+ // --region: warn per-sheet if no data block was found (empty sheet).
2022
+ if (opts.region) {
2023
+ for (const ws of sheets) {
2024
+ const r = detectRegion(ws);
2025
+ if (!r) {
2026
+ console.error(`note: --region: no data found in sheet "${ws.name}"; dumping full sheet dimensions.`);
2027
+ }
2028
+ }
2029
+ }
2030
+
1841
2031
  // Pick output formatter.
1842
2032
  const renderText = (ws) => dumpSheet(ws, wb, perSheetOpts);
1843
2033
  const renderMd = (ws) => dumpSheetMarkdown(ws, wb, perSheetOpts);
@@ -1965,4 +2155,7 @@ module.exports = {
1965
2155
  trySimpleEval,
1966
2156
  // budget
1967
2157
  applyTokenBudget,
2158
+ // region detection
2159
+ detectRegion,
2160
+ selectionBounds,
1968
2161
  };
package/lib/bugReport.js CHANGED
@@ -21,7 +21,7 @@ const fs = require('fs');
21
21
  const path = require('path');
22
22
  const os = require('os');
23
23
  const JSZip = require('jszip');
24
- const ExcelJS = require('@protobi/exceljs');
24
+ const engine = require('./engine');
25
25
 
26
26
  const PKG_VERSION = require('../package.json').version;
27
27
 
@@ -117,21 +117,6 @@ function inventoryFeatures(filenames) {
117
117
  return out;
118
118
  }
119
119
 
120
- // Given the workbook.xml, extract the sheet relationship Ids and order
121
- // without reading any user content. We just need names and rIds so we
122
- // can pair them with worksheet parts to compute per-sheet stats.
123
- function listSheetPartNames(zip) {
124
- // Resolve via workbook rels: xl/_rels/workbook.xml.rels.
125
- const out = [];
126
- const relsFile = zip.file('xl/_rels/workbook.xml.rels');
127
- if (!relsFile) return out;
128
- // Sync — we already have the file in memory inside JSZip.
129
- // We use a lightweight regex; structural only, no values inside.
130
- // Each Relationship: <Relationship Id="rId1" Type="..." Target="worksheets/sheet1.xml"/>
131
- // We can't do sync read without loading; caller already loaded.
132
- return out;
133
- }
134
-
135
120
  async function generateBugReport(filePath) {
136
121
  if (!fs.existsSync(filePath)) {
137
122
  throw new Error(`File not found: ${filePath}`);
@@ -167,8 +152,7 @@ async function generateBugReport(filePath) {
167
152
  let exceljsError = null;
168
153
 
169
154
  try {
170
- const wb = new ExcelJS.Workbook();
171
- await wb.xlsx.readFile(filePath);
155
+ const wb = await engine.loadWorkbook(filePath);
172
156
  sheetCount = wb.worksheets.length;
173
157
  for (const ws of wb.worksheets) {
174
158
  const merges = ws.model && ws.model.merges ? ws.model.merges.length : 0;
@@ -130,6 +130,75 @@ function redactThreadedCommentsXml(xml) {
130
130
  return xml.replace(/\btext="[^"]*"/g, 'text="x"');
131
131
  }
132
132
 
133
+ // docProps/core.xml — strip author, title, subject, description, keywords,
134
+ // category, lastModifiedBy, and any other user-text elements.
135
+ // The timestamp elements (dcterms:created / dcterms:modified) and structural
136
+ // elements (the xmlns declarations, DocSecurity, etc.) are left alone because
137
+ // they're non-identifying metadata needed for round-trip fidelity.
138
+ //
139
+ // Elements scrubbed:
140
+ // dc:creator → the file's original author name
141
+ // dc:title → document title set by author
142
+ // dc:subject → subject field
143
+ // dc:description → description field
144
+ // cp:keywords → keyword tags
145
+ // cp:category → category field
146
+ // cp:lastModifiedBy → last editor's name
147
+ // cp:contentStatus → rarely set, but can contain user text
148
+ const CORE_SCRUB_TAGS = [
149
+ 'dc:creator',
150
+ 'dc:title',
151
+ 'dc:subject',
152
+ 'dc:description',
153
+ 'cp:keywords',
154
+ 'cp:category',
155
+ 'cp:lastModifiedBy',
156
+ 'cp:contentStatus',
157
+ ];
158
+
159
+ function redactCoreXml(xml) {
160
+ let out = xml;
161
+ for (const tag of CORE_SCRUB_TAGS) {
162
+ // Replace inner content: <dc:creator>...</dc:creator> → <dc:creator></dc:creator>
163
+ // Handles attributes on the opening tag and multi-line content.
164
+ out = out.replace(
165
+ new RegExp(`(<${tag}\\b[^>]*>)[\\s\\S]*?(<\\/${tag}>)`, 'g'),
166
+ '$1$2'
167
+ );
168
+ }
169
+ return out;
170
+ }
171
+
172
+ // docProps/app.xml — strip Company, Manager, and HyperlinkBase which can
173
+ // contain user-identifying strings. The Application, AppVersion, DocSecurity,
174
+ // HeadingPairs, and TitlesOfParts (sheet names) fields are structural and left
175
+ // alone — sheet names are part of workbook structure, not cell values.
176
+ const APP_SCRUB_TAGS = [
177
+ 'Company',
178
+ 'Manager',
179
+ 'HyperlinkBase',
180
+ ];
181
+
182
+ function redactAppXml(xml) {
183
+ let out = xml;
184
+ for (const tag of APP_SCRUB_TAGS) {
185
+ out = out.replace(
186
+ new RegExp(`(<${tag}\\b[^>]*>)[\\s\\S]*?(<\\/${tag}>)`, 'g'),
187
+ '$1$2'
188
+ );
189
+ }
190
+ return out;
191
+ }
192
+
193
+ // docProps/custom.xml — custom properties are arbitrary user-defined key/value
194
+ // pairs. Strip the value payloads; keep the property names so the file remains
195
+ // structurally valid.
196
+ function redactCustomPropsXml(xml) {
197
+ // Custom property values live inside <vt:*> typed-value elements.
198
+ // Replace their inner text with empty string (preserves type nodes).
199
+ return xml.replace(/(<vt:[a-zA-Z]+\b[^>]*>)[^<]*(.*?)(<\/vt:[a-zA-Z]+>)/g, '$1$3');
200
+ }
201
+
133
202
  async function exportRedactedWorkbook(inputPath, outputPath) {
134
203
  if (!fs.existsSync(inputPath)) {
135
204
  throw new Error(`File not found: ${inputPath}`);
@@ -160,6 +229,15 @@ async function exportRedactedWorkbook(inputPath, outputPath) {
160
229
  } else if (/^xl\/threadedComments\/threadedComment\d+\.xml$/i.test(name)) {
161
230
  const xml = await file.async('string');
162
231
  zip.file(name, redactThreadedCommentsXml(xml));
232
+ } else if (/^docProps\/core\.xml$/i.test(name)) {
233
+ const xml = await file.async('string');
234
+ zip.file(name, redactCoreXml(xml));
235
+ } else if (/^docProps\/app\.xml$/i.test(name)) {
236
+ const xml = await file.async('string');
237
+ zip.file(name, redactAppXml(xml));
238
+ } else if (/^docProps\/custom\.xml$/i.test(name)) {
239
+ const xml = await file.async('string');
240
+ zip.file(name, redactCustomPropsXml(xml));
163
241
  }
164
242
  // All other parts pass through untouched.
165
243
  }
@@ -180,4 +258,7 @@ module.exports = {
180
258
  // exported for unit testing
181
259
  _redactSheetXml: redactSheetXml,
182
260
  _redactSharedStringsXml: redactSharedStringsXml,
261
+ _redactCoreXml: redactCoreXml,
262
+ _redactAppXml: redactAppXml,
263
+ _redactCustomPropsXml: redactCustomPropsXml,
183
264
  };
@@ -0,0 +1,115 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Persistent user-level telemetry config at ~/.xlsx-for-ai/config.json.
5
+ *
6
+ * Stored outside node_modules so consent survives `npm install -g xlsx-for-ai@latest`
7
+ * upgrades. Path is resolved via os.homedir() for cross-platform support.
8
+ *
9
+ * Config shape:
10
+ * { "telemetry": true, "consented_at": "ISO-string", "consent_version": 1 }
11
+ *
12
+ * consent_version: bump CURRENT_CONSENT_VERSION when the telemetry shape changes.
13
+ * If the file's version is older, telemetry is PAUSED until the user re-runs
14
+ * --enable-telemetry. Never silently expand data shape under old consent.
15
+ */
16
+
17
+ const fs = require('fs');
18
+ const path = require('path');
19
+ const os = require('os');
20
+
21
+ const CURRENT_CONSENT_VERSION = 1;
22
+
23
+ /**
24
+ * Return the path to the config file. Uses XFA_CONFIG_DIR env var for test
25
+ * isolation; otherwise defaults to ~/.xlsx-for-ai/config.json.
26
+ */
27
+ function configDir() {
28
+ return process.env.XFA_CONFIG_DIR || path.join(os.homedir(), '.xlsx-for-ai');
29
+ }
30
+
31
+ function configPath() {
32
+ return path.join(configDir(), 'config.json');
33
+ }
34
+
35
+ /**
36
+ * Read config from disk. Returns null if file doesn't exist or is unreadable.
37
+ */
38
+ function readConfig() {
39
+ try {
40
+ const raw = fs.readFileSync(configPath(), 'utf8');
41
+ return JSON.parse(raw);
42
+ } catch (_) {
43
+ return null;
44
+ }
45
+ }
46
+
47
+ /**
48
+ * Write config to disk atomically. Creates the directory if needed.
49
+ */
50
+ function writeConfig(data) {
51
+ const dir = configDir();
52
+ fs.mkdirSync(dir, { recursive: true });
53
+ fs.writeFileSync(configPath(), JSON.stringify(data, null, 2) + '\n', 'utf8');
54
+ }
55
+
56
+ /**
57
+ * Telemetry status as one of:
58
+ * 'enabled' - opt-in, consent_version matches
59
+ * 'disabled' - explicitly opted out
60
+ * 'not configured' - no config file yet
61
+ * 'paused (consent_version mismatch)' - opted in but consent_version is stale
62
+ */
63
+ function telemetryStatus() {
64
+ const cfg = readConfig();
65
+ if (!cfg) return 'not configured';
66
+ if (cfg.telemetry === false) return 'disabled';
67
+ if (cfg.telemetry === true) {
68
+ if (cfg.consent_version !== CURRENT_CONSENT_VERSION) {
69
+ return 'paused (consent_version mismatch)';
70
+ }
71
+ return 'enabled';
72
+ }
73
+ return 'not configured';
74
+ }
75
+
76
+ /**
77
+ * Returns true only if telemetry is fully active (opted in AND version matches).
78
+ */
79
+ function isTelemetryActive() {
80
+ return telemetryStatus() === 'enabled';
81
+ }
82
+
83
+ /**
84
+ * Enable telemetry — write consent with current version.
85
+ * Idempotent.
86
+ */
87
+ function enableTelemetry() {
88
+ const existing = readConfig() || {};
89
+ writeConfig({
90
+ ...existing,
91
+ telemetry: true,
92
+ consented_at: new Date().toISOString(),
93
+ consent_version: CURRENT_CONSENT_VERSION,
94
+ });
95
+ }
96
+
97
+ /**
98
+ * Disable telemetry — write explicit false (keeps the file so we can distinguish
99
+ * "user said no" from "never asked").
100
+ */
101
+ function disableTelemetry() {
102
+ const existing = readConfig() || {};
103
+ writeConfig({ ...existing, telemetry: false });
104
+ }
105
+
106
+ module.exports = {
107
+ CURRENT_CONSENT_VERSION,
108
+ configPath,
109
+ readConfig,
110
+ writeConfig,
111
+ telemetryStatus,
112
+ isTelemetryActive,
113
+ enableTelemetry,
114
+ disableTelemetry,
115
+ };
@@ -0,0 +1,138 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Process-level crash telemetry hooks for xlsx-for-ai.
5
+ *
6
+ * Registers uncaughtException + unhandledRejection handlers only when the user
7
+ * has opted in (isTelemetryActive() === true). On crash, sends a minimal,
8
+ * sanitized payload and then re-throws the original error so the user still
9
+ * sees the stack trace and gets a non-zero exit code.
10
+ *
11
+ * Endpoint: XLSX_FOR_AI_TELEMETRY_ENDPOINT env var if set, else default below.
12
+ * // Endpoint deployment tracked separately — see project memory
13
+ * // project_xlsx_for_ai_telemetry_endpoint.md (TBD).
14
+ */
15
+
16
+ const https = require('https');
17
+ const http = require('http');
18
+ const { URL } = require('url');
19
+
20
+ const { isTelemetryActive, telemetryStatus } = require('./telemetry-config');
21
+ const { buildPayload } = require('./telemetry-sanitize');
22
+
23
+ const SEND_TIMEOUT_MS = 2000;
24
+
25
+ // Endpoint deployment tracked separately — see project memory
26
+ // project_xlsx_for_ai_telemetry_endpoint.md (TBD).
27
+ const DEFAULT_ENDPOINT = 'https://telemetry.xlsx-for-ai.dev/v1/crash';
28
+
29
+ function resolveEndpoint() {
30
+ return process.env.XLSX_FOR_AI_TELEMETRY_ENDPOINT || DEFAULT_ENDPOINT;
31
+ }
32
+
33
+ /**
34
+ * Send payload to the telemetry endpoint. Returns a Promise that:
35
+ * - resolves on success (2xx)
36
+ * - resolves (with a warning) on non-2xx or send failure
37
+ * - resolves on timeout (after SEND_TIMEOUT_MS)
38
+ *
39
+ * The Promise ALWAYS resolves — never rejects. A hung send must not block exit.
40
+ */
41
+ function sendPayload(payload) {
42
+ return new Promise((resolve) => {
43
+ const body = JSON.stringify(payload);
44
+ const endpoint = resolveEndpoint();
45
+
46
+ let parsed;
47
+ try {
48
+ parsed = new URL(endpoint);
49
+ } catch (_) {
50
+ resolve();
51
+ return;
52
+ }
53
+
54
+ const transport = parsed.protocol === 'http:' ? http : https;
55
+ const options = {
56
+ hostname: parsed.hostname,
57
+ port: parsed.port || (parsed.protocol === 'http:' ? 80 : 443),
58
+ path: parsed.pathname + parsed.search,
59
+ method: 'POST',
60
+ headers: {
61
+ 'Content-Type': 'application/json',
62
+ 'Content-Length': Buffer.byteLength(body),
63
+ },
64
+ };
65
+
66
+ const timer = setTimeout(() => {
67
+ try { req.destroy(); } catch (_) { /* ignore */ }
68
+ resolve();
69
+ }, SEND_TIMEOUT_MS);
70
+
71
+ const req = transport.request(options, (res) => {
72
+ clearTimeout(timer);
73
+ // Drain response body to free the socket.
74
+ res.resume();
75
+ res.on('end', resolve);
76
+ res.on('error', resolve);
77
+ });
78
+
79
+ req.on('error', () => {
80
+ clearTimeout(timer);
81
+ resolve();
82
+ });
83
+
84
+ req.write(body);
85
+ req.end();
86
+ });
87
+ }
88
+
89
+ /**
90
+ * Register process-level crash handlers if telemetry is active.
91
+ * Call once at startup. No-op if telemetry is not enabled.
92
+ *
93
+ * Prints a one-line notice if telemetry was opted in but consent_version is stale.
94
+ */
95
+ function registerCrashHooks(version) {
96
+ const status = telemetryStatus();
97
+
98
+ if (status === 'paused (consent_version mismatch)') {
99
+ process.stderr.write(
100
+ 'xlsx-for-ai: telemetry has been updated. Run `xlsx-for-ai --enable-telemetry`' +
101
+ ' to resume on the new shape, or `--telemetry-status` for details.\n'
102
+ );
103
+ return;
104
+ }
105
+
106
+ if (status !== 'enabled') return;
107
+
108
+ async function handleCrash(err) {
109
+ const payload = buildPayload(err, version);
110
+ try {
111
+ await sendPayload(payload);
112
+ } catch (_) {
113
+ // Never let telemetry mask the real error.
114
+ }
115
+ // Re-throw so the original stack + non-zero exit still happens.
116
+ // We use process.exit(1) here because re-throwing from an
117
+ // uncaughtException handler after it fires causes Node to call the
118
+ // handler again, creating an infinite loop.
119
+ process.stderr.write((err && (err.stack || err.message)) ? (err.stack || err.message) + '\n' : String(err) + '\n');
120
+ process.exit(1);
121
+ }
122
+
123
+ process.on('uncaughtException', (err) => {
124
+ handleCrash(err);
125
+ });
126
+
127
+ process.on('unhandledRejection', (reason) => {
128
+ handleCrash(reason instanceof Error ? reason : new Error(String(reason)));
129
+ });
130
+ }
131
+
132
+ module.exports = {
133
+ registerCrashHooks,
134
+ sendPayload,
135
+ resolveEndpoint,
136
+ DEFAULT_ENDPOINT,
137
+ SEND_TIMEOUT_MS,
138
+ };
@@ -0,0 +1,180 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Sanitization for crash telemetry payloads.
5
+ *
6
+ * INVARIANTS (non-negotiable):
7
+ * - No file paths: scrub /Users/<x>/..., C:\Users\<x>\..., /home/<x>/...
8
+ * - Cap error_message at 200 chars (after scrubbing)
9
+ * - No cell values, no workbook structure (not available post-crash anyway)
10
+ * - No env vars, no argv beyond a hardcoded allowlist
11
+ * - No machine identifier (no hostname, MAC, install ID)
12
+ *
13
+ * Future maintainers: do NOT enrich this payload. The consent_version gates
14
+ * any shape expansion. Bump CURRENT_CONSENT_VERSION in telemetry-config.js
15
+ * before adding new fields.
16
+ */
17
+
18
+ const os = require('os');
19
+
20
+ const MAX_MESSAGE_LENGTH = 200;
21
+
22
+ // Allowlisted first-arg values for the command field.
23
+ // Everything else becomes '<other>'.
24
+ const ALLOWED_COMMANDS = new Set([
25
+ 'xlsx-for-ai',
26
+ 'cursor-reads-xlsx',
27
+ 'write',
28
+ '--json',
29
+ '--md',
30
+ '--stdout',
31
+ '--sql',
32
+ '--schema',
33
+ '--compact',
34
+ '--evaluate',
35
+ '--stream',
36
+ '--list-sheets',
37
+ '--diff',
38
+ '--range',
39
+ '--named-range',
40
+ '--max-rows',
41
+ '--max-cols',
42
+ '--max-tokens',
43
+ '--report-bug',
44
+ '--export-redacted-workbook',
45
+ '--enable-telemetry',
46
+ '--disable-telemetry',
47
+ '--telemetry-status',
48
+ '--help',
49
+ '--version',
50
+ '-h',
51
+ '-v',
52
+ ]);
53
+
54
+ /**
55
+ * Scrub filesystem paths from a string.
56
+ *
57
+ * Covers:
58
+ * /Users/<name>/... (macOS)
59
+ * /home/<name>/... (Linux)
60
+ * C:\Users\<name>\... (Windows, forward or back slash variants)
61
+ * /var/folders/... (macOS temp)
62
+ * /tmp/... (Linux/macOS tmp)
63
+ * /private/tmp/... (macOS private tmp — e.g. worktrees)
64
+ * URL-encoded forms of the above (%2FUsers%2F<name>%2F...)
65
+ * $HOME and %USERPROFILE% references
66
+ */
67
+ function scrubPaths(str) {
68
+ if (typeof str !== 'string') return str;
69
+
70
+ // URL-decode once before scanning, then re-check on the decoded copy.
71
+ // We do NOT modify `str` in-place with decoded content because the
72
+ // caller's downstream display may want the original encoding. Instead
73
+ // we run two passes: one on the raw string, one on the decoded copy,
74
+ // and use the more-scrubbed result.
75
+ let decoded;
76
+ try {
77
+ decoded = decodeURIComponent(str);
78
+ } catch (_) {
79
+ decoded = str;
80
+ }
81
+
82
+ function scrubLiteral(s) {
83
+ // Windows: C:\Users\<name>\... or C:/Users/<name>/...
84
+ let out = s.replace(
85
+ /[A-Za-z]:[/\\][Uu]sers[/\\][^/\\:\s]+([/\\][^\s]*)*/g,
86
+ '<path>'
87
+ );
88
+
89
+ // Unix home dirs: /Users/<name>/... or /home/<name>/...
90
+ out = out.replace(
91
+ /\/(Users|home)\/[^/\s:]+([^\s:])*/g,
92
+ '<path>'
93
+ );
94
+
95
+ // /tmp/... and /private/tmp/...
96
+ out = out.replace(
97
+ /\/(?:private\/)?tmp\/[^\s:]+/g,
98
+ '<path>'
99
+ );
100
+
101
+ // /var/folders/...
102
+ out = out.replace(
103
+ /\/var\/folders\/[^\s:]+/g,
104
+ '<path>'
105
+ );
106
+
107
+ // $HOME/... or ${HOME}/... (env-var style references to home dir)
108
+ out = out.replace(
109
+ /\$\{?HOME\}?\/[^\s]*/gi,
110
+ '<path>'
111
+ );
112
+
113
+ // %USERPROFILE%\... (Windows env-var style)
114
+ out = out.replace(
115
+ /%USERPROFILE%[/\\][^\s]*/gi,
116
+ '<path>'
117
+ );
118
+
119
+ return out;
120
+ }
121
+
122
+ const scrubbed = scrubLiteral(str);
123
+ const scrubbedDecoded = scrubLiteral(decoded);
124
+
125
+ // If the decoded version produced additional scrubbing (URL-encoded path),
126
+ // return the scrubbed-decoded version so the sensitive data is removed.
127
+ // Heuristic: if scrubbing the decoded string was MORE aggressive (fewer
128
+ // remaining path fragments) use that result.
129
+ if (scrubbedDecoded.length < scrubbed.length) {
130
+ return scrubbedDecoded;
131
+ }
132
+ return scrubbed;
133
+ }
134
+
135
+ /**
136
+ * Sanitize error message: scrub paths, cap at 200 chars.
137
+ */
138
+ function sanitizeMessage(message) {
139
+ if (!message) return '';
140
+ const scrubbed = scrubPaths(String(message));
141
+ return scrubbed.slice(0, MAX_MESSAGE_LENGTH);
142
+ }
143
+
144
+ /**
145
+ * Build the outgoing crash payload from an Error object.
146
+ * Returns a plain object with only the allowed fields.
147
+ */
148
+ function buildPayload(err, version) {
149
+ const errorType = (err && err.constructor && err.constructor.name) || 'Error';
150
+ const rawMessage = (err && err.message) ? err.message : String(err);
151
+ const message = sanitizeMessage(rawMessage);
152
+
153
+ // First arg from process.argv — only if in allowlist.
154
+ let command = '<other>';
155
+ try {
156
+ const firstArg = process.argv[2];
157
+ if (firstArg && ALLOWED_COMMANDS.has(firstArg)) {
158
+ command = firstArg;
159
+ }
160
+ } catch (_) { /* ignore */ }
161
+
162
+ return {
163
+ v: 1,
164
+ ts: new Date().toISOString(),
165
+ error_type: errorType,
166
+ error_message: message,
167
+ command,
168
+ xlsx_for_ai_version: version || 'unknown',
169
+ node_version: process.version,
170
+ os_arch: `${process.platform}-${process.arch}`,
171
+ };
172
+ }
173
+
174
+ module.exports = {
175
+ scrubPaths,
176
+ sanitizeMessage,
177
+ buildPayload,
178
+ MAX_MESSAGE_LENGTH,
179
+ ALLOWED_COMMANDS,
180
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xlsx-for-ai",
3
- "version": "1.5.0",
3
+ "version": "1.5.1",
4
4
  "description": "CLI that converts .xlsx files into rich text or JSON dumps that AI coding agents (Claude, Cursor, Copilot, ChatGPT, etc.) can read — preserving values, formulas, formatting, colors, column widths, frozen panes, named ranges, tables, and more.",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -17,7 +17,7 @@
17
17
  "LICENSE"
18
18
  ],
19
19
  "scripts": {
20
- "test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js"
20
+ "test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js tests/telemetry-sanitize.test.js tests/telemetry-config.test.js tests/telemetry-consent-version.test.js tests/telemetry-flags.test.js"
21
21
  },
22
22
  "keywords": [
23
23
  "xlsx",
@@ -50,6 +50,7 @@
50
50
  "@formulajs/formulajs": "^4.6.0",
51
51
  "@protobi/exceljs": "^4.4.0-protobi.9",
52
52
  "gpt-tokenizer": "^3.4.0",
53
+ "jszip": "^3.10.1",
53
54
  "papaparse": "^5.5.3",
54
55
  "xlsx": "^0.18.5"
55
56
  },