xlsx-for-ai 1.5.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -1
- package/WHY.md +4 -0
- package/index.js +209 -4
- package/lib/bugReport.js +2 -18
- package/lib/engine.js +1 -1
- package/lib/redactWorkbook.js +147 -4
- package/lib/telemetry-config.js +115 -0
- package/lib/telemetry-hooks.js +138 -0
- package/lib/telemetry-sanitize.js +180 -0
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -94,6 +94,7 @@ npx xlsx-for-ai data.xlsx "Sheet1" --stdout --max-rows 50 --compact
|
|
|
94
94
|
| `[sheetName]` | Positional: dump only this sheet |
|
|
95
95
|
| `--range A1:D50` | Dump only this rectangular range |
|
|
96
96
|
| `--named-range NAME` | Dump only the cells covered by a workbook-defined name |
|
|
97
|
+
| `--region` | Auto-detect the dominant contiguous data block (Excel "current region" / Ctrl+Shift+*). Picks the largest region by populated-cell count when multiple disjoint blocks exist. Compatible with `--max-rows` / `--max-cols`. |
|
|
97
98
|
| `--max-rows N` | Cap at the first N rows per sheet |
|
|
98
99
|
| `--max-cols N` | Cap at the first N columns per sheet |
|
|
99
100
|
|
|
@@ -318,7 +319,22 @@ A future release may apply these dep upgrades via `patch-package` so they travel
|
|
|
318
319
|
|
|
319
320
|
## Reporting bugs
|
|
320
321
|
|
|
321
|
-
**The privacy contract: we never auto-send
|
|
322
|
+
**The privacy contract: we never auto-send workbook data.** Anonymous crash telemetry is opt-in via `--enable-telemetry`; even then, we receive only error type, error message (sanitized — paths scrubbed, capped at 200 chars), tool version, Node version, and OS/arch. No paths, no cell values, no identifiers.
|
|
323
|
+
|
|
324
|
+
To enable or manage crash telemetry:
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
# Opt in — prints the exact payload schema so you can see what gets sent
|
|
328
|
+
xlsx-for-ai --enable-telemetry
|
|
329
|
+
|
|
330
|
+
# Opt out
|
|
331
|
+
xlsx-for-ai --disable-telemetry
|
|
332
|
+
|
|
333
|
+
# Check current state and config path
|
|
334
|
+
xlsx-for-ai --telemetry-status
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
Consent is stored at `~/.xlsx-for-ai/config.json` and persists across `npm install -g xlsx-for-ai@latest` upgrades. If the telemetry shape ever changes, the tool pauses sending and prompts you to re-opt-in — we never silently expand what we collect under old consent.
|
|
322
338
|
|
|
323
339
|
When something breaks on a real workbook, two flags help us reproduce locally without asking you to share the original file:
|
|
324
340
|
|
package/WHY.md
CHANGED
|
@@ -92,3 +92,7 @@ Spreadsheet libraries are designed for developers building software *on top of*
|
|
|
92
92
|
`xlsx-for-ai` is the first one built specifically for that. The output is shaped for an LLM's context window — markdown tables when the model just needs to read, structured JSON when it needs to reason, token-aware truncation when the spreadsheet is too big to fit, and a real `.xlsx` writer that produces a file you can hand back to a human along with a built-in note explaining everything that changed.
|
|
93
93
|
|
|
94
94
|
It's a small tool. It just happens to fix the one thing standing between AI assistants and the file format most knowledge work actually lives in.
|
|
95
|
+
|
|
96
|
+
## Privacy contract
|
|
97
|
+
|
|
98
|
+
We never auto-send workbook data. Anonymous crash telemetry is opt-in via `xlsx-for-ai --enable-telemetry`; even then, we receive only error type, error message (sanitized — paths scrubbed, capped at 200 chars), and tool/Node/OS version — no paths, no cell values, no identifiers. Nothing leaves your machine unless you choose to enable it.
|
package/index.js
CHANGED
|
@@ -22,10 +22,11 @@ if (!process.env.XLSX_FOR_AI_RESPAWNED) {
|
|
|
22
22
|
const path = require('path');
|
|
23
23
|
const fs = require('fs');
|
|
24
24
|
// All xlsx-engine access goes through the engine abstraction in lib/engine.js
|
|
25
|
-
// —
|
|
26
|
-
// different library, server-side service), replace
|
|
27
|
-
// else changes. Current engine: @protobi/exceljs
|
|
28
|
-
// with active maintenance + preservation patches;
|
|
25
|
+
// — lib/engine.js is the ONLY place in lib/ that requires @protobi/exceljs.
|
|
26
|
+
// To swap engines (fork, different library, server-side service), replace
|
|
27
|
+
// lib/engine.js; nothing else changes. Current engine: @protobi/exceljs
|
|
28
|
+
// (drop-in fork of exceljs with active maintenance + preservation patches;
|
|
29
|
+
// see ROADMAP for rationale).
|
|
29
30
|
const engine = require('./lib/engine');
|
|
30
31
|
|
|
31
32
|
// Lazy-load heavy deps only when their feature is used (keeps cold start fast
|
|
@@ -55,12 +56,16 @@ function parseArgs(argv) {
|
|
|
55
56
|
diff: null,
|
|
56
57
|
range: null,
|
|
57
58
|
namedRange: null,
|
|
59
|
+
region: false,
|
|
58
60
|
maxRows: null,
|
|
59
61
|
maxCols: null,
|
|
60
62
|
maxTokens: null,
|
|
61
63
|
reportBug: null,
|
|
62
64
|
exportRedactedWorkbook: null,
|
|
63
65
|
help: false,
|
|
66
|
+
enableTelemetry: false,
|
|
67
|
+
disableTelemetry: false,
|
|
68
|
+
telemetryStatus: false,
|
|
64
69
|
};
|
|
65
70
|
let i = 0;
|
|
66
71
|
while (i < argv.length) {
|
|
@@ -77,11 +82,15 @@ function parseArgs(argv) {
|
|
|
77
82
|
else if (arg === '--diff') { opts.diff = argv[++i]; }
|
|
78
83
|
else if (arg === '--range') { opts.range = argv[++i]; }
|
|
79
84
|
else if (arg === '--named-range') { opts.namedRange = argv[++i]; }
|
|
85
|
+
else if (arg === '--region') opts.region = true;
|
|
80
86
|
else if (arg === '--max-rows') { opts.maxRows = parseInt(argv[++i], 10); }
|
|
81
87
|
else if (arg === '--max-cols') { opts.maxCols = parseInt(argv[++i], 10); }
|
|
82
88
|
else if (arg === '--max-tokens') { opts.maxTokens = parseInt(argv[++i], 10); }
|
|
83
89
|
else if (arg === '--report-bug') { opts.reportBug = argv[++i]; }
|
|
84
90
|
else if (arg === '--export-redacted-workbook'){ opts.exportRedactedWorkbook = argv[++i]; }
|
|
91
|
+
else if (arg === '--enable-telemetry') opts.enableTelemetry = true;
|
|
92
|
+
else if (arg === '--disable-telemetry') opts.disableTelemetry = true;
|
|
93
|
+
else if (arg === '--telemetry-status') opts.telemetryStatus = true;
|
|
85
94
|
else if (arg === '-h' || arg === '--help') opts.help = true;
|
|
86
95
|
else opts.positional.push(arg);
|
|
87
96
|
i++;
|
|
@@ -111,6 +120,10 @@ Selection:
|
|
|
111
120
|
[sheetName] Positional second arg, dump only this sheet
|
|
112
121
|
--range A1:D50 Dump only this rectangular range
|
|
113
122
|
--named-range NM Dump only the cells covered by this defined name
|
|
123
|
+
--region Auto-detect the dominant contiguous data block (Excel
|
|
124
|
+
"current region" semantics); picks the largest region
|
|
125
|
+
by populated-cell count when multiple disjoint blocks
|
|
126
|
+
exist. Compatible with --max-rows / --max-cols.
|
|
114
127
|
--max-rows N Limit to first N rows per sheet
|
|
115
128
|
--max-cols N Limit to first N columns per sheet
|
|
116
129
|
|
|
@@ -141,6 +154,21 @@ Bug reporting (privacy-by-design — no data leaves your machine):
|
|
|
141
154
|
structure, styles, named ranges preserved. Optional
|
|
142
155
|
attachment for hard-to-repro bugs.
|
|
143
156
|
|
|
157
|
+
Crash telemetry (opt-in only):
|
|
158
|
+
--enable-telemetry
|
|
159
|
+
Opt in to anonymous crash telemetry. Only error type,
|
|
160
|
+
sanitized error message (paths scrubbed, ≤200 chars),
|
|
161
|
+
tool version, Node version, and OS/arch are sent.
|
|
162
|
+
No paths, no cell values, no identifiers.
|
|
163
|
+
Payload: { v, ts, error_type, error_message, command,
|
|
164
|
+
xlsx_for_ai_version, node_version, os_arch }
|
|
165
|
+
Consent persists at ~/.xlsx-for-ai/config.json across
|
|
166
|
+
upgrades.
|
|
167
|
+
--disable-telemetry
|
|
168
|
+
Opt out. Config file is kept (explicit "no" is recorded).
|
|
169
|
+
--telemetry-status
|
|
170
|
+
Show current state and config path.
|
|
171
|
+
|
|
144
172
|
Misc:
|
|
145
173
|
-h, --help Show this help
|
|
146
174
|
|
|
@@ -150,6 +178,8 @@ Examples:
|
|
|
150
178
|
npx xlsx-for-ai data.xlsx --json --max-tokens 8000 --stdout
|
|
151
179
|
npx xlsx-for-ai data.csv --md --stdout
|
|
152
180
|
npx xlsx-for-ai data.xlsx --range B2:F100 --stdout
|
|
181
|
+
npx xlsx-for-ai data.xlsx --region --stdout
|
|
182
|
+
npx xlsx-for-ai data.xlsx --region --max-rows 50 --stdout
|
|
153
183
|
npx xlsx-for-ai data.xlsx --named-range MyTotals --stdout
|
|
154
184
|
npx xlsx-for-ai data.xlsx --sql --stdout > schema.sql
|
|
155
185
|
npx xlsx-for-ai old.xlsx --diff new.xlsx --stdout
|
|
@@ -325,6 +355,115 @@ function resolveNamedRange(wb, name) {
|
|
|
325
355
|
return { sheet: sheetName, range: parseRange(rangeStr) };
|
|
326
356
|
}
|
|
327
357
|
|
|
358
|
+
// ---------------------------------------------------------------------------
|
|
359
|
+
// Region detection — "current region" semantics (Excel Ctrl+Shift+*)
|
|
360
|
+
//
|
|
361
|
+
// Finds the dominant contiguous data block on a worksheet. Algorithm:
|
|
362
|
+
// 1. Scan the sheet to collect all populated cells.
|
|
363
|
+
// 2. Build connected components using 8-neighbor flood fill (cells that
|
|
364
|
+
// share a corner or edge are in the same region).
|
|
365
|
+
// 3. For each component, compute the bounding rectangle and the count of
|
|
366
|
+
// populated cells inside it.
|
|
367
|
+
// 4. Return the bounding box of the component with the most populated cells
|
|
368
|
+
// (tie-break: largest populated count; if still tied, the first found).
|
|
369
|
+
//
|
|
370
|
+
// Returns {startRow, startCol, endRow, endCol} (1-indexed), or null if the
|
|
371
|
+
// sheet has no populated cells.
|
|
372
|
+
// ---------------------------------------------------------------------------
|
|
373
|
+
|
|
374
|
+
function detectRegion(ws) {
|
|
375
|
+
// Step 1: collect all populated cells into a Set for O(1) lookup.
|
|
376
|
+
// We store them as "row,col" strings and also keep a list for iteration.
|
|
377
|
+
const populated = new Set();
|
|
378
|
+
const cells = [];
|
|
379
|
+
|
|
380
|
+
const rowCount = ws.rowCount;
|
|
381
|
+
const colCount = ws.columnCount;
|
|
382
|
+
if (rowCount === 0 || colCount === 0) return null;
|
|
383
|
+
|
|
384
|
+
// ExcelJS reports rowCount/columnCount as the highest USED row/column,
|
|
385
|
+
// not actual storage. A workbook with one cell at XFD1048576 reports
|
|
386
|
+
// 1048576 × 16384 = ~17B coordinates. Refuse the scan past 5M cells —
|
|
387
|
+
// pathological/malicious inputs would otherwise hang the CLI.
|
|
388
|
+
if (rowCount * colCount > 5_000_000) {
|
|
389
|
+
console.warn(
|
|
390
|
+
`detectRegion: workbook reports ${rowCount}×${colCount} cell dimensions, ` +
|
|
391
|
+
`exceeds 5M-cell scan cap; skipping region detection`
|
|
392
|
+
);
|
|
393
|
+
return null;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
for (let r = 1; r <= rowCount; r++) {
|
|
397
|
+
const row = ws.getRow(r);
|
|
398
|
+
for (let c = 1; c <= colCount; c++) {
|
|
399
|
+
const v = row.getCell(c).value;
|
|
400
|
+
if (v != null && v !== '') {
|
|
401
|
+
const key = `${r},${c}`;
|
|
402
|
+
populated.add(key);
|
|
403
|
+
cells.push([r, c]);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if (cells.length === 0) return null;
|
|
409
|
+
|
|
410
|
+
// Step 2: flood-fill connected components (8-neighbor).
|
|
411
|
+
const visited = new Set();
|
|
412
|
+
const components = [];
|
|
413
|
+
|
|
414
|
+
for (const [startR, startC] of cells) {
|
|
415
|
+
const key = `${startR},${startC}`;
|
|
416
|
+
if (visited.has(key)) continue;
|
|
417
|
+
|
|
418
|
+
// BFS from this seed cell.
|
|
419
|
+
const component = [];
|
|
420
|
+
const queue = [[startR, startC]];
|
|
421
|
+
visited.add(key);
|
|
422
|
+
|
|
423
|
+
while (queue.length > 0) {
|
|
424
|
+
const [r, c] = queue.shift();
|
|
425
|
+
component.push([r, c]);
|
|
426
|
+
// 8 neighbors
|
|
427
|
+
for (let dr = -1; dr <= 1; dr++) {
|
|
428
|
+
for (let dc = -1; dc <= 1; dc++) {
|
|
429
|
+
if (dr === 0 && dc === 0) continue;
|
|
430
|
+
const nr = r + dr;
|
|
431
|
+
const nc = c + dc;
|
|
432
|
+
if (nr < 1 || nc < 1) continue;
|
|
433
|
+
const nk = `${nr},${nc}`;
|
|
434
|
+
if (!visited.has(nk) && populated.has(nk)) {
|
|
435
|
+
visited.add(nk);
|
|
436
|
+
queue.push([nr, nc]);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
components.push(component);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// Step 3: pick the component with the most populated cells.
|
|
445
|
+
let best = null;
|
|
446
|
+
let bestCount = -1;
|
|
447
|
+
for (const comp of components) {
|
|
448
|
+
if (comp.length > bestCount) {
|
|
449
|
+
bestCount = comp.length;
|
|
450
|
+
best = comp;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// Step 4: compute bounding rectangle of the winning component.
|
|
455
|
+
let minR = Infinity, maxR = -Infinity;
|
|
456
|
+
let minC = Infinity, maxC = -Infinity;
|
|
457
|
+
for (const [r, c] of best) {
|
|
458
|
+
if (r < minR) minR = r;
|
|
459
|
+
if (r > maxR) maxR = r;
|
|
460
|
+
if (c < minC) minC = c;
|
|
461
|
+
if (c > maxC) maxC = c;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return { startRow: minR, endRow: maxR, startCol: minC, endCol: maxC };
|
|
465
|
+
}
|
|
466
|
+
|
|
328
467
|
// ---------------------------------------------------------------------------
|
|
329
468
|
// Selection bounds — combines --range, --named-range, --max-rows/cols, sheet
|
|
330
469
|
// dimensions into a single {startRow, startCol, endRow, endCol}.
|
|
@@ -336,6 +475,9 @@ function selectionBounds(ws, opts) {
|
|
|
336
475
|
bounds = parseRange(opts.range);
|
|
337
476
|
} else if (opts.namedRangeBounds) {
|
|
338
477
|
bounds = opts.namedRangeBounds;
|
|
478
|
+
} else if (opts.region) {
|
|
479
|
+
bounds = detectRegion(ws);
|
|
480
|
+
// bounds may be null (empty sheet); handled below by falling back to sheet dimensions.
|
|
339
481
|
}
|
|
340
482
|
const startRow = bounds ? bounds.startRow : 1;
|
|
341
483
|
const startCol = bounds ? bounds.startCol : 1;
|
|
@@ -1709,6 +1851,56 @@ async function main() {
|
|
|
1709
1851
|
|
|
1710
1852
|
if (opts.help) { printHelp(); process.exit(0); }
|
|
1711
1853
|
|
|
1854
|
+
// ---------------------------------------------------------------------------
|
|
1855
|
+
// Telemetry management flags — handled before crash hooks are registered.
|
|
1856
|
+
// ---------------------------------------------------------------------------
|
|
1857
|
+
if (opts.enableTelemetry || opts.disableTelemetry || opts.telemetryStatus) {
|
|
1858
|
+
const telCfg = require('./lib/telemetry-config');
|
|
1859
|
+
|
|
1860
|
+
if (opts.enableTelemetry) {
|
|
1861
|
+
telCfg.enableTelemetry();
|
|
1862
|
+
console.log('Crash telemetry enabled.');
|
|
1863
|
+
console.log('');
|
|
1864
|
+
console.log('When a crash occurs, this payload will be sent:');
|
|
1865
|
+
console.log(JSON.stringify({
|
|
1866
|
+
v: 1,
|
|
1867
|
+
ts: '<ISO-timestamp>',
|
|
1868
|
+
error_type: '<e.g. TypeError>',
|
|
1869
|
+
error_message: '<sanitized, ≤200 chars — paths scrubbed>',
|
|
1870
|
+
command: '<first CLI arg from allowlist, or "<other>">',
|
|
1871
|
+
xlsx_for_ai_version: require('./package.json').version,
|
|
1872
|
+
node_version: process.version,
|
|
1873
|
+
os_arch: `${process.platform}-${process.arch}`,
|
|
1874
|
+
}, null, 2));
|
|
1875
|
+
console.log('');
|
|
1876
|
+
console.log('No paths, no cell values, no identifiers. Consent stored at:');
|
|
1877
|
+
console.log(telCfg.configPath());
|
|
1878
|
+
return;
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
if (opts.disableTelemetry) {
|
|
1882
|
+
telCfg.disableTelemetry();
|
|
1883
|
+
console.log('Crash telemetry disabled.');
|
|
1884
|
+
console.log('Config kept at: ' + telCfg.configPath());
|
|
1885
|
+
return;
|
|
1886
|
+
}
|
|
1887
|
+
|
|
1888
|
+
if (opts.telemetryStatus) {
|
|
1889
|
+
const status = telCfg.telemetryStatus();
|
|
1890
|
+
console.log(`Telemetry status: ${status}`);
|
|
1891
|
+
console.log(`Config path: ${telCfg.configPath()}`);
|
|
1892
|
+
return;
|
|
1893
|
+
}
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
// ---------------------------------------------------------------------------
|
|
1897
|
+
// Register process-level crash hooks (no-op if user hasn't opted in).
|
|
1898
|
+
// ---------------------------------------------------------------------------
|
|
1899
|
+
{
|
|
1900
|
+
const { registerCrashHooks } = require('./lib/telemetry-hooks');
|
|
1901
|
+
registerCrashHooks(require('./package.json').version);
|
|
1902
|
+
}
|
|
1903
|
+
|
|
1712
1904
|
// Bug-report and redacted-workbook modes consume their input via the
|
|
1713
1905
|
// flag itself, so they bypass the normal positional / loader path.
|
|
1714
1906
|
if (opts.reportBug) {
|
|
@@ -1838,6 +2030,16 @@ async function main() {
|
|
|
1838
2030
|
|
|
1839
2031
|
const baseName = path.basename(filePath, path.extname(filePath));
|
|
1840
2032
|
|
|
2033
|
+
// --region: warn per-sheet if no data block was found (empty sheet).
|
|
2034
|
+
if (opts.region) {
|
|
2035
|
+
for (const ws of sheets) {
|
|
2036
|
+
const r = detectRegion(ws);
|
|
2037
|
+
if (!r) {
|
|
2038
|
+
console.error(`note: --region: no data found in sheet "${ws.name}"; dumping full sheet dimensions.`);
|
|
2039
|
+
}
|
|
2040
|
+
}
|
|
2041
|
+
}
|
|
2042
|
+
|
|
1841
2043
|
// Pick output formatter.
|
|
1842
2044
|
const renderText = (ws) => dumpSheet(ws, wb, perSheetOpts);
|
|
1843
2045
|
const renderMd = (ws) => dumpSheetMarkdown(ws, wb, perSheetOpts);
|
|
@@ -1965,4 +2167,7 @@ module.exports = {
|
|
|
1965
2167
|
trySimpleEval,
|
|
1966
2168
|
// budget
|
|
1967
2169
|
applyTokenBudget,
|
|
2170
|
+
// region detection
|
|
2171
|
+
detectRegion,
|
|
2172
|
+
selectionBounds,
|
|
1968
2173
|
};
|
package/lib/bugReport.js
CHANGED
|
@@ -21,7 +21,7 @@ const fs = require('fs');
|
|
|
21
21
|
const path = require('path');
|
|
22
22
|
const os = require('os');
|
|
23
23
|
const JSZip = require('jszip');
|
|
24
|
-
const
|
|
24
|
+
const engine = require('./engine');
|
|
25
25
|
|
|
26
26
|
const PKG_VERSION = require('../package.json').version;
|
|
27
27
|
|
|
@@ -117,21 +117,6 @@ function inventoryFeatures(filenames) {
|
|
|
117
117
|
return out;
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
-
// Given the workbook.xml, extract the sheet relationship Ids and order
|
|
121
|
-
// without reading any user content. We just need names and rIds so we
|
|
122
|
-
// can pair them with worksheet parts to compute per-sheet stats.
|
|
123
|
-
function listSheetPartNames(zip) {
|
|
124
|
-
// Resolve via workbook rels: xl/_rels/workbook.xml.rels.
|
|
125
|
-
const out = [];
|
|
126
|
-
const relsFile = zip.file('xl/_rels/workbook.xml.rels');
|
|
127
|
-
if (!relsFile) return out;
|
|
128
|
-
// Sync — we already have the file in memory inside JSZip.
|
|
129
|
-
// We use a lightweight regex; structural only, no values inside.
|
|
130
|
-
// Each Relationship: <Relationship Id="rId1" Type="..." Target="worksheets/sheet1.xml"/>
|
|
131
|
-
// We can't do sync read without loading; caller already loaded.
|
|
132
|
-
return out;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
120
|
async function generateBugReport(filePath) {
|
|
136
121
|
if (!fs.existsSync(filePath)) {
|
|
137
122
|
throw new Error(`File not found: ${filePath}`);
|
|
@@ -167,8 +152,7 @@ async function generateBugReport(filePath) {
|
|
|
167
152
|
let exceljsError = null;
|
|
168
153
|
|
|
169
154
|
try {
|
|
170
|
-
const wb =
|
|
171
|
-
await wb.xlsx.readFile(filePath);
|
|
155
|
+
const wb = await engine.loadWorkbook(filePath);
|
|
172
156
|
sheetCount = wb.worksheets.length;
|
|
173
157
|
for (const ws of wb.worksheets) {
|
|
174
158
|
const merges = ws.model && ws.model.merges ? ws.model.merges.length : 0;
|
package/lib/engine.js
CHANGED
|
@@ -23,7 +23,7 @@ class ExcelJSEngine {
|
|
|
23
23
|
/** Engine identifier — useful for diagnostics. */
|
|
24
24
|
get name() { return 'exceljs'; }
|
|
25
25
|
get version() {
|
|
26
|
-
try { return require('exceljs/package.json').version; } catch (_) { return 'unknown'; }
|
|
26
|
+
try { return require('@protobi/exceljs/package.json').version; } catch (_) { return 'unknown'; }
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
/**
|
package/lib/redactWorkbook.js
CHANGED
|
@@ -117,19 +117,138 @@ function redactSharedStringsXml(xml) {
|
|
|
117
117
|
}
|
|
118
118
|
|
|
119
119
|
// Comments: <comment><text><r>...<t>USER TEXT</t></r></text></comment>
|
|
120
|
-
// Replace every <t> payload with "x".
|
|
120
|
+
// Replace every <t> payload with "x". Also strips <author>NAME</author>
|
|
121
|
+
// display names in <authors>; the numeric authorId on each <comment>
|
|
122
|
+
// references the (now redacted) author entry.
|
|
121
123
|
function redactCommentsXml(xml) {
|
|
122
|
-
|
|
124
|
+
let out = xml.replace(/(<t\b[^>]*>)([\s\S]*?)(<\/t>)/g, (m, open, payload, close) => {
|
|
125
|
+
return open + (payload === '' ? '' : 'x') + close;
|
|
126
|
+
});
|
|
127
|
+
out = out.replace(/(<author\b[^>]*>)([\s\S]*?)(<\/author>)/g, (m, open, payload, close) => {
|
|
123
128
|
return open + (payload === '' ? '' : 'x') + close;
|
|
124
129
|
});
|
|
130
|
+
return out;
|
|
125
131
|
}
|
|
126
132
|
|
|
127
133
|
// Threaded comments: <threadedComment ... text="USER TEXT" .../>
|
|
128
|
-
// Excel encodes the body as an attribute — must redact in place.
|
|
134
|
+
// Excel encodes the body as an attribute — must redact in place. Both
|
|
135
|
+
// double-quoted and single-quoted attribute values are valid XML and we
|
|
136
|
+
// must scrub both forms.
|
|
129
137
|
function redactThreadedCommentsXml(xml) {
|
|
130
|
-
return xml.replace(/\btext="[^"]*"/g, 'text="x"');
|
|
138
|
+
return xml.replace(/\btext=("[^"]*"|'[^']*')/g, 'text="x"');
|
|
131
139
|
}
|
|
132
140
|
|
|
141
|
+
// xl/persons/person.xml — author registry for threaded comments.
|
|
142
|
+
// <person displayName="Alice" id="..." userId="alice@co.com" providerId="AzureAD"/>
|
|
143
|
+
// Strip the three identifying attributes; leave id (a UUID) so threaded comment
|
|
144
|
+
// authorId references still resolve.
|
|
145
|
+
function redactPersonsXml(xml) {
|
|
146
|
+
return xml
|
|
147
|
+
.replace(/\bdisplayName="[^"]*"/g, 'displayName="x"')
|
|
148
|
+
.replace(/\buserId="[^"]*"/g, 'userId="x"')
|
|
149
|
+
.replace(/\bproviderId="[^"]*"/g, 'providerId="x"');
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// docProps/core.xml — strip author, title, subject, description, keywords,
|
|
153
|
+
// category, lastModifiedBy, and any other user-text elements.
|
|
154
|
+
// The timestamp elements (dcterms:created / dcterms:modified) and structural
|
|
155
|
+
// elements (the xmlns declarations, DocSecurity, etc.) are left alone because
|
|
156
|
+
// they're non-identifying metadata needed for round-trip fidelity.
|
|
157
|
+
//
|
|
158
|
+
// Elements scrubbed:
|
|
159
|
+
// dc:creator → the file's original author name
|
|
160
|
+
// dc:title → document title set by author
|
|
161
|
+
// dc:subject → subject field
|
|
162
|
+
// dc:description → description field
|
|
163
|
+
// cp:keywords → keyword tags
|
|
164
|
+
// cp:category → category field
|
|
165
|
+
// cp:lastModifiedBy → last editor's name
|
|
166
|
+
// cp:contentStatus → rarely set, but can contain user text
|
|
167
|
+
const CORE_SCRUB_TAGS = [
|
|
168
|
+
'dc:creator',
|
|
169
|
+
'dc:title',
|
|
170
|
+
'dc:subject',
|
|
171
|
+
'dc:description',
|
|
172
|
+
'cp:keywords',
|
|
173
|
+
'cp:category',
|
|
174
|
+
'cp:lastModifiedBy',
|
|
175
|
+
'cp:contentStatus',
|
|
176
|
+
];
|
|
177
|
+
|
|
178
|
+
function redactCoreXml(xml) {
|
|
179
|
+
let out = xml;
|
|
180
|
+
for (const tag of CORE_SCRUB_TAGS) {
|
|
181
|
+
// Replace inner content: <dc:creator>...</dc:creator> → <dc:creator></dc:creator>
|
|
182
|
+
// Handles attributes on the opening tag and multi-line content.
|
|
183
|
+
out = out.replace(
|
|
184
|
+
new RegExp(`(<${tag}\\b[^>]*>)[\\s\\S]*?(<\\/${tag}>)`, 'g'),
|
|
185
|
+
'$1$2'
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
return out;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// docProps/app.xml — strip Company, Manager, and HyperlinkBase which can
|
|
192
|
+
// contain user-identifying strings. The Application, AppVersion, DocSecurity,
|
|
193
|
+
// HeadingPairs, and TitlesOfParts (sheet names) fields are structural and left
|
|
194
|
+
// alone — sheet names are part of workbook structure, not cell values.
|
|
195
|
+
const APP_SCRUB_TAGS = [
|
|
196
|
+
'Company',
|
|
197
|
+
'Manager',
|
|
198
|
+
'HyperlinkBase',
|
|
199
|
+
];
|
|
200
|
+
|
|
201
|
+
function redactAppXml(xml) {
|
|
202
|
+
let out = xml;
|
|
203
|
+
for (const tag of APP_SCRUB_TAGS) {
|
|
204
|
+
out = out.replace(
|
|
205
|
+
new RegExp(`(<${tag}\\b[^>]*>)[\\s\\S]*?(<\\/${tag}>)`, 'g'),
|
|
206
|
+
'$1$2'
|
|
207
|
+
);
|
|
208
|
+
}
|
|
209
|
+
return out;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// docProps/custom.xml — custom properties are arbitrary user-defined key/value
|
|
213
|
+
// pairs. Strip the value payloads; keep the property names so the file remains
|
|
214
|
+
// structurally valid.
|
|
215
|
+
function redactCustomPropsXml(xml) {
|
|
216
|
+
// Custom property values live inside <vt:*> typed-value elements.
|
|
217
|
+
// Replace their inner text with empty string (preserves type nodes).
|
|
218
|
+
//
|
|
219
|
+
// The character class includes digits so OOXML numeric type names
|
|
220
|
+
// (vt:r4, vt:r8, vt:i1/i2/i4/i8, vt:ui1/ui2/ui4/ui8, vt:filetime) match.
|
|
221
|
+
// The \2 backreference forces the open and close tag names to match,
|
|
222
|
+
// so a payload that contains nested elements (e.g.
|
|
223
|
+
// <vt:variant><vt:lpwstr>X</vt:lpwstr></vt:variant>) doesn't produce
|
|
224
|
+
// mangled output. The inner text class [^<] keeps the match strictly
|
|
225
|
+
// text-only; the outer wrapper is left structurally intact and any
|
|
226
|
+
// nested vt:* elements get scrubbed by the same regex on overlapping
|
|
227
|
+
// passes.
|
|
228
|
+
return xml.replace(/(<(vt:[a-zA-Z0-9]+)\b[^>]*>)[^<]*(<\/\2>)/g, '$1$3');
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// 1×1 transparent PNG — minimum valid PNG bytes. Used as a safe placeholder
|
|
232
|
+
// when stripping xl/media/ binary blobs so the ZIP remains structurally valid
|
|
233
|
+
// and drawing relationships don't point to missing entries.
|
|
234
|
+
// (96 bytes: PNG sig + IHDR + IDAT with one transparent pixel + IEND)
|
|
235
|
+
const TRANSPARENT_1X1_PNG = Buffer.from(
|
|
236
|
+
'89504e470d0a1a0a' + // PNG signature
|
|
237
|
+
'0000000d49484452' + // IHDR length + type
|
|
238
|
+
'00000001' + // width = 1
|
|
239
|
+
'00000001' + // height = 1
|
|
240
|
+
'08060000' + // 8-bit RGBA
|
|
241
|
+
'001f15c4' + // IHDR CRC
|
|
242
|
+
'89' + // IHDR chunk footer padding
|
|
243
|
+
'0000000a49444154' + // IDAT length + type
|
|
244
|
+
'789c6260' + // zlib header + deflate block
|
|
245
|
+
'0000000200' + // deflate end
|
|
246
|
+
'01e221bc33' + // IDAT CRC
|
|
247
|
+
'0000000049454e44' + // IEND length + type
|
|
248
|
+
'ae426082', // IEND CRC
|
|
249
|
+
'hex',
|
|
250
|
+
);
|
|
251
|
+
|
|
133
252
|
async function exportRedactedWorkbook(inputPath, outputPath) {
|
|
134
253
|
if (!fs.existsSync(inputPath)) {
|
|
135
254
|
throw new Error(`File not found: ${inputPath}`);
|
|
@@ -160,6 +279,23 @@ async function exportRedactedWorkbook(inputPath, outputPath) {
|
|
|
160
279
|
} else if (/^xl\/threadedComments\/threadedComment\d+\.xml$/i.test(name)) {
|
|
161
280
|
const xml = await file.async('string');
|
|
162
281
|
zip.file(name, redactThreadedCommentsXml(xml));
|
|
282
|
+
} else if (/^docProps\/core\.xml$/i.test(name)) {
|
|
283
|
+
const xml = await file.async('string');
|
|
284
|
+
zip.file(name, redactCoreXml(xml));
|
|
285
|
+
} else if (/^docProps\/app\.xml$/i.test(name)) {
|
|
286
|
+
const xml = await file.async('string');
|
|
287
|
+
zip.file(name, redactAppXml(xml));
|
|
288
|
+
} else if (/^docProps\/custom\.xml$/i.test(name)) {
|
|
289
|
+
const xml = await file.async('string');
|
|
290
|
+
zip.file(name, redactCustomPropsXml(xml));
|
|
291
|
+
} else if (/^xl\/persons\/person\.xml$/i.test(name)) {
|
|
292
|
+
const xml = await file.async('string');
|
|
293
|
+
zip.file(name, redactPersonsXml(xml));
|
|
294
|
+
} else if (/^xl\/media\//i.test(name)) {
|
|
295
|
+
// Embedded images / media — replace with a 1×1 transparent PNG so
|
|
296
|
+
// drawing relationships remain valid and the ZIP is structurally intact,
|
|
297
|
+
// but no user-supplied binary data survives in the output.
|
|
298
|
+
zip.file(name, TRANSPARENT_1X1_PNG);
|
|
163
299
|
}
|
|
164
300
|
// All other parts pass through untouched.
|
|
165
301
|
}
|
|
@@ -180,4 +316,11 @@ module.exports = {
|
|
|
180
316
|
// exported for unit testing
|
|
181
317
|
_redactSheetXml: redactSheetXml,
|
|
182
318
|
_redactSharedStringsXml: redactSharedStringsXml,
|
|
319
|
+
_redactCommentsXml: redactCommentsXml,
|
|
320
|
+
_redactThreadedCommentsXml: redactThreadedCommentsXml,
|
|
321
|
+
_redactCoreXml: redactCoreXml,
|
|
322
|
+
_redactAppXml: redactAppXml,
|
|
323
|
+
_redactCustomPropsXml: redactCustomPropsXml,
|
|
324
|
+
_redactPersonsXml: redactPersonsXml,
|
|
325
|
+
_TRANSPARENT_1X1_PNG: TRANSPARENT_1X1_PNG,
|
|
183
326
|
};
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Persistent user-level telemetry config at ~/.xlsx-for-ai/config.json.
|
|
5
|
+
*
|
|
6
|
+
* Stored outside node_modules so consent survives `npm install -g xlsx-for-ai@latest`
|
|
7
|
+
* upgrades. Path is resolved via os.homedir() for cross-platform support.
|
|
8
|
+
*
|
|
9
|
+
* Config shape:
|
|
10
|
+
* { "telemetry": true, "consented_at": "ISO-string", "consent_version": 1 }
|
|
11
|
+
*
|
|
12
|
+
* consent_version: bump CURRENT_CONSENT_VERSION when the telemetry shape changes.
|
|
13
|
+
* If the file's version is older, telemetry is PAUSED until the user re-runs
|
|
14
|
+
* --enable-telemetry. Never silently expand data shape under old consent.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const fs = require('fs');
|
|
18
|
+
const path = require('path');
|
|
19
|
+
const os = require('os');
|
|
20
|
+
|
|
21
|
+
const CURRENT_CONSENT_VERSION = 1;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Return the path to the config file. Uses XFA_CONFIG_DIR env var for test
|
|
25
|
+
* isolation; otherwise defaults to ~/.xlsx-for-ai/config.json.
|
|
26
|
+
*/
|
|
27
|
+
function configDir() {
|
|
28
|
+
return process.env.XFA_CONFIG_DIR || path.join(os.homedir(), '.xlsx-for-ai');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function configPath() {
|
|
32
|
+
return path.join(configDir(), 'config.json');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Read config from disk. Returns null if file doesn't exist or is unreadable.
|
|
37
|
+
*/
|
|
38
|
+
function readConfig() {
|
|
39
|
+
try {
|
|
40
|
+
const raw = fs.readFileSync(configPath(), 'utf8');
|
|
41
|
+
return JSON.parse(raw);
|
|
42
|
+
} catch (_) {
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Write config to disk atomically. Creates the directory if needed.
|
|
49
|
+
*/
|
|
50
|
+
function writeConfig(data) {
|
|
51
|
+
const dir = configDir();
|
|
52
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
53
|
+
fs.writeFileSync(configPath(), JSON.stringify(data, null, 2) + '\n', 'utf8');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Telemetry status as one of:
|
|
58
|
+
* 'enabled' - opt-in, consent_version matches
|
|
59
|
+
* 'disabled' - explicitly opted out
|
|
60
|
+
* 'not configured' - no config file yet
|
|
61
|
+
* 'paused (consent_version mismatch)' - opted in but consent_version is stale
|
|
62
|
+
*/
|
|
63
|
+
function telemetryStatus() {
|
|
64
|
+
const cfg = readConfig();
|
|
65
|
+
if (!cfg) return 'not configured';
|
|
66
|
+
if (cfg.telemetry === false) return 'disabled';
|
|
67
|
+
if (cfg.telemetry === true) {
|
|
68
|
+
if (cfg.consent_version !== CURRENT_CONSENT_VERSION) {
|
|
69
|
+
return 'paused (consent_version mismatch)';
|
|
70
|
+
}
|
|
71
|
+
return 'enabled';
|
|
72
|
+
}
|
|
73
|
+
return 'not configured';
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Returns true only if telemetry is fully active (opted in AND version matches).
|
|
78
|
+
*/
|
|
79
|
+
function isTelemetryActive() {
|
|
80
|
+
return telemetryStatus() === 'enabled';
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Enable telemetry — write consent with current version.
|
|
85
|
+
* Idempotent.
|
|
86
|
+
*/
|
|
87
|
+
function enableTelemetry() {
|
|
88
|
+
const existing = readConfig() || {};
|
|
89
|
+
writeConfig({
|
|
90
|
+
...existing,
|
|
91
|
+
telemetry: true,
|
|
92
|
+
consented_at: new Date().toISOString(),
|
|
93
|
+
consent_version: CURRENT_CONSENT_VERSION,
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Disable telemetry — write explicit false (keeps the file so we can distinguish
|
|
99
|
+
* "user said no" from "never asked").
|
|
100
|
+
*/
|
|
101
|
+
function disableTelemetry() {
|
|
102
|
+
const existing = readConfig() || {};
|
|
103
|
+
writeConfig({ ...existing, telemetry: false });
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
module.exports = {
|
|
107
|
+
CURRENT_CONSENT_VERSION,
|
|
108
|
+
configPath,
|
|
109
|
+
readConfig,
|
|
110
|
+
writeConfig,
|
|
111
|
+
telemetryStatus,
|
|
112
|
+
isTelemetryActive,
|
|
113
|
+
enableTelemetry,
|
|
114
|
+
disableTelemetry,
|
|
115
|
+
};
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Process-level crash telemetry hooks for xlsx-for-ai.
|
|
5
|
+
*
|
|
6
|
+
* Registers uncaughtException + unhandledRejection handlers only when the user
|
|
7
|
+
* has opted in (isTelemetryActive() === true). On crash, sends a minimal,
|
|
8
|
+
* sanitized payload and then re-throws the original error so the user still
|
|
9
|
+
* sees the stack trace and gets a non-zero exit code.
|
|
10
|
+
*
|
|
11
|
+
* Endpoint: XLSX_FOR_AI_TELEMETRY_ENDPOINT env var if set, else default below.
|
|
12
|
+
* // Endpoint deployment tracked separately — see project memory
|
|
13
|
+
* // project_xlsx_for_ai_telemetry_endpoint.md (TBD).
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const https = require('https');
|
|
17
|
+
const http = require('http');
|
|
18
|
+
const { URL } = require('url');
|
|
19
|
+
|
|
20
|
+
const { isTelemetryActive, telemetryStatus } = require('./telemetry-config');
|
|
21
|
+
const { buildPayload } = require('./telemetry-sanitize');
|
|
22
|
+
|
|
23
|
+
const SEND_TIMEOUT_MS = 2000;
|
|
24
|
+
|
|
25
|
+
// Endpoint deployment tracked separately — see project memory
|
|
26
|
+
// project_xlsx_for_ai_telemetry_endpoint.md (TBD).
|
|
27
|
+
const DEFAULT_ENDPOINT = 'https://telemetry.xlsx-for-ai.dev/v1/crash';
|
|
28
|
+
|
|
29
|
+
function resolveEndpoint() {
|
|
30
|
+
return process.env.XLSX_FOR_AI_TELEMETRY_ENDPOINT || DEFAULT_ENDPOINT;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Send payload to the telemetry endpoint. Returns a Promise that:
|
|
35
|
+
* - resolves on success (2xx)
|
|
36
|
+
* - resolves (with a warning) on non-2xx or send failure
|
|
37
|
+
* - resolves on timeout (after SEND_TIMEOUT_MS)
|
|
38
|
+
*
|
|
39
|
+
* The Promise ALWAYS resolves — never rejects. A hung send must not block exit.
|
|
40
|
+
*/
|
|
41
|
+
function sendPayload(payload) {
|
|
42
|
+
return new Promise((resolve) => {
|
|
43
|
+
const body = JSON.stringify(payload);
|
|
44
|
+
const endpoint = resolveEndpoint();
|
|
45
|
+
|
|
46
|
+
let parsed;
|
|
47
|
+
try {
|
|
48
|
+
parsed = new URL(endpoint);
|
|
49
|
+
} catch (_) {
|
|
50
|
+
resolve();
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const transport = parsed.protocol === 'http:' ? http : https;
|
|
55
|
+
const options = {
|
|
56
|
+
hostname: parsed.hostname,
|
|
57
|
+
port: parsed.port || (parsed.protocol === 'http:' ? 80 : 443),
|
|
58
|
+
path: parsed.pathname + parsed.search,
|
|
59
|
+
method: 'POST',
|
|
60
|
+
headers: {
|
|
61
|
+
'Content-Type': 'application/json',
|
|
62
|
+
'Content-Length': Buffer.byteLength(body),
|
|
63
|
+
},
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
const timer = setTimeout(() => {
|
|
67
|
+
try { req.destroy(); } catch (_) { /* ignore */ }
|
|
68
|
+
resolve();
|
|
69
|
+
}, SEND_TIMEOUT_MS);
|
|
70
|
+
|
|
71
|
+
const req = transport.request(options, (res) => {
|
|
72
|
+
clearTimeout(timer);
|
|
73
|
+
// Drain response body to free the socket.
|
|
74
|
+
res.resume();
|
|
75
|
+
res.on('end', resolve);
|
|
76
|
+
res.on('error', resolve);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
req.on('error', () => {
|
|
80
|
+
clearTimeout(timer);
|
|
81
|
+
resolve();
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
req.write(body);
|
|
85
|
+
req.end();
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Register process-level crash handlers if telemetry is active.
|
|
91
|
+
* Call once at startup. No-op if telemetry is not enabled.
|
|
92
|
+
*
|
|
93
|
+
* Prints a one-line notice if telemetry was opted in but consent_version is stale.
|
|
94
|
+
*/
|
|
95
|
+
function registerCrashHooks(version) {
|
|
96
|
+
const status = telemetryStatus();
|
|
97
|
+
|
|
98
|
+
if (status === 'paused (consent_version mismatch)') {
|
|
99
|
+
process.stderr.write(
|
|
100
|
+
'xlsx-for-ai: telemetry has been updated. Run `xlsx-for-ai --enable-telemetry`' +
|
|
101
|
+
' to resume on the new shape, or `--telemetry-status` for details.\n'
|
|
102
|
+
);
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (status !== 'enabled') return;
|
|
107
|
+
|
|
108
|
+
async function handleCrash(err) {
|
|
109
|
+
const payload = buildPayload(err, version);
|
|
110
|
+
try {
|
|
111
|
+
await sendPayload(payload);
|
|
112
|
+
} catch (_) {
|
|
113
|
+
// Never let telemetry mask the real error.
|
|
114
|
+
}
|
|
115
|
+
// Re-throw so the original stack + non-zero exit still happens.
|
|
116
|
+
// We use process.exit(1) here because re-throwing from an
|
|
117
|
+
// uncaughtException handler after it fires causes Node to call the
|
|
118
|
+
// handler again, creating an infinite loop.
|
|
119
|
+
process.stderr.write((err && (err.stack || err.message)) ? (err.stack || err.message) + '\n' : String(err) + '\n');
|
|
120
|
+
process.exit(1);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
process.on('uncaughtException', (err) => {
|
|
124
|
+
handleCrash(err);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
process.on('unhandledRejection', (reason) => {
|
|
128
|
+
handleCrash(reason instanceof Error ? reason : new Error(String(reason)));
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
module.exports = {
|
|
133
|
+
registerCrashHooks,
|
|
134
|
+
sendPayload,
|
|
135
|
+
resolveEndpoint,
|
|
136
|
+
DEFAULT_ENDPOINT,
|
|
137
|
+
SEND_TIMEOUT_MS,
|
|
138
|
+
};
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Sanitization for crash telemetry payloads.
|
|
5
|
+
*
|
|
6
|
+
* INVARIANTS (non-negotiable):
|
|
7
|
+
* - No file paths: scrub /Users/<x>/..., C:\Users\<x>\..., /home/<x>/...
|
|
8
|
+
* - Cap error_message at 200 chars (after scrubbing)
|
|
9
|
+
* - No cell values, no workbook structure (not available post-crash anyway)
|
|
10
|
+
* - No env vars, no argv beyond a hardcoded allowlist
|
|
11
|
+
* - No machine identifier (no hostname, MAC, install ID)
|
|
12
|
+
*
|
|
13
|
+
* Future maintainers: do NOT enrich this payload. The consent_version gates
|
|
14
|
+
* any shape expansion. Bump CURRENT_CONSENT_VERSION in telemetry-config.js
|
|
15
|
+
* before adding new fields.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const os = require('os');
|
|
19
|
+
|
|
20
|
+
const MAX_MESSAGE_LENGTH = 200;
|
|
21
|
+
|
|
22
|
+
// Allowlisted first-arg values for the command field.
|
|
23
|
+
// Everything else becomes '<other>'.
|
|
24
|
+
const ALLOWED_COMMANDS = new Set([
|
|
25
|
+
'xlsx-for-ai',
|
|
26
|
+
'cursor-reads-xlsx',
|
|
27
|
+
'write',
|
|
28
|
+
'--json',
|
|
29
|
+
'--md',
|
|
30
|
+
'--stdout',
|
|
31
|
+
'--sql',
|
|
32
|
+
'--schema',
|
|
33
|
+
'--compact',
|
|
34
|
+
'--evaluate',
|
|
35
|
+
'--stream',
|
|
36
|
+
'--list-sheets',
|
|
37
|
+
'--diff',
|
|
38
|
+
'--range',
|
|
39
|
+
'--named-range',
|
|
40
|
+
'--max-rows',
|
|
41
|
+
'--max-cols',
|
|
42
|
+
'--max-tokens',
|
|
43
|
+
'--report-bug',
|
|
44
|
+
'--export-redacted-workbook',
|
|
45
|
+
'--enable-telemetry',
|
|
46
|
+
'--disable-telemetry',
|
|
47
|
+
'--telemetry-status',
|
|
48
|
+
'--help',
|
|
49
|
+
'--version',
|
|
50
|
+
'-h',
|
|
51
|
+
'-v',
|
|
52
|
+
]);
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Scrub filesystem paths from a string.
|
|
56
|
+
*
|
|
57
|
+
* Covers:
|
|
58
|
+
* /Users/<name>/... (macOS)
|
|
59
|
+
* /home/<name>/... (Linux)
|
|
60
|
+
* C:\Users\<name>\... (Windows, forward or back slash variants)
|
|
61
|
+
* /var/folders/... (macOS temp)
|
|
62
|
+
* /tmp/... (Linux/macOS tmp)
|
|
63
|
+
* /private/tmp/... (macOS private tmp — e.g. worktrees)
|
|
64
|
+
* URL-encoded forms of the above (%2FUsers%2F<name>%2F...)
|
|
65
|
+
* $HOME and %USERPROFILE% references
|
|
66
|
+
*/
|
|
67
|
+
function scrubPaths(str) {
|
|
68
|
+
if (typeof str !== 'string') return str;
|
|
69
|
+
|
|
70
|
+
// URL-decode once before scanning, then re-check on the decoded copy.
|
|
71
|
+
// We do NOT modify `str` in-place with decoded content because the
|
|
72
|
+
// caller's downstream display may want the original encoding. Instead
|
|
73
|
+
// we run two passes: one on the raw string, one on the decoded copy,
|
|
74
|
+
// and use the more-scrubbed result.
|
|
75
|
+
let decoded;
|
|
76
|
+
try {
|
|
77
|
+
decoded = decodeURIComponent(str);
|
|
78
|
+
} catch (_) {
|
|
79
|
+
decoded = str;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function scrubLiteral(s) {
|
|
83
|
+
// Windows: C:\Users\<name>\... or C:/Users/<name>/...
|
|
84
|
+
let out = s.replace(
|
|
85
|
+
/[A-Za-z]:[/\\][Uu]sers[/\\][^/\\:\s]+([/\\][^\s]*)*/g,
|
|
86
|
+
'<path>'
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
// Unix home dirs: /Users/<name>/... or /home/<name>/...
|
|
90
|
+
out = out.replace(
|
|
91
|
+
/\/(Users|home)\/[^/\s:]+([^\s:])*/g,
|
|
92
|
+
'<path>'
|
|
93
|
+
);
|
|
94
|
+
|
|
95
|
+
// /tmp/... and /private/tmp/...
|
|
96
|
+
out = out.replace(
|
|
97
|
+
/\/(?:private\/)?tmp\/[^\s:]+/g,
|
|
98
|
+
'<path>'
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
// /var/folders/...
|
|
102
|
+
out = out.replace(
|
|
103
|
+
/\/var\/folders\/[^\s:]+/g,
|
|
104
|
+
'<path>'
|
|
105
|
+
);
|
|
106
|
+
|
|
107
|
+
// $HOME/... or ${HOME}/... (env-var style references to home dir)
|
|
108
|
+
out = out.replace(
|
|
109
|
+
/\$\{?HOME\}?\/[^\s]*/gi,
|
|
110
|
+
'<path>'
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
// %USERPROFILE%\... (Windows env-var style)
|
|
114
|
+
out = out.replace(
|
|
115
|
+
/%USERPROFILE%[/\\][^\s]*/gi,
|
|
116
|
+
'<path>'
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
return out;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const scrubbed = scrubLiteral(str);
|
|
123
|
+
const scrubbedDecoded = scrubLiteral(decoded);
|
|
124
|
+
|
|
125
|
+
// If the decoded version produced additional scrubbing (URL-encoded path),
|
|
126
|
+
// return the scrubbed-decoded version so the sensitive data is removed.
|
|
127
|
+
// Heuristic: if scrubbing the decoded string was MORE aggressive (fewer
|
|
128
|
+
// remaining path fragments) use that result.
|
|
129
|
+
if (scrubbedDecoded.length < scrubbed.length) {
|
|
130
|
+
return scrubbedDecoded;
|
|
131
|
+
}
|
|
132
|
+
return scrubbed;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Sanitize error message: scrub paths, cap at 200 chars.
|
|
137
|
+
*/
|
|
138
|
+
function sanitizeMessage(message) {
|
|
139
|
+
if (!message) return '';
|
|
140
|
+
const scrubbed = scrubPaths(String(message));
|
|
141
|
+
return scrubbed.slice(0, MAX_MESSAGE_LENGTH);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Build the outgoing crash payload from an Error object.
|
|
146
|
+
* Returns a plain object with only the allowed fields.
|
|
147
|
+
*/
|
|
148
|
+
function buildPayload(err, version) {
|
|
149
|
+
const errorType = (err && err.constructor && err.constructor.name) || 'Error';
|
|
150
|
+
const rawMessage = (err && err.message) ? err.message : String(err);
|
|
151
|
+
const message = sanitizeMessage(rawMessage);
|
|
152
|
+
|
|
153
|
+
// First arg from process.argv — only if in allowlist.
|
|
154
|
+
let command = '<other>';
|
|
155
|
+
try {
|
|
156
|
+
const firstArg = process.argv[2];
|
|
157
|
+
if (firstArg && ALLOWED_COMMANDS.has(firstArg)) {
|
|
158
|
+
command = firstArg;
|
|
159
|
+
}
|
|
160
|
+
} catch (_) { /* ignore */ }
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
v: 1,
|
|
164
|
+
ts: new Date().toISOString(),
|
|
165
|
+
error_type: errorType,
|
|
166
|
+
error_message: message,
|
|
167
|
+
command,
|
|
168
|
+
xlsx_for_ai_version: version || 'unknown',
|
|
169
|
+
node_version: process.version,
|
|
170
|
+
os_arch: `${process.platform}-${process.arch}`,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
module.exports = {
|
|
175
|
+
scrubPaths,
|
|
176
|
+
sanitizeMessage,
|
|
177
|
+
buildPayload,
|
|
178
|
+
MAX_MESSAGE_LENGTH,
|
|
179
|
+
ALLOWED_COMMANDS,
|
|
180
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xlsx-for-ai",
|
|
3
|
-
"version": "1.5.
|
|
3
|
+
"version": "1.5.2",
|
|
4
4
|
"description": "CLI that converts .xlsx files into rich text or JSON dumps that AI coding agents (Claude, Cursor, Copilot, ChatGPT, etc.) can read — preserving values, formulas, formatting, colors, column widths, frozen panes, named ranges, tables, and more.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"LICENSE"
|
|
18
18
|
],
|
|
19
19
|
"scripts": {
|
|
20
|
-
"test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js"
|
|
20
|
+
"test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js tests/telemetry-sanitize.test.js tests/telemetry-config.test.js tests/telemetry-consent-version.test.js tests/telemetry-flags.test.js tests/redactWorkbook-leak-check.test.js"
|
|
21
21
|
},
|
|
22
22
|
"keywords": [
|
|
23
23
|
"xlsx",
|
|
@@ -50,6 +50,7 @@
|
|
|
50
50
|
"@formulajs/formulajs": "^4.6.0",
|
|
51
51
|
"@protobi/exceljs": "^4.4.0-protobi.9",
|
|
52
52
|
"gpt-tokenizer": "^3.4.0",
|
|
53
|
+
"jszip": "^3.10.1",
|
|
53
54
|
"papaparse": "^5.5.3",
|
|
54
55
|
"xlsx": "^0.18.5"
|
|
55
56
|
},
|