xlsx-for-ai 1.5.1 → 1.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -0
- package/index.js +12 -0
- package/lib/engine.js +1 -1
- package/lib/redactWorkbook.js +67 -5
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -317,6 +317,27 @@ Run `rm -rf node_modules package-lock.json && npm install` and the warnings will
|
|
|
317
317
|
|
|
318
318
|
A future release may apply these dep upgrades via `patch-package` so they travel through the dep graph automatically. The infrastructure is in place; the patches haven't been needed urgently because most installs are CLI-direct.
|
|
319
319
|
|
|
320
|
+
### Audit findings on install (what's inherited from upstream)
|
|
321
|
+
|
|
322
|
+
When you `npm install xlsx-for-ai` (especially as a library dep, not the top-level project), `npm audit` may surface one or more advisories. Most are inherited transitively from `@protobi/exceljs` and the legacy `xlsx` fallback parser. Each one has been triaged and is documented in [`.github/audit-allowlist.json`](.github/audit-allowlist.json), which is the canonical list our CI's `audit.yml` job reads.
|
|
323
|
+
|
|
324
|
+
Each allowlist entry includes:
|
|
325
|
+
|
|
326
|
+
- **`ghsa`** — the advisory ID (e.g. `GHSA-w5hq-g745-h8pq`).
|
|
327
|
+
- **`package`** — the dependency the advisory lives on.
|
|
328
|
+
- **`severity`** — the advisory's published severity.
|
|
329
|
+
- **`reason`** — why the finding is accepted, including the code path's reachability in our usage.
|
|
330
|
+
- **`reassess`** — the date by which we will re-evaluate (typically a quarterly cadence).
|
|
331
|
+
- **`owner`** — who owns the re-evaluation.
|
|
332
|
+
|
|
333
|
+
The current set covers two `xlsx` advisories (the npm-published 0.18.5 line is unmaintained; we carry it as a fallback parser only) and one `uuid` advisory inherited from ExcelJS (`v4()` call sites in ExcelJS do not pass a pre-allocated buffer, so the bounds-check gap is unreachable here). An upstream gift PR is open to bump uuid in the protobi fork; once merged and released, the `uuid` line will drop on the next `@protobi/exceljs` update.
|
|
334
|
+
|
|
335
|
+
If you embed xlsx-for-ai in a product with stricter audit policies than ours, you have three clean options:
|
|
336
|
+
|
|
337
|
+
1. **Mirror the allowlist entries** into your own audit configuration (e.g. `npm audit --omit=dev` filters, Snyk policy file, GitHub Dependabot ignore rules) using the same `ghsa` IDs.
|
|
338
|
+
2. **Pin to a future xlsx-for-ai release** that bumps `@protobi/exceljs` past the upstream uuid bump (will drop the `uuid` advisory automatically; tracked in the allowlist's `reassess` date).
|
|
339
|
+
3. **Vendor the parser path you actually use** — if you only need the modern `@protobi/exceljs` engine and not the legacy `xlsx` fallback, you can disable the fallback in your wrapper and the `xlsx` advisories cease to apply to your dep graph.
|
|
340
|
+
|
|
320
341
|
## Reporting bugs
|
|
321
342
|
|
|
322
343
|
**The privacy contract: we never auto-send workbook data.** Anonymous crash telemetry is opt-in via `--enable-telemetry`; even then, we receive only error type, error message (sanitized — paths scrubbed, capped at 200 chars), tool version, Node version, and OS/arch. No paths, no cell values, no identifiers.
|
package/index.js
CHANGED
|
@@ -381,6 +381,18 @@ function detectRegion(ws) {
|
|
|
381
381
|
const colCount = ws.columnCount;
|
|
382
382
|
if (rowCount === 0 || colCount === 0) return null;
|
|
383
383
|
|
|
384
|
+
// ExcelJS reports rowCount/columnCount as the highest USED row/column,
|
|
385
|
+
// not actual storage. A workbook with one cell at XFD1048576 reports
|
|
386
|
+
// 1048576 × 16384 = ~17B coordinates. Refuse the scan past 5M cells —
|
|
387
|
+
// pathological/malicious inputs would otherwise hang the CLI.
|
|
388
|
+
if (rowCount * colCount > 5_000_000) {
|
|
389
|
+
console.warn(
|
|
390
|
+
`detectRegion: workbook reports ${rowCount}×${colCount} cell dimensions, ` +
|
|
391
|
+
`exceeds 5M-cell scan cap; skipping region detection`
|
|
392
|
+
);
|
|
393
|
+
return null;
|
|
394
|
+
}
|
|
395
|
+
|
|
384
396
|
for (let r = 1; r <= rowCount; r++) {
|
|
385
397
|
const row = ws.getRow(r);
|
|
386
398
|
for (let c = 1; c <= colCount; c++) {
|
package/lib/engine.js
CHANGED
|
@@ -23,7 +23,7 @@ class ExcelJSEngine {
|
|
|
23
23
|
/** Engine identifier — useful for diagnostics. */
|
|
24
24
|
get name() { return 'exceljs'; }
|
|
25
25
|
get version() {
|
|
26
|
-
try { return require('exceljs/package.json').version; } catch (_) { return 'unknown'; }
|
|
26
|
+
try { return require('@protobi/exceljs/package.json').version; } catch (_) { return 'unknown'; }
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
/**
|
package/lib/redactWorkbook.js
CHANGED
|
@@ -117,17 +117,36 @@ function redactSharedStringsXml(xml) {
|
|
|
117
117
|
}
|
|
118
118
|
|
|
119
119
|
// Comments: <comment><text><r>...<t>USER TEXT</t></r></text></comment>
|
|
120
|
-
// Replace every <t> payload with "x".
|
|
120
|
+
// Replace every <t> payload with "x". Also strips <author>NAME</author>
|
|
121
|
+
// display names in <authors>; the numeric authorId on each <comment>
|
|
122
|
+
// references the (now redacted) author entry.
|
|
121
123
|
function redactCommentsXml(xml) {
|
|
122
|
-
|
|
124
|
+
let out = xml.replace(/(<t\b[^>]*>)([\s\S]*?)(<\/t>)/g, (m, open, payload, close) => {
|
|
125
|
+
return open + (payload === '' ? '' : 'x') + close;
|
|
126
|
+
});
|
|
127
|
+
out = out.replace(/(<author\b[^>]*>)([\s\S]*?)(<\/author>)/g, (m, open, payload, close) => {
|
|
123
128
|
return open + (payload === '' ? '' : 'x') + close;
|
|
124
129
|
});
|
|
130
|
+
return out;
|
|
125
131
|
}
|
|
126
132
|
|
|
127
133
|
// Threaded comments: <threadedComment ... text="USER TEXT" .../>
|
|
128
|
-
// Excel encodes the body as an attribute — must redact in place.
|
|
134
|
+
// Excel encodes the body as an attribute — must redact in place. Both
|
|
135
|
+
// double-quoted and single-quoted attribute values are valid XML and we
|
|
136
|
+
// must scrub both forms.
|
|
129
137
|
function redactThreadedCommentsXml(xml) {
|
|
130
|
-
return xml.replace(/\btext="[^"]*"/g, 'text="x"');
|
|
138
|
+
return xml.replace(/\btext=("[^"]*"|'[^']*')/g, 'text="x"');
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// xl/persons/person.xml — author registry for threaded comments.
|
|
142
|
+
// <person displayName="Alice" id="..." userId="alice@co.com" providerId="AzureAD"/>
|
|
143
|
+
// Strip the three identifying attributes; leave id (a UUID) so threaded comment
|
|
144
|
+
// authorId references still resolve.
|
|
145
|
+
function redactPersonsXml(xml) {
|
|
146
|
+
return xml
|
|
147
|
+
.replace(/\bdisplayName="[^"]*"/g, 'displayName="x"')
|
|
148
|
+
.replace(/\buserId="[^"]*"/g, 'userId="x"')
|
|
149
|
+
.replace(/\bproviderId="[^"]*"/g, 'providerId="x"');
|
|
131
150
|
}
|
|
132
151
|
|
|
133
152
|
// docProps/core.xml — strip author, title, subject, description, keywords,
|
|
@@ -196,9 +215,40 @@ function redactAppXml(xml) {
|
|
|
196
215
|
function redactCustomPropsXml(xml) {
|
|
197
216
|
// Custom property values live inside <vt:*> typed-value elements.
|
|
198
217
|
// Replace their inner text with empty string (preserves type nodes).
|
|
199
|
-
|
|
218
|
+
//
|
|
219
|
+
// The character class includes digits so OOXML numeric type names
|
|
220
|
+
// (vt:r4, vt:r8, vt:i1/i2/i4/i8, vt:ui1/ui2/ui4/ui8, vt:filetime) match.
|
|
221
|
+
// The \2 backreference forces the open and close tag names to match,
|
|
222
|
+
// so a payload that contains nested elements (e.g.
|
|
223
|
+
// <vt:variant><vt:lpwstr>X</vt:lpwstr></vt:variant>) doesn't produce
|
|
224
|
+
// mangled output. The inner text class [^<] keeps the match strictly
|
|
225
|
+
// text-only; the outer wrapper is left structurally intact and any
|
|
226
|
+
// nested vt:* elements get scrubbed by the same regex on overlapping
|
|
227
|
+
// passes.
|
|
228
|
+
return xml.replace(/(<(vt:[a-zA-Z0-9]+)\b[^>]*>)[^<]*(<\/\2>)/g, '$1$3');
|
|
200
229
|
}
|
|
201
230
|
|
|
231
|
+
// 1×1 transparent PNG — minimum valid PNG bytes. Used as a safe placeholder
|
|
232
|
+
// when stripping xl/media/ binary blobs so the ZIP remains structurally valid
|
|
233
|
+
// and drawing relationships don't point to missing entries.
|
|
234
|
+
// (96 bytes: PNG sig + IHDR + IDAT with one transparent pixel + IEND)
|
|
235
|
+
const TRANSPARENT_1X1_PNG = Buffer.from(
|
|
236
|
+
'89504e470d0a1a0a' + // PNG signature
|
|
237
|
+
'0000000d49484452' + // IHDR length + type
|
|
238
|
+
'00000001' + // width = 1
|
|
239
|
+
'00000001' + // height = 1
|
|
240
|
+
'08060000' + // 8-bit RGBA
|
|
241
|
+
'001f15c4' + // IHDR CRC
|
|
242
|
+
'89' + // IHDR chunk footer padding
|
|
243
|
+
'0000000a49444154' + // IDAT length + type
|
|
244
|
+
'789c6260' + // zlib header + deflate block
|
|
245
|
+
'0000000200' + // deflate end
|
|
246
|
+
'01e221bc33' + // IDAT CRC
|
|
247
|
+
'0000000049454e44' + // IEND length + type
|
|
248
|
+
'ae426082', // IEND CRC
|
|
249
|
+
'hex',
|
|
250
|
+
);
|
|
251
|
+
|
|
202
252
|
async function exportRedactedWorkbook(inputPath, outputPath) {
|
|
203
253
|
if (!fs.existsSync(inputPath)) {
|
|
204
254
|
throw new Error(`File not found: ${inputPath}`);
|
|
@@ -238,6 +288,14 @@ async function exportRedactedWorkbook(inputPath, outputPath) {
|
|
|
238
288
|
} else if (/^docProps\/custom\.xml$/i.test(name)) {
|
|
239
289
|
const xml = await file.async('string');
|
|
240
290
|
zip.file(name, redactCustomPropsXml(xml));
|
|
291
|
+
} else if (/^xl\/persons\/person\.xml$/i.test(name)) {
|
|
292
|
+
const xml = await file.async('string');
|
|
293
|
+
zip.file(name, redactPersonsXml(xml));
|
|
294
|
+
} else if (/^xl\/media\//i.test(name)) {
|
|
295
|
+
// Embedded images / media — replace with a 1×1 transparent PNG so
|
|
296
|
+
// drawing relationships remain valid and the ZIP is structurally intact,
|
|
297
|
+
// but no user-supplied binary data survives in the output.
|
|
298
|
+
zip.file(name, TRANSPARENT_1X1_PNG);
|
|
241
299
|
}
|
|
242
300
|
// All other parts pass through untouched.
|
|
243
301
|
}
|
|
@@ -258,7 +316,11 @@ module.exports = {
|
|
|
258
316
|
// exported for unit testing
|
|
259
317
|
_redactSheetXml: redactSheetXml,
|
|
260
318
|
_redactSharedStringsXml: redactSharedStringsXml,
|
|
319
|
+
_redactCommentsXml: redactCommentsXml,
|
|
320
|
+
_redactThreadedCommentsXml: redactThreadedCommentsXml,
|
|
261
321
|
_redactCoreXml: redactCoreXml,
|
|
262
322
|
_redactAppXml: redactAppXml,
|
|
263
323
|
_redactCustomPropsXml: redactCustomPropsXml,
|
|
324
|
+
_redactPersonsXml: redactPersonsXml,
|
|
325
|
+
_TRANSPARENT_1X1_PNG: TRANSPARENT_1X1_PNG,
|
|
264
326
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xlsx-for-ai",
|
|
3
|
-
"version": "1.5.
|
|
3
|
+
"version": "1.5.3",
|
|
4
4
|
"description": "CLI that converts .xlsx files into rich text or JSON dumps that AI coding agents (Claude, Cursor, Copilot, ChatGPT, etc.) can read — preserving values, formulas, formatting, colors, column widths, frozen panes, named ranges, tables, and more.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"LICENSE"
|
|
18
18
|
],
|
|
19
19
|
"scripts": {
|
|
20
|
-
"test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js tests/telemetry-sanitize.test.js tests/telemetry-config.test.js tests/telemetry-consent-version.test.js tests/telemetry-flags.test.js"
|
|
20
|
+
"test": "node --test test/round-trip.test.js test/output-matrix.test.js test/unit/*.test.js tests/telemetry-sanitize.test.js tests/telemetry-config.test.js tests/telemetry-consent-version.test.js tests/telemetry-flags.test.js tests/redactWorkbook-leak-check.test.js"
|
|
21
21
|
},
|
|
22
22
|
"keywords": [
|
|
23
23
|
"xlsx",
|