npm - @fuguejs/xlsx - Versions diffs - 0.1.0 - Mend

@fuguejs/xlsx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,53 @@
+# @fuguejs/xlsx
+Pure workbook parsing for Fugue DAGs. `parseWorkbook` turns `.xlsx` bytes into
+Zod-validated typed rows. It is a pure function (deterministic, no I/O) — fetching
+the bytes is a [`documents`](../document-source) capability concern
+([`@fuguejs/fs`](../adapter-fs), [`@fuguejs/ms-graph`](../adapter-ms-graph)); parsing
+stays here so it is fixture-testable and provider-agnostic (ADR-0052).
+- **AI/usage guide:** [`docs/llm-document-source.md`](../../docs/llm-document-source.md)
+## Usage
+```ts
+import { z } from "zod";
+import { parseWorkbook } from "@fuguejs/xlsx";
+const RowSchema = z.object({ customerId: z.string(), revenue: z.coerce.number() });
+// inside a createFetchNode `fetch`, after ctx.documents.getContent(ref):
+const parsed = await parseWorkbook(bytes, RowSchema);
+//    Promise<Result<{ rows: readonly { customerId: string; revenue: number }[] }, FrameworkError>>
+```
+## API
+```ts
+parseWorkbook<T>(
+  bytes: Uint8Array,
+  rowSchema: z.ZodType<T>,
+  opts?: { sheet?: string | number; headerRow?: number },  // default: first sheet, header row 1
+): Promise<Result<{ rows: readonly T[] }, FrameworkError>>
+normalizeCell(value: unknown): string | number | boolean | Date | null  // exported for testing
+```
+- Rows are objects keyed by the header-row cells.
+- Cells are normalised to primitives: formula → its result, rich text /
+  hyperlink → text, error cells → `null`, dates kept as `Date`. Use
+  `z.coerce.*` for columns stored as text. Fully-blank rows are skipped.
+## Errors
+| Situation | `FrameworkError` |
+|---|---|
+| Bytes are not a readable workbook | non-retriable `node-crash` |
+| Requested worksheet absent | non-retriable `node-crash` |
+| A row violates `rowSchema` | `validation` (message names the row) |
+## Tests
+Unit tests build `.xlsx` fixtures in memory (no committed binaries). An
+end-to-end test reads a real file from disk through `@fuguejs/fs` and parses it,
+proving the `getContent → parseWorkbook` path.

package/package.json ADDED Viewed

@@ -0,0 +1,39 @@
+{
+  "name": "@fuguejs/xlsx",
+  "version": "0.1.0",
+  "type": "module",
+  "main": "src/index.ts",
+  "exports": {
+    ".": "./src/index.ts"
+  },
+  "scripts": {
+    "build": "tsc",
+    "typecheck": "tsc --noEmit",
+    "test": "bun test"
+  },
+  "dependencies": {
+    "@fuguejs/framework": "0.1.0",
+    "exceljs": "^4.4.0",
+    "jszip": "^3.10.1"
+  },
+  "peerDependencies": {
+    "zod": "^4.3.6"
+  },
+  "peerDependenciesMeta": {
+    "zod": {
+      "optional": false
+    }
+  },
+  "devDependencies": {
+    "@types/bun": "latest",
+    "@fuguejs/fs": "0.1.0",
+    "zod": "^4.3.6"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "files": [
+    "src",
+    "!src/__tests__"
+  ]
+}

package/src/index.ts ADDED Viewed

@@ -0,0 +1,199 @@
+/**
+ * @fuguejs/xlsx — pure workbook parsing for Fugue DAGs.
+ *
+ * `parseWorkbook` turns `.xlsx` bytes into Zod-validated typed rows. It is a
+ * pure function (deterministic, no I/O) — the byte fetching is a `documents`
+ * capability concern (`@fuguejs/ms-graph`, `@fuguejs/fs`), and parsing stays here
+ * so it is fixture-testable and provider-agnostic. See ADR-0052.
+ *
+ * ## Usage
+ *
+ * ```ts
+ * import { z } from "zod";
+ * import { parseWorkbook } from "@fuguejs/xlsx";
+ *
+ * const RowSchema = z.object({ customerId: z.string(), revenue: z.coerce.number() });
+ *
+ * // inside a createFetchNode `fetch`, after ctx.documents.getContent(ref):
+ * const parsed = await parseWorkbook(bytes, RowSchema);   // Result<{ rows }, FrameworkError>
+ * ```
+ *
+ * Rows are objects keyed by the header-row cells. Cells are normalised to
+ * primitives (formula → result, rich text / hyperlink → text, dates kept as
+ * `Date`); pair numeric or date columns with `z.coerce.*` if your source stores
+ * them as text.
+ */
+import ExcelJS from "exceljs";
+import type { z } from "zod";
+import type { Result, FrameworkError } from "@fuguejs/framework";
+import { ok, err, nodeId } from "@fuguejs/framework";
+/** Sentinel node ID for parse errors (parsing is a lib, not a DAG node). */
+const XLSX_NODE_ID = nodeId("xlsx-parse");
+const msg = (e: unknown): string => (e instanceof Error ? e.message : String(e));
+const crashErr = (message: string): FrameworkError => ({
+  kind: "node-crash",
+  nodeId: XLSX_NODE_ID,
+  message,
+  retriability: "non-retriable",
+});
+const validationErr = (message: string, path?: string): FrameworkError => ({
+  kind: "validation",
+  nodeId: XLSX_NODE_ID,
+  message,
+  ...(path !== undefined ? { path } : {}),
+});
+/** Options for `parseWorkbook`. */
+export interface ParseWorkbookOpts {
+  /** Worksheet to read: name (string) or 1-based index (number). Default: first sheet. */
+  readonly sheet?: string | number;
+  /** 1-based row holding the column headers. Default: 1. */
+  readonly headerRow?: number;
+}
+/**
+ * Strip `<dateGroupItem …/>` elements from `xl/tables/*.xml`.
+ *
+ * Real-world exports (Dynamics 365, BI tools) save date-grouped table
+ * autofilters as `dateGroupItem` nodes, which ExcelJS's table parser does not
+ * know and crashes on ("Unexpected xml node in parseOpen"). The nodes only
+ * describe a UI filter selection — never cell data — so removing them is
+ * lossless for row extraction. `ignoreNodes` can't reach them (tables are
+ * parsed outside the worksheet xform), hence the zip-level rewrite. Uses
+ * jszip, which ExcelJS already depends on.
+ */
+const stripDateGroupItems = async (bytes: Uint8Array): Promise<Uint8Array> => {
+  const { default: JSZip } = await import("jszip");
+  const zip = await JSZip.loadAsync(bytes);
+  const tableFiles = zip.file(/^xl\/tables\/.*\.xml$/);
+  for (const file of tableFiles) {
+    const xml = await file.async("string");
+    if (xml.includes("<dateGroupItem")) {
+      zip.file(file.name, xml.replace(/<dateGroupItem\b[^>]*\/>/g, ""));
+    }
+  }
+  return zip.generateAsync({ type: "uint8array" });
+};
+const isDateGroupItemCrash = (e: unknown): boolean =>
+  e instanceof Error && e.message.includes("parseOpen") && e.message.includes("dateGroupItem");
+/**
+ * Normalise an ExcelJS cell value to a primitive (or `null`). Handles formulas
+ * (`{ formula, result }` → the computed result), rich text (`{ richText }`),
+ * hyperlinks (`{ text, hyperlink }`), and error cells (`#REF!`, `#DIV/0!`, … →
+ * `null`); passes through string/number/boolean/Date unchanged.
+ */
+export const normalizeCell = (value: unknown): string | number | boolean | Date | null => {
+  if (value === null || value === undefined) return null;
+  if (value instanceof Date) return value;
+  if (typeof value === "object") {
+    const v = value as Record<string, unknown>;
+    if (Array.isArray(v.richText)) {
+      return (v.richText as { text?: string }[]).map((p) => p.text ?? "").join("");
+    }
+    if (typeof v.text === "string") return v.text; // hyperlink cell
+    if ("result" in v) return normalizeCell(v.result); // formula → its computed result
+    if ("error" in v) return null; // error cell (#REF!, #DIV/0!, …)
+    return null;
+  }
+  if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
+    return value;
+  }
+  return null;
+};
+/**
+ * Parse `.xlsx` bytes into rows validated against `rowSchema`.
+ *
+ * Returns:
+ * - `node-crash` (non-retriable) when the bytes aren't a readable workbook or
+ *   the requested worksheet is absent — deterministic, so not retried.
+ * - `validation` when a row does not match `rowSchema` (message names the row).
+ * - `ok({ rows })` otherwise. Rows are skipped when every cell normalises to
+ *   empty — this includes rows whose cells are all error cells (`#REF!`,
+ *   `#DIV/0!`, …), since `normalizeCell` maps those to `null`. A row mixing an
+ *   error cell with real values is kept and fails `rowSchema` validation unless
+ *   the offending column is nullable.
+ */
+export const parseWorkbook = async <T>(
+  bytes: Uint8Array,
+  rowSchema: z.ZodType<T>,
+  opts: ParseWorkbookOpts = {},
+): Promise<Result<{ rows: readonly T[] }, FrameworkError>> => {
+  const wb = new ExcelJS.Workbook();
+  try {
+    // exceljs types `load` as the global Buffer; recent @types/node makes
+    // Buffer.from return Buffer<ArrayBuffer> — cast to exceljs's exact param.
+    await wb.xlsx.load(Buffer.from(bytes) as unknown as Parameters<typeof wb.xlsx.load>[0]);
+  } catch (e) {
+    if (!isDateGroupItemCrash(e)) {
+      return err(crashErr(`failed to parse workbook: ${msg(e)}`));
+    }
+    // Date-grouped table autofilter (Dynamics/BI exports) — strip the
+    // UI-only filter nodes and retry once. See stripDateGroupItems.
+    try {
+      const cleaned = await stripDateGroupItems(bytes);
+      await wb.xlsx.load(Buffer.from(cleaned) as unknown as Parameters<typeof wb.xlsx.load>[0]);
+    } catch (e2) {
+      return err(crashErr(`failed to parse workbook: ${msg(e2)}`));
+    }
+  }
+  const ws =
+    typeof opts.sheet === "string"
+      ? wb.getWorksheet(opts.sheet)
+      : wb.worksheets[(typeof opts.sheet === "number" ? opts.sheet : 1) - 1];
+  if (!ws) {
+    const which = opts.sheet ?? "(first)";
+    return err(crashErr(`worksheet not found: ${which}`));
+  }
+  const headerRowNum = opts.headerRow ?? 1;
+  const colCount = ws.columnCount;
+  const headerRow = ws.getRow(headerRowNum);
+  const headers: string[] = [];
+  const seenHeaders = new Set<string>();
+  for (let c = 1; c <= colCount; c++) {
+    const h = normalizeCell(headerRow.getCell(c).value);
+    const key = h === null ? "" : String(h).trim();
+    // A duplicate non-empty header would silently overwrite the earlier
+    // column when rows are keyed by header (`obj[key] = val`), dropping a whole
+    // column of data. Fail loudly instead. Blank headers are legitimately
+    // skipped (multiple empty columns are fine), so they're exempt.
+    if (key !== "" && seenHeaders.has(key)) {
+      return err(crashErr(`duplicate header column: '${key}' (header row ${headerRowNum})`));
+    }
+    if (key !== "") seenHeaders.add(key);
+    headers[c] = key;
+  }
+  const rows: T[] = [];
+  for (let r = headerRowNum + 1; r <= ws.rowCount; r++) {
+    const row = ws.getRow(r);
+    const obj: Record<string, unknown> = {};
+    let hasValue = false;
+    for (let c = 1; c <= colCount; c++) {
+      const key = headers[c];
+      if (!key) continue;
+      const val = normalizeCell(row.getCell(c).value);
+      if (val !== null && val !== "") hasValue = true;
+      obj[key] = val;
+    }
+    if (!hasValue) continue; // skip fully-blank rows
+    const parsed = rowSchema.safeParse(obj);
+    if (!parsed.success) {
+      const issue = parsed.error.issues[0];
+      return err(validationErr(`row ${r}: ${parsed.error.message}`, issue?.path.join(".")));
+    }
+    rows.push(parsed.data);
+  }
+  return ok({ rows });
+};