npm - @mkterswingman/5mghost-wonder - Versions diffs - 0.0.6 → 0.0.8 - Mend

@mkterswingman/5mghost-wonder 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/xlsx/format.js +171 -0
package/dist/xlsx/parse-tab.js +25 -0
package/dist/xlsx/sheet.js +9 -1
package/package.json +2 -2
package/skills/use-5mghost-wonder/SKILL.md +36 -14

package/dist/xlsx/format.js ADDED Viewed

@@ -0,0 +1,171 @@
+// src/xlsx/format.ts
+// Convert Excel number values to display strings using their numFmt format
+// codes. Currently handles date / time / datetime formats — the most common
+// source of "?" placeholders in WeCom xlsx output (date columns, time slots).
+//
+// Excel serial date semantics: integer = days since 1899-12-30, fractional
+// part = time-of-day fraction. There is a legacy "1900 leap-year bug" where
+// serial 60 represents the (fictional) 1900-02-29; standard practice is to
+// pin the epoch to 1899-12-30 so the off-by-one cancels out for all serials
+// >= 61. Serials < 61 are rare in real workbooks and are left to fall back
+// to numeric formatting; the WeCom date columns we target are all in 2025+.
+const DAY_MS = 24 * 60 * 60 * 1000;
+const EPOCH_UTC = Date.UTC(1899, 11, 30); // 1899-12-30
+/**
+ * Detect whether a number-format code contains date/time tokens. Tokens
+ * inside double-quoted literals (e.g. `"年"`, `"月"`) and inside escaped
+ * characters (`\.`) are ignored.
+ *
+ * Date tokens: y, m, d (also full names `mm`, `mmm`, `mmmm`, `dd`, `ddd`).
+ * Time tokens: h, s, AM/PM. Note: lowercase `m` is ambiguous (month vs.
+ * minute) — we keep it simple: any unquoted `m` adjacent to `h` or `s` is
+ * still classed as date-or-time, which is fine for our purposes.
+ */
+export function classifyFormat(format) {
+    let inQuote = false;
+    let escape = false;
+    let hasDate = false;
+    let hasTime = false;
+    const toks = [];
+    for (let i = 0; i < format.length; i++) {
+        const ch = format[i];
+        if (escape) {
+            escape = false;
+            continue;
+        }
+        if (ch === "\\") {
+            escape = true;
+            continue;
+        }
+        if (ch === '"') {
+            inQuote = !inQuote;
+            continue;
+        }
+        if (inQuote)
+            continue;
+        toks.push({ ch, pos: i });
+    }
+    // Indices that belong to AM/PM literals — they are time markers and
+    // must not be treated as month/minute when we walk through tokens.
+    const ampmIdx = new Set();
+    for (let i = 0; i < toks.length; i++) {
+        const ch = toks[i].ch;
+        if (ch.toLowerCase() !== "a")
+            continue;
+        // Try AM/PM first (5 tokens), then AM/P (4 tokens).
+        const slice5 = toks
+            .slice(i, i + 5)
+            .map((t) => t.ch.toUpperCase())
+            .join("");
+        const slice4 = toks
+            .slice(i, i + 4)
+            .map((t) => t.ch.toUpperCase())
+            .join("");
+        if (slice5 === "AM/PM") {
+            hasTime = true;
+            for (let j = i; j < i + 5; j++)
+                ampmIdx.add(j);
+        }
+        else if (slice4 === "AM/P") {
+            hasTime = true;
+            for (let j = i; j < i + 4; j++)
+                ampmIdx.add(j);
+        }
+    }
+    for (let i = 0; i < toks.length; i++) {
+        if (ampmIdx.has(i))
+            continue;
+        const t = toks[i];
+        const lower = t.ch.toLowerCase();
+        if (lower === "y" || lower === "d") {
+            hasDate = true;
+        }
+        else if (lower === "h" || lower === "s") {
+            hasTime = true;
+        }
+        else if (lower === "m") {
+            // Excel's lowercase `m` is ambiguous between month and minute.
+            // Disambiguation rule (matches Excel's own behaviour): `m` is
+            // minutes when adjacent to an h/hh token (immediately before, run
+            // of m's allowed) or immediately before s/ss. Otherwise month.
+            // We search through runs of m and across simple separators
+            // (`:` is the canonical time separator) but stop at any other
+            // letter — so `AM/PM` with its `/` separator does not pollute.
+            const prev = findAdjacentLetter(toks, i, -1, ampmIdx);
+            const next = findAdjacentLetter(toks, i, 1, ampmIdx);
+            const prevH = prev != null && prev.ch.toLowerCase() === "h";
+            const nextS = next != null && next.ch.toLowerCase() === "s";
+            if (prevH || nextS)
+                hasTime = true;
+            else
+                hasDate = true;
+        }
+        // AM/PM is handled in the pre-pass above.
+    }
+    return { hasDate, hasTime };
+}
+/**
+ * Walk through `m` characters and time-only separators (`:`) to find the
+ * nearest disambiguating letter. Stops on any non-`:` separator — so a
+ * format like `mm/yyyy` correctly classifies the `m` as month (slash
+ * stops the search), and `h:mm:ss AM/PM` keeps the `m` as minute (only
+ * `:` separators between h and the m run).
+ */
+function findAdjacentLetter(toks, from, step, skip) {
+    // Disambiguating month-vs-minute by looking at the nearest letter token,
+    // skipping over m's, time separators (`:`), and whitespace. Stops on
+    // any other separator (e.g. `/`, `-`) so that `mm/yyyy` correctly keeps
+    // `m` as month while `h:mm:ss` and `h mm` correctly keep it as minute.
+    for (let i = from + step; i >= 0 && i < toks.length; i += step) {
+        if (skip.has(i))
+            continue;
+        const ch = toks[i].ch;
+        if (ch.toLowerCase() === "m")
+            continue;
+        if (ch === ":")
+            continue;
+        if (ch === " " || ch === "\t")
+            continue;
+        if (/[a-zA-Z]/.test(ch))
+            return toks[i];
+        return null; // hard separator stops the search
+    }
+    return null;
+}
+function pad2(n) {
+    return n < 10 ? `0${n}` : String(n);
+}
+/**
+ * Render an Excel serial number as a display string using a sensible
+ * default — `YYYY-MM-DD` for pure dates, `HH:MM` for pure times,
+ * `YYYY-MM-DD HH:MM` for combined. We deliberately do not honour the
+ * exact format string (e.g. `m"月"d"日"`) because the goal is for the
+ * AI / user to see *some* readable date, not a pixel-perfect Excel render.
+ */
+export function formatSerial(value, kind) {
+    const ms = EPOCH_UTC + Math.round(value * DAY_MS);
+    const d = new Date(ms);
+    const Y = d.getUTCFullYear();
+    const M = pad2(d.getUTCMonth() + 1);
+    const D = pad2(d.getUTCDate());
+    const h = pad2(d.getUTCHours());
+    const m = pad2(d.getUTCMinutes());
+    if (kind.hasDate && kind.hasTime)
+        return `${Y}-${M}-${D} ${h}:${m}`;
+    if (kind.hasDate)
+        return `${Y}-${M}-${D}`;
+    if (kind.hasTime)
+        return `${h}:${m}`;
+    return String(value);
+}
+/**
+ * Convenience: returns a rendered text string when `format` is a date/time
+ * format, undefined otherwise. The caller should fall back to the raw
+ * numeric value when undefined is returned.
+ */
+export function renderNumberByFormat(value, format) {
+    const kind = classifyFormat(format);
+    if (!kind.hasDate && !kind.hasTime)
+        return undefined;
+    return formatSerial(value, kind);
+}

package/dist/xlsx/parse-tab.js CHANGED Viewed

@@ -99,6 +99,31 @@ export async function parseTab(xlsxPath, tabName, saveDir) {
         if (m.endCol > maxCol)
             maxCol = m.endCol;
     }
+    // Step 7c.5: annotate top-left anchor cells with their merge span. We do
+    // this after image merging so even cells that exist only because of an
+    // image anchor still receive the span if they happen to be a merge anchor
+    // (rare but possible — e.g. a merged cell whose only content is a picture).
+    for (const m of sheetData.merges) {
+        const rows = m.endRow - m.startRow + 1;
+        const cols = m.endCol - m.startCol + 1;
+        if (rows === 1 && cols === 1)
+            continue; // not really a merge
+        const key = cellKey(m.startRow, m.startCol);
+        const anchor = cellMap.get(key);
+        if (anchor) {
+            anchor.mergeSpan = { rows, cols };
+        }
+        else {
+            // Anchor has no content of its own; create a stub so the merge span
+            // is still surfaced. Without this, an empty merged-anchor cell would
+            // be invisible in cells[] even though it represents a 4-row block.
+            cellMap.set(key, {
+                row: m.startRow,
+                col: m.startCol,
+                mergeSpan: { rows, cols },
+            });
+        }
+    }
     // Step 7d: sort cells row-asc, col-asc
     const cells = Array.from(cellMap.values()).sort((a, b) => a.row !== b.row ? a.row - b.row : a.col - b.col);
     return {

package/dist/xlsx/sheet.js CHANGED Viewed

@@ -12,6 +12,7 @@
 // still injected: `sharedStrings[]` from shared-strings.ts and
 // `getFormatCode()` from styles.ts.
 import { SaxesParser } from "saxes";
+import { renderNumberByFormat } from "./format.js";
 // ---------------------------------------------------------------------------
 // Cell ref helpers (pure functions, reused)
 // ---------------------------------------------------------------------------
@@ -207,8 +208,15 @@ function buildCell(ref, cType, cStyleIdx, vRaw, isText, sharedStrings, getFormat
             const cell = { row, col, value };
             if (cStyleIdx >= 0) {
                 const format = getFormatCode(cStyleIdx);
-                if (format !== undefined)
+                if (format !== undefined) {
                     cell.format = format;
+                    // Render dates/times to a readable text so consumers don't have
+                    // to translate Excel serial numbers themselves. Other formats
+                    // (currency, percentages, etc.) are left numeric.
+                    const rendered = renderNumberByFormat(value, format);
+                    if (rendered !== undefined)
+                        cell.text = rendered;
+                }
             }
             return cell;
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mkterswingman/5mghost-wonder",
-  "version": "0.0.6",
+  "version": "0.0.8",
   "description": "企微文档读取 CLI — WeCom document reader",
   "type": "module",
   "engines": {
@@ -25,7 +25,7 @@
   "scripts": {
     "build": "rm -rf dist && tsc && chmod +x dist/cli.js",
     "typecheck": "tsc --noEmit",
-    "test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs",
+    "test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs && node --test tests/format.test.mjs",
     "smoke": "npm run build && node dist/cli.js help > /dev/null",
     "postinstall": "node scripts/postinstall.mjs"
   },

package/skills/use-5mghost-wonder/SKILL.md CHANGED Viewed

@@ -96,6 +96,18 @@ Output is a structured JSON:
 Consume this JSON directly to answer the user's question.
+### ⚠️ Merged cells — always check `mergeSpan`
+A cell may carry `mergeSpan: {rows, cols}` when it is the top-left anchor of a merged range. The cell's content is shared across `rows × cols` grid positions starting at `(row, col)`. **When the user asks about a cell's content, always factor `mergeSpan` into the answer.**
+Concrete example: in a half-hour-grid time table, a `mergeSpan: {rows: 4, cols: 1}` anchored at `(row=39, col=40)` saying "★直播 + Poach" means the activity actually occupies **4 half-hour slots = 2 hours**, not 30 minutes. Quoting only the anchor row's time would mislead the user.
+The complete merge list is also available in the top-level `merges[]` array if you need to reason about every merge in the sheet.
+### ⚠️ Date / time cells — read `text`, not `value`
+Number cells with date or time formats (e.g. `m"月"d"日"`, `h:mm`) are rendered into a readable `text` string (`"2026-04-30"`, `"12:00"`) alongside the raw `value` and `format`. **Always use `text` when reporting dates and times to the user.** The raw `value` is an Excel serial number (days since 1899-12-30) and is meaningless to humans.
 ### ⚠️ Row/column indexing — JSON is 0-based, Excel is 1-based
 In wonder's JSON output, `row` and `col` (and `startRow`/`endRow`/`startCol`/`endCol` in `merges`) are **0-based**. Excel addresses the user sees in the WeCom UI are **1-based**.
@@ -118,19 +130,20 @@ Read("/Users/<you>/Downloads/5mghost-wonder/media/image3.png")
 **Note:** Images are full-resolution originals (up to several MB each). Only load images the user specifically asks about.
-### Viewing visual layout (optional)
+### Viewing visual layout — required when colour or layout carries meaning
+The JSON does not carry cell colours, font colours, or rendered borders. When colour or position is part of the answer, you cannot recover it from JSON — you must read the rendered sheet.
-Use when the cell JSON alone can't answer the question because the sheet's meaning comes from **visual structure** — not from the cell values themselves. Typical signals:
+Render to PDF and Read it whenever any of these are true:
-- Gantt chart (date columns × task rows, coloured blocks across cell ranges)
-- Calendar (week grid with merged day cells or coloured categories)
-- Status board / roadmap (colour-coded cells indicating stage, owner, priority)
-- Large merge-to-cell ratio in the JSON (`merges.length` is a non-trivial fraction of `cells.length`)
-- User explicitly asks about "how it looks", "颜色", "排版", "这个图表", "这张表的结构"
+- The user asks about "how it looks", "颜色", "排版", "这个图表", "这张表的结构", or refers to a visible highlight
+- The sheet is a gantt chart, calendar, status board, or roadmap (colour = stage / owner / priority / "this week")
+- A column or row in the user's question is highlighted (yellow / red / green) in the WeCom UI
+- The merge-to-cell ratio in the JSON is non-trivial (e.g. `merges.length / cells.length > 0.1`) — likely a layout-driven sheet
-Do **not** run render for plain data tables, lookup sheets, or when the user just wants a value. The render costs ~30 s and ~10+ MB of PDF per file.
+You cannot detect colour from JSON alone, so when in doubt about a sheet that mixes data with visual cues, render. The cost is ~30 s and ~10 MB; the cost of guessing wrong is worse.
-Render the whole xlsx (one PDF page per tab, preserves layout, merges, fills, borders):
+Render command (one PDF page per tab, preserves layout, merges, fills, borders):
 ```bash
 soffice --headless \
@@ -140,6 +153,8 @@ soffice --headless \
 Then use the Read tool on the generated PDF. Page N corresponds to the Nth tab in workbook order (same as `tabs[]` in the metadata output).
+Skip rendering only when the user clearly wants a single cell value or a numeric lookup from a plain data table.
 ---
 ## docx Workflow (`doc/w3_`, `doc/e2_`)
@@ -196,15 +211,17 @@ Output:
 { "type": "slide", "path": "/Users/<you>/Downloads/5mghost-wonder/filename.pptx" }
 ```
-### Step 2 — Read content
+### Step 2 — Always extract both text and visual layout
-**Read text** (recommended first step):
+For pptx, run **both** extractions every time. Most WeCom slides are layout-driven (timelines, image collages, status boards, recap pages) — pure text loses critical meaning, and pure-PDF visual reading can mis-OCR text that pandoc captures cleanly. You cannot tell a "complex" slide from a "simple" slide without first looking at it, so don't try to decide; just run both and use whichever the question calls for.
+**1. Text** (for exact wording, fast keyword scanning, copy-quoting):
 ```bash
 pandoc <path> -o /tmp/wonder-slide-output.md && cat /tmp/wonder-slide-output.md
 ```
-**View slide layout**:
+**2. Visual layout** (for image-text relationships, timelines, colour, embedded screenshots whose text pandoc cannot reach — e.g. Korean / Japanese chat captures):
 ```bash
 soffice --headless --convert-to pdf --outdir /tmp/ <path>
@@ -212,7 +229,11 @@ soffice --headless --convert-to pdf --outdir /tmp/ <path>
 Then use the Read tool on the generated PDF.
-**Access embedded images**:
+When answering, combine: lean on the PDF for "what's on the slide and how it's organised", lean on the markdown for exact-wording quotes. Don't answer from text-only when a slide visibly relies on layout — the user will spot the gap immediately.
+If `soffice` is not installed (`wonder check` reports it as optional/missing), fall back to pandoc-only and tell the user upfront that visual cues, embedded screenshot text, and image-text relationships will be missing from your answer.
+### Optional: access embedded images directly
 ```bash
 mkdir -p /tmp/wonder-pptx-unpack && cp <path> /tmp/wonder-pptx-unpack/slide.zip && unzip -o /tmp/wonder-pptx-unpack/slide.zip -d /tmp/wonder-pptx-unpack/
@@ -241,7 +262,8 @@ Then use Read tool on files in `/tmp/wonder-pptx-unpack/ppt/media/`.
 | pptx slice crash | `prs.slides[:N]` → `AttributeError: 'list' object has no attribute 'rId'` | Use `for slide in prs.slides` |
 | Cookie expiry | Cookie valid for 7–30 days | Run `wonder wecom cookie` to refresh |
 | xlsx images are full-size | Original images can be up to 6 MB each | Only read images when user specifically needs them |
-| xlsx visual layout needs soffice | Gantt/calendar/coloured boards lose meaning in JSON alone | Run the optional soffice render step in the xlsx section; CLI does not auto-render |
+| xlsx colour / visual layout | JSON has no fill colour, font colour, or rendered borders | Render to PDF (xlsx section) when colour or layout carries meaning |
+| pptx layout-driven slides | Pure pandoc loses image-text relationships, timelines, embedded screenshot text (e.g. Korean chats) | pptx workflow now runs pandoc + soffice→pdf together by default |
 | smartpage unsupported | Export API returns 0% progress forever | Manual browser export |
 ---