@mkterswingman/5mghost-wonder 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,171 @@
1
+ // src/xlsx/format.ts
2
+ // Convert Excel number values to display strings using their numFmt format
3
+ // codes. Currently handles date / time / datetime formats — the most common
4
+ // source of "?" placeholders in WeCom xlsx output (date columns, time slots).
5
+ //
6
+ // Excel serial date semantics: integer = days since 1899-12-30, fractional
7
+ // part = time-of-day fraction. There is a legacy "1900 leap-year bug" where
8
+ // serial 60 represents the (fictional) 1900-02-29; standard practice is to
9
+ // pin the epoch to 1899-12-30 so the off-by-one cancels out for all serials
10
+ // >= 61. Serials < 61 are rare in real workbooks and are left to fall back
11
+ // to numeric formatting; the WeCom date columns we target are all in 2025+.
12
+ const DAY_MS = 24 * 60 * 60 * 1000;
13
+ const EPOCH_UTC = Date.UTC(1899, 11, 30); // 1899-12-30
14
+ /**
15
+ * Detect whether a number-format code contains date/time tokens. Tokens
16
+ * inside double-quoted literals (e.g. `"年"`, `"月"`) and inside escaped
17
+ * characters (`\.`) are ignored.
18
+ *
19
+ * Date tokens: y, m, d (also full names `mm`, `mmm`, `mmmm`, `dd`, `ddd`).
20
+ * Time tokens: h, s, AM/PM. Note: lowercase `m` is ambiguous (month vs.
21
+ * minute) — we keep it simple: any unquoted `m` adjacent to `h` or `s` is
22
+ * still classed as date-or-time, which is fine for our purposes.
23
+ */
24
+ export function classifyFormat(format) {
25
+ let inQuote = false;
26
+ let escape = false;
27
+ let hasDate = false;
28
+ let hasTime = false;
29
+ const toks = [];
30
+ for (let i = 0; i < format.length; i++) {
31
+ const ch = format[i];
32
+ if (escape) {
33
+ escape = false;
34
+ continue;
35
+ }
36
+ if (ch === "\\") {
37
+ escape = true;
38
+ continue;
39
+ }
40
+ if (ch === '"') {
41
+ inQuote = !inQuote;
42
+ continue;
43
+ }
44
+ if (inQuote)
45
+ continue;
46
+ toks.push({ ch, pos: i });
47
+ }
48
+ // Indices that belong to AM/PM literals — they are time markers and
49
+ // must not be treated as month/minute when we walk through tokens.
50
+ const ampmIdx = new Set();
51
+ for (let i = 0; i < toks.length; i++) {
52
+ const ch = toks[i].ch;
53
+ if (ch.toLowerCase() !== "a")
54
+ continue;
55
+ // Try AM/PM first (5 tokens), then AM/P (4 tokens).
56
+ const slice5 = toks
57
+ .slice(i, i + 5)
58
+ .map((t) => t.ch.toUpperCase())
59
+ .join("");
60
+ const slice4 = toks
61
+ .slice(i, i + 4)
62
+ .map((t) => t.ch.toUpperCase())
63
+ .join("");
64
+ if (slice5 === "AM/PM") {
65
+ hasTime = true;
66
+ for (let j = i; j < i + 5; j++)
67
+ ampmIdx.add(j);
68
+ }
69
+ else if (slice4 === "AM/P") {
70
+ hasTime = true;
71
+ for (let j = i; j < i + 4; j++)
72
+ ampmIdx.add(j);
73
+ }
74
+ }
75
+ for (let i = 0; i < toks.length; i++) {
76
+ if (ampmIdx.has(i))
77
+ continue;
78
+ const t = toks[i];
79
+ const lower = t.ch.toLowerCase();
80
+ if (lower === "y" || lower === "d") {
81
+ hasDate = true;
82
+ }
83
+ else if (lower === "h" || lower === "s") {
84
+ hasTime = true;
85
+ }
86
+ else if (lower === "m") {
87
+ // Excel's lowercase `m` is ambiguous between month and minute.
88
+ // Disambiguation rule (matches Excel's own behaviour): `m` is
89
+ // minutes when adjacent to an h/hh token (immediately before, run
90
+ // of m's allowed) or immediately before s/ss. Otherwise month.
91
+ // We search through runs of m and across simple separators
92
+ // (`:` is the canonical time separator) but stop at any other
93
+ // letter — so `AM/PM` with its `/` separator does not pollute.
94
+ const prev = findAdjacentLetter(toks, i, -1, ampmIdx);
95
+ const next = findAdjacentLetter(toks, i, 1, ampmIdx);
96
+ const prevH = prev != null && prev.ch.toLowerCase() === "h";
97
+ const nextS = next != null && next.ch.toLowerCase() === "s";
98
+ if (prevH || nextS)
99
+ hasTime = true;
100
+ else
101
+ hasDate = true;
102
+ }
103
+ // AM/PM is handled in the pre-pass above.
104
+ }
105
+ return { hasDate, hasTime };
106
+ }
107
+ /**
108
+ * Walk through `m` characters and time-only separators (`:`) to find the
109
+ * nearest disambiguating letter. Stops on any non-`:` separator — so a
110
+ * format like `mm/yyyy` correctly classifies the `m` as month (slash
111
+ * stops the search), and `h:mm:ss AM/PM` keeps the `m` as minute (only
112
+ * `:` separators between h and the m run).
113
+ */
114
+ function findAdjacentLetter(toks, from, step, skip) {
115
+ // Disambiguating month-vs-minute by looking at the nearest letter token,
116
+ // skipping over m's, time separators (`:`), and whitespace. Stops on
117
+ // any other separator (e.g. `/`, `-`) so that `mm/yyyy` correctly keeps
118
+ // `m` as month while `h:mm:ss` and `h mm` correctly keep it as minute.
119
+ for (let i = from + step; i >= 0 && i < toks.length; i += step) {
120
+ if (skip.has(i))
121
+ continue;
122
+ const ch = toks[i].ch;
123
+ if (ch.toLowerCase() === "m")
124
+ continue;
125
+ if (ch === ":")
126
+ continue;
127
+ if (ch === " " || ch === "\t")
128
+ continue;
129
+ if (/[a-zA-Z]/.test(ch))
130
+ return toks[i];
131
+ return null; // hard separator stops the search
132
+ }
133
+ return null;
134
+ }
135
+ function pad2(n) {
136
+ return n < 10 ? `0${n}` : String(n);
137
+ }
138
+ /**
139
+ * Render an Excel serial number as a display string using a sensible
140
+ * default — `YYYY-MM-DD` for pure dates, `HH:MM` for pure times,
141
+ * `YYYY-MM-DD HH:MM` for combined. We deliberately do not honour the
142
+ * exact format string (e.g. `m"月"d"日"`) because the goal is for the
143
+ * AI / user to see *some* readable date, not a pixel-perfect Excel render.
144
+ */
145
+ export function formatSerial(value, kind) {
146
+ const ms = EPOCH_UTC + Math.round(value * DAY_MS);
147
+ const d = new Date(ms);
148
+ const Y = d.getUTCFullYear();
149
+ const M = pad2(d.getUTCMonth() + 1);
150
+ const D = pad2(d.getUTCDate());
151
+ const h = pad2(d.getUTCHours());
152
+ const m = pad2(d.getUTCMinutes());
153
+ if (kind.hasDate && kind.hasTime)
154
+ return `${Y}-${M}-${D} ${h}:${m}`;
155
+ if (kind.hasDate)
156
+ return `${Y}-${M}-${D}`;
157
+ if (kind.hasTime)
158
+ return `${h}:${m}`;
159
+ return String(value);
160
+ }
161
+ /**
162
+ * Convenience: returns a rendered text string when `format` is a date/time
163
+ * format, undefined otherwise. The caller should fall back to the raw
164
+ * numeric value when undefined is returned.
165
+ */
166
+ export function renderNumberByFormat(value, format) {
167
+ const kind = classifyFormat(format);
168
+ if (!kind.hasDate && !kind.hasTime)
169
+ return undefined;
170
+ return formatSerial(value, kind);
171
+ }
@@ -99,6 +99,31 @@ export async function parseTab(xlsxPath, tabName, saveDir) {
99
99
  if (m.endCol > maxCol)
100
100
  maxCol = m.endCol;
101
101
  }
102
+ // Step 7c.5: annotate top-left anchor cells with their merge span. We do
103
+ // this after image merging so even cells that exist only because of an
104
+ // image anchor still receive the span if they happen to be a merge anchor
105
+ // (rare but possible — e.g. a merged cell whose only content is a picture).
106
+ for (const m of sheetData.merges) {
107
+ const rows = m.endRow - m.startRow + 1;
108
+ const cols = m.endCol - m.startCol + 1;
109
+ if (rows === 1 && cols === 1)
110
+ continue; // not really a merge
111
+ const key = cellKey(m.startRow, m.startCol);
112
+ const anchor = cellMap.get(key);
113
+ if (anchor) {
114
+ anchor.mergeSpan = { rows, cols };
115
+ }
116
+ else {
117
+ // Anchor has no content of its own; create a stub so the merge span
118
+ // is still surfaced. Without this, an empty merged-anchor cell would
119
+ // be invisible in cells[] even though it represents a 4-row block.
120
+ cellMap.set(key, {
121
+ row: m.startRow,
122
+ col: m.startCol,
123
+ mergeSpan: { rows, cols },
124
+ });
125
+ }
126
+ }
102
127
  // Step 7d: sort cells row-asc, col-asc
103
128
  const cells = Array.from(cellMap.values()).sort((a, b) => a.row !== b.row ? a.row - b.row : a.col - b.col);
104
129
  return {
@@ -12,6 +12,7 @@
12
12
  // still injected: `sharedStrings[]` from shared-strings.ts and
13
13
  // `getFormatCode()` from styles.ts.
14
14
  import { SaxesParser } from "saxes";
15
+ import { renderNumberByFormat } from "./format.js";
15
16
  // ---------------------------------------------------------------------------
16
17
  // Cell ref helpers (pure functions, reused)
17
18
  // ---------------------------------------------------------------------------
@@ -207,8 +208,15 @@ function buildCell(ref, cType, cStyleIdx, vRaw, isText, sharedStrings, getFormat
207
208
  const cell = { row, col, value };
208
209
  if (cStyleIdx >= 0) {
209
210
  const format = getFormatCode(cStyleIdx);
210
- if (format !== undefined)
211
+ if (format !== undefined) {
211
212
  cell.format = format;
213
+ // Render dates/times to a readable text so consumers don't have
214
+ // to translate Excel serial numbers themselves. Other formats
215
+ // (currency, percentages, etc.) are left numeric.
216
+ const rendered = renderNumberByFormat(value, format);
217
+ if (rendered !== undefined)
218
+ cell.text = rendered;
219
+ }
212
220
  }
213
221
  return cell;
214
222
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mkterswingman/5mghost-wonder",
3
- "version": "0.0.6",
3
+ "version": "0.0.8",
4
4
  "description": "企微文档读取 CLI — WeCom document reader",
5
5
  "type": "module",
6
6
  "engines": {
@@ -25,7 +25,7 @@
25
25
  "scripts": {
26
26
  "build": "rm -rf dist && tsc && chmod +x dist/cli.js",
27
27
  "typecheck": "tsc --noEmit",
28
- "test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs",
28
+ "test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs && node --test tests/format.test.mjs",
29
29
  "smoke": "npm run build && node dist/cli.js help > /dev/null",
30
30
  "postinstall": "node scripts/postinstall.mjs"
31
31
  },
@@ -96,6 +96,18 @@ Output is a structured JSON:
96
96
 
97
97
  Consume this JSON directly to answer the user's question.
98
98
 
99
+ ### ⚠️ Merged cells — always check `mergeSpan`
100
+
101
+ A cell may carry `mergeSpan: {rows, cols}` when it is the top-left anchor of a merged range. The cell's content is shared across `rows × cols` grid positions starting at `(row, col)`. **When the user asks about a cell's content, always factor `mergeSpan` into the answer.**
102
+
103
+ Concrete example: in a half-hour-grid time table, a `mergeSpan: {rows: 4, cols: 1}` anchored at `(row=39, col=40)` saying "★直播 + Poach" means the activity actually occupies **4 half-hour slots = 2 hours**, not 30 minutes. Quoting only the anchor row's time would mislead the user.
104
+
105
+ The complete merge list is also available in the top-level `merges[]` array if you need to reason about every merge in the sheet.
106
+
107
+ ### ⚠️ Date / time cells — read `text`, not `value`
108
+
109
+ Number cells with date or time formats (e.g. `m"月"d"日"`, `h:mm`) are rendered into a readable `text` string (`"2026-04-30"`, `"12:00"`) alongside the raw `value` and `format`. **Always use `text` when reporting dates and times to the user.** The raw `value` is an Excel serial number (days since 1899-12-30) and is meaningless to humans.
110
+
99
111
  ### ⚠️ Row/column indexing — JSON is 0-based, Excel is 1-based
100
112
 
101
113
  In wonder's JSON output, `row` and `col` (and `startRow`/`endRow`/`startCol`/`endCol` in `merges`) are **0-based**. Excel addresses the user sees in the WeCom UI are **1-based**.
@@ -118,19 +130,20 @@ Read("/Users/<you>/Downloads/5mghost-wonder/media/image3.png")
118
130
 
119
131
  **Note:** Images are full-resolution originals (up to several MB each). Only load images the user specifically asks about.
120
132
 
121
- ### Viewing visual layout (optional)
133
+ ### Viewing visual layout — required when colour or layout carries meaning
134
+
135
+ The JSON does not carry cell colours, font colours, or rendered borders. When colour or position is part of the answer, you cannot recover it from JSON — you must read the rendered sheet.
122
136
 
123
- Use when the cell JSON alone can't answer the question because the sheet's meaning comes from **visual structure** — not from the cell values themselves. Typical signals:
137
+ Render to PDF and Read it whenever any of these are true:
124
138
 
125
- - Gantt chart (date columns × task rows, coloured blocks across cell ranges)
126
- - Calendar (week grid with merged day cells or coloured categories)
127
- - Status board / roadmap (colour-coded cells indicating stage, owner, priority)
128
- - Large merge-to-cell ratio in the JSON (`merges.length` is a non-trivial fraction of `cells.length`)
129
- - User explicitly asks about "how it looks", "颜色", "排版", "这个图表", "这张表的结构"
139
+ - The user asks about "how it looks", "颜色", "排版", "这个图表", "这张表的结构", or refers to a visible highlight
140
+ - The sheet is a gantt chart, calendar, status board, or roadmap (colour = stage / owner / priority / "this week")
141
+ - A column or row in the user's question is highlighted (yellow / red / green) in the WeCom UI
142
+ - The merge-to-cell ratio in the JSON is non-trivial (e.g. `merges.length / cells.length > 0.1`) — likely a layout-driven sheet
130
143
 
131
- Do **not** run render for plain data tables, lookup sheets, or when the user just wants a value. The render costs ~30 s and ~10+ MB of PDF per file.
144
+ You cannot detect colour from JSON alone, so when in doubt about a sheet that mixes data with visual cues, render. The cost is ~30 s and ~10 MB; the cost of guessing wrong is worse.
132
145
 
133
- Render the whole xlsx (one PDF page per tab, preserves layout, merges, fills, borders):
146
+ Render command (one PDF page per tab, preserves layout, merges, fills, borders):
134
147
 
135
148
  ```bash
136
149
  soffice --headless \
@@ -140,6 +153,8 @@ soffice --headless \
140
153
 
141
154
  Then use the Read tool on the generated PDF. Page N corresponds to the Nth tab in workbook order (same as `tabs[]` in the metadata output).
142
155
 
156
+ Skip rendering only when the user clearly wants a single cell value or a numeric lookup from a plain data table.
157
+
143
158
  ---
144
159
 
145
160
  ## docx Workflow (`doc/w3_`, `doc/e2_`)
@@ -196,15 +211,17 @@ Output:
196
211
  { "type": "slide", "path": "/Users/<you>/Downloads/5mghost-wonder/filename.pptx" }
197
212
  ```
198
213
 
199
- ### Step 2 — Read content
214
+ ### Step 2 — Always extract both text and visual layout
200
215
 
201
- **Read text** (recommended first step):
216
+ For pptx, run **both** extractions every time. Most WeCom slides are layout-driven (timelines, image collages, status boards, recap pages) — pure text loses critical meaning, and pure-PDF visual reading can mis-OCR text that pandoc captures cleanly. You cannot tell a "complex" slide from a "simple" slide without first looking at it, so don't try to decide; just run both and use whichever the question calls for.
217
+
218
+ **1. Text** (for exact wording, fast keyword scanning, copy-quoting):
202
219
 
203
220
  ```bash
204
221
  pandoc <path> -o /tmp/wonder-slide-output.md && cat /tmp/wonder-slide-output.md
205
222
  ```
206
223
 
207
- **View slide layout**:
224
+ **2. Visual layout** (for image-text relationships, timelines, colour, embedded screenshots whose text pandoc cannot reach — e.g. Korean / Japanese chat captures):
208
225
 
209
226
  ```bash
210
227
  soffice --headless --convert-to pdf --outdir /tmp/ <path>
@@ -212,7 +229,11 @@ soffice --headless --convert-to pdf --outdir /tmp/ <path>
212
229
 
213
230
  Then use the Read tool on the generated PDF.
214
231
 
215
- **Access embedded images**:
232
+ When answering, combine: lean on the PDF for "what's on the slide and how it's organised", lean on the markdown for exact-wording quotes. Don't answer from text-only when a slide visibly relies on layout — the user will spot the gap immediately.
233
+
234
+ If `soffice` is not installed (`wonder check` reports it as optional/missing), fall back to pandoc-only and tell the user upfront that visual cues, embedded screenshot text, and image-text relationships will be missing from your answer.
235
+
236
+ ### Optional: access embedded images directly
216
237
 
217
238
  ```bash
218
239
  mkdir -p /tmp/wonder-pptx-unpack && cp <path> /tmp/wonder-pptx-unpack/slide.zip && unzip -o /tmp/wonder-pptx-unpack/slide.zip -d /tmp/wonder-pptx-unpack/
@@ -241,7 +262,8 @@ Then use Read tool on files in `/tmp/wonder-pptx-unpack/ppt/media/`.
241
262
  | pptx slice crash | `prs.slides[:N]` → `AttributeError: 'list' object has no attribute 'rId'` | Use `for slide in prs.slides` |
242
263
  | Cookie expiry | Cookie valid for 7–30 days | Run `wonder wecom cookie` to refresh |
243
264
  | xlsx images are full-size | Original images can be up to 6 MB each | Only read images when user specifically needs them |
244
- | xlsx visual layout needs soffice | Gantt/calendar/coloured boards lose meaning in JSON alone | Run the optional soffice render step in the xlsx section; CLI does not auto-render |
265
+ | xlsx colour / visual layout | JSON has no fill colour, font colour, or rendered borders | Render to PDF (xlsx section) when colour or layout carries meaning |
266
+ | pptx layout-driven slides | Pure pandoc loses image-text relationships, timelines, embedded screenshot text (e.g. Korean chats) | pptx workflow now runs pandoc + soffice→pdf together by default |
245
267
  | smartpage unsupported | Export API returns 0% progress forever | Manual browser export |
246
268
 
247
269
  ---