@mkterswingman/5mghost-wonder 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,171 @@
1
+ // src/xlsx/format.ts
2
+ // Convert Excel number values to display strings using their numFmt format
3
+ // codes. Currently handles date / time / datetime formats — the most common
4
+ // source of "?" placeholders in WeCom xlsx output (date columns, time slots).
5
+ //
6
+ // Excel serial date semantics: integer = days since 1899-12-30, fractional
7
+ // part = time-of-day fraction. There is a legacy "1900 leap-year bug" where
8
+ // serial 60 represents the (fictional) 1900-02-29; standard practice is to
9
+ // pin the epoch to 1899-12-30 so the off-by-one cancels out for all serials
10
+ // >= 61. Serials < 61 are rare in real workbooks and are left to fall back
11
+ // to numeric formatting; the WeCom date columns we target are all in 2025+.
12
+ const DAY_MS = 24 * 60 * 60 * 1000;
13
+ const EPOCH_UTC = Date.UTC(1899, 11, 30); // 1899-12-30
14
+ /**
15
+ * Detect whether a number-format code contains date/time tokens. Tokens
16
+ * inside double-quoted literals (e.g. `"年"`, `"月"`) and inside escaped
17
+ * characters (`\.`) are ignored.
18
+ *
19
+ * Date tokens: y, m, d (also full names `mm`, `mmm`, `mmmm`, `dd`, `ddd`).
20
+ * Time tokens: h, s, AM/PM. Note: lowercase `m` is ambiguous (month vs.
21
+ * minute) — we keep it simple: any unquoted `m` adjacent to `h` or `s` is
22
+ * still classed as date-or-time, which is fine for our purposes.
23
+ */
24
+ export function classifyFormat(format) {
25
+ let inQuote = false;
26
+ let escape = false;
27
+ let hasDate = false;
28
+ let hasTime = false;
29
+ const toks = [];
30
+ for (let i = 0; i < format.length; i++) {
31
+ const ch = format[i];
32
+ if (escape) {
33
+ escape = false;
34
+ continue;
35
+ }
36
+ if (ch === "\\") {
37
+ escape = true;
38
+ continue;
39
+ }
40
+ if (ch === '"') {
41
+ inQuote = !inQuote;
42
+ continue;
43
+ }
44
+ if (inQuote)
45
+ continue;
46
+ toks.push({ ch, pos: i });
47
+ }
48
+ // Indices that belong to AM/PM literals — they are time markers and
49
+ // must not be treated as month/minute when we walk through tokens.
50
+ const ampmIdx = new Set();
51
+ for (let i = 0; i < toks.length; i++) {
52
+ const ch = toks[i].ch;
53
+ if (ch.toLowerCase() !== "a")
54
+ continue;
55
+ // Try AM/PM first (5 tokens), then AM/P (4 tokens).
56
+ const slice5 = toks
57
+ .slice(i, i + 5)
58
+ .map((t) => t.ch.toUpperCase())
59
+ .join("");
60
+ const slice4 = toks
61
+ .slice(i, i + 4)
62
+ .map((t) => t.ch.toUpperCase())
63
+ .join("");
64
+ if (slice5 === "AM/PM") {
65
+ hasTime = true;
66
+ for (let j = i; j < i + 5; j++)
67
+ ampmIdx.add(j);
68
+ }
69
+ else if (slice4 === "AM/P") {
70
+ hasTime = true;
71
+ for (let j = i; j < i + 4; j++)
72
+ ampmIdx.add(j);
73
+ }
74
+ }
75
+ for (let i = 0; i < toks.length; i++) {
76
+ if (ampmIdx.has(i))
77
+ continue;
78
+ const t = toks[i];
79
+ const lower = t.ch.toLowerCase();
80
+ if (lower === "y" || lower === "d") {
81
+ hasDate = true;
82
+ }
83
+ else if (lower === "h" || lower === "s") {
84
+ hasTime = true;
85
+ }
86
+ else if (lower === "m") {
87
+ // Excel's lowercase `m` is ambiguous between month and minute.
88
+ // Disambiguation rule (matches Excel's own behaviour): `m` is
89
+ // minutes when adjacent to an h/hh token (immediately before, run
90
+ // of m's allowed) or immediately before s/ss. Otherwise month.
91
+ // We search through runs of m and across simple separators
92
+ // (`:` is the canonical time separator) but stop at any other
93
+ // letter — so `AM/PM` with its `/` separator does not pollute.
94
+ const prev = findAdjacentLetter(toks, i, -1, ampmIdx);
95
+ const next = findAdjacentLetter(toks, i, 1, ampmIdx);
96
+ const prevH = prev != null && prev.ch.toLowerCase() === "h";
97
+ const nextS = next != null && next.ch.toLowerCase() === "s";
98
+ if (prevH || nextS)
99
+ hasTime = true;
100
+ else
101
+ hasDate = true;
102
+ }
103
+ // AM/PM is handled in the pre-pass above.
104
+ }
105
+ return { hasDate, hasTime };
106
+ }
107
+ /**
108
+ * Walk through `m` characters and time-only separators (`:`) to find the
109
+ * nearest disambiguating letter. Stops on any non-`:` separator — so a
110
+ * format like `mm/yyyy` correctly classifies the `m` as month (slash
111
+ * stops the search), and `h:mm:ss AM/PM` keeps the `m` as minute (only
112
+ * `:` separators between h and the m run).
113
+ */
114
+ function findAdjacentLetter(toks, from, step, skip) {
115
+ // Disambiguating month-vs-minute by looking at the nearest letter token,
116
+ // skipping over m's, time separators (`:`), and whitespace. Stops on
117
+ // any other separator (e.g. `/`, `-`) so that `mm/yyyy` correctly keeps
118
+ // `m` as month while `h:mm:ss` and `h mm` correctly keep it as minute.
119
+ for (let i = from + step; i >= 0 && i < toks.length; i += step) {
120
+ if (skip.has(i))
121
+ continue;
122
+ const ch = toks[i].ch;
123
+ if (ch.toLowerCase() === "m")
124
+ continue;
125
+ if (ch === ":")
126
+ continue;
127
+ if (ch === " " || ch === "\t")
128
+ continue;
129
+ if (/[a-zA-Z]/.test(ch))
130
+ return toks[i];
131
+ return null; // hard separator stops the search
132
+ }
133
+ return null;
134
+ }
135
+ function pad2(n) {
136
+ return n < 10 ? `0${n}` : String(n);
137
+ }
138
+ /**
139
+ * Render an Excel serial number as a display string using a sensible
140
+ * default — `YYYY-MM-DD` for pure dates, `HH:MM` for pure times,
141
+ * `YYYY-MM-DD HH:MM` for combined. We deliberately do not honour the
142
+ * exact format string (e.g. `m"月"d"日"`) because the goal is for the
143
+ * AI / user to see *some* readable date, not a pixel-perfect Excel render.
144
+ */
145
+ export function formatSerial(value, kind) {
146
+ const ms = EPOCH_UTC + Math.round(value * DAY_MS);
147
+ const d = new Date(ms);
148
+ const Y = d.getUTCFullYear();
149
+ const M = pad2(d.getUTCMonth() + 1);
150
+ const D = pad2(d.getUTCDate());
151
+ const h = pad2(d.getUTCHours());
152
+ const m = pad2(d.getUTCMinutes());
153
+ if (kind.hasDate && kind.hasTime)
154
+ return `${Y}-${M}-${D} ${h}:${m}`;
155
+ if (kind.hasDate)
156
+ return `${Y}-${M}-${D}`;
157
+ if (kind.hasTime)
158
+ return `${h}:${m}`;
159
+ return String(value);
160
+ }
161
+ /**
162
+ * Convenience: returns a rendered text string when `format` is a date/time
163
+ * format, undefined otherwise. The caller should fall back to the raw
164
+ * numeric value when undefined is returned.
165
+ */
166
+ export function renderNumberByFormat(value, format) {
167
+ const kind = classifyFormat(format);
168
+ if (!kind.hasDate && !kind.hasTime)
169
+ return undefined;
170
+ return formatSerial(value, kind);
171
+ }
@@ -99,6 +99,31 @@ export async function parseTab(xlsxPath, tabName, saveDir) {
99
99
  if (m.endCol > maxCol)
100
100
  maxCol = m.endCol;
101
101
  }
102
+ // Step 7c.5: annotate top-left anchor cells with their merge span. We do
103
+ // this after image merging so even cells that exist only because of an
104
+ // image anchor still receive the span if they happen to be a merge anchor
105
+ // (rare but possible — e.g. a merged cell whose only content is a picture).
106
+ for (const m of sheetData.merges) {
107
+ const rows = m.endRow - m.startRow + 1;
108
+ const cols = m.endCol - m.startCol + 1;
109
+ if (rows === 1 && cols === 1)
110
+ continue; // not really a merge
111
+ const key = cellKey(m.startRow, m.startCol);
112
+ const anchor = cellMap.get(key);
113
+ if (anchor) {
114
+ anchor.mergeSpan = { rows, cols };
115
+ }
116
+ else {
117
+ // Anchor has no content of its own; create a stub so the merge span
118
+ // is still surfaced. Without this, an empty merged-anchor cell would
119
+ // be invisible in cells[] even though it represents a 4-row block.
120
+ cellMap.set(key, {
121
+ row: m.startRow,
122
+ col: m.startCol,
123
+ mergeSpan: { rows, cols },
124
+ });
125
+ }
126
+ }
102
127
  // Step 7d: sort cells row-asc, col-asc
103
128
  const cells = Array.from(cellMap.values()).sort((a, b) => a.row !== b.row ? a.row - b.row : a.col - b.col);
104
129
  return {
@@ -12,6 +12,7 @@
12
12
  // still injected: `sharedStrings[]` from shared-strings.ts and
13
13
  // `getFormatCode()` from styles.ts.
14
14
  import { SaxesParser } from "saxes";
15
+ import { renderNumberByFormat } from "./format.js";
15
16
  // ---------------------------------------------------------------------------
16
17
  // Cell ref helpers (pure functions, reused)
17
18
  // ---------------------------------------------------------------------------
@@ -207,8 +208,15 @@ function buildCell(ref, cType, cStyleIdx, vRaw, isText, sharedStrings, getFormat
207
208
  const cell = { row, col, value };
208
209
  if (cStyleIdx >= 0) {
209
210
  const format = getFormatCode(cStyleIdx);
210
- if (format !== undefined)
211
+ if (format !== undefined) {
211
212
  cell.format = format;
213
+ // Render dates/times to a readable text so consumers don't have
214
+ // to translate Excel serial numbers themselves. Other formats
215
+ // (currency, percentages, etc.) are left numeric.
216
+ const rendered = renderNumberByFormat(value, format);
217
+ if (rendered !== undefined)
218
+ cell.text = rendered;
219
+ }
212
220
  }
213
221
  return cell;
214
222
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mkterswingman/5mghost-wonder",
3
- "version": "0.0.6",
3
+ "version": "0.0.7",
4
4
  "description": "企微文档读取 CLI — WeCom document reader",
5
5
  "type": "module",
6
6
  "engines": {
@@ -25,7 +25,7 @@
25
25
  "scripts": {
26
26
  "build": "rm -rf dist && tsc && chmod +x dist/cli.js",
27
27
  "typecheck": "tsc --noEmit",
28
- "test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs",
28
+ "test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs && node --test tests/format.test.mjs",
29
29
  "smoke": "npm run build && node dist/cli.js help > /dev/null",
30
30
  "postinstall": "node scripts/postinstall.mjs"
31
31
  },
@@ -96,6 +96,18 @@ Output is a structured JSON:
96
96
 
97
97
  Consume this JSON directly to answer the user's question.
98
98
 
99
+ ### ⚠️ Merged cells — always check `mergeSpan`
100
+
101
+ A cell may carry `mergeSpan: {rows, cols}` when it is the top-left anchor of a merged range. The cell's content is shared across `rows × cols` grid positions starting at `(row, col)`. **When the user asks about a cell's content, always factor `mergeSpan` into the answer.**
102
+
103
+ Concrete example: in a half-hour-grid time table, a `mergeSpan: {rows: 4, cols: 1}` anchored at `(row=39, col=40)` saying "★直播 + Poach" means the activity actually occupies **4 half-hour slots = 2 hours**, not 30 minutes. Quoting only the anchor row's time would mislead the user.
104
+
105
+ The complete merge list is also available in the top-level `merges[]` array if you need to reason about every merge in the sheet.
106
+
107
+ ### ⚠️ Date / time cells — read `text`, not `value`
108
+
109
+ Number cells with date or time formats (e.g. `m"月"d"日"`, `h:mm`) are rendered into a readable `text` string (`"2026-04-30"`, `"12:00"`) alongside the raw `value` and `format`. **Always use `text` when reporting dates and times to the user.** The raw `value` is an Excel serial number (days since 1899-12-30) and is meaningless to humans.
110
+
99
111
  ### ⚠️ Row/column indexing — JSON is 0-based, Excel is 1-based
100
112
 
101
113
  In wonder's JSON output, `row` and `col` (and `startRow`/`endRow`/`startCol`/`endCol` in `merges`) are **0-based**. Excel addresses the user sees in the WeCom UI are **1-based**.