@mkterswingman/5mghost-wonder 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/xlsx/format.js +171 -0
- package/dist/xlsx/parse-tab.js +25 -0
- package/dist/xlsx/sheet.js +9 -1
- package/package.json +2 -2
- package/skills/use-5mghost-wonder/SKILL.md +36 -14
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
// src/xlsx/format.ts
|
|
2
|
+
// Convert Excel number values to display strings using their numFmt format
|
|
3
|
+
// codes. Currently handles date / time / datetime formats — the most common
|
|
4
|
+
// source of "?" placeholders in WeCom xlsx output (date columns, time slots).
|
|
5
|
+
//
|
|
6
|
+
// Excel serial date semantics: integer = days since 1899-12-30, fractional
|
|
7
|
+
// part = time-of-day fraction. There is a legacy "1900 leap-year bug" where
|
|
8
|
+
// serial 60 represents the (fictional) 1900-02-29; standard practice is to
|
|
9
|
+
// pin the epoch to 1899-12-30 so the off-by-one cancels out for all serials
|
|
10
|
+
// >= 61. Serials < 61 are rare in real workbooks and are left to fall back
|
|
11
|
+
// to numeric formatting; the WeCom date columns we target are all in 2025+.
|
|
12
|
+
const DAY_MS = 24 * 60 * 60 * 1000;
|
|
13
|
+
const EPOCH_UTC = Date.UTC(1899, 11, 30); // 1899-12-30
|
|
14
|
+
/**
|
|
15
|
+
* Detect whether a number-format code contains date/time tokens. Tokens
|
|
16
|
+
* inside double-quoted literals (e.g. `"年"`, `"月"`) and inside escaped
|
|
17
|
+
* characters (`\.`) are ignored.
|
|
18
|
+
*
|
|
19
|
+
* Date tokens: y, m, d (also full names `mm`, `mmm`, `mmmm`, `dd`, `ddd`).
|
|
20
|
+
* Time tokens: h, s, AM/PM. Note: lowercase `m` is ambiguous (month vs.
|
|
21
|
+
* minute) — we keep it simple: any unquoted `m` adjacent to `h` or `s` is
|
|
22
|
+
* still classed as date-or-time, which is fine for our purposes.
|
|
23
|
+
*/
|
|
24
|
+
export function classifyFormat(format) {
|
|
25
|
+
let inQuote = false;
|
|
26
|
+
let escape = false;
|
|
27
|
+
let hasDate = false;
|
|
28
|
+
let hasTime = false;
|
|
29
|
+
const toks = [];
|
|
30
|
+
for (let i = 0; i < format.length; i++) {
|
|
31
|
+
const ch = format[i];
|
|
32
|
+
if (escape) {
|
|
33
|
+
escape = false;
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
if (ch === "\\") {
|
|
37
|
+
escape = true;
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
if (ch === '"') {
|
|
41
|
+
inQuote = !inQuote;
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (inQuote)
|
|
45
|
+
continue;
|
|
46
|
+
toks.push({ ch, pos: i });
|
|
47
|
+
}
|
|
48
|
+
// Indices that belong to AM/PM literals — they are time markers and
|
|
49
|
+
// must not be treated as month/minute when we walk through tokens.
|
|
50
|
+
const ampmIdx = new Set();
|
|
51
|
+
for (let i = 0; i < toks.length; i++) {
|
|
52
|
+
const ch = toks[i].ch;
|
|
53
|
+
if (ch.toLowerCase() !== "a")
|
|
54
|
+
continue;
|
|
55
|
+
// Try AM/PM first (5 tokens), then AM/P (4 tokens).
|
|
56
|
+
const slice5 = toks
|
|
57
|
+
.slice(i, i + 5)
|
|
58
|
+
.map((t) => t.ch.toUpperCase())
|
|
59
|
+
.join("");
|
|
60
|
+
const slice4 = toks
|
|
61
|
+
.slice(i, i + 4)
|
|
62
|
+
.map((t) => t.ch.toUpperCase())
|
|
63
|
+
.join("");
|
|
64
|
+
if (slice5 === "AM/PM") {
|
|
65
|
+
hasTime = true;
|
|
66
|
+
for (let j = i; j < i + 5; j++)
|
|
67
|
+
ampmIdx.add(j);
|
|
68
|
+
}
|
|
69
|
+
else if (slice4 === "AM/P") {
|
|
70
|
+
hasTime = true;
|
|
71
|
+
for (let j = i; j < i + 4; j++)
|
|
72
|
+
ampmIdx.add(j);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
for (let i = 0; i < toks.length; i++) {
|
|
76
|
+
if (ampmIdx.has(i))
|
|
77
|
+
continue;
|
|
78
|
+
const t = toks[i];
|
|
79
|
+
const lower = t.ch.toLowerCase();
|
|
80
|
+
if (lower === "y" || lower === "d") {
|
|
81
|
+
hasDate = true;
|
|
82
|
+
}
|
|
83
|
+
else if (lower === "h" || lower === "s") {
|
|
84
|
+
hasTime = true;
|
|
85
|
+
}
|
|
86
|
+
else if (lower === "m") {
|
|
87
|
+
// Excel's lowercase `m` is ambiguous between month and minute.
|
|
88
|
+
// Disambiguation rule (matches Excel's own behaviour): `m` is
|
|
89
|
+
// minutes when adjacent to an h/hh token (immediately before, run
|
|
90
|
+
// of m's allowed) or immediately before s/ss. Otherwise month.
|
|
91
|
+
// We search through runs of m and across simple separators
|
|
92
|
+
// (`:` is the canonical time separator) but stop at any other
|
|
93
|
+
// letter — so `AM/PM` with its `/` separator does not pollute.
|
|
94
|
+
const prev = findAdjacentLetter(toks, i, -1, ampmIdx);
|
|
95
|
+
const next = findAdjacentLetter(toks, i, 1, ampmIdx);
|
|
96
|
+
const prevH = prev != null && prev.ch.toLowerCase() === "h";
|
|
97
|
+
const nextS = next != null && next.ch.toLowerCase() === "s";
|
|
98
|
+
if (prevH || nextS)
|
|
99
|
+
hasTime = true;
|
|
100
|
+
else
|
|
101
|
+
hasDate = true;
|
|
102
|
+
}
|
|
103
|
+
// AM/PM is handled in the pre-pass above.
|
|
104
|
+
}
|
|
105
|
+
return { hasDate, hasTime };
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Walk through `m` characters and time-only separators (`:`) to find the
|
|
109
|
+
* nearest disambiguating letter. Stops on any non-`:` separator — so a
|
|
110
|
+
* format like `mm/yyyy` correctly classifies the `m` as month (slash
|
|
111
|
+
* stops the search), and `h:mm:ss AM/PM` keeps the `m` as minute (only
|
|
112
|
+
* `:` separators between h and the m run).
|
|
113
|
+
*/
|
|
114
|
+
function findAdjacentLetter(toks, from, step, skip) {
|
|
115
|
+
// Disambiguating month-vs-minute by looking at the nearest letter token,
|
|
116
|
+
// skipping over m's, time separators (`:`), and whitespace. Stops on
|
|
117
|
+
// any other separator (e.g. `/`, `-`) so that `mm/yyyy` correctly keeps
|
|
118
|
+
// `m` as month while `h:mm:ss` and `h mm` correctly keep it as minute.
|
|
119
|
+
for (let i = from + step; i >= 0 && i < toks.length; i += step) {
|
|
120
|
+
if (skip.has(i))
|
|
121
|
+
continue;
|
|
122
|
+
const ch = toks[i].ch;
|
|
123
|
+
if (ch.toLowerCase() === "m")
|
|
124
|
+
continue;
|
|
125
|
+
if (ch === ":")
|
|
126
|
+
continue;
|
|
127
|
+
if (ch === " " || ch === "\t")
|
|
128
|
+
continue;
|
|
129
|
+
if (/[a-zA-Z]/.test(ch))
|
|
130
|
+
return toks[i];
|
|
131
|
+
return null; // hard separator stops the search
|
|
132
|
+
}
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
function pad2(n) {
|
|
136
|
+
return n < 10 ? `0${n}` : String(n);
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Render an Excel serial number as a display string using a sensible
|
|
140
|
+
* default — `YYYY-MM-DD` for pure dates, `HH:MM` for pure times,
|
|
141
|
+
* `YYYY-MM-DD HH:MM` for combined. We deliberately do not honour the
|
|
142
|
+
* exact format string (e.g. `m"月"d"日"`) because the goal is for the
|
|
143
|
+
* AI / user to see *some* readable date, not a pixel-perfect Excel render.
|
|
144
|
+
*/
|
|
145
|
+
export function formatSerial(value, kind) {
|
|
146
|
+
const ms = EPOCH_UTC + Math.round(value * DAY_MS);
|
|
147
|
+
const d = new Date(ms);
|
|
148
|
+
const Y = d.getUTCFullYear();
|
|
149
|
+
const M = pad2(d.getUTCMonth() + 1);
|
|
150
|
+
const D = pad2(d.getUTCDate());
|
|
151
|
+
const h = pad2(d.getUTCHours());
|
|
152
|
+
const m = pad2(d.getUTCMinutes());
|
|
153
|
+
if (kind.hasDate && kind.hasTime)
|
|
154
|
+
return `${Y}-${M}-${D} ${h}:${m}`;
|
|
155
|
+
if (kind.hasDate)
|
|
156
|
+
return `${Y}-${M}-${D}`;
|
|
157
|
+
if (kind.hasTime)
|
|
158
|
+
return `${h}:${m}`;
|
|
159
|
+
return String(value);
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Convenience: returns a rendered text string when `format` is a date/time
|
|
163
|
+
* format, undefined otherwise. The caller should fall back to the raw
|
|
164
|
+
* numeric value when undefined is returned.
|
|
165
|
+
*/
|
|
166
|
+
export function renderNumberByFormat(value, format) {
|
|
167
|
+
const kind = classifyFormat(format);
|
|
168
|
+
if (!kind.hasDate && !kind.hasTime)
|
|
169
|
+
return undefined;
|
|
170
|
+
return formatSerial(value, kind);
|
|
171
|
+
}
|
package/dist/xlsx/parse-tab.js
CHANGED
|
@@ -99,6 +99,31 @@ export async function parseTab(xlsxPath, tabName, saveDir) {
|
|
|
99
99
|
if (m.endCol > maxCol)
|
|
100
100
|
maxCol = m.endCol;
|
|
101
101
|
}
|
|
102
|
+
// Step 7c.5: annotate top-left anchor cells with their merge span. We do
|
|
103
|
+
// this after image merging so even cells that exist only because of an
|
|
104
|
+
// image anchor still receive the span if they happen to be a merge anchor
|
|
105
|
+
// (rare but possible — e.g. a merged cell whose only content is a picture).
|
|
106
|
+
for (const m of sheetData.merges) {
|
|
107
|
+
const rows = m.endRow - m.startRow + 1;
|
|
108
|
+
const cols = m.endCol - m.startCol + 1;
|
|
109
|
+
if (rows === 1 && cols === 1)
|
|
110
|
+
continue; // not really a merge
|
|
111
|
+
const key = cellKey(m.startRow, m.startCol);
|
|
112
|
+
const anchor = cellMap.get(key);
|
|
113
|
+
if (anchor) {
|
|
114
|
+
anchor.mergeSpan = { rows, cols };
|
|
115
|
+
}
|
|
116
|
+
else {
|
|
117
|
+
// Anchor has no content of its own; create a stub so the merge span
|
|
118
|
+
// is still surfaced. Without this, an empty merged-anchor cell would
|
|
119
|
+
// be invisible in cells[] even though it represents a 4-row block.
|
|
120
|
+
cellMap.set(key, {
|
|
121
|
+
row: m.startRow,
|
|
122
|
+
col: m.startCol,
|
|
123
|
+
mergeSpan: { rows, cols },
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
}
|
|
102
127
|
// Step 7d: sort cells row-asc, col-asc
|
|
103
128
|
const cells = Array.from(cellMap.values()).sort((a, b) => a.row !== b.row ? a.row - b.row : a.col - b.col);
|
|
104
129
|
return {
|
package/dist/xlsx/sheet.js
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
// still injected: `sharedStrings[]` from shared-strings.ts and
|
|
13
13
|
// `getFormatCode()` from styles.ts.
|
|
14
14
|
import { SaxesParser } from "saxes";
|
|
15
|
+
import { renderNumberByFormat } from "./format.js";
|
|
15
16
|
// ---------------------------------------------------------------------------
|
|
16
17
|
// Cell ref helpers (pure functions, reused)
|
|
17
18
|
// ---------------------------------------------------------------------------
|
|
@@ -207,8 +208,15 @@ function buildCell(ref, cType, cStyleIdx, vRaw, isText, sharedStrings, getFormat
|
|
|
207
208
|
const cell = { row, col, value };
|
|
208
209
|
if (cStyleIdx >= 0) {
|
|
209
210
|
const format = getFormatCode(cStyleIdx);
|
|
210
|
-
if (format !== undefined)
|
|
211
|
+
if (format !== undefined) {
|
|
211
212
|
cell.format = format;
|
|
213
|
+
// Render dates/times to a readable text so consumers don't have
|
|
214
|
+
// to translate Excel serial numbers themselves. Other formats
|
|
215
|
+
// (currency, percentages, etc.) are left numeric.
|
|
216
|
+
const rendered = renderNumberByFormat(value, format);
|
|
217
|
+
if (rendered !== undefined)
|
|
218
|
+
cell.text = rendered;
|
|
219
|
+
}
|
|
212
220
|
}
|
|
213
221
|
return cell;
|
|
214
222
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mkterswingman/5mghost-wonder",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.8",
|
|
4
4
|
"description": "企微文档读取 CLI — WeCom document reader",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
"scripts": {
|
|
26
26
|
"build": "rm -rf dist && tsc && chmod +x dist/cli.js",
|
|
27
27
|
"typecheck": "tsc --noEmit",
|
|
28
|
-
"test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs",
|
|
28
|
+
"test": "node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs && node --test tests/format.test.mjs",
|
|
29
29
|
"smoke": "npm run build && node dist/cli.js help > /dev/null",
|
|
30
30
|
"postinstall": "node scripts/postinstall.mjs"
|
|
31
31
|
},
|
|
@@ -96,6 +96,18 @@ Output is a structured JSON:
|
|
|
96
96
|
|
|
97
97
|
Consume this JSON directly to answer the user's question.
|
|
98
98
|
|
|
99
|
+
### ⚠️ Merged cells — always check `mergeSpan`
|
|
100
|
+
|
|
101
|
+
A cell may carry `mergeSpan: {rows, cols}` when it is the top-left anchor of a merged range. The cell's content is shared across `rows × cols` grid positions starting at `(row, col)`. **When the user asks about a cell's content, always factor `mergeSpan` into the answer.**
|
|
102
|
+
|
|
103
|
+
Concrete example: in a half-hour-grid time table, a `mergeSpan: {rows: 4, cols: 1}` anchored at `(row=39, col=40)` saying "★直播 + Poach" means the activity actually occupies **4 half-hour slots = 2 hours**, not 30 minutes. Quoting only the anchor row's time would mislead the user.
|
|
104
|
+
|
|
105
|
+
The complete merge list is also available in the top-level `merges[]` array if you need to reason about every merge in the sheet.
|
|
106
|
+
|
|
107
|
+
### ⚠️ Date / time cells — read `text`, not `value`
|
|
108
|
+
|
|
109
|
+
Number cells with date or time formats (e.g. `m"月"d"日"`, `h:mm`) are rendered into a readable `text` string (`"2026-04-30"`, `"12:00"`) alongside the raw `value` and `format`. **Always use `text` when reporting dates and times to the user.** The raw `value` is an Excel serial number (days since 1899-12-30) and is meaningless to humans.
|
|
110
|
+
|
|
99
111
|
### ⚠️ Row/column indexing — JSON is 0-based, Excel is 1-based
|
|
100
112
|
|
|
101
113
|
In wonder's JSON output, `row` and `col` (and `startRow`/`endRow`/`startCol`/`endCol` in `merges`) are **0-based**. Excel addresses the user sees in the WeCom UI are **1-based**.
|
|
@@ -118,19 +130,20 @@ Read("/Users/<you>/Downloads/5mghost-wonder/media/image3.png")
|
|
|
118
130
|
|
|
119
131
|
**Note:** Images are full-resolution originals (up to several MB each). Only load images the user specifically asks about.
|
|
120
132
|
|
|
121
|
-
### Viewing visual layout
|
|
133
|
+
### Viewing visual layout — required when colour or layout carries meaning
|
|
134
|
+
|
|
135
|
+
The JSON does not carry cell colours, font colours, or rendered borders. When colour or position is part of the answer, you cannot recover it from JSON — you must read the rendered sheet.
|
|
122
136
|
|
|
123
|
-
|
|
137
|
+
Render to PDF and Read it whenever any of these are true:
|
|
124
138
|
|
|
125
|
-
-
|
|
126
|
-
-
|
|
127
|
-
-
|
|
128
|
-
-
|
|
129
|
-
- User explicitly asks about "how it looks", "颜色", "排版", "这个图表", "这张表的结构"
|
|
139
|
+
- The user asks about "how it looks", "颜色", "排版", "这个图表", "这张表的结构", or refers to a visible highlight
|
|
140
|
+
- The sheet is a gantt chart, calendar, status board, or roadmap (colour = stage / owner / priority / "this week")
|
|
141
|
+
- A column or row in the user's question is highlighted (yellow / red / green) in the WeCom UI
|
|
142
|
+
- The merge-to-cell ratio in the JSON is non-trivial (e.g. `merges.length / cells.length > 0.1`) — likely a layout-driven sheet
|
|
130
143
|
|
|
131
|
-
|
|
144
|
+
You cannot detect colour from JSON alone, so when in doubt about a sheet that mixes data with visual cues, render. The cost is ~30 s and ~10 MB; the cost of guessing wrong is worse.
|
|
132
145
|
|
|
133
|
-
Render
|
|
146
|
+
Render command (one PDF page per tab, preserves layout, merges, fills, borders):
|
|
134
147
|
|
|
135
148
|
```bash
|
|
136
149
|
soffice --headless \
|
|
@@ -140,6 +153,8 @@ soffice --headless \
|
|
|
140
153
|
|
|
141
154
|
Then use the Read tool on the generated PDF. Page N corresponds to the Nth tab in workbook order (same as `tabs[]` in the metadata output).
|
|
142
155
|
|
|
156
|
+
Skip rendering only when the user clearly wants a single cell value or a numeric lookup from a plain data table.
|
|
157
|
+
|
|
143
158
|
---
|
|
144
159
|
|
|
145
160
|
## docx Workflow (`doc/w3_`, `doc/e2_`)
|
|
@@ -196,15 +211,17 @@ Output:
|
|
|
196
211
|
{ "type": "slide", "path": "/Users/<you>/Downloads/5mghost-wonder/filename.pptx" }
|
|
197
212
|
```
|
|
198
213
|
|
|
199
|
-
### Step 2 —
|
|
214
|
+
### Step 2 — Always extract both text and visual layout
|
|
200
215
|
|
|
201
|
-
|
|
216
|
+
For pptx, run **both** extractions every time. Most WeCom slides are layout-driven (timelines, image collages, status boards, recap pages) — pure text loses critical meaning, and pure-PDF visual reading can mis-OCR text that pandoc captures cleanly. You cannot tell a "complex" slide from a "simple" slide without first looking at it, so don't try to decide; just run both and use whichever the question calls for.
|
|
217
|
+
|
|
218
|
+
**1. Text** (for exact wording, fast keyword scanning, copy-quoting):
|
|
202
219
|
|
|
203
220
|
```bash
|
|
204
221
|
pandoc <path> -o /tmp/wonder-slide-output.md && cat /tmp/wonder-slide-output.md
|
|
205
222
|
```
|
|
206
223
|
|
|
207
|
-
**
|
|
224
|
+
**2. Visual layout** (for image-text relationships, timelines, colour, embedded screenshots whose text pandoc cannot reach — e.g. Korean / Japanese chat captures):
|
|
208
225
|
|
|
209
226
|
```bash
|
|
210
227
|
soffice --headless --convert-to pdf --outdir /tmp/ <path>
|
|
@@ -212,7 +229,11 @@ soffice --headless --convert-to pdf --outdir /tmp/ <path>
|
|
|
212
229
|
|
|
213
230
|
Then use the Read tool on the generated PDF.
|
|
214
231
|
|
|
215
|
-
|
|
232
|
+
When answering, combine: lean on the PDF for "what's on the slide and how it's organised", lean on the markdown for exact-wording quotes. Don't answer from text-only when a slide visibly relies on layout — the user will spot the gap immediately.
|
|
233
|
+
|
|
234
|
+
If `soffice` is not installed (`wonder check` reports it as optional/missing), fall back to pandoc-only and tell the user upfront that visual cues, embedded screenshot text, and image-text relationships will be missing from your answer.
|
|
235
|
+
|
|
236
|
+
### Optional: access embedded images directly
|
|
216
237
|
|
|
217
238
|
```bash
|
|
218
239
|
mkdir -p /tmp/wonder-pptx-unpack && cp <path> /tmp/wonder-pptx-unpack/slide.zip && unzip -o /tmp/wonder-pptx-unpack/slide.zip -d /tmp/wonder-pptx-unpack/
|
|
@@ -241,7 +262,8 @@ Then use Read tool on files in `/tmp/wonder-pptx-unpack/ppt/media/`.
|
|
|
241
262
|
| pptx slice crash | `prs.slides[:N]` → `AttributeError: 'list' object has no attribute 'rId'` | Use `for slide in prs.slides` |
|
|
242
263
|
| Cookie expiry | Cookie valid for 7–30 days | Run `wonder wecom cookie` to refresh |
|
|
243
264
|
| xlsx images are full-size | Original images can be up to 6 MB each | Only read images when user specifically needs them |
|
|
244
|
-
| xlsx visual layout
|
|
265
|
+
| xlsx colour / visual layout | JSON has no fill colour, font colour, or rendered borders | Render to PDF (xlsx section) when colour or layout carries meaning |
|
|
266
|
+
| pptx layout-driven slides | Pure pandoc loses image-text relationships, timelines, embedded screenshot text (e.g. Korean chats) | pptx workflow now runs pandoc + soffice→pdf together by default |
|
|
245
267
|
| smartpage unsupported | Export API returns 0% progress forever | Manual browser export |
|
|
246
268
|
|
|
247
269
|
---
|