@mkterswingman/5mghost-wonder 0.0.14 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/browser.js +16 -5
- package/dist/commands/help.js +1 -0
- package/dist/wecom/browser-probe.js +384 -5
- package/package.json +2 -2
- package/skills/setup-5mghost-wonder/SKILL.md +2 -2
- package/skills/update-5mghost-wonder/SKILL.md +1 -1
- package/skills/use-5mghost-wonder/SKILL.md +2 -2
- package/skills/use-5mghost-wonder/references/cli-reference.md +3 -2
- package/skills/use-5mghost-wonder/references/read-no-export-browser.md +16 -6
package/dist/commands/browser.js
CHANGED
|
@@ -2,19 +2,27 @@
|
|
|
2
2
|
// Experimental browser runtime commands.
|
|
3
3
|
import { resolveWonderPaths } from "../platform/paths.js";
|
|
4
4
|
import { parseWecomUrl } from "../wecom/url.js";
|
|
5
|
-
import { runBrowserNoExportProbe } from "../wecom/browser-probe.js";
|
|
5
|
+
import { runBrowserNoExportProbe, runBrowserNoExportRead, } from "../wecom/browser-probe.js";
|
|
6
6
|
export async function runBrowserCommand(argv, context) {
|
|
7
7
|
const [sub, ...rest] = argv;
|
|
8
8
|
switch (sub) {
|
|
9
9
|
case "probe":
|
|
10
10
|
return runBrowserProbe(rest, context);
|
|
11
|
+
case "read":
|
|
12
|
+
return runBrowserRead(rest, context);
|
|
11
13
|
default:
|
|
12
14
|
context.io.stderr(`Unknown subcommand: browser ${sub ?? "(none)"}\n` +
|
|
13
|
-
"Usage: wonder browser probe <url> [--headed] [--timeout-ms <ms>] [--save <dir>]");
|
|
15
|
+
"Usage: wonder browser probe|read <url> [--headed] [--timeout-ms <ms>] [--save <dir>]");
|
|
14
16
|
return { exitCode: 1 };
|
|
15
17
|
}
|
|
16
18
|
}
|
|
19
|
+
async function runBrowserRead(args, context) {
|
|
20
|
+
return runBrowserAction(args, context, "read");
|
|
21
|
+
}
|
|
17
22
|
async function runBrowserProbe(args, context) {
|
|
23
|
+
return runBrowserAction(args, context, "probe");
|
|
24
|
+
}
|
|
25
|
+
async function runBrowserAction(args, context, action) {
|
|
18
26
|
let url;
|
|
19
27
|
let saveDir;
|
|
20
28
|
let timeoutMs;
|
|
@@ -37,7 +45,7 @@ async function runBrowserProbe(args, context) {
|
|
|
37
45
|
if (!url) {
|
|
38
46
|
context.io.stderr(JSON.stringify({
|
|
39
47
|
error: "missing_url",
|
|
40
|
-
message:
|
|
48
|
+
message: `用法:wonder browser ${action} <url> [--headed] [--timeout-ms <ms>] [--save <dir>]`,
|
|
41
49
|
}));
|
|
42
50
|
return { exitCode: 1, telemetry: { outcome: "failure", errorKind: "missing_url" } };
|
|
43
51
|
}
|
|
@@ -57,14 +65,17 @@ async function runBrowserProbe(args, context) {
|
|
|
57
65
|
return { exitCode: 1, telemetry: { outcome: "failure", errorKind: "invalid_timeout" } };
|
|
58
66
|
}
|
|
59
67
|
const paths = resolveWonderPaths({ homeDir: context.homeDir });
|
|
60
|
-
const
|
|
68
|
+
const options = {
|
|
61
69
|
url,
|
|
62
70
|
chromeProfilePath: paths.chromeProfilePath,
|
|
63
71
|
saveDir,
|
|
64
72
|
headed,
|
|
65
73
|
timeoutMs,
|
|
66
74
|
io: context.io,
|
|
67
|
-
}
|
|
75
|
+
};
|
|
76
|
+
const result = action === "read"
|
|
77
|
+
? await runBrowserNoExportRead(options)
|
|
78
|
+
: await runBrowserNoExportProbe(options);
|
|
68
79
|
context.io.stdout(JSON.stringify(result));
|
|
69
80
|
return {
|
|
70
81
|
exitCode: result.status === "fail" ? 1 : 0,
|
package/dist/commands/help.js
CHANGED
|
@@ -34,6 +34,7 @@ export function renderHelpText() {
|
|
|
34
34
|
"Experimental browser runtime:",
|
|
35
35
|
" browser probe <url> Open in Wonder browser profile and capture",
|
|
36
36
|
" no-export evidence summaries",
|
|
37
|
+
" browser read <url> Experimental no-export text read",
|
|
37
38
|
" browser probe <url> --headed Show the browser while probing",
|
|
38
39
|
].join("\n");
|
|
39
40
|
}
|
|
@@ -10,6 +10,8 @@ const CDP_CALL_TIMEOUT_MS = 5000;
|
|
|
10
10
|
const BROWSER_START_TIMEOUT_MS = 20_000;
|
|
11
11
|
const DEFAULT_CAPTURE_MS = 8_000;
|
|
12
12
|
const MAX_NETWORK_CANDIDATES = 40;
|
|
13
|
+
const MAX_IMAGE_DOWNLOADS = 20;
|
|
14
|
+
const MAX_IMAGE_BYTES = 25 * 1024 * 1024;
|
|
13
15
|
let cdpNextId = 1;
|
|
14
16
|
export async function runBrowserNoExportProbe(options) {
|
|
15
17
|
const timeoutMs = options.timeoutMs ?? DEFAULT_CAPTURE_MS;
|
|
@@ -52,7 +54,8 @@ export async function runBrowserNoExportProbe(options) {
|
|
|
52
54
|
await sendCdpCommand(socket, "Page.navigate", { url: options.url });
|
|
53
55
|
await delay(timeoutMs);
|
|
54
56
|
const pageInfo = await evaluatePageInfo(socket);
|
|
55
|
-
const
|
|
57
|
+
const networkArtifacts = await collectNetworkArtifacts(socket, responses);
|
|
58
|
+
const networkCandidates = summarizeNetworkArtifacts(networkArtifacts);
|
|
56
59
|
const evidence = ["browser-page"];
|
|
57
60
|
if (pageInfo.visibleTextSample)
|
|
58
61
|
evidence.push("visible-text");
|
|
@@ -107,6 +110,115 @@ export async function runBrowserNoExportProbe(options) {
|
|
|
107
110
|
await terminateBrowserProcess(child);
|
|
108
111
|
}
|
|
109
112
|
}
|
|
113
|
+
export async function runBrowserNoExportRead(options) {
|
|
114
|
+
const timeoutMs = options.timeoutMs ?? DEFAULT_CAPTURE_MS;
|
|
115
|
+
const browser = findInstalledBrowser(options.executablePath);
|
|
116
|
+
const port = await getFreePort();
|
|
117
|
+
const endpoint = `http://127.0.0.1:${port}`;
|
|
118
|
+
const child = spawnBrowser({
|
|
119
|
+
browser,
|
|
120
|
+
port,
|
|
121
|
+
profilePath: options.chromeProfilePath,
|
|
122
|
+
url: options.url,
|
|
123
|
+
headed: options.headed ?? false,
|
|
124
|
+
});
|
|
125
|
+
try {
|
|
126
|
+
await waitForDebugger(endpoint, BROWSER_START_TIMEOUT_MS);
|
|
127
|
+
const pageWsUrl = await waitForPageWebSocket(endpoint, options.url, BROWSER_START_TIMEOUT_MS);
|
|
128
|
+
const socket = await openDevToolsSocket(pageWsUrl);
|
|
129
|
+
const responses = [];
|
|
130
|
+
try {
|
|
131
|
+
socket.addEventListener("message", (event) => {
|
|
132
|
+
const payload = safeJsonParse(String(event.data));
|
|
133
|
+
if (payload?.method !== "Network.responseReceived")
|
|
134
|
+
return;
|
|
135
|
+
const response = payload.params?.response;
|
|
136
|
+
const requestId = payload.params?.requestId;
|
|
137
|
+
if (!response?.url || !requestId)
|
|
138
|
+
return;
|
|
139
|
+
if (!isInterestingNetworkUrl(response.url, response.mimeType ?? ""))
|
|
140
|
+
return;
|
|
141
|
+
responses.push({
|
|
142
|
+
requestId,
|
|
143
|
+
url: response.url,
|
|
144
|
+
status: response.status ?? 0,
|
|
145
|
+
mimeType: response.mimeType ?? "",
|
|
146
|
+
});
|
|
147
|
+
});
|
|
148
|
+
await sendCdpCommand(socket, "Network.enable");
|
|
149
|
+
await sendCdpCommand(socket, "Page.enable");
|
|
150
|
+
await sendCdpCommand(socket, "Runtime.enable");
|
|
151
|
+
await sendCdpCommand(socket, "Page.navigate", { url: options.url });
|
|
152
|
+
await delay(timeoutMs);
|
|
153
|
+
const pageInfo = await evaluatePageInfo(socket);
|
|
154
|
+
const networkArtifacts = await collectNetworkArtifacts(socket, responses);
|
|
155
|
+
const opendoc = networkArtifacts.find((artifact) => artifact.rawUrl.includes("/dop-api/opendoc") && artifact.body);
|
|
156
|
+
const extracted = opendoc?.body ? extractTextFromOpendoc(opendoc.body) : null;
|
|
157
|
+
const evidence = ["browser-page", "network-response-summary"];
|
|
158
|
+
if (opendoc)
|
|
159
|
+
evidence.push("opendoc-response");
|
|
160
|
+
if (extracted?.text)
|
|
161
|
+
evidence.push("initial-attributed-text");
|
|
162
|
+
const images = options.saveDir && extracted?.imageUrls
|
|
163
|
+
? await downloadImageResources(extracted.imageUrls, options.saveDir)
|
|
164
|
+
: (extracted?.imageUrls ?? []).map((url) => ({
|
|
165
|
+
url,
|
|
166
|
+
...parseImageDimensionsFromUrl(url),
|
|
167
|
+
status: "skipped",
|
|
168
|
+
error: "pass --save <dir> to download image resources",
|
|
169
|
+
}));
|
|
170
|
+
if (images.some((image) => image.status === "downloaded"))
|
|
171
|
+
evidence.push("image-resource-download");
|
|
172
|
+
const result = {
|
|
173
|
+
mode: "browser-no-export-read",
|
|
174
|
+
status: extracted?.text ? "partial" : "fail",
|
|
175
|
+
url: redactUrl(options.url),
|
|
176
|
+
finalUrl: pageInfo.finalUrl ? redactUrl(pageInfo.finalUrl) : undefined,
|
|
177
|
+
title: extracted?.title ?? pageInfo.title,
|
|
178
|
+
padType: extracted?.padType,
|
|
179
|
+
text: extracted?.text,
|
|
180
|
+
textLength: extracted?.text.length,
|
|
181
|
+
imageUrls: extracted?.imageUrls,
|
|
182
|
+
images,
|
|
183
|
+
extraction: extracted?.text ? "opendoc-initial-attributed-text" : "none",
|
|
184
|
+
evidence,
|
|
185
|
+
warnings: extracted?.warnings ?? [
|
|
186
|
+
"No readable opendoc initialAttributedText was captured.",
|
|
187
|
+
],
|
|
188
|
+
missing: [
|
|
189
|
+
"table merge ranges",
|
|
190
|
+
...(images.some((image) => image.status === "downloaded") ? [] : ["image original resources"]),
|
|
191
|
+
"image anchors",
|
|
192
|
+
"floating vs fixed image classification",
|
|
193
|
+
],
|
|
194
|
+
};
|
|
195
|
+
if (options.saveDir) {
|
|
196
|
+
mkdirSync(options.saveDir, { recursive: true });
|
|
197
|
+
const savedPath = resolve(options.saveDir, `wonder-browser-read-${Date.now()}.json`);
|
|
198
|
+
writeFileSync(savedPath, JSON.stringify(result, null, 2), { mode: 0o600 });
|
|
199
|
+
result.savedPath = savedPath;
|
|
200
|
+
}
|
|
201
|
+
return result;
|
|
202
|
+
}
|
|
203
|
+
finally {
|
|
204
|
+
socket.close();
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
catch (err) {
|
|
208
|
+
return {
|
|
209
|
+
mode: "browser-no-export-read",
|
|
210
|
+
status: "fail",
|
|
211
|
+
url: redactUrl(options.url),
|
|
212
|
+
extraction: "none",
|
|
213
|
+
evidence: [],
|
|
214
|
+
warnings: [],
|
|
215
|
+
missing: [err instanceof Error ? err.message : String(err)],
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
finally {
|
|
219
|
+
await terminateBrowserProcess(child);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
110
222
|
async function evaluatePageInfo(socket) {
|
|
111
223
|
const expression = `(() => {
|
|
112
224
|
const text = (document.body && document.body.innerText || "").replace(/\\s+/g, " ").trim();
|
|
@@ -136,14 +248,14 @@ async function evaluatePageInfo(socket) {
|
|
|
136
248
|
runtimeGlobals: value?.runtimeGlobals ?? [],
|
|
137
249
|
};
|
|
138
250
|
}
|
|
139
|
-
async function
|
|
251
|
+
async function collectNetworkArtifacts(socket, responses) {
|
|
140
252
|
const unique = new Map();
|
|
141
253
|
for (const response of responses) {
|
|
142
254
|
if (unique.size >= MAX_NETWORK_CANDIDATES)
|
|
143
255
|
break;
|
|
144
256
|
unique.set(response.requestId, response);
|
|
145
257
|
}
|
|
146
|
-
const
|
|
258
|
+
const artifacts = [];
|
|
147
259
|
for (const response of unique.values()) {
|
|
148
260
|
let body = "";
|
|
149
261
|
let base64Encoded = false;
|
|
@@ -156,16 +268,283 @@ async function collectNetworkCandidates(socket, responses) {
|
|
|
156
268
|
// Some responses are streamed, cached, too large, or not retained.
|
|
157
269
|
}
|
|
158
270
|
const signals = detectBodySignals(body, response.mimeType);
|
|
159
|
-
|
|
271
|
+
artifacts.push({
|
|
272
|
+
requestId: response.requestId,
|
|
273
|
+
rawUrl: response.url,
|
|
160
274
|
url: redactUrl(response.url),
|
|
161
275
|
status: response.status,
|
|
162
276
|
mimeType: response.mimeType,
|
|
163
277
|
bodyLength: body ? body.length : undefined,
|
|
164
278
|
base64Encoded: body ? base64Encoded : undefined,
|
|
165
279
|
signals,
|
|
280
|
+
body,
|
|
166
281
|
});
|
|
167
282
|
}
|
|
168
|
-
return
|
|
283
|
+
return artifacts;
|
|
284
|
+
}
|
|
285
|
+
function summarizeNetworkArtifacts(artifacts) {
|
|
286
|
+
return artifacts.map((artifact) => ({
|
|
287
|
+
url: artifact.url,
|
|
288
|
+
status: artifact.status,
|
|
289
|
+
mimeType: artifact.mimeType,
|
|
290
|
+
bodyLength: artifact.bodyLength,
|
|
291
|
+
base64Encoded: artifact.base64Encoded,
|
|
292
|
+
signals: artifact.signals,
|
|
293
|
+
}));
|
|
294
|
+
}
|
|
295
|
+
export function extractTextFromOpendoc(body) {
|
|
296
|
+
let payload;
|
|
297
|
+
try {
|
|
298
|
+
payload = JSON.parse(body);
|
|
299
|
+
}
|
|
300
|
+
catch {
|
|
301
|
+
return null;
|
|
302
|
+
}
|
|
303
|
+
const initialText = payload.clientVars?.collab_client_vars?.initialAttributedText?.text;
|
|
304
|
+
const chunks = Array.isArray(initialText) ? initialText : [];
|
|
305
|
+
const decodedChunks = chunks
|
|
306
|
+
.filter((chunk) => typeof chunk === "string")
|
|
307
|
+
.flatMap((chunk) => extractInitialAttributedTextCandidates(chunk));
|
|
308
|
+
const textCandidate = chooseTextCandidate(decodedChunks);
|
|
309
|
+
const text = normalizeExtractedText(textCandidate ?? decodedChunks.join("\n"));
|
|
310
|
+
if (!text)
|
|
311
|
+
return null;
|
|
312
|
+
const imageUrls = extractImageUrls(decodedChunks.join("\n"));
|
|
313
|
+
return {
|
|
314
|
+
title: payload.clientVars?.title ?? payload.clientVars?.initialTitle,
|
|
315
|
+
padType: payload.clientVars?.padType ?? payload.padType,
|
|
316
|
+
text,
|
|
317
|
+
imageUrls,
|
|
318
|
+
warnings: [
|
|
319
|
+
"No-export text is decoded from opendoc initialAttributedText; rich styles are not reconstructed yet.",
|
|
320
|
+
...(imageUrls.length > 0
|
|
321
|
+
? ["Image resource URLs were detected, but image anchors and fixed/floating placement are not reconstructed yet."]
|
|
322
|
+
: []),
|
|
323
|
+
],
|
|
324
|
+
};
|
|
325
|
+
}
|
|
326
|
+
function extractInitialAttributedTextCandidates(chunk) {
|
|
327
|
+
let decoded;
|
|
328
|
+
try {
|
|
329
|
+
decoded = Buffer.from(chunk, "base64");
|
|
330
|
+
}
|
|
331
|
+
catch {
|
|
332
|
+
decoded = Buffer.from(chunk, "utf8");
|
|
333
|
+
}
|
|
334
|
+
const candidates = collectProtobufStringCandidates(decoded);
|
|
335
|
+
if (candidates.length > 0)
|
|
336
|
+
return candidates.map((candidate) => candidate.text);
|
|
337
|
+
return [decoded.toString("utf8")];
|
|
338
|
+
}
|
|
339
|
+
function chooseTextCandidate(candidates) {
|
|
340
|
+
const scored = candidates
|
|
341
|
+
.map((text) => ({
|
|
342
|
+
text,
|
|
343
|
+
score: text.length +
|
|
344
|
+
(/\[[^\]]+\]/.test(text) ? 200 : 0) +
|
|
345
|
+
(/[\u4e00-\u9fff]/.test(text) ? 200 : 0) -
|
|
346
|
+
((text.match(/https?:\/\//g)?.length ?? 0) * 100),
|
|
347
|
+
}))
|
|
348
|
+
.filter((candidate) => candidate.text.length > 0)
|
|
349
|
+
.sort((a, b) => b.score - a.score);
|
|
350
|
+
return scored[0]?.text ?? null;
|
|
351
|
+
}
|
|
352
|
+
function collectProtobufStringCandidates(buffer) {
|
|
353
|
+
const candidates = [];
|
|
354
|
+
walkProtobuf(buffer, 0, buffer.length, 0, candidates);
|
|
355
|
+
return candidates
|
|
356
|
+
.filter((candidate) => candidate.byteLength >= 2 &&
|
|
357
|
+
candidate.printableRatio >= 0.8 &&
|
|
358
|
+
/[\p{L}\p{N}\[]/u.test(candidate.text))
|
|
359
|
+
.sort((a, b) => b.byteLength - a.byteLength);
|
|
360
|
+
}
|
|
361
|
+
function walkProtobuf(buffer, start, end, depth, candidates) {
|
|
362
|
+
if (depth > 6)
|
|
363
|
+
return;
|
|
364
|
+
let offset = start;
|
|
365
|
+
let fieldsSeen = 0;
|
|
366
|
+
while (offset < end && fieldsSeen < 10_000) {
|
|
367
|
+
fieldsSeen++;
|
|
368
|
+
const key = readVarint(buffer, offset, end);
|
|
369
|
+
if (!key)
|
|
370
|
+
return;
|
|
371
|
+
offset = key.nextOffset;
|
|
372
|
+
const wireType = key.value & 7;
|
|
373
|
+
if (wireType === 0) {
|
|
374
|
+
const value = readVarint(buffer, offset, end);
|
|
375
|
+
if (!value)
|
|
376
|
+
return;
|
|
377
|
+
offset = value.nextOffset;
|
|
378
|
+
}
|
|
379
|
+
else if (wireType === 1) {
|
|
380
|
+
offset += 8;
|
|
381
|
+
}
|
|
382
|
+
else if (wireType === 5) {
|
|
383
|
+
offset += 4;
|
|
384
|
+
}
|
|
385
|
+
else if (wireType === 2) {
|
|
386
|
+
const length = readVarint(buffer, offset, end);
|
|
387
|
+
if (!length)
|
|
388
|
+
return;
|
|
389
|
+
offset = length.nextOffset;
|
|
390
|
+
const fieldEnd = offset + length.value;
|
|
391
|
+
if (fieldEnd > end)
|
|
392
|
+
return;
|
|
393
|
+
const raw = buffer.subarray(offset, fieldEnd);
|
|
394
|
+
const text = raw.toString("utf8");
|
|
395
|
+
candidates.push({
|
|
396
|
+
text,
|
|
397
|
+
byteLength: length.value,
|
|
398
|
+
printableRatio: printableRatio(text),
|
|
399
|
+
});
|
|
400
|
+
if (length.value > 2) {
|
|
401
|
+
walkProtobuf(buffer, offset, fieldEnd, depth + 1, candidates);
|
|
402
|
+
}
|
|
403
|
+
offset = fieldEnd;
|
|
404
|
+
}
|
|
405
|
+
else {
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
function readVarint(buffer, offset, end) {
|
|
411
|
+
let value = 0n;
|
|
412
|
+
let shift = 0n;
|
|
413
|
+
let current = offset;
|
|
414
|
+
while (current < end) {
|
|
415
|
+
const byte = buffer[current++];
|
|
416
|
+
value |= BigInt(byte & 0x7f) << shift;
|
|
417
|
+
if ((byte & 0x80) === 0) {
|
|
418
|
+
return { value: Number(value), nextOffset: current };
|
|
419
|
+
}
|
|
420
|
+
shift += 7n;
|
|
421
|
+
if (shift > 63n)
|
|
422
|
+
return null;
|
|
423
|
+
}
|
|
424
|
+
return null;
|
|
425
|
+
}
|
|
426
|
+
function printableRatio(text) {
|
|
427
|
+
if (!text)
|
|
428
|
+
return 0;
|
|
429
|
+
let printable = 0;
|
|
430
|
+
for (const char of text) {
|
|
431
|
+
if (/[\p{L}\p{N}\p{P}\p{Zs}\r\n\t]/u.test(char))
|
|
432
|
+
printable++;
|
|
433
|
+
}
|
|
434
|
+
return printable / text.length;
|
|
435
|
+
}
|
|
436
|
+
function extractImageUrls(text) {
|
|
437
|
+
const urls = text.match(/https?:\/\/[^\s"'<>\\\u0000-\u001f]+/g) ?? [];
|
|
438
|
+
return Array.from(new Set(urls
|
|
439
|
+
.map((url) => url.replace(/[)*,.;:]+$/g, ""))
|
|
440
|
+
.filter((url) => /qpic\.cn|weixin\.qq\.com|doc\.weixin\.qq\.com/i.test(url))));
|
|
441
|
+
}
|
|
442
|
+
async function downloadImageResources(imageUrls, saveDir) {
|
|
443
|
+
const imageDir = resolve(saveDir, "images");
|
|
444
|
+
mkdirSync(imageDir, { recursive: true });
|
|
445
|
+
const results = [];
|
|
446
|
+
for (const [index, url] of imageUrls.slice(0, MAX_IMAGE_DOWNLOADS).entries()) {
|
|
447
|
+
const dimensions = parseImageDimensionsFromUrl(url);
|
|
448
|
+
try {
|
|
449
|
+
const res = await fetch(url);
|
|
450
|
+
if (!res.ok) {
|
|
451
|
+
results.push({
|
|
452
|
+
url,
|
|
453
|
+
...dimensions,
|
|
454
|
+
status: "failed",
|
|
455
|
+
error: `HTTP ${res.status}`,
|
|
456
|
+
});
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
const contentType = res.headers.get("content-type") ?? undefined;
|
|
460
|
+
const bytes = Buffer.from(await res.arrayBuffer());
|
|
461
|
+
if (bytes.length > MAX_IMAGE_BYTES) {
|
|
462
|
+
results.push({
|
|
463
|
+
url,
|
|
464
|
+
...dimensions,
|
|
465
|
+
contentType,
|
|
466
|
+
sizeBytes: bytes.length,
|
|
467
|
+
status: "failed",
|
|
468
|
+
error: `image exceeds ${MAX_IMAGE_BYTES} bytes`,
|
|
469
|
+
});
|
|
470
|
+
continue;
|
|
471
|
+
}
|
|
472
|
+
const filePath = resolve(imageDir, `image-${String(index + 1).padStart(3, "0")}${extensionForContentType(contentType)}`);
|
|
473
|
+
writeFileSync(filePath, bytes, { mode: 0o600 });
|
|
474
|
+
results.push({
|
|
475
|
+
url,
|
|
476
|
+
path: filePath,
|
|
477
|
+
contentType,
|
|
478
|
+
sizeBytes: bytes.length,
|
|
479
|
+
...dimensions,
|
|
480
|
+
status: "downloaded",
|
|
481
|
+
});
|
|
482
|
+
}
|
|
483
|
+
catch (err) {
|
|
484
|
+
results.push({
|
|
485
|
+
url,
|
|
486
|
+
...dimensions,
|
|
487
|
+
status: "failed",
|
|
488
|
+
error: err instanceof Error ? err.message : String(err),
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
if (imageUrls.length > MAX_IMAGE_DOWNLOADS) {
|
|
493
|
+
for (const url of imageUrls.slice(MAX_IMAGE_DOWNLOADS)) {
|
|
494
|
+
results.push({
|
|
495
|
+
url,
|
|
496
|
+
...parseImageDimensionsFromUrl(url),
|
|
497
|
+
status: "skipped",
|
|
498
|
+
error: `only first ${MAX_IMAGE_DOWNLOADS} images are downloaded`,
|
|
499
|
+
});
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
return results;
|
|
503
|
+
}
|
|
504
|
+
export function parseImageDimensionsFromUrl(url) {
|
|
505
|
+
try {
|
|
506
|
+
const parsed = new URL(url);
|
|
507
|
+
const width = Number(parsed.searchParams.get("w"));
|
|
508
|
+
const height = Number(parsed.searchParams.get("h"));
|
|
509
|
+
return {
|
|
510
|
+
width: Number.isFinite(width) && width > 0 ? width : undefined,
|
|
511
|
+
height: Number.isFinite(height) && height > 0 ? height : undefined,
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
catch {
|
|
515
|
+
return {};
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
function extensionForContentType(contentType) {
|
|
519
|
+
if (!contentType)
|
|
520
|
+
return ".bin";
|
|
521
|
+
if (contentType.includes("png"))
|
|
522
|
+
return ".png";
|
|
523
|
+
if (contentType.includes("jpeg") || contentType.includes("jpg"))
|
|
524
|
+
return ".jpg";
|
|
525
|
+
if (contentType.includes("gif"))
|
|
526
|
+
return ".gif";
|
|
527
|
+
if (contentType.includes("webp"))
|
|
528
|
+
return ".webp";
|
|
529
|
+
if (contentType.includes("svg"))
|
|
530
|
+
return ".svg";
|
|
531
|
+
return ".bin";
|
|
532
|
+
}
|
|
533
|
+
function normalizeExtractedText(text) {
|
|
534
|
+
return text
|
|
535
|
+
.replace(/p\.\d{8,}/g, "")
|
|
536
|
+
.replace(/\b\d{12,}@eJ\b/g, "")
|
|
537
|
+
.replace(/[^\S\r\n]+/g, " ")
|
|
538
|
+
.replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]+/g, "\n")
|
|
539
|
+
.split(/\r?\n/)
|
|
540
|
+
.map((line) => line.trim())
|
|
541
|
+
.filter((line) => line.length > 0)
|
|
542
|
+
.filter((line) => line.length > 1 || /[\u4e00-\u9fff]/.test(line))
|
|
543
|
+
.filter((line) => !line.includes("\uFFFD"))
|
|
544
|
+
.filter((line) => /[\p{L}\p{N}\]\)]/u.test(line))
|
|
545
|
+
.join("\n")
|
|
546
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
547
|
+
.trim();
|
|
169
548
|
}
|
|
170
549
|
function detectBodySignals(body, mimeType) {
|
|
171
550
|
const haystack = `${mimeType}\n${body.slice(0, 200_000)}`;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mkterswingman/5mghost-wonder",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.16",
|
|
4
4
|
"description": "企微文档读取 CLI — WeCom document reader",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
"build": "rm -rf dist && tsc && chmod +x dist/cli.js",
|
|
27
27
|
"typecheck": "tsc --noEmit",
|
|
28
28
|
"check:skills": "node scripts/check-skills.mjs",
|
|
29
|
-
"test": "npm run check:skills && node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs && node --test tests/format.test.mjs && node --test tests/cookies-validation.test.mjs",
|
|
29
|
+
"test": "npm run check:skills && node dist/wecom/url.test.js && node --test tests/sheet-parity.test.mjs && node --test tests/export-sanitize.test.mjs && node --test tests/format.test.mjs && node --test tests/cookies-validation.test.mjs && node --test tests/browser-read.test.mjs",
|
|
30
30
|
"smoke": "npm run build && node dist/cli.js help > /dev/null",
|
|
31
31
|
"postinstall": "node scripts/postinstall.mjs"
|
|
32
32
|
},
|
|
@@ -7,9 +7,9 @@ description: Use this skill when the user wants to install or set up wonder, say
|
|
|
7
7
|
|
|
8
8
|
## Skill version
|
|
9
9
|
|
|
10
|
-
This skill matches **wonder 0.0.
|
|
10
|
+
This skill matches **wonder 0.0.16**.
|
|
11
11
|
|
|
12
|
-
Once the CLI is installed in Step 1, run `wonder --version`. If the output does not equal `0.0.
|
|
12
|
+
Once the CLI is installed in Step 1, run `wonder --version`. If the output does not equal `0.0.16`, the CLI on disk has drifted from the skill text loaded in this session. Ask the user to run `/update-5mghost-wonder`, then **start a fresh AI session** (`/exit` and re-enter, or open a new chat) — skill text already loaded into a running session does not refresh after `wonder update`, even though the file on disk has been replaced.
|
|
13
13
|
|
|
14
14
|
After a successful first install, also remind the user to start a fresh AI session before invoking `/use-5mghost-wonder` for the first time. The skill files were just written to disk; the current session never loaded them.
|
|
15
15
|
|
|
@@ -10,10 +10,10 @@ the referenced workflow files needed for the current task.
|
|
|
10
10
|
|
|
11
11
|
## Version Gate
|
|
12
12
|
|
|
13
|
-
This skill matches **wonder 0.0.
|
|
13
|
+
This skill matches **wonder 0.0.16**.
|
|
14
14
|
|
|
15
15
|
On first use in a session, follow `references/session-init.md`. If the installed
|
|
16
|
-
CLI version differs from `0.0.
|
|
16
|
+
CLI version differs from `0.0.16`, stop and ask the user to run
|
|
17
17
|
`/update-5mghost-wonder`, then start a fresh AI session.
|
|
18
18
|
|
|
19
19
|
## Hard Rules
|
|
@@ -7,6 +7,7 @@ wonder read <url> --save <dir> # Output directory
|
|
|
7
7
|
wonder read <url> --no-cache # Force fresh export
|
|
8
8
|
|
|
9
9
|
wonder browser probe <url> # Experimental no-export browser evidence probe
|
|
10
|
+
wonder browser read <url> # Experimental no-export text read
|
|
10
11
|
wonder browser probe <url> --headed
|
|
11
12
|
wonder browser probe <url> --save <dir>
|
|
12
13
|
|
|
@@ -22,5 +23,5 @@ wonder auth login # Browser OAuth login
|
|
|
22
23
|
wonder version # Print version
|
|
23
24
|
```
|
|
24
25
|
|
|
25
|
-
Browser editor runtime remains experimental. No-export read
|
|
26
|
-
|
|
26
|
+
Browser editor runtime remains experimental. No-export read has experimental
|
|
27
|
+
browser commands, but it is not a lossless reader yet.
|
|
@@ -30,7 +30,17 @@ Do not describe screenshot/OCR output as lossless.
|
|
|
30
30
|
## Procedure
|
|
31
31
|
|
|
32
32
|
1. Confirm cookies are valid. If not, follow `cookie-recovery.md`.
|
|
33
|
-
2. Start with the built-in
|
|
33
|
+
2. Start with the built-in browser read:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
wonder browser read <url> --save /tmp/wonder-browser-read
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
This currently attempts text extraction from the browser `opendoc`
|
|
40
|
+
`initialAttributedText`. Treat it as `partial` unless it also proves image and
|
|
41
|
+
table structure.
|
|
42
|
+
|
|
43
|
+
3. If the read is insufficient, run the evidence probe:
|
|
34
44
|
|
|
35
45
|
```bash
|
|
36
46
|
wonder browser probe <url> --save /tmp/wonder-browser-probe
|
|
@@ -38,9 +48,9 @@ wonder browser probe <url> --save /tmp/wonder-browser-probe
|
|
|
38
48
|
|
|
39
49
|
Use `--headed` only when login/debug visibility is needed.
|
|
40
50
|
|
|
41
|
-
|
|
51
|
+
4. If the probe is insufficient, continue with the available browser automation
|
|
42
52
|
runtime.
|
|
43
|
-
|
|
53
|
+
5. Capture non-destructive evidence:
|
|
44
54
|
- page URL and document type
|
|
45
55
|
- visible title
|
|
46
56
|
- loaded script/runtime globals that look like document stores
|
|
@@ -48,16 +58,16 @@ Use `--headed` only when login/debug visibility is needed.
|
|
|
48
58
|
media URLs
|
|
49
59
|
- WebSocket messages if they expose structured document operations
|
|
50
60
|
- screenshots only as a fallback or visual cross-check
|
|
51
|
-
|
|
61
|
+
6. For tables, specifically look for:
|
|
52
62
|
- cell coordinates
|
|
53
63
|
- displayed text
|
|
54
64
|
- merge ranges
|
|
55
65
|
- row/column sizes if available
|
|
56
66
|
- fixed cell images
|
|
57
67
|
- floating images with position, size, and anchor
|
|
58
|
-
|
|
68
|
+
7. For image-heavy docs, verify whether each image can be fetched as an original
|
|
59
69
|
resource or only appears in a raster screenshot.
|
|
60
|
-
|
|
70
|
+
8. Report `pass`, `partial`, or `fail` with the missing fields.
|
|
61
71
|
|
|
62
72
|
## Output Contract
|
|
63
73
|
|