onenote-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +10 -0
- package/LICENSE +21 -0
- package/README.md +142 -0
- package/SKILL.md +77 -0
- package/bun.lock +92 -0
- package/docs/azure-app-registration.md +80 -0
- package/docs/development-notes.md +84 -0
- package/docs/graph-api-endpoints.md +95 -0
- package/docs/local-search-architecture.md +132 -0
- package/docs/onen0te-cli-analysis.md +130 -0
- package/docs/setup.md +141 -0
- package/docs/zhihu-intro.md +92 -0
- package/package.json +22 -0
- package/src/auth.ts +179 -0
- package/src/cache.ts +763 -0
- package/src/graph.ts +264 -0
- package/src/index.ts +408 -0
- package/tsconfig.json +14 -0
package/src/cache.ts
ADDED
|
@@ -0,0 +1,763 @@
|
|
|
1
|
+
import { getAccessToken } from "./auth";
|
|
2
|
+
import { listNotebooks } from "./graph";
|
|
3
|
+
import { readFile, writeFile, mkdir, readdir, stat } from "node:fs/promises";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
|
|
6
|
+
import { homedir } from "node:os";
|
|
7
|
+
import { dirname } from "node:path";
|
|
8
|
+
|
|
9
|
+
// Cache directory: use package root (../.onenote/cache relative to src/) so that
|
|
10
|
+
// the cache lives alongside .env.local. Fall back to ~/.onenote-cli/cache if the
|
|
11
|
+
// package root is not writable (e.g. installed via npm).
|
|
12
|
+
const PKG_ROOT = dirname(import.meta.dir);
|
|
13
|
+
const CACHE_DIR = process.env.ONENOTE_CACHE_DIR
|
|
14
|
+
|| join(PKG_ROOT, ".onenote", "cache");
|
|
15
|
+
|
|
16
|
+
interface CachedPage {
|
|
17
|
+
title: string;
|
|
18
|
+
body: string;
|
|
19
|
+
section: string;
|
|
20
|
+
notebook: string;
|
|
21
|
+
webUrl: string; // OneNote Online URL for this page (section + page GUID)
|
|
22
|
+
pageGuid?: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface CacheIndex {
|
|
26
|
+
updatedAt: string;
|
|
27
|
+
notebooks: {
|
|
28
|
+
id: string;
|
|
29
|
+
displayName: string;
|
|
30
|
+
sections: {
|
|
31
|
+
driveItemId: string;
|
|
32
|
+
displayName: string;
|
|
33
|
+
webUrl: string;
|
|
34
|
+
drivePath: string;
|
|
35
|
+
cachedAt: string;
|
|
36
|
+
}[];
|
|
37
|
+
}[];
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function getNotebookDrivePath(notebook: any): string | null {
|
|
41
|
+
const webUrl = notebook.links?.oneNoteWebUrl?.href;
|
|
42
|
+
if (!webUrl) return null;
|
|
43
|
+
const match = decodeURIComponent(new URL(webUrl).pathname).match(/Documents\/(.+)/);
|
|
44
|
+
return match?.[1] ?? null;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
async function graphFetchRaw(path: string): Promise<Response> {
|
|
48
|
+
const token = await getAccessToken();
|
|
49
|
+
const url = path.startsWith("http")
|
|
50
|
+
? path
|
|
51
|
+
: `https://graph.microsoft.com/v1.0${path}`;
|
|
52
|
+
return fetch(url, {
|
|
53
|
+
headers: { Authorization: `Bearer ${token}` },
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function isReadableChar(code: number): boolean {
|
|
58
|
+
return (
|
|
59
|
+
(code >= 0x20 && code <= 0x7e) || // ASCII printable
|
|
60
|
+
code === 0x0a || code === 0x0d || code === 0x09 || // whitespace
|
|
61
|
+
(code >= 0xa0 && code <= 0x024f) || // Latin Extended
|
|
62
|
+
(code >= 0x0370 && code <= 0x058f) || // Greek, Cyrillic, Armenian
|
|
63
|
+
(code >= 0x0600 && code <= 0x06ff) || // Arabic
|
|
64
|
+
(code >= 0x0900 && code <= 0x097f) || // Devanagari
|
|
65
|
+
(code >= 0x0e00 && code <= 0x0e7f) || // Thai
|
|
66
|
+
(code >= 0x1100 && code <= 0x11ff) || // Hangul Jamo
|
|
67
|
+
(code >= 0x2000 && code <= 0x206f) || // General Punctuation
|
|
68
|
+
(code >= 0x2100 && code <= 0x214f) || // Letterlike Symbols
|
|
69
|
+
(code >= 0x2190 && code <= 0x21ff) || // Arrows
|
|
70
|
+
(code >= 0x2200 && code <= 0x22ff) || // Mathematical Operators
|
|
71
|
+
(code >= 0x2500 && code <= 0x257f) || // Box Drawing
|
|
72
|
+
(code >= 0x3000 && code <= 0x303f) || // CJK Symbols and Punctuation
|
|
73
|
+
(code >= 0x3040 && code <= 0x309f) || // Hiragana
|
|
74
|
+
(code >= 0x30a0 && code <= 0x30ff) || // Katakana
|
|
75
|
+
(code >= 0x3100 && code <= 0x312f) || // Bopomofo
|
|
76
|
+
(code >= 0x3400 && code <= 0x4dbf) || // CJK Extension A
|
|
77
|
+
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
|
|
78
|
+
(code >= 0xac00 && code <= 0xd7af) || // Hangul Syllables
|
|
79
|
+
(code >= 0xf900 && code <= 0xfaff) || // CJK Compatibility Ideographs
|
|
80
|
+
(code >= 0xfe30 && code <= 0xfe4f) || // CJK Compatibility Forms
|
|
81
|
+
(code >= 0xff00 && code <= 0xffef) // Fullwidth Forms
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function extractTextBlocks(buf: Buffer): { offset: number; text: string }[] {
|
|
86
|
+
const blocks: { offset: number; text: string }[] = [];
|
|
87
|
+
|
|
88
|
+
// Extract UTF-8 text blocks
|
|
89
|
+
const utf8 = buf.toString("utf-8");
|
|
90
|
+
let start = -1;
|
|
91
|
+
let chars = "";
|
|
92
|
+
for (let i = 0; i < utf8.length; i++) {
|
|
93
|
+
if (isReadableChar(utf8.charCodeAt(i))) {
|
|
94
|
+
if (start < 0) start = i;
|
|
95
|
+
chars += utf8[i];
|
|
96
|
+
} else {
|
|
97
|
+
if (chars.trim().length >= 6) blocks.push({ offset: start, text: chars.trim() });
|
|
98
|
+
chars = "";
|
|
99
|
+
start = -1;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
if (chars.trim().length >= 6) blocks.push({ offset: start, text: chars.trim() });
|
|
103
|
+
|
|
104
|
+
// Extract UTF-16LE text blocks at both even and odd byte alignments
|
|
105
|
+
// (OneNote may have UTF-16LE strings starting at either alignment)
|
|
106
|
+
for (const startOffset of [0, 1]) {
|
|
107
|
+
start = -1;
|
|
108
|
+
chars = "";
|
|
109
|
+
for (let i = startOffset; i < buf.length - 1; i += 2) {
|
|
110
|
+
const code = buf[i] | (buf[i + 1] << 8);
|
|
111
|
+
if (isReadableChar(code)) {
|
|
112
|
+
if (start < 0) start = i;
|
|
113
|
+
chars += String.fromCharCode(code);
|
|
114
|
+
} else {
|
|
115
|
+
if (chars.trim().length >= 4) blocks.push({ offset: start, text: chars.trim() });
|
|
116
|
+
chars = "";
|
|
117
|
+
start = -1;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (chars.trim().length >= 4) blocks.push({ offset: start, text: chars.trim() });
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Sort by offset
|
|
124
|
+
blocks.sort((a, b) => a.offset - b.offset);
|
|
125
|
+
|
|
126
|
+
// Filter noise: require blocks to have a reasonable ratio of common characters
|
|
127
|
+
return blocks.filter((b) => {
|
|
128
|
+
const common = b.text.replace(
|
|
129
|
+
/[^a-zA-Z0-9\u3040-\u30ff\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\s.,;:!?@#\-_()[\]{}'"\/\\]/g,
|
|
130
|
+
""
|
|
131
|
+
);
|
|
132
|
+
if (common.length / b.text.length <= 0.6 || b.text.length < 4) return false;
|
|
133
|
+
|
|
134
|
+
// Detect misaligned UTF-16LE reading of ASCII: characters where low byte = 0x00
|
|
135
|
+
// and code is in the "shifted ASCII" range (0x2000-0x7E00 typically)
|
|
136
|
+
let shiftedAsciiCount = 0;
|
|
137
|
+
let cjkCount = 0;
|
|
138
|
+
for (const ch of b.text) {
|
|
139
|
+
const code = ch.charCodeAt(0);
|
|
140
|
+
if ((code & 0xff) === 0 && code >= 0x2000 && code <= 0x7f00) {
|
|
141
|
+
shiftedAsciiCount++;
|
|
142
|
+
}
|
|
143
|
+
if (code >= 0x4e00 && code <= 0x9fff) cjkCount++;
|
|
144
|
+
}
|
|
145
|
+
// If most characters look like shifted-ASCII garbage, reject
|
|
146
|
+
if (shiftedAsciiCount > 3 && shiftedAsciiCount / b.text.length > 0.5) return false;
|
|
147
|
+
return true;
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function groupIntoPages(blocks: { offset: number; text: string }[]): { title: string; body: string }[] {
|
|
152
|
+
const pages: { title: string; body: string }[] = [];
|
|
153
|
+
let group: typeof blocks = [];
|
|
154
|
+
|
|
155
|
+
for (const block of blocks) {
|
|
156
|
+
if (group.length > 0) {
|
|
157
|
+
const prevEnd = group[group.length - 1].offset + group[group.length - 1].text.length * 2;
|
|
158
|
+
const gap = block.offset - prevEnd;
|
|
159
|
+
if (gap > 500) {
|
|
160
|
+
const body = group.map((b) => b.text).join("\n");
|
|
161
|
+
if (body.length > 10) {
|
|
162
|
+
const lines = body.split("\n").filter((l) => l.trim().length > 0);
|
|
163
|
+
pages.push({ title: lines[0]?.slice(0, 200) || "(untitled)", body });
|
|
164
|
+
}
|
|
165
|
+
group = [];
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
group.push(block);
|
|
169
|
+
}
|
|
170
|
+
if (group.length > 0) {
|
|
171
|
+
const body = group.map((b) => b.text).join("\n");
|
|
172
|
+
if (body.length > 10) {
|
|
173
|
+
const lines = body.split("\n").filter((l) => l.trim().length > 0);
|
|
174
|
+
pages.push({ title: lines[0]?.slice(0, 200) || "(untitled)", body });
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return pages;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function bufToGuid(b: Buffer, off: number): string {
|
|
182
|
+
return [
|
|
183
|
+
b.readUInt32LE(off).toString(16).padStart(8, "0"),
|
|
184
|
+
b.readUInt16LE(off + 4).toString(16).padStart(4, "0"),
|
|
185
|
+
b.readUInt16LE(off + 6).toString(16).padStart(4, "0"),
|
|
186
|
+
b.slice(off + 8, off + 10).toString("hex"),
|
|
187
|
+
b.slice(off + 10, off + 16).toString("hex"),
|
|
188
|
+
].join("-");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Extract (pageGuid, title, offset) tuples from .one binary.
|
|
193
|
+
* Pattern: [UTF-16LE title] 00 00 [10 00 00 00] [16-byte GUID]
|
|
194
|
+
*/
|
|
195
|
+
export function extractPageGuids(
|
|
196
|
+
buf: Buffer
|
|
197
|
+
): { guid: string; title: string; offset: number }[] {
|
|
198
|
+
const results: { guid: string; title: string; offset: number }[] = [];
|
|
199
|
+
const seen = new Set<string>();
|
|
200
|
+
|
|
201
|
+
for (let i = 0; i < buf.length - 20; i++) {
|
|
202
|
+
// Size marker 10 00 00 00 (uint32 LE = 16)
|
|
203
|
+
if (buf[i] !== 0x10 || buf[i + 1] !== 0 || buf[i + 2] !== 0 || buf[i + 3] !== 0) continue;
|
|
204
|
+
|
|
205
|
+
// Check GUID validity (UUIDv4: version=4, variant=8-B)
|
|
206
|
+
const v = (buf[i + 4 + 7] >> 4) & 0xf;
|
|
207
|
+
const vr = (buf[i + 4 + 8] >> 4) & 0xf;
|
|
208
|
+
if (v !== 4 || vr < 8 || vr > 0xb) continue;
|
|
209
|
+
|
|
210
|
+
const guid = bufToGuid(buf, i + 4);
|
|
211
|
+
if (seen.has(guid)) continue;
|
|
212
|
+
|
|
213
|
+
// Walk backwards from i to find UTF-16LE title
|
|
214
|
+
let j = i - 2;
|
|
215
|
+
if (j >= 0 && buf[j] === 0 && buf[j + 1] === 0) j -= 2; // skip terminator
|
|
216
|
+
let chars = "";
|
|
217
|
+
while (j >= 0 && chars.length < 200) {
|
|
218
|
+
const code = buf[j] | (buf[j + 1] << 8);
|
|
219
|
+
if ((code >= 0x20 && code <= 0x7e) || (code >= 0xa0 && code <= 0xffef)) {
|
|
220
|
+
chars = String.fromCharCode(code) + chars;
|
|
221
|
+
j -= 2;
|
|
222
|
+
} else {
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if (chars.length >= 3 && chars.length < 200) {
|
|
228
|
+
// Filter out garbage titles: must contain a meaningful ratio of "real" characters
|
|
229
|
+
// (ASCII letters/digits, common CJK ideographs, hiragana/katakana, punctuation)
|
|
230
|
+
const meaningful = chars.replace(
|
|
231
|
+
/[^a-zA-Z0-9\u3040-\u30ff\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\s.,;:!?@#\-_()[\]{}'"\/\\]/g,
|
|
232
|
+
""
|
|
233
|
+
);
|
|
234
|
+
if (meaningful.length / chars.length < 0.7) continue;
|
|
235
|
+
|
|
236
|
+
// Reject titles where most CJK chars are likely shifted-ASCII garbage
|
|
237
|
+
let shiftedCount = 0;
|
|
238
|
+
for (const ch of chars) {
|
|
239
|
+
const code = ch.charCodeAt(0);
|
|
240
|
+
if ((code & 0xff) === 0 && code >= 0x2000) shiftedCount++;
|
|
241
|
+
}
|
|
242
|
+
if (shiftedCount > 2 && shiftedCount / chars.length > 0.3) continue;
|
|
243
|
+
|
|
244
|
+
// Reject embedded object titles like ".jpg", ".png", ".pdf" that are
|
|
245
|
+
// attachment GUIDs, not page GUIDs
|
|
246
|
+
const trimmed = chars.trim();
|
|
247
|
+
if (/^\.[a-z0-9]{2,5}$/i.test(trimmed)) continue;
|
|
248
|
+
|
|
249
|
+
seen.add(guid);
|
|
250
|
+
results.push({ guid, title: trimmed, offset: i });
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return results;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
export function extractPages(
|
|
258
|
+
buf: Buffer
|
|
259
|
+
): { title: string; body: string; pageGuid?: string }[] {
|
|
260
|
+
const blocks = extractTextBlocks(buf);
|
|
261
|
+
const guidEntries = extractPageGuids(buf);
|
|
262
|
+
|
|
263
|
+
if (guidEntries.length === 0) {
|
|
264
|
+
return groupIntoPages(blocks).map((p) => ({ ...p, pageGuid: undefined }));
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Sort guid entries by offset and dedupe to get unique pages with their FIRST offset
|
|
268
|
+
guidEntries.sort((a, b) => a.offset - b.offset);
|
|
269
|
+
const firstOffsetByGuid = new Map<string, { title: string; offset: number }>();
|
|
270
|
+
const titleByGuid = new Map<string, string>();
|
|
271
|
+
for (const e of guidEntries) {
|
|
272
|
+
if (!firstOffsetByGuid.has(e.guid)) {
|
|
273
|
+
firstOffsetByGuid.set(e.guid, { title: e.title.trim(), offset: e.offset });
|
|
274
|
+
titleByGuid.set(e.guid, e.title.trim());
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Build sorted list of (offset, guid) anchors using ALL occurrences
|
|
279
|
+
const anchors = guidEntries
|
|
280
|
+
.map((e) => ({ offset: e.offset, guid: e.guid }))
|
|
281
|
+
.sort((a, b) => a.offset - b.offset);
|
|
282
|
+
|
|
283
|
+
// Build a title -> guid map for boundary detection
|
|
284
|
+
const titleToGuidMap = new Map<string, string>();
|
|
285
|
+
for (const [guid, title] of titleByGuid) {
|
|
286
|
+
if (title.length >= 4) titleToGuidMap.set(title, guid);
|
|
287
|
+
}
|
|
288
|
+
// Sort known titles by length desc for greedy match
|
|
289
|
+
const knownTitlesSorted = [...titleToGuidMap.keys()].sort((a, b) => b.length - a.length);
|
|
290
|
+
|
|
291
|
+
// For each text block:
|
|
292
|
+
// - If it matches a known page title, switch to that page's GUID
|
|
293
|
+
// - Otherwise, append to the current page's body
|
|
294
|
+
const bodiesByGuid = new Map<string, string[]>();
|
|
295
|
+
let anchorIdx = 0;
|
|
296
|
+
let currentGuid: string | undefined;
|
|
297
|
+
for (const block of blocks) {
|
|
298
|
+
// Check if this block IS a known page title (boundary)
|
|
299
|
+
let titleMatch: string | undefined;
|
|
300
|
+
for (const t of knownTitlesSorted) {
|
|
301
|
+
if (block.text === t || block.text.startsWith(t)) {
|
|
302
|
+
titleMatch = t;
|
|
303
|
+
break;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
if (titleMatch) {
|
|
307
|
+
currentGuid = titleToGuidMap.get(titleMatch);
|
|
308
|
+
// Skip pushing the title text into the body to avoid noise
|
|
309
|
+
continue;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Otherwise advance anchor by offset
|
|
313
|
+
while (anchorIdx < anchors.length && anchors[anchorIdx].offset <= block.offset) {
|
|
314
|
+
currentGuid = anchors[anchorIdx].guid;
|
|
315
|
+
anchorIdx++;
|
|
316
|
+
}
|
|
317
|
+
if (!currentGuid) continue;
|
|
318
|
+
const arr = bodiesByGuid.get(currentGuid) || [];
|
|
319
|
+
arr.push(block.text);
|
|
320
|
+
bodiesByGuid.set(currentGuid, arr);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Build final pages
|
|
324
|
+
const pages: { title: string; body: string; pageGuid?: string }[] = [];
|
|
325
|
+
for (const [guid, info] of firstOffsetByGuid) {
|
|
326
|
+
const body = (bodiesByGuid.get(guid) || []).join("\n");
|
|
327
|
+
if (body.length < 5 && info.title.length < 3) continue;
|
|
328
|
+
pages.push({ title: info.title || "(untitled)", body: body || info.title, pageGuid: guid });
|
|
329
|
+
}
|
|
330
|
+
return pages;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
async function ensureDir(dir: string) {
|
|
334
|
+
await mkdir(dir, { recursive: true });
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
async function downloadSection(
|
|
338
|
+
drivePath: string
|
|
339
|
+
): Promise<Buffer | null> {
|
|
340
|
+
const encoded = drivePath
|
|
341
|
+
.split("/")
|
|
342
|
+
.map((s) => encodeURIComponent(s))
|
|
343
|
+
.join("/");
|
|
344
|
+
const res = await graphFetchRaw(`/me/drive/root:/${encoded}:/content`);
|
|
345
|
+
if (!res.ok) return null;
|
|
346
|
+
return Buffer.from(await res.arrayBuffer());
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
async function getSectionWebUrl(drivePath: string): Promise<string> {
|
|
350
|
+
const encoded = drivePath
|
|
351
|
+
.split("/")
|
|
352
|
+
.map((s) => encodeURIComponent(s))
|
|
353
|
+
.join("/");
|
|
354
|
+
try {
|
|
355
|
+
const res = await graphFetchRaw(
|
|
356
|
+
`/me/drive/root:/${encoded}?$select=webUrl`
|
|
357
|
+
);
|
|
358
|
+
if (res.ok) {
|
|
359
|
+
const item = (await res.json()) as any;
|
|
360
|
+
return item.webUrl?.split("&mobileredirect")[0] ?? "";
|
|
361
|
+
}
|
|
362
|
+
} catch {}
|
|
363
|
+
return "";
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Get OneNote pages for a section via Graph API.
|
|
368
|
+
* Uses the `0-{guid}` ID prefix which works even when the 5,000-item limit
|
|
369
|
+
* blocks listing endpoints.
|
|
370
|
+
*/
|
|
371
|
+
async function getOneNotePagesForSection(
|
|
372
|
+
sectionGuid: string
|
|
373
|
+
): Promise<{ id: string; title: string; webUrl: string }[]> {
|
|
374
|
+
try {
|
|
375
|
+
const res = await graphFetchRaw(
|
|
376
|
+
`/me/onenote/sections/0-${sectionGuid}/pages?$select=id,title,links&$top=100`
|
|
377
|
+
);
|
|
378
|
+
if (!res.ok) return [];
|
|
379
|
+
const data = (await res.json()) as any;
|
|
380
|
+
return (data.value ?? []).map((p: any) => ({
|
|
381
|
+
id: p.id,
|
|
382
|
+
title: p.title ?? "",
|
|
383
|
+
webUrl: p.links?.oneNoteWebUrl?.href ?? "",
|
|
384
|
+
}));
|
|
385
|
+
} catch {
|
|
386
|
+
return [];
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Extract the page navigation GUID from the OneNote oneNoteWebUrl.
|
|
392
|
+
* The webUrl contains `wd=target(...|{lastGuid}/)` where lastGuid is the page GUID
|
|
393
|
+
* used for navigation (matches what we extract from the binary).
|
|
394
|
+
*/
|
|
395
|
+
function pageGuidFromWebUrl(webUrl: string): string | null {
|
|
396
|
+
if (!webUrl) return null;
|
|
397
|
+
// Find the LAST GUID in the URL (the page-level one, after the section group)
|
|
398
|
+
// Pattern: ...{guid1}/{title}|{pageGuid}/)
|
|
399
|
+
const decoded = decodeURIComponent(webUrl);
|
|
400
|
+
const matches = decoded.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi);
|
|
401
|
+
if (!matches || matches.length === 0) return null;
|
|
402
|
+
return matches[matches.length - 1].toLowerCase();
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
async function listSectionFiles(
|
|
406
|
+
notebook: any
|
|
407
|
+
): Promise<{ name: string; drivePath: string }[]> {
|
|
408
|
+
const nbPath = getNotebookDrivePath(notebook);
|
|
409
|
+
if (!nbPath) return [];
|
|
410
|
+
const encoded = nbPath
|
|
411
|
+
.split("/")
|
|
412
|
+
.map((s) => encodeURIComponent(s))
|
|
413
|
+
.join("/");
|
|
414
|
+
try {
|
|
415
|
+
const res = await graphFetchRaw(
|
|
416
|
+
`/me/drive/root:/${encoded}:/children?$select=name,id,file,size&$top=200`
|
|
417
|
+
);
|
|
418
|
+
if (!res.ok) return [];
|
|
419
|
+
const data = (await res.json()) as any;
|
|
420
|
+
return (data.value ?? [])
|
|
421
|
+
.filter((f: any) => f.name?.endsWith(".one"))
|
|
422
|
+
.map((f: any) => ({
|
|
423
|
+
name: f.name.replace(/\.one$/, ""),
|
|
424
|
+
drivePath: `${nbPath}/${f.name}`,
|
|
425
|
+
}));
|
|
426
|
+
} catch {
|
|
427
|
+
return [];
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
export async function syncCache(
|
|
432
|
+
onProgress?: (msg: string) => void
|
|
433
|
+
): Promise<void> {
|
|
434
|
+
await ensureDir(CACHE_DIR);
|
|
435
|
+
const log = onProgress ?? console.log;
|
|
436
|
+
|
|
437
|
+
const notebooks = await listNotebooks();
|
|
438
|
+
log(`Found ${notebooks.length} notebooks`);
|
|
439
|
+
|
|
440
|
+
for (const nb of notebooks) {
|
|
441
|
+
const nbDir = join(CACHE_DIR, nb.displayName);
|
|
442
|
+
await ensureDir(nbDir);
|
|
443
|
+
|
|
444
|
+
const sections = await listSectionFiles(nb);
|
|
445
|
+
log(` ${nb.displayName}: ${sections.length} sections`);
|
|
446
|
+
|
|
447
|
+
for (const sec of sections) {
|
|
448
|
+
const cachePath = join(nbDir, `${sec.name}.json`);
|
|
449
|
+
|
|
450
|
+
// Check if cache exists and is recent (< 1 hour)
|
|
451
|
+
try {
|
|
452
|
+
const s = await stat(cachePath);
|
|
453
|
+
const age = Date.now() - s.mtimeMs;
|
|
454
|
+
if (age < 3600_000) continue;
|
|
455
|
+
} catch {}
|
|
456
|
+
|
|
457
|
+
log(` [downloading] ${sec.name}...`);
|
|
458
|
+
const buf = await downloadSection(sec.drivePath);
|
|
459
|
+
if (!buf) {
|
|
460
|
+
log(` [failed] ${sec.name}`);
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
const pages = extractPages(buf);
|
|
465
|
+
const guidEntries = extractPageGuids(buf).sort((a, b) => a.offset - b.offset);
|
|
466
|
+
const webUrl = await getSectionWebUrl(sec.drivePath);
|
|
467
|
+
|
|
468
|
+
// Extract section's sourcedoc GUID from webUrl, then fetch official page list via OneNote API
|
|
469
|
+
const sourcedocMatch = webUrl.match(/sourcedoc=%7B([0-9a-f-]+)%7D/i);
|
|
470
|
+
const sectionGuid = sourcedocMatch?.[1]?.toLowerCase();
|
|
471
|
+
const officialPages = sectionGuid
|
|
472
|
+
? await getOneNotePagesForSection(sectionGuid)
|
|
473
|
+
: [];
|
|
474
|
+
// Map: pageGuid -> official webUrl (extract GUID from webUrl, not from id)
|
|
475
|
+
const officialUrlByGuid = new Map<string, { url: string; title: string }>();
|
|
476
|
+
for (const op of officialPages) {
|
|
477
|
+
const guid = pageGuidFromWebUrl(op.webUrl);
|
|
478
|
+
if (guid && op.webUrl) officialUrlByGuid.set(guid, { url: op.webUrl, title: op.title });
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Save the binary as base64 for accurate position-based search
|
|
482
|
+
// Limit raw cache to <50MB sections to control disk usage
|
|
483
|
+
const includeRaw = buf.length < 50 * 1024 * 1024;
|
|
484
|
+
|
|
485
|
+
const cacheData = {
|
|
486
|
+
section: sec.name,
|
|
487
|
+
notebook: nb.displayName,
|
|
488
|
+
webUrl,
|
|
489
|
+
pages: pages.map((p: any) => ({
|
|
490
|
+
title: p.title,
|
|
491
|
+
body: p.body,
|
|
492
|
+
pageGuid: p.pageGuid,
|
|
493
|
+
officialUrl: p.pageGuid ? officialUrlByGuid.get(p.pageGuid)?.url : undefined,
|
|
494
|
+
})),
|
|
495
|
+
// Use OneNote API page list as authoritative anchor list (with official URLs)
|
|
496
|
+
officialPages: officialPages.map((p) => ({
|
|
497
|
+
guid: pageGuidFromWebUrl(p.webUrl),
|
|
498
|
+
title: p.title,
|
|
499
|
+
webUrl: p.webUrl,
|
|
500
|
+
})),
|
|
501
|
+
anchors: guidEntries.map((e) => ({ offset: e.offset, guid: e.guid, title: e.title })),
|
|
502
|
+
rawSize: buf.length,
|
|
503
|
+
cachedAt: new Date().toISOString(),
|
|
504
|
+
};
|
|
505
|
+
|
|
506
|
+
// Save raw .one file alongside JSON for binary search
|
|
507
|
+
if (includeRaw) {
|
|
508
|
+
const binPath = cachePath.replace(/\.json$/, ".one");
|
|
509
|
+
await writeFile(binPath, buf);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
await writeFile(cachePath, JSON.stringify(cacheData));
|
|
513
|
+
log(` [ok] ${sec.name} (${pages.length} pages)`);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
log("Sync complete.");
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* Build a page-level OneNote Online URL.
|
|
521
|
+
* Format: {sectionUrl}&wd=target({escapedTitle}|{pageGuid}/)
|
|
522
|
+
*
|
|
523
|
+
* Note: OneNote Online caches the user's last-viewed page within a section.
|
|
524
|
+
* When opened in a session that has previously viewed the section, OneNote may
|
|
525
|
+
* redirect to the cached page instead of honoring the wd=target parameter.
|
|
526
|
+
* The URL is still a correct page-level permalink.
|
|
527
|
+
*/
|
|
528
|
+
function buildPageUrl(
|
|
529
|
+
sectionUrl: string,
|
|
530
|
+
pageTitle: string,
|
|
531
|
+
pageGuid?: string
|
|
532
|
+
): string {
|
|
533
|
+
if (!pageGuid || !sectionUrl) return sectionUrl;
|
|
534
|
+
// OneNote escapes only `)` and `|` in titles with `\`
|
|
535
|
+
const escapedTitle = pageTitle
|
|
536
|
+
.replace(/\\/g, "\\\\")
|
|
537
|
+
.replace(/\)/g, "\\)")
|
|
538
|
+
.replace(/\|/g, "\\|");
|
|
539
|
+
const wd = `target(${escapedTitle}|${pageGuid}/)`;
|
|
540
|
+
// Strict encoding (also encode parens) to match OneNote's own URL format
|
|
541
|
+
const encoded = encodeURIComponent(wd).replace(
|
|
542
|
+
/[!'()*]/g,
|
|
543
|
+
(c) => "%" + c.charCodeAt(0).toString(16).toUpperCase()
|
|
544
|
+
);
|
|
545
|
+
const separator = sectionUrl.includes("?") ? "&" : "?";
|
|
546
|
+
return `${sectionUrl}${separator}wd=${encoded}`;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
export async function isCacheEmpty(): Promise<boolean> {
|
|
550
|
+
try {
|
|
551
|
+
const nbDirs = await readdir(CACHE_DIR);
|
|
552
|
+
for (const nb of nbDirs) {
|
|
553
|
+
const nbDir = join(CACHE_DIR, nb);
|
|
554
|
+
const s = await stat(nbDir);
|
|
555
|
+
if (!s.isDirectory()) continue;
|
|
556
|
+
const files = await readdir(nbDir);
|
|
557
|
+
if (files.some((f) => f.endsWith(".json"))) return false;
|
|
558
|
+
}
|
|
559
|
+
} catch {}
|
|
560
|
+
return true;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/**
|
|
564
|
+
* Find all binary positions where `needle` appears in `buf`.
|
|
565
|
+
* Searches for both UTF-8 and UTF-16LE encodings.
|
|
566
|
+
*/
|
|
567
|
+
function findAllInBinary(buf: Buffer, needle: string): number[] {
|
|
568
|
+
const results: number[] = [];
|
|
569
|
+
const utf8 = Buffer.from(needle, "utf-8");
|
|
570
|
+
for (let i = 0; i < buf.length - utf8.length; i++) {
|
|
571
|
+
let m = true;
|
|
572
|
+
for (let j = 0; j < utf8.length; j++) if (buf[i + j] !== utf8[j]) { m = false; break; }
|
|
573
|
+
if (m) results.push(i);
|
|
574
|
+
}
|
|
575
|
+
const utf16 = Buffer.from(needle, "utf16le");
|
|
576
|
+
for (let i = 0; i < buf.length - utf16.length; i++) {
|
|
577
|
+
let m = true;
|
|
578
|
+
for (let j = 0; j < utf16.length; j++) if (buf[i + j] !== utf16[j]) { m = false; break; }
|
|
579
|
+
if (m) results.push(i);
|
|
580
|
+
}
|
|
581
|
+
return results;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
function getNearestPrecedingAnchor(
|
|
585
|
+
anchors: { offset: number; guid: string; title: string }[],
|
|
586
|
+
pos: number
|
|
587
|
+
): { guid: string; title: string } | null {
|
|
588
|
+
let best: { guid: string; title: string } | null = null;
|
|
589
|
+
for (const a of anchors) {
|
|
590
|
+
if (a.offset <= pos) best = { guid: a.guid, title: a.title };
|
|
591
|
+
else break;
|
|
592
|
+
}
|
|
593
|
+
return best;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
/**
|
|
597
|
+
* Find the page that owns this binary position by checking which page title
|
|
598
|
+
* appears in the surrounding context. Falls back to nearest preceding anchor.
|
|
599
|
+
*/
|
|
600
|
+
function findOwnerPage(
|
|
601
|
+
buf: Buffer,
|
|
602
|
+
anchors: { offset: number; guid: string; title: string }[],
|
|
603
|
+
pos: number
|
|
604
|
+
): { guid: string; title: string } | null {
|
|
605
|
+
// Get a 20KB context around the match
|
|
606
|
+
const ctxStart = Math.max(0, pos - 10000);
|
|
607
|
+
const ctxEnd = Math.min(buf.length, pos + 10000);
|
|
608
|
+
const ctx = buf.slice(ctxStart, ctxEnd);
|
|
609
|
+
// Decode as both UTF-8 and UTF-16LE to catch all titles
|
|
610
|
+
const ctxUtf8 = ctx.toString("utf-8");
|
|
611
|
+
const ctxUtf16 = ctx.toString("utf16le");
|
|
612
|
+
|
|
613
|
+
// For each known anchor, check if its title appears in context.
|
|
614
|
+
// Prefer the longest matching title (more specific = more likely correct).
|
|
615
|
+
const candidates: { anchor: typeof anchors[number]; matchLen: number }[] = [];
|
|
616
|
+
for (const a of anchors) {
|
|
617
|
+
const t = a.title.trim();
|
|
618
|
+
if (t.length < 4) continue;
|
|
619
|
+
// Use a substring of the title (first 30 chars) for matching
|
|
620
|
+
const probe = t.slice(0, 30);
|
|
621
|
+
if (ctxUtf8.includes(probe) || ctxUtf16.includes(probe)) {
|
|
622
|
+
candidates.push({ anchor: a, matchLen: probe.length });
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (candidates.length > 0) {
|
|
627
|
+
// Return the candidate with the longest matched title
|
|
628
|
+
candidates.sort((a, b) => b.matchLen - a.matchLen);
|
|
629
|
+
return { guid: candidates[0].anchor.guid, title: candidates[0].anchor.title };
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// Fallback: nearest preceding anchor
|
|
633
|
+
return getNearestPrecedingAnchor(anchors, pos);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
function extractContextFromBinary(
|
|
637
|
+
buf: Buffer,
|
|
638
|
+
pos: number,
|
|
639
|
+
query: string
|
|
640
|
+
): string {
|
|
641
|
+
// Try both UTF-8 and UTF-16LE; pick the one that contains the query
|
|
642
|
+
// Use a wider context to be sure
|
|
643
|
+
const queryByteLen = Math.max(
|
|
644
|
+
Buffer.byteLength(query, "utf-8"),
|
|
645
|
+
Buffer.byteLength(query, "utf16le")
|
|
646
|
+
);
|
|
647
|
+
const start = Math.max(0, pos - 200);
|
|
648
|
+
const end = Math.min(buf.length, pos + queryByteLen + 200);
|
|
649
|
+
const segment = buf.slice(start, end);
|
|
650
|
+
|
|
651
|
+
const cleanText = (s: string): string =>
|
|
652
|
+
s.replace(/[\x00-\x1F\x7F\uFFFD]/g, " ").replace(/\s+/g, " ").trim();
|
|
653
|
+
|
|
654
|
+
// Try UTF-8
|
|
655
|
+
const utf8Text = cleanText(segment.toString("utf-8"));
|
|
656
|
+
if (utf8Text.includes(query)) return utf8Text;
|
|
657
|
+
|
|
658
|
+
// Try UTF-16LE at this offset and offset+1
|
|
659
|
+
const utf16Text = cleanText(segment.toString("utf16le"));
|
|
660
|
+
if (utf16Text.includes(query)) return utf16Text;
|
|
661
|
+
|
|
662
|
+
// Try UTF-16LE with shifted alignment
|
|
663
|
+
if (segment.length > 1) {
|
|
664
|
+
const shifted = cleanText(segment.slice(1).toString("utf16le"));
|
|
665
|
+
if (shifted.includes(query)) return shifted;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
// Fallback: return UTF-8 even if it doesn't contain the query
|
|
669
|
+
return utf8Text || utf16Text;
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
export async function searchLocal(query: string): Promise<CachedPage[]> {
|
|
673
|
+
const results: CachedPage[] = [];
|
|
674
|
+
|
|
675
|
+
let nbDirs: string[];
|
|
676
|
+
try {
|
|
677
|
+
nbDirs = await readdir(CACHE_DIR);
|
|
678
|
+
} catch {
|
|
679
|
+
return results;
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
for (const nbName of nbDirs) {
|
|
683
|
+
const nbDir = join(CACHE_DIR, nbName);
|
|
684
|
+
let files: string[];
|
|
685
|
+
try {
|
|
686
|
+
const s = await stat(nbDir);
|
|
687
|
+
if (!s.isDirectory()) continue;
|
|
688
|
+
files = await readdir(nbDir);
|
|
689
|
+
} catch {
|
|
690
|
+
continue;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
for (const file of files) {
|
|
694
|
+
if (!file.endsWith(".json")) continue;
|
|
695
|
+
try {
|
|
696
|
+
const jsonPath = join(nbDir, file);
|
|
697
|
+
const raw = await readFile(jsonPath, "utf-8");
|
|
698
|
+
const data = JSON.parse(raw);
|
|
699
|
+
|
|
700
|
+
// Try binary-based search first if .one is cached
|
|
701
|
+
const binPath = jsonPath.replace(/\.json$/, ".one");
|
|
702
|
+
let binBuf: Buffer | null = null;
|
|
703
|
+
try {
|
|
704
|
+
binBuf = await readFile(binPath);
|
|
705
|
+
} catch {}
|
|
706
|
+
|
|
707
|
+
if (binBuf && data.anchors) {
|
|
708
|
+
const positions = findAllInBinary(binBuf, query);
|
|
709
|
+
// Group positions by owner page (using context-based lookup)
|
|
710
|
+
const byGuid = new Map<string, { title: string; positions: number[] }>();
|
|
711
|
+
for (const pos of positions) {
|
|
712
|
+
const anchor = findOwnerPage(binBuf, data.anchors, pos);
|
|
713
|
+
if (!anchor) continue;
|
|
714
|
+
const existing = byGuid.get(anchor.guid);
|
|
715
|
+
if (existing) existing.positions.push(pos);
|
|
716
|
+
else byGuid.set(anchor.guid, { title: anchor.title, positions: [pos] });
|
|
717
|
+
}
|
|
718
|
+
// Build map of official URLs by GUID (from OneNote API)
|
|
719
|
+
const officialByGuid = new Map<string, { url: string; title: string }>();
|
|
720
|
+
for (const op of data.officialPages ?? []) {
|
|
721
|
+
if (op.guid && op.webUrl) officialByGuid.set(op.guid, { url: op.webUrl, title: op.title });
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
for (const [guid, info] of byGuid) {
|
|
725
|
+
const firstPos = info.positions[0];
|
|
726
|
+
const context = extractContextFromBinary(binBuf, firstPos, query);
|
|
727
|
+
const official = officialByGuid.get(guid);
|
|
728
|
+
const pageUrl = official?.url ?? buildPageUrl(data.webUrl, info.title, guid);
|
|
729
|
+
const displayTitle = official?.title ?? info.title;
|
|
730
|
+
results.push({
|
|
731
|
+
title: displayTitle,
|
|
732
|
+
body: context,
|
|
733
|
+
section: data.section,
|
|
734
|
+
notebook: data.notebook,
|
|
735
|
+
webUrl: pageUrl,
|
|
736
|
+
pageGuid: guid,
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
} else {
|
|
740
|
+
// Fallback: search in extracted page bodies
|
|
741
|
+
const lowerQuery = query.toLowerCase();
|
|
742
|
+
for (const page of data.pages ?? []) {
|
|
743
|
+
if (page.body?.toLowerCase().includes(lowerQuery)) {
|
|
744
|
+
const pageUrl = buildPageUrl(data.webUrl, page.title, page.pageGuid);
|
|
745
|
+
results.push({
|
|
746
|
+
title: page.title,
|
|
747
|
+
body: page.body,
|
|
748
|
+
section: data.section,
|
|
749
|
+
notebook: data.notebook,
|
|
750
|
+
webUrl: pageUrl,
|
|
751
|
+
pageGuid: page.pageGuid,
|
|
752
|
+
});
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
} catch {
|
|
757
|
+
continue;
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
return results;
|
|
763
|
+
}
|