onenote-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/cache.ts ADDED
@@ -0,0 +1,763 @@
1
+ import { getAccessToken } from "./auth";
2
+ import { listNotebooks } from "./graph";
3
+ import { readFile, writeFile, mkdir, readdir, stat } from "node:fs/promises";
4
+ import { join } from "node:path";
5
+
6
+ import { homedir } from "node:os";
7
+ import { dirname } from "node:path";
8
+
9
+ // Cache directory: use package root (../.onenote/cache relative to src/) so that
10
+ // the cache lives alongside .env.local. Fall back to ~/.onenote-cli/cache if the
11
+ // package root is not writable (e.g. installed via npm).
12
+ const PKG_ROOT = dirname(import.meta.dir);
13
+ const CACHE_DIR = process.env.ONENOTE_CACHE_DIR
14
+ || join(PKG_ROOT, ".onenote", "cache");
15
+
16
+ interface CachedPage {
17
+ title: string;
18
+ body: string;
19
+ section: string;
20
+ notebook: string;
21
+ webUrl: string; // OneNote Online URL for this page (section + page GUID)
22
+ pageGuid?: string;
23
+ }
24
+
25
+ interface CacheIndex {
26
+ updatedAt: string;
27
+ notebooks: {
28
+ id: string;
29
+ displayName: string;
30
+ sections: {
31
+ driveItemId: string;
32
+ displayName: string;
33
+ webUrl: string;
34
+ drivePath: string;
35
+ cachedAt: string;
36
+ }[];
37
+ }[];
38
+ }
39
+
40
+ function getNotebookDrivePath(notebook: any): string | null {
41
+ const webUrl = notebook.links?.oneNoteWebUrl?.href;
42
+ if (!webUrl) return null;
43
+ const match = decodeURIComponent(new URL(webUrl).pathname).match(/Documents\/(.+)/);
44
+ return match?.[1] ?? null;
45
+ }
46
+
47
+ async function graphFetchRaw(path: string): Promise<Response> {
48
+ const token = await getAccessToken();
49
+ const url = path.startsWith("http")
50
+ ? path
51
+ : `https://graph.microsoft.com/v1.0${path}`;
52
+ return fetch(url, {
53
+ headers: { Authorization: `Bearer ${token}` },
54
+ });
55
+ }
56
+
57
+ function isReadableChar(code: number): boolean {
58
+ return (
59
+ (code >= 0x20 && code <= 0x7e) || // ASCII printable
60
+ code === 0x0a || code === 0x0d || code === 0x09 || // whitespace
61
+ (code >= 0xa0 && code <= 0x024f) || // Latin Extended
62
+ (code >= 0x0370 && code <= 0x058f) || // Greek, Cyrillic, Armenian
63
+ (code >= 0x0600 && code <= 0x06ff) || // Arabic
64
+ (code >= 0x0900 && code <= 0x097f) || // Devanagari
65
+ (code >= 0x0e00 && code <= 0x0e7f) || // Thai
66
+ (code >= 0x1100 && code <= 0x11ff) || // Hangul Jamo
67
+ (code >= 0x2000 && code <= 0x206f) || // General Punctuation
68
+ (code >= 0x2100 && code <= 0x214f) || // Letterlike Symbols
69
+ (code >= 0x2190 && code <= 0x21ff) || // Arrows
70
+ (code >= 0x2200 && code <= 0x22ff) || // Mathematical Operators
71
+ (code >= 0x2500 && code <= 0x257f) || // Box Drawing
72
+ (code >= 0x3000 && code <= 0x303f) || // CJK Symbols and Punctuation
73
+ (code >= 0x3040 && code <= 0x309f) || // Hiragana
74
+ (code >= 0x30a0 && code <= 0x30ff) || // Katakana
75
+ (code >= 0x3100 && code <= 0x312f) || // Bopomofo
76
+ (code >= 0x3400 && code <= 0x4dbf) || // CJK Extension A
77
+ (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
78
+ (code >= 0xac00 && code <= 0xd7af) || // Hangul Syllables
79
+ (code >= 0xf900 && code <= 0xfaff) || // CJK Compatibility Ideographs
80
+ (code >= 0xfe30 && code <= 0xfe4f) || // CJK Compatibility Forms
81
+ (code >= 0xff00 && code <= 0xffef) // Fullwidth Forms
82
+ );
83
+ }
84
+
85
+ function extractTextBlocks(buf: Buffer): { offset: number; text: string }[] {
86
+ const blocks: { offset: number; text: string }[] = [];
87
+
88
+ // Extract UTF-8 text blocks
89
+ const utf8 = buf.toString("utf-8");
90
+ let start = -1;
91
+ let chars = "";
92
+ for (let i = 0; i < utf8.length; i++) {
93
+ if (isReadableChar(utf8.charCodeAt(i))) {
94
+ if (start < 0) start = i;
95
+ chars += utf8[i];
96
+ } else {
97
+ if (chars.trim().length >= 6) blocks.push({ offset: start, text: chars.trim() });
98
+ chars = "";
99
+ start = -1;
100
+ }
101
+ }
102
+ if (chars.trim().length >= 6) blocks.push({ offset: start, text: chars.trim() });
103
+
104
+ // Extract UTF-16LE text blocks at both even and odd byte alignments
105
+ // (OneNote may have UTF-16LE strings starting at either alignment)
106
+ for (const startOffset of [0, 1]) {
107
+ start = -1;
108
+ chars = "";
109
+ for (let i = startOffset; i < buf.length - 1; i += 2) {
110
+ const code = buf[i] | (buf[i + 1] << 8);
111
+ if (isReadableChar(code)) {
112
+ if (start < 0) start = i;
113
+ chars += String.fromCharCode(code);
114
+ } else {
115
+ if (chars.trim().length >= 4) blocks.push({ offset: start, text: chars.trim() });
116
+ chars = "";
117
+ start = -1;
118
+ }
119
+ }
120
+ if (chars.trim().length >= 4) blocks.push({ offset: start, text: chars.trim() });
121
+ }
122
+
123
+ // Sort by offset
124
+ blocks.sort((a, b) => a.offset - b.offset);
125
+
126
+ // Filter noise: require blocks to have a reasonable ratio of common characters
127
+ return blocks.filter((b) => {
128
+ const common = b.text.replace(
129
+ /[^a-zA-Z0-9\u3040-\u30ff\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\s.,;:!?@#\-_()[\]{}'"\/\\]/g,
130
+ ""
131
+ );
132
+ if (common.length / b.text.length <= 0.6 || b.text.length < 4) return false;
133
+
134
+ // Detect misaligned UTF-16LE reading of ASCII: characters where low byte = 0x00
135
+ // and code is in the "shifted ASCII" range (0x2000-0x7E00 typically)
136
+ let shiftedAsciiCount = 0;
137
+ let cjkCount = 0;
138
+ for (const ch of b.text) {
139
+ const code = ch.charCodeAt(0);
140
+ if ((code & 0xff) === 0 && code >= 0x2000 && code <= 0x7f00) {
141
+ shiftedAsciiCount++;
142
+ }
143
+ if (code >= 0x4e00 && code <= 0x9fff) cjkCount++;
144
+ }
145
+ // If most characters look like shifted-ASCII garbage, reject
146
+ if (shiftedAsciiCount > 3 && shiftedAsciiCount / b.text.length > 0.5) return false;
147
+ return true;
148
+ });
149
+ }
150
+
151
+ function groupIntoPages(blocks: { offset: number; text: string }[]): { title: string; body: string }[] {
152
+ const pages: { title: string; body: string }[] = [];
153
+ let group: typeof blocks = [];
154
+
155
+ for (const block of blocks) {
156
+ if (group.length > 0) {
157
+ const prevEnd = group[group.length - 1].offset + group[group.length - 1].text.length * 2;
158
+ const gap = block.offset - prevEnd;
159
+ if (gap > 500) {
160
+ const body = group.map((b) => b.text).join("\n");
161
+ if (body.length > 10) {
162
+ const lines = body.split("\n").filter((l) => l.trim().length > 0);
163
+ pages.push({ title: lines[0]?.slice(0, 200) || "(untitled)", body });
164
+ }
165
+ group = [];
166
+ }
167
+ }
168
+ group.push(block);
169
+ }
170
+ if (group.length > 0) {
171
+ const body = group.map((b) => b.text).join("\n");
172
+ if (body.length > 10) {
173
+ const lines = body.split("\n").filter((l) => l.trim().length > 0);
174
+ pages.push({ title: lines[0]?.slice(0, 200) || "(untitled)", body });
175
+ }
176
+ }
177
+
178
+ return pages;
179
+ }
180
+
181
+ function bufToGuid(b: Buffer, off: number): string {
182
+ return [
183
+ b.readUInt32LE(off).toString(16).padStart(8, "0"),
184
+ b.readUInt16LE(off + 4).toString(16).padStart(4, "0"),
185
+ b.readUInt16LE(off + 6).toString(16).padStart(4, "0"),
186
+ b.slice(off + 8, off + 10).toString("hex"),
187
+ b.slice(off + 10, off + 16).toString("hex"),
188
+ ].join("-");
189
+ }
190
+
191
+ /**
192
+ * Extract (pageGuid, title, offset) tuples from .one binary.
193
+ * Pattern: [UTF-16LE title] 00 00 [10 00 00 00] [16-byte GUID]
194
+ */
195
+ export function extractPageGuids(
196
+ buf: Buffer
197
+ ): { guid: string; title: string; offset: number }[] {
198
+ const results: { guid: string; title: string; offset: number }[] = [];
199
+ const seen = new Set<string>();
200
+
201
+ for (let i = 0; i < buf.length - 20; i++) {
202
+ // Size marker 10 00 00 00 (uint32 LE = 16)
203
+ if (buf[i] !== 0x10 || buf[i + 1] !== 0 || buf[i + 2] !== 0 || buf[i + 3] !== 0) continue;
204
+
205
+ // Check GUID validity (UUIDv4: version=4, variant=8-B)
206
+ const v = (buf[i + 4 + 7] >> 4) & 0xf;
207
+ const vr = (buf[i + 4 + 8] >> 4) & 0xf;
208
+ if (v !== 4 || vr < 8 || vr > 0xb) continue;
209
+
210
+ const guid = bufToGuid(buf, i + 4);
211
+ if (seen.has(guid)) continue;
212
+
213
+ // Walk backwards from i to find UTF-16LE title
214
+ let j = i - 2;
215
+ if (j >= 0 && buf[j] === 0 && buf[j + 1] === 0) j -= 2; // skip terminator
216
+ let chars = "";
217
+ while (j >= 0 && chars.length < 200) {
218
+ const code = buf[j] | (buf[j + 1] << 8);
219
+ if ((code >= 0x20 && code <= 0x7e) || (code >= 0xa0 && code <= 0xffef)) {
220
+ chars = String.fromCharCode(code) + chars;
221
+ j -= 2;
222
+ } else {
223
+ break;
224
+ }
225
+ }
226
+
227
+ if (chars.length >= 3 && chars.length < 200) {
228
+ // Filter out garbage titles: must contain a meaningful ratio of "real" characters
229
+ // (ASCII letters/digits, common CJK ideographs, hiragana/katakana, punctuation)
230
+ const meaningful = chars.replace(
231
+ /[^a-zA-Z0-9\u3040-\u30ff\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\s.,;:!?@#\-_()[\]{}'"\/\\]/g,
232
+ ""
233
+ );
234
+ if (meaningful.length / chars.length < 0.7) continue;
235
+
236
+ // Reject titles where most CJK chars are likely shifted-ASCII garbage
237
+ let shiftedCount = 0;
238
+ for (const ch of chars) {
239
+ const code = ch.charCodeAt(0);
240
+ if ((code & 0xff) === 0 && code >= 0x2000) shiftedCount++;
241
+ }
242
+ if (shiftedCount > 2 && shiftedCount / chars.length > 0.3) continue;
243
+
244
+ // Reject embedded object titles like ".jpg", ".png", ".pdf" that are
245
+ // attachment GUIDs, not page GUIDs
246
+ const trimmed = chars.trim();
247
+ if (/^\.[a-z0-9]{2,5}$/i.test(trimmed)) continue;
248
+
249
+ seen.add(guid);
250
+ results.push({ guid, title: trimmed, offset: i });
251
+ }
252
+ }
253
+
254
+ return results;
255
+ }
256
+
257
+ export function extractPages(
258
+ buf: Buffer
259
+ ): { title: string; body: string; pageGuid?: string }[] {
260
+ const blocks = extractTextBlocks(buf);
261
+ const guidEntries = extractPageGuids(buf);
262
+
263
+ if (guidEntries.length === 0) {
264
+ return groupIntoPages(blocks).map((p) => ({ ...p, pageGuid: undefined }));
265
+ }
266
+
267
+ // Sort guid entries by offset and dedupe to get unique pages with their FIRST offset
268
+ guidEntries.sort((a, b) => a.offset - b.offset);
269
+ const firstOffsetByGuid = new Map<string, { title: string; offset: number }>();
270
+ const titleByGuid = new Map<string, string>();
271
+ for (const e of guidEntries) {
272
+ if (!firstOffsetByGuid.has(e.guid)) {
273
+ firstOffsetByGuid.set(e.guid, { title: e.title.trim(), offset: e.offset });
274
+ titleByGuid.set(e.guid, e.title.trim());
275
+ }
276
+ }
277
+
278
+ // Build sorted list of (offset, guid) anchors using ALL occurrences
279
+ const anchors = guidEntries
280
+ .map((e) => ({ offset: e.offset, guid: e.guid }))
281
+ .sort((a, b) => a.offset - b.offset);
282
+
283
+ // Build a title -> guid map for boundary detection
284
+ const titleToGuidMap = new Map<string, string>();
285
+ for (const [guid, title] of titleByGuid) {
286
+ if (title.length >= 4) titleToGuidMap.set(title, guid);
287
+ }
288
+ // Sort known titles by length desc for greedy match
289
+ const knownTitlesSorted = [...titleToGuidMap.keys()].sort((a, b) => b.length - a.length);
290
+
291
+ // For each text block:
292
+ // - If it matches a known page title, switch to that page's GUID
293
+ // - Otherwise, append to the current page's body
294
+ const bodiesByGuid = new Map<string, string[]>();
295
+ let anchorIdx = 0;
296
+ let currentGuid: string | undefined;
297
+ for (const block of blocks) {
298
+ // Check if this block IS a known page title (boundary)
299
+ let titleMatch: string | undefined;
300
+ for (const t of knownTitlesSorted) {
301
+ if (block.text === t || block.text.startsWith(t)) {
302
+ titleMatch = t;
303
+ break;
304
+ }
305
+ }
306
+ if (titleMatch) {
307
+ currentGuid = titleToGuidMap.get(titleMatch);
308
+ // Skip pushing the title text into the body to avoid noise
309
+ continue;
310
+ }
311
+
312
+ // Otherwise advance anchor by offset
313
+ while (anchorIdx < anchors.length && anchors[anchorIdx].offset <= block.offset) {
314
+ currentGuid = anchors[anchorIdx].guid;
315
+ anchorIdx++;
316
+ }
317
+ if (!currentGuid) continue;
318
+ const arr = bodiesByGuid.get(currentGuid) || [];
319
+ arr.push(block.text);
320
+ bodiesByGuid.set(currentGuid, arr);
321
+ }
322
+
323
+ // Build final pages
324
+ const pages: { title: string; body: string; pageGuid?: string }[] = [];
325
+ for (const [guid, info] of firstOffsetByGuid) {
326
+ const body = (bodiesByGuid.get(guid) || []).join("\n");
327
+ if (body.length < 5 && info.title.length < 3) continue;
328
+ pages.push({ title: info.title || "(untitled)", body: body || info.title, pageGuid: guid });
329
+ }
330
+ return pages;
331
+ }
332
+
333
+ async function ensureDir(dir: string) {
334
+ await mkdir(dir, { recursive: true });
335
+ }
336
+
337
+ async function downloadSection(
338
+ drivePath: string
339
+ ): Promise<Buffer | null> {
340
+ const encoded = drivePath
341
+ .split("/")
342
+ .map((s) => encodeURIComponent(s))
343
+ .join("/");
344
+ const res = await graphFetchRaw(`/me/drive/root:/${encoded}:/content`);
345
+ if (!res.ok) return null;
346
+ return Buffer.from(await res.arrayBuffer());
347
+ }
348
+
349
+ async function getSectionWebUrl(drivePath: string): Promise<string> {
350
+ const encoded = drivePath
351
+ .split("/")
352
+ .map((s) => encodeURIComponent(s))
353
+ .join("/");
354
+ try {
355
+ const res = await graphFetchRaw(
356
+ `/me/drive/root:/${encoded}?$select=webUrl`
357
+ );
358
+ if (res.ok) {
359
+ const item = (await res.json()) as any;
360
+ return item.webUrl?.split("&mobileredirect")[0] ?? "";
361
+ }
362
+ } catch {}
363
+ return "";
364
+ }
365
+
366
+ /**
367
+ * Get OneNote pages for a section via Graph API.
368
+ * Uses the `0-{guid}` ID prefix which works even when the 5,000-item limit
369
+ * blocks listing endpoints.
370
+ */
371
+ async function getOneNotePagesForSection(
372
+ sectionGuid: string
373
+ ): Promise<{ id: string; title: string; webUrl: string }[]> {
374
+ try {
375
+ const res = await graphFetchRaw(
376
+ `/me/onenote/sections/0-${sectionGuid}/pages?$select=id,title,links&$top=100`
377
+ );
378
+ if (!res.ok) return [];
379
+ const data = (await res.json()) as any;
380
+ return (data.value ?? []).map((p: any) => ({
381
+ id: p.id,
382
+ title: p.title ?? "",
383
+ webUrl: p.links?.oneNoteWebUrl?.href ?? "",
384
+ }));
385
+ } catch {
386
+ return [];
387
+ }
388
+ }
389
+
390
+ /**
391
+ * Extract the page navigation GUID from the OneNote oneNoteWebUrl.
392
+ * The webUrl contains `wd=target(...|{lastGuid}/)` where lastGuid is the page GUID
393
+ * used for navigation (matches what we extract from the binary).
394
+ */
395
+ function pageGuidFromWebUrl(webUrl: string): string | null {
396
+ if (!webUrl) return null;
397
+ // Find the LAST GUID in the URL (the page-level one, after the section group)
398
+ // Pattern: ...{guid1}/{title}|{pageGuid}/)
399
+ const decoded = decodeURIComponent(webUrl);
400
+ const matches = decoded.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi);
401
+ if (!matches || matches.length === 0) return null;
402
+ return matches[matches.length - 1].toLowerCase();
403
+ }
404
+
405
+ async function listSectionFiles(
406
+ notebook: any
407
+ ): Promise<{ name: string; drivePath: string }[]> {
408
+ const nbPath = getNotebookDrivePath(notebook);
409
+ if (!nbPath) return [];
410
+ const encoded = nbPath
411
+ .split("/")
412
+ .map((s) => encodeURIComponent(s))
413
+ .join("/");
414
+ try {
415
+ const res = await graphFetchRaw(
416
+ `/me/drive/root:/${encoded}:/children?$select=name,id,file,size&$top=200`
417
+ );
418
+ if (!res.ok) return [];
419
+ const data = (await res.json()) as any;
420
+ return (data.value ?? [])
421
+ .filter((f: any) => f.name?.endsWith(".one"))
422
+ .map((f: any) => ({
423
+ name: f.name.replace(/\.one$/, ""),
424
+ drivePath: `${nbPath}/${f.name}`,
425
+ }));
426
+ } catch {
427
+ return [];
428
+ }
429
+ }
430
+
431
+ export async function syncCache(
432
+ onProgress?: (msg: string) => void
433
+ ): Promise<void> {
434
+ await ensureDir(CACHE_DIR);
435
+ const log = onProgress ?? console.log;
436
+
437
+ const notebooks = await listNotebooks();
438
+ log(`Found ${notebooks.length} notebooks`);
439
+
440
+ for (const nb of notebooks) {
441
+ const nbDir = join(CACHE_DIR, nb.displayName);
442
+ await ensureDir(nbDir);
443
+
444
+ const sections = await listSectionFiles(nb);
445
+ log(` ${nb.displayName}: ${sections.length} sections`);
446
+
447
+ for (const sec of sections) {
448
+ const cachePath = join(nbDir, `${sec.name}.json`);
449
+
450
+ // Check if cache exists and is recent (< 1 hour)
451
+ try {
452
+ const s = await stat(cachePath);
453
+ const age = Date.now() - s.mtimeMs;
454
+ if (age < 3600_000) continue;
455
+ } catch {}
456
+
457
+ log(` [downloading] ${sec.name}...`);
458
+ const buf = await downloadSection(sec.drivePath);
459
+ if (!buf) {
460
+ log(` [failed] ${sec.name}`);
461
+ continue;
462
+ }
463
+
464
+ const pages = extractPages(buf);
465
+ const guidEntries = extractPageGuids(buf).sort((a, b) => a.offset - b.offset);
466
+ const webUrl = await getSectionWebUrl(sec.drivePath);
467
+
468
+ // Extract section's sourcedoc GUID from webUrl, then fetch official page list via OneNote API
469
+ const sourcedocMatch = webUrl.match(/sourcedoc=%7B([0-9a-f-]+)%7D/i);
470
+ const sectionGuid = sourcedocMatch?.[1]?.toLowerCase();
471
+ const officialPages = sectionGuid
472
+ ? await getOneNotePagesForSection(sectionGuid)
473
+ : [];
474
+ // Map: pageGuid -> official webUrl (extract GUID from webUrl, not from id)
475
+ const officialUrlByGuid = new Map<string, { url: string; title: string }>();
476
+ for (const op of officialPages) {
477
+ const guid = pageGuidFromWebUrl(op.webUrl);
478
+ if (guid && op.webUrl) officialUrlByGuid.set(guid, { url: op.webUrl, title: op.title });
479
+ }
480
+
481
+ // Save the binary as base64 for accurate position-based search
482
+ // Limit raw cache to <50MB sections to control disk usage
483
+ const includeRaw = buf.length < 50 * 1024 * 1024;
484
+
485
+ const cacheData = {
486
+ section: sec.name,
487
+ notebook: nb.displayName,
488
+ webUrl,
489
+ pages: pages.map((p: any) => ({
490
+ title: p.title,
491
+ body: p.body,
492
+ pageGuid: p.pageGuid,
493
+ officialUrl: p.pageGuid ? officialUrlByGuid.get(p.pageGuid)?.url : undefined,
494
+ })),
495
+ // Use OneNote API page list as authoritative anchor list (with official URLs)
496
+ officialPages: officialPages.map((p) => ({
497
+ guid: pageGuidFromWebUrl(p.webUrl),
498
+ title: p.title,
499
+ webUrl: p.webUrl,
500
+ })),
501
+ anchors: guidEntries.map((e) => ({ offset: e.offset, guid: e.guid, title: e.title })),
502
+ rawSize: buf.length,
503
+ cachedAt: new Date().toISOString(),
504
+ };
505
+
506
+ // Save raw .one file alongside JSON for binary search
507
+ if (includeRaw) {
508
+ const binPath = cachePath.replace(/\.json$/, ".one");
509
+ await writeFile(binPath, buf);
510
+ }
511
+
512
+ await writeFile(cachePath, JSON.stringify(cacheData));
513
+ log(` [ok] ${sec.name} (${pages.length} pages)`);
514
+ }
515
+ }
516
+ log("Sync complete.");
517
+ }
518
+
519
+ /**
520
+ * Build a page-level OneNote Online URL.
521
+ * Format: {sectionUrl}&wd=target({escapedTitle}|{pageGuid}/)
522
+ *
523
+ * Note: OneNote Online caches the user's last-viewed page within a section.
524
+ * When opened in a session that has previously viewed the section, OneNote may
525
+ * redirect to the cached page instead of honoring the wd=target parameter.
526
+ * The URL is still a correct page-level permalink.
527
+ */
528
+ function buildPageUrl(
529
+ sectionUrl: string,
530
+ pageTitle: string,
531
+ pageGuid?: string
532
+ ): string {
533
+ if (!pageGuid || !sectionUrl) return sectionUrl;
534
+ // OneNote escapes only `)` and `|` in titles with `\`
535
+ const escapedTitle = pageTitle
536
+ .replace(/\\/g, "\\\\")
537
+ .replace(/\)/g, "\\)")
538
+ .replace(/\|/g, "\\|");
539
+ const wd = `target(${escapedTitle}|${pageGuid}/)`;
540
+ // Strict encoding (also encode parens) to match OneNote's own URL format
541
+ const encoded = encodeURIComponent(wd).replace(
542
+ /[!'()*]/g,
543
+ (c) => "%" + c.charCodeAt(0).toString(16).toUpperCase()
544
+ );
545
+ const separator = sectionUrl.includes("?") ? "&" : "?";
546
+ return `${sectionUrl}${separator}wd=${encoded}`;
547
+ }
548
+
549
+ export async function isCacheEmpty(): Promise<boolean> {
550
+ try {
551
+ const nbDirs = await readdir(CACHE_DIR);
552
+ for (const nb of nbDirs) {
553
+ const nbDir = join(CACHE_DIR, nb);
554
+ const s = await stat(nbDir);
555
+ if (!s.isDirectory()) continue;
556
+ const files = await readdir(nbDir);
557
+ if (files.some((f) => f.endsWith(".json"))) return false;
558
+ }
559
+ } catch {}
560
+ return true;
561
+ }
562
+
563
+ /**
564
+ * Find all binary positions where `needle` appears in `buf`.
565
+ * Searches for both UTF-8 and UTF-16LE encodings.
566
+ */
567
+ function findAllInBinary(buf: Buffer, needle: string): number[] {
568
+ const results: number[] = [];
569
+ const utf8 = Buffer.from(needle, "utf-8");
570
+ for (let i = 0; i < buf.length - utf8.length; i++) {
571
+ let m = true;
572
+ for (let j = 0; j < utf8.length; j++) if (buf[i + j] !== utf8[j]) { m = false; break; }
573
+ if (m) results.push(i);
574
+ }
575
+ const utf16 = Buffer.from(needle, "utf16le");
576
+ for (let i = 0; i < buf.length - utf16.length; i++) {
577
+ let m = true;
578
+ for (let j = 0; j < utf16.length; j++) if (buf[i + j] !== utf16[j]) { m = false; break; }
579
+ if (m) results.push(i);
580
+ }
581
+ return results;
582
+ }
583
+
584
+ function getNearestPrecedingAnchor(
585
+ anchors: { offset: number; guid: string; title: string }[],
586
+ pos: number
587
+ ): { guid: string; title: string } | null {
588
+ let best: { guid: string; title: string } | null = null;
589
+ for (const a of anchors) {
590
+ if (a.offset <= pos) best = { guid: a.guid, title: a.title };
591
+ else break;
592
+ }
593
+ return best;
594
+ }
595
+
596
+ /**
597
+ * Find the page that owns this binary position by checking which page title
598
+ * appears in the surrounding context. Falls back to nearest preceding anchor.
599
+ */
600
+ function findOwnerPage(
601
+ buf: Buffer,
602
+ anchors: { offset: number; guid: string; title: string }[],
603
+ pos: number
604
+ ): { guid: string; title: string } | null {
605
+ // Get a 20KB context around the match
606
+ const ctxStart = Math.max(0, pos - 10000);
607
+ const ctxEnd = Math.min(buf.length, pos + 10000);
608
+ const ctx = buf.slice(ctxStart, ctxEnd);
609
+ // Decode as both UTF-8 and UTF-16LE to catch all titles
610
+ const ctxUtf8 = ctx.toString("utf-8");
611
+ const ctxUtf16 = ctx.toString("utf16le");
612
+
613
+ // For each known anchor, check if its title appears in context.
614
+ // Prefer the longest matching title (more specific = more likely correct).
615
+ const candidates: { anchor: typeof anchors[number]; matchLen: number }[] = [];
616
+ for (const a of anchors) {
617
+ const t = a.title.trim();
618
+ if (t.length < 4) continue;
619
+ // Use a substring of the title (first 30 chars) for matching
620
+ const probe = t.slice(0, 30);
621
+ if (ctxUtf8.includes(probe) || ctxUtf16.includes(probe)) {
622
+ candidates.push({ anchor: a, matchLen: probe.length });
623
+ }
624
+ }
625
+
626
+ if (candidates.length > 0) {
627
+ // Return the candidate with the longest matched title
628
+ candidates.sort((a, b) => b.matchLen - a.matchLen);
629
+ return { guid: candidates[0].anchor.guid, title: candidates[0].anchor.title };
630
+ }
631
+
632
+ // Fallback: nearest preceding anchor
633
+ return getNearestPrecedingAnchor(anchors, pos);
634
+ }
635
+
636
+ function extractContextFromBinary(
637
+ buf: Buffer,
638
+ pos: number,
639
+ query: string
640
+ ): string {
641
+ // Try both UTF-8 and UTF-16LE; pick the one that contains the query
642
+ // Use a wider context to be sure
643
+ const queryByteLen = Math.max(
644
+ Buffer.byteLength(query, "utf-8"),
645
+ Buffer.byteLength(query, "utf16le")
646
+ );
647
+ const start = Math.max(0, pos - 200);
648
+ const end = Math.min(buf.length, pos + queryByteLen + 200);
649
+ const segment = buf.slice(start, end);
650
+
651
+ const cleanText = (s: string): string =>
652
+ s.replace(/[\x00-\x1F\x7F\uFFFD]/g, " ").replace(/\s+/g, " ").trim();
653
+
654
+ // Try UTF-8
655
+ const utf8Text = cleanText(segment.toString("utf-8"));
656
+ if (utf8Text.includes(query)) return utf8Text;
657
+
658
+ // Try UTF-16LE at this offset and offset+1
659
+ const utf16Text = cleanText(segment.toString("utf16le"));
660
+ if (utf16Text.includes(query)) return utf16Text;
661
+
662
+ // Try UTF-16LE with shifted alignment
663
+ if (segment.length > 1) {
664
+ const shifted = cleanText(segment.slice(1).toString("utf16le"));
665
+ if (shifted.includes(query)) return shifted;
666
+ }
667
+
668
+ // Fallback: return UTF-8 even if it doesn't contain the query
669
+ return utf8Text || utf16Text;
670
+ }
671
+
672
+ export async function searchLocal(query: string): Promise<CachedPage[]> {
673
+ const results: CachedPage[] = [];
674
+
675
+ let nbDirs: string[];
676
+ try {
677
+ nbDirs = await readdir(CACHE_DIR);
678
+ } catch {
679
+ return results;
680
+ }
681
+
682
+ for (const nbName of nbDirs) {
683
+ const nbDir = join(CACHE_DIR, nbName);
684
+ let files: string[];
685
+ try {
686
+ const s = await stat(nbDir);
687
+ if (!s.isDirectory()) continue;
688
+ files = await readdir(nbDir);
689
+ } catch {
690
+ continue;
691
+ }
692
+
693
+ for (const file of files) {
694
+ if (!file.endsWith(".json")) continue;
695
+ try {
696
+ const jsonPath = join(nbDir, file);
697
+ const raw = await readFile(jsonPath, "utf-8");
698
+ const data = JSON.parse(raw);
699
+
700
+ // Try binary-based search first if .one is cached
701
+ const binPath = jsonPath.replace(/\.json$/, ".one");
702
+ let binBuf: Buffer | null = null;
703
+ try {
704
+ binBuf = await readFile(binPath);
705
+ } catch {}
706
+
707
+ if (binBuf && data.anchors) {
708
+ const positions = findAllInBinary(binBuf, query);
709
+ // Group positions by owner page (using context-based lookup)
710
+ const byGuid = new Map<string, { title: string; positions: number[] }>();
711
+ for (const pos of positions) {
712
+ const anchor = findOwnerPage(binBuf, data.anchors, pos);
713
+ if (!anchor) continue;
714
+ const existing = byGuid.get(anchor.guid);
715
+ if (existing) existing.positions.push(pos);
716
+ else byGuid.set(anchor.guid, { title: anchor.title, positions: [pos] });
717
+ }
718
+ // Build map of official URLs by GUID (from OneNote API)
719
+ const officialByGuid = new Map<string, { url: string; title: string }>();
720
+ for (const op of data.officialPages ?? []) {
721
+ if (op.guid && op.webUrl) officialByGuid.set(op.guid, { url: op.webUrl, title: op.title });
722
+ }
723
+
724
+ for (const [guid, info] of byGuid) {
725
+ const firstPos = info.positions[0];
726
+ const context = extractContextFromBinary(binBuf, firstPos, query);
727
+ const official = officialByGuid.get(guid);
728
+ const pageUrl = official?.url ?? buildPageUrl(data.webUrl, info.title, guid);
729
+ const displayTitle = official?.title ?? info.title;
730
+ results.push({
731
+ title: displayTitle,
732
+ body: context,
733
+ section: data.section,
734
+ notebook: data.notebook,
735
+ webUrl: pageUrl,
736
+ pageGuid: guid,
737
+ });
738
+ }
739
+ } else {
740
+ // Fallback: search in extracted page bodies
741
+ const lowerQuery = query.toLowerCase();
742
+ for (const page of data.pages ?? []) {
743
+ if (page.body?.toLowerCase().includes(lowerQuery)) {
744
+ const pageUrl = buildPageUrl(data.webUrl, page.title, page.pageGuid);
745
+ results.push({
746
+ title: page.title,
747
+ body: page.body,
748
+ section: data.section,
749
+ notebook: data.notebook,
750
+ webUrl: pageUrl,
751
+ pageGuid: page.pageGuid,
752
+ });
753
+ }
754
+ }
755
+ }
756
+ } catch {
757
+ continue;
758
+ }
759
+ }
760
+ }
761
+
762
+ return results;
763
+ }