sanook-cli 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. package/.env.example +19 -0
  2. package/CHANGELOG.md +144 -0
  3. package/README.md +153 -20
  4. package/README.th.md +136 -0
  5. package/dist/agentContext.js +4 -0
  6. package/dist/approval.js +6 -0
  7. package/dist/bin.js +394 -51
  8. package/dist/brain.js +92 -59
  9. package/dist/brand.js +47 -0
  10. package/dist/checkpoint.js +37 -0
  11. package/dist/commands.js +86 -6
  12. package/dist/compaction.js +76 -5
  13. package/dist/config.js +100 -12
  14. package/dist/cost.js +60 -3
  15. package/dist/doctor.js +92 -0
  16. package/dist/gateway/auth.js +2 -2
  17. package/dist/gateway/ledger.js +2 -2
  18. package/dist/gateway/scheduler.js +1 -0
  19. package/dist/gateway/serve.js +6 -4
  20. package/dist/gateway/server.js +10 -2
  21. package/dist/git.js +11 -2
  22. package/dist/hooks.js +43 -17
  23. package/dist/knowledge.js +48 -49
  24. package/dist/loop.js +182 -66
  25. package/dist/lsp/client.js +173 -0
  26. package/dist/lsp/framing.js +56 -0
  27. package/dist/lsp/index.js +138 -0
  28. package/dist/lsp/servers.js +82 -0
  29. package/dist/mcp-server.js +244 -0
  30. package/dist/mcp.js +184 -29
  31. package/dist/memory-store.js +559 -0
  32. package/dist/memory.js +143 -29
  33. package/dist/orchestrate.js +150 -0
  34. package/dist/providers/codex.js +2 -2
  35. package/dist/providers/keys.js +3 -2
  36. package/dist/providers/registry.js +133 -1
  37. package/dist/repomap.js +93 -0
  38. package/dist/search/chunk.js +158 -0
  39. package/dist/search/embed-store.js +187 -0
  40. package/dist/search/engine.js +203 -0
  41. package/dist/search/fuse.js +35 -0
  42. package/dist/search/index-core.js +187 -0
  43. package/dist/search/indexer.js +241 -0
  44. package/dist/search/store.js +77 -0
  45. package/dist/session.js +42 -8
  46. package/dist/skill-install.js +10 -10
  47. package/dist/skills.js +12 -9
  48. package/dist/summarize.js +31 -0
  49. package/dist/tools/bash.js +21 -2
  50. package/dist/tools/diagnostics.js +41 -0
  51. package/dist/tools/edit.js +29 -7
  52. package/dist/tools/index.js +8 -1
  53. package/dist/tools/list.js +7 -2
  54. package/dist/tools/permission.js +90 -9
  55. package/dist/tools/read.js +23 -4
  56. package/dist/tools/remember.js +1 -1
  57. package/dist/tools/sandbox.js +61 -0
  58. package/dist/tools/search.js +105 -4
  59. package/dist/tools/task.js +195 -29
  60. package/dist/tools/timeout.js +35 -0
  61. package/dist/tools/util.js +10 -0
  62. package/dist/tools/write.js +6 -4
  63. package/dist/trust.js +89 -0
  64. package/dist/ui/app.js +218 -27
  65. package/dist/ui/banner.js +4 -9
  66. package/dist/ui/history.js +30 -0
  67. package/dist/ui/mentions.js +44 -0
  68. package/dist/ui/setup.js +6 -5
  69. package/dist/ui/useEditor.js +83 -0
  70. package/dist/update.js +114 -0
  71. package/dist/worktree.js +173 -0
  72. package/package.json +11 -5
  73. package/scripts/postinstall.mjs +33 -0
  74. package/second-brain/.agents/_Index.md +30 -0
  75. package/second-brain/.agents/skills/_Index.md +30 -0
  76. package/second-brain/.agents/workflows/_Index.md +30 -0
  77. package/second-brain/AGENTS.md +4 -4
  78. package/second-brain/Acceptance/_Index.md +30 -0
  79. package/second-brain/Acceptance/golden-case-template.md +39 -0
  80. package/second-brain/Areas/_Index.md +30 -0
  81. package/second-brain/Bugs/System-OS/_Index.md +30 -0
  82. package/second-brain/Bugs/_Index.md +30 -0
  83. package/second-brain/CLAUDE.md +4 -1
  84. package/second-brain/Checklists/_Index.md +30 -0
  85. package/second-brain/Checklists/preflight-postflight-template.md +29 -0
  86. package/second-brain/Distillations/_Index.md +30 -0
  87. package/second-brain/Entities/_Index.md +30 -0
  88. package/second-brain/Entities/entity-template.md +33 -0
  89. package/second-brain/Evals/_Index.md +30 -0
  90. package/second-brain/Evals/correction-pairs.md +24 -0
  91. package/second-brain/Evals/failure-taxonomy.md +24 -0
  92. package/second-brain/Evals/golden-set.md +25 -0
  93. package/second-brain/Evals/quality-ledger.md +23 -0
  94. package/second-brain/Evals/self-eval-rubric.md +23 -0
  95. package/second-brain/GEMINI.md +4 -4
  96. package/second-brain/Goals/_Index.md +30 -0
  97. package/second-brain/Handoffs/_Index.md +30 -0
  98. package/second-brain/Home.md +7 -0
  99. package/second-brain/Intake/Raw Sources/_Index.md +30 -0
  100. package/second-brain/Intake/_Index.md +30 -0
  101. package/second-brain/Intake/_Quarantine/_Index.md +30 -0
  102. package/second-brain/Learning/_Index.md +30 -0
  103. package/second-brain/Playbooks/_Index.md +30 -0
  104. package/second-brain/Playbooks/playbook-template.md +23 -0
  105. package/second-brain/Projects/_Index.md +30 -0
  106. package/second-brain/Prompts/_Index.md +30 -0
  107. package/second-brain/README.md +2 -1
  108. package/second-brain/Research/_Index.md +30 -0
  109. package/second-brain/Retrospectives/_Index.md +30 -0
  110. package/second-brain/Reviews/_Index.md +30 -0
  111. package/second-brain/Runbooks/_Index.md +30 -0
  112. package/second-brain/Runbooks/eval-loop.md +24 -0
  113. package/second-brain/Sessions/_Index.md +30 -0
  114. package/second-brain/Shared/AI-Context-Index.md +20 -0
  115. package/second-brain/Shared/AI-Threads/_Index.md +30 -0
  116. package/second-brain/Shared/Archive/_Index.md +30 -0
  117. package/second-brain/Shared/Assets/_Index.md +30 -0
  118. package/second-brain/Shared/Context-Packs/_Index.md +30 -0
  119. package/second-brain/Shared/Context7-Docs/_Index.md +30 -0
  120. package/second-brain/Shared/Coordination/NOW.md +28 -0
  121. package/second-brain/Shared/Coordination/_Index.md +30 -0
  122. package/second-brain/Shared/Coordination/agent-registry.md +24 -0
  123. package/second-brain/Shared/Coordination/task-board/_Index.md +30 -0
  124. package/second-brain/Shared/Coordination/task-board/task-template.md +43 -0
  125. package/second-brain/Shared/Coordination/task-board.md +32 -0
  126. package/second-brain/Shared/Core-Facts/_Index.md +30 -0
  127. package/second-brain/Shared/Decision-Memory/_Index.md +30 -0
  128. package/second-brain/Shared/Glossary/_Index.md +30 -0
  129. package/second-brain/Shared/Memory-Inbox/_Index.md +30 -0
  130. package/second-brain/Shared/Operating-State/_Index.md +30 -0
  131. package/second-brain/Shared/Prompting/_Index.md +30 -0
  132. package/second-brain/Shared/Provenance/_Index.md +30 -0
  133. package/second-brain/Shared/Rules/_Index.md +30 -0
  134. package/second-brain/Shared/Rules/contextual-note-rule.md +30 -0
  135. package/second-brain/Shared/Rules/frontmatter-standard.md +10 -0
  136. package/second-brain/Shared/Rules/memory-write-protocol.md +28 -0
  137. package/second-brain/Shared/Rules/procedural-runbook-header.md +40 -0
  138. package/second-brain/Shared/Rules/review-and-staleness-policy.md +22 -0
  139. package/second-brain/Shared/Rules/rules-formatting.md +34 -0
  140. package/second-brain/Shared/Scripts/_Index.md +30 -0
  141. package/second-brain/Shared/Scripts-Archive/_Index.md +30 -0
  142. package/second-brain/Shared/Tech-Standards/_Index.md +30 -0
  143. package/second-brain/Shared/Tech-Standards/verification-standard.md +40 -0
  144. package/second-brain/Shared/User-Memory/_Index.md +30 -0
  145. package/second-brain/Shared/User-Persona/_Index.md +30 -0
  146. package/second-brain/Shared/User-Persona/owner-profile.md +25 -0
  147. package/second-brain/Shared/Working-Memory/_Index.md +30 -0
  148. package/second-brain/Shared/_Index.md +30 -0
  149. package/second-brain/Shared/mcp-servers/_Index.md +30 -0
  150. package/second-brain/Skills/_Index.md +30 -0
  151. package/second-brain/Templates/_Index.md +30 -0
  152. package/second-brain/Templates/bug.md +2 -0
  153. package/second-brain/Templates/handoff.md +2 -0
  154. package/second-brain/Templates/session.md +2 -0
  155. package/second-brain/Tools/_Index.md +30 -0
  156. package/second-brain/Traces/_Index.md +30 -0
  157. package/second-brain/Vault Structure Map.md +33 -1
  158. package/second-brain/copilot/_Index.md +30 -0
  159. package/skills/audit-license-compliance/SKILL.md +117 -0
  160. package/skills/author-codemod/SKILL.md +110 -0
  161. package/skills/build-audit-logging/SKILL.md +112 -0
  162. package/skills/build-cdc-streaming-pipeline/SKILL.md +123 -0
  163. package/skills/build-cli-tool/SKILL.md +108 -0
  164. package/skills/build-data-table/SKILL.md +141 -0
  165. package/skills/build-native-mobile-ui/SKILL.md +154 -0
  166. package/skills/build-offline-first-sync/SKILL.md +118 -0
  167. package/skills/build-realtime-channel/SKILL.md +122 -0
  168. package/skills/build-vector-search/SKILL.md +131 -0
  169. package/skills/compose-local-dev-stack/SKILL.md +149 -0
  170. package/skills/configure-bundler-build/SKILL.md +166 -0
  171. package/skills/configure-dns-tls/SKILL.md +142 -0
  172. package/skills/configure-reverse-proxy-lb/SKILL.md +129 -0
  173. package/skills/configure-security-headers-csp/SKILL.md +122 -0
  174. package/skills/contract-testing/SKILL.md +140 -0
  175. package/skills/datetime-timezone-correctness/SKILL.md +125 -0
  176. package/skills/debug-ci-pipeline-failure/SKILL.md +134 -0
  177. package/skills/debug-flaky-tests/SKILL.md +128 -0
  178. package/skills/defend-llm-prompt-injection/SKILL.md +110 -0
  179. package/skills/deliver-webhooks/SKILL.md +116 -0
  180. package/skills/design-api-pagination/SKILL.md +144 -0
  181. package/skills/design-authorization-model/SKILL.md +119 -0
  182. package/skills/design-backup-dr-recovery/SKILL.md +113 -0
  183. package/skills/design-event-sourcing-cqrs/SKILL.md +143 -0
  184. package/skills/design-multi-tenancy/SKILL.md +100 -0
  185. package/skills/design-protobuf-grpc-service/SKILL.md +146 -0
  186. package/skills/design-relational-schema/SKILL.md +129 -0
  187. package/skills/design-search-index-infra/SKILL.md +151 -0
  188. package/skills/design-state-machine/SKILL.md +108 -0
  189. package/skills/design-token-system/SKILL.md +109 -0
  190. package/skills/distributed-locks-leases/SKILL.md +120 -0
  191. package/skills/encrypt-sensitive-data/SKILL.md +148 -0
  192. package/skills/feature-flags-rollout/SKILL.md +130 -0
  193. package/skills/file-upload-object-storage/SKILL.md +107 -0
  194. package/skills/fuzz-dynamic-security-test/SKILL.md +111 -0
  195. package/skills/harden-llm-app-reliability/SKILL.md +126 -0
  196. package/skills/i18n-localization-setup/SKILL.md +113 -0
  197. package/skills/idempotency-keys/SKILL.md +107 -0
  198. package/skills/implement-push-notifications/SKILL.md +142 -0
  199. package/skills/ingest-webhook-secure/SKILL.md +120 -0
  200. package/skills/integrate-oauth-oidc/SKILL.md +126 -0
  201. package/skills/load-stress-test/SKILL.md +129 -0
  202. package/skills/map-privacy-data-gdpr/SKILL.md +146 -0
  203. package/skills/model-nosql-data/SKILL.md +118 -0
  204. package/skills/money-decimal-arithmetic/SKILL.md +123 -0
  205. package/skills/monitor-ml-drift/SKILL.md +109 -0
  206. package/skills/numeric-precision-units/SKILL.md +144 -0
  207. package/skills/optimize-llm-cost-latency/SKILL.md +103 -0
  208. package/skills/optimize-react-rerenders/SKILL.md +124 -0
  209. package/skills/orchestrate-agent-workflow/SKILL.md +100 -0
  210. package/skills/payments-billing-integration/SKILL.md +114 -0
  211. package/skills/pin-toolchain-versions/SKILL.md +116 -0
  212. package/skills/plan-strangler-migration/SKILL.md +95 -0
  213. package/skills/property-based-testing/SKILL.md +108 -0
  214. package/skills/publish-package-registry/SKILL.md +130 -0
  215. package/skills/recover-git-state/SKILL.md +119 -0
  216. package/skills/remediate-web-vulnerabilities/SKILL.md +125 -0
  217. package/skills/resilience-timeouts-retries/SKILL.md +104 -0
  218. package/skills/resolve-merge-rebase-conflict/SKILL.md +97 -0
  219. package/skills/rewrite-git-history/SKILL.md +109 -0
  220. package/skills/scaffold-cross-platform-app/SKILL.md +137 -0
  221. package/skills/schema-evolution-compatibility/SKILL.md +121 -0
  222. package/skills/send-transactional-email/SKILL.md +126 -0
  223. package/skills/serve-deploy-ml-model/SKILL.md +107 -0
  224. package/skills/setup-cdn-edge-waf/SKILL.md +107 -0
  225. package/skills/setup-devcontainer-env/SKILL.md +131 -0
  226. package/skills/setup-lint-format-precommit/SKILL.md +140 -0
  227. package/skills/setup-monorepo-tooling/SKILL.md +125 -0
  228. package/skills/ship-mobile-app-store-release/SKILL.md +137 -0
  229. package/skills/structured-output-llm/SKILL.md +86 -0
  230. package/skills/supply-chain-sbom-provenance/SKILL.md +120 -0
  231. package/skills/test-data-factories/SKILL.md +158 -0
  232. package/skills/threat-model-stride/SKILL.md +123 -0
  233. package/skills/train-evaluate-ml-model/SKILL.md +109 -0
  234. package/skills/unicode-text-correctness/SKILL.md +109 -0
  235. package/skills/visual-regression-testing/SKILL.md +120 -0
@@ -0,0 +1,93 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { join, extname } from 'node:path';
3
+ import { runGit, isGitRepo } from './git.js';
4
+ // repo map = symbol map คร่าวๆ ของ repo (zero-dep, regex per ภาษา) inject ตอน session start
5
+ // ช่วย agent เลือกไฟล์ถูกโดยไม่ต้อง grep/read ทีละไฟล์ — เลียน Aider repo-map (เวอร์ชัน lightweight)
6
+ const MAX_FILES = 400;
7
+ const MAX_FILE_BYTES = 32 * 1024;
8
+ const SYMS_PER_FILE = 12;
9
+ const SOURCE_EXT = new Set([
10
+ '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs', '.py', '.go', '.rs',
11
+ '.java', '.rb', '.c', '.h', '.cpp', '.hpp', '.cs', '.php', '.swift', '.kt',
12
+ ]);
13
+ const IGNORE_DIR = /(^|\/)(node_modules|dist|build|coverage|\.next|\.cache|\.git|vendor|__pycache__)(\/|$)/;
14
+ // regex ดึง top-level / exported symbol — หลายภาษา รวมกัน dedup
15
+ const SYMBOL_PATTERNS = [
16
+ /^export\s+(?:default\s+)?(?:async\s+)?(?:function|class|const|interface|type|enum)\s+([A-Za-z0-9_$]+)/gm, // TS/JS export
17
+ /^(?:export\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/gm, // JS function
18
+ /^(?:export\s+)?class\s+([A-Za-z0-9_$]+)/gm, // JS class
19
+ /^(?:def|class)\s+([A-Za-z0-9_]+)/gm, // Python
20
+ /^func\s+(?:\([^)]*\)\s+)?([A-Za-z0-9_]+)/gm, // Go
21
+ /^(?:pub\s+)?(?:fn|struct|enum|trait|impl)\s+([A-Za-z0-9_]+)/gm, // Rust
22
+ ];
23
+ function extractSymbols(content) {
24
+ const found = new Set();
25
+ for (const re of SYMBOL_PATTERNS) {
26
+ re.lastIndex = 0;
27
+ let m;
28
+ while ((m = re.exec(content)) !== null) {
29
+ if (m[1])
30
+ found.add(m[1]);
31
+ if (found.size >= SYMS_PER_FILE * 3)
32
+ break;
33
+ }
34
+ }
35
+ return [...found].slice(0, SYMS_PER_FILE);
36
+ }
37
+ function isSource(rel) {
38
+ return SOURCE_EXT.has(extname(rel).toLowerCase()) && !IGNORE_DIR.test(rel);
39
+ }
40
+ // คืน null = git ล้มชั่วคราว (อย่า cache, ลองใหม่รอบหน้า) · [] = ไม่ใช่ git repo จริงๆ (cache ได้)
41
+ async function listFiles(cwd) {
42
+ if (await isGitRepo(cwd)) {
43
+ try {
44
+ return (await runGit(['ls-files'], cwd)).split('\n').filter(Boolean);
45
+ }
46
+ catch {
47
+ return null; // ls-files ล้ม (เช่น maxBuffer / index lock) ≠ repo ว่าง
48
+ }
49
+ }
50
+ return [];
51
+ }
52
+ let cached = null;
53
+ /**
54
+ * โครงสร้าง symbol ของ repo (cap ที่ maxChars) — cache ต่อ process ต่อ cwd (โครงสร้างไม่ค่อยเปลี่ยนกลาง session)
55
+ * คืน '' ถ้าไม่ใช่ git repo / ไม่มี source file (เช่น brain vault ที่มีแต่ markdown)
56
+ */
57
+ export async function loadRepoMap(cwd = process.cwd(), maxChars = 4000) {
58
+ if (cached && cached.cwd === cwd)
59
+ return cached.map;
60
+ const raw = await listFiles(cwd);
61
+ if (raw === null)
62
+ return ''; // git ล้มชั่วคราว → คืนว่างแต่ไม่ cache (ลองใหม่รอบหน้า)
63
+ const files = raw.filter(isSource).slice(0, MAX_FILES);
64
+ if (!files.length) {
65
+ cached = { cwd, map: '' };
66
+ return '';
67
+ }
68
+ const entries = await Promise.all(files.map(async (rel) => {
69
+ try {
70
+ const content = (await readFile(join(cwd, rel), 'utf8')).slice(0, MAX_FILE_BYTES);
71
+ const syms = extractSymbols(content);
72
+ return syms.length ? `${rel}: ${syms.join(', ')}` : rel;
73
+ }
74
+ catch {
75
+ return rel;
76
+ }
77
+ }));
78
+ let body = '';
79
+ for (const e of entries) {
80
+ if (body.length + e.length + 1 > maxChars) {
81
+ body += '\n…';
82
+ break;
83
+ }
84
+ body += (body ? '\n' : '') + e;
85
+ }
86
+ const map = `<repo_map note="symbol คร่าวๆ ของ repo (อาจไม่ครบ/ไม่เป๊ะ) — ใช้ glob/grep/read_file ยืนยันก่อนแก้">\n${body}\n</repo_map>`;
87
+ cached = { cwd, map };
88
+ return map;
89
+ }
90
+ /** เคลียร์ cache (สำหรับ test / เมื่อ cwd เปลี่ยน) */
91
+ export function clearRepoMapCache() {
92
+ cached = null;
93
+ }
@@ -0,0 +1,158 @@
1
+ // ============================================================================
2
+ // src/search/chunk.ts — ONE generic, heading-aware markdown chunker.
3
+ //
4
+ // arra-oracle ships five hardcoded type parsers (resonance/learning/retro/
5
+ // distillation/security-corpus), each splitting on its own header convention.
6
+ // We replace all five with a single type-agnostic chunker: split on ATX
7
+ // headings, fold sub-MIN sections forward so we never emit a tiny chunk, and key
8
+ // each chunk by a stable hash of (path)#ordinal so re-indexing a file replaces
9
+ // exactly its chunks (no posting creep — see index-core.addDoc).
10
+ //
11
+ // Everything is pure (no fs) and DEFENSIVE: malformed frontmatter, nested YAML,
12
+ // or a stray [[ inside a code fence degrade to "no frontmatter / no links"
13
+ // rather than throwing. We must never block indexing a real, messy vault file.
14
+ // ============================================================================
15
+ const MIN_CHARS = 120; // sections shorter than this fold into the next chunk
16
+ /** deterministic short hash of a path (fnv-1a → base36) — no crypto dep, stable chunk ids. */
17
+ export function pathHash(path) {
18
+ let h = 0x811c9dc5;
19
+ for (let i = 0; i < path.length; i++) {
20
+ h ^= path.charCodeAt(i);
21
+ h = Math.imul(h, 0x01000193);
22
+ }
23
+ return (h >>> 0).toString(36);
24
+ }
25
+ /** split a leading `---\n…\n---` frontmatter block from the body. Defensive: no block ⇒ {} + full md. */
26
+ export function parseFrontmatter(md) {
27
+ const empty = { tags: [] };
28
+ if (!md.startsWith('---'))
29
+ return { data: empty, body: md };
30
+ const end = md.indexOf('\n---', 3);
31
+ if (end === -1)
32
+ return { data: empty, body: md };
33
+ const block = md.slice(3, end).trim();
34
+ const body = md.slice(md.indexOf('\n', end + 1) + 1).replace(/^\n+/, '');
35
+ const data = { tags: [] };
36
+ const lines = block.split('\n');
37
+ for (let i = 0; i < lines.length; i++) {
38
+ const line = lines[i];
39
+ const m = /^([A-Za-z0-9_-]+):\s*(.*)$/.exec(line);
40
+ if (!m)
41
+ continue;
42
+ const key = m[1].toLowerCase();
43
+ const val = m[2].trim();
44
+ if (key === 'note_type' || key === 'notetype')
45
+ data.noteType = stripQuotes(val);
46
+ else if (key === 'parent')
47
+ data.parent = unwrapLink(val);
48
+ else if (key === 'up')
49
+ data.up = unwrapLink(val);
50
+ else if (key === 'tags') {
51
+ if (val.startsWith('['))
52
+ data.tags = inlineList(val);
53
+ else if (val)
54
+ data.tags = [stripQuotes(val)];
55
+ else {
56
+ // YAML block list: subsequent "- item" lines
57
+ for (let j = i + 1; j < lines.length && /^\s*-\s+/.test(lines[j]); j++) {
58
+ data.tags.push(stripQuotes(lines[j].replace(/^\s*-\s+/, '').trim()));
59
+ }
60
+ }
61
+ }
62
+ }
63
+ return { data, body };
64
+ }
65
+ function stripQuotes(s) {
66
+ return s.replace(/^["']|["']$/g, '').trim();
67
+ }
68
+ function unwrapLink(s) {
69
+ const m = /\[\[([^\]]+)\]\]/.exec(s);
70
+ return (m ? m[1] : stripQuotes(s)).split('|')[0].trim();
71
+ }
72
+ function inlineList(s) {
73
+ return s
74
+ .replace(/^\[|\]$/g, '')
75
+ .split(',')
76
+ .map((t) => stripQuotes(t))
77
+ .filter(Boolean);
78
+ }
79
+ /** extract [[wikilink]] targets (alias after | dropped), ignoring fenced code blocks. Deduped. */
80
+ export function extractWikilinks(md) {
81
+ const noFences = md.replace(/```[\s\S]*?```/g, ' ').replace(/`[^`]*`/g, ' ');
82
+ const out = new Set();
83
+ for (const m of noFences.matchAll(/\[\[([^\]]+)\]\]/g)) {
84
+ const target = m[1].split('|')[0].split('#')[0].trim();
85
+ if (target)
86
+ out.add(target);
87
+ }
88
+ return [...out];
89
+ }
90
+ /** split body into sections at ATX headings (fenced code blocks are not headings). */
91
+ function splitSections(md) {
92
+ const sections = [];
93
+ let cur = { heading: '', body: '' };
94
+ let inFence = false;
95
+ for (const line of md.split('\n')) {
96
+ if (/^\s*(```|~~~)/.test(line))
97
+ inFence = !inFence;
98
+ const m = inFence ? null : /^(#{1,6})\s+(.*\S)\s*$/.exec(line);
99
+ if (m) {
100
+ if (cur.heading || cur.body.trim())
101
+ sections.push(cur);
102
+ cur = { heading: m[2].trim(), body: '' };
103
+ }
104
+ else {
105
+ cur.body += `${line}\n`;
106
+ }
107
+ }
108
+ if (cur.heading || cur.body.trim())
109
+ sections.push(cur);
110
+ return sections;
111
+ }
112
+ /** greedily pack sections so no chunk is below MIN_CHARS; the first section's heading labels the group. */
113
+ function packSections(sections) {
114
+ const out = [];
115
+ let groupHeading = null;
116
+ let buf = '';
117
+ const flush = () => {
118
+ if (buf.trim())
119
+ out.push({ heading: groupHeading ?? '', body: buf.trim() });
120
+ buf = '';
121
+ groupHeading = null;
122
+ };
123
+ for (const s of sections) {
124
+ if (groupHeading === null) {
125
+ groupHeading = s.heading;
126
+ buf += s.body;
127
+ }
128
+ else {
129
+ if (s.heading)
130
+ buf += `\n${s.heading}\n`;
131
+ buf += s.body;
132
+ }
133
+ if (buf.trim().length >= MIN_CHARS)
134
+ flush();
135
+ }
136
+ flush();
137
+ return out;
138
+ }
139
+ /**
140
+ * Parse a markdown file into frontmatter + wikilink edges + heading-aware chunks.
141
+ * Pure and total — any structural weirdness degrades, never throws.
142
+ */
143
+ export function chunkMarkdown(path, md) {
144
+ // normalize CRLF→LF — ไฟล์ vault บน Windows มัก CRLF; ไม่งั้น frontmatter ('\n---') + split พัง+ hash เพี้ยนข้ามแพลตฟอร์ม
145
+ md = md.replace(/\r\n/g, '\n');
146
+ const { data, body } = parseFrontmatter(md);
147
+ const links = extractWikilinks(body);
148
+ const packed = packSections(splitSections(body));
149
+ const hash = pathHash(path);
150
+ const chunks = packed.map((s, ordinal) => ({
151
+ id: `${hash}#${ordinal}`,
152
+ ordinal,
153
+ heading: s.heading,
154
+ text: s.body,
155
+ }));
156
+ // a file with a body but (after packing) no chunk — e.g. only whitespace — yields none; that's fine.
157
+ return { frontmatter: data, links, chunks };
158
+ }
@@ -0,0 +1,187 @@
1
+ // ============================================================================
2
+ // src/search/embed-store.ts — OPTIONAL L1 semantic layer (BYOK embeddings).
3
+ //
4
+ // arra-oracle's semantic search needs LanceDB/sqlite-vec/Qdrant native binaries
5
+ // (~100MB, no Windows for LanceDB) plus an Ollama model download and a Python
6
+ // reranker sidecar. We need NONE of that: embeddings go through the user's
7
+ // EXISTING ai-SDK provider key (embedMany), vectors live as a compact Float32
8
+ // blob next to index.json, and cosine runs in-process over a BM25-PREFILTERED
9
+ // candidate set (so we never scan the whole corpus per query). The whole layer is
10
+ // LAZY — absent without a key, the engine degrades to BM25 with zero ceremony.
11
+ //
12
+ // Pure math (normalize, cosineTopK, (de)serialize) is unit-tested with fake
13
+ // vectors; the only networked function is embedTexts(), kept thin.
14
+ // ============================================================================
15
+ import { chmod, mkdir, readFile, rename, rm, stat, writeFile } from 'node:fs/promises';
16
+ import { randomUUID } from 'node:crypto';
17
+ import { join } from 'node:path';
18
+ import { embedMany } from 'ai';
19
+ import { appHomePath, persistenceEnabled } from '../brand.js';
20
+ import { resolveEmbedder } from '../providers/registry.js';
21
+ export const VECTORS_PATH = join(appHomePath('search'), 'vectors.json');
22
+ export function emptyVectors(tag = '') {
23
+ return { tag, dim: 0, ids: [], data: new Float32Array(0) };
24
+ }
25
+ /** L2-normalize in place and return — lets cosine reduce to a dot product. */
26
+ export function normalizeVec(v) {
27
+ let sum = 0;
28
+ for (let i = 0; i < v.length; i++)
29
+ sum += v[i] * v[i];
30
+ const norm = Math.sqrt(sum) || 1;
31
+ for (let i = 0; i < v.length; i++)
32
+ v[i] /= norm;
33
+ return v;
34
+ }
35
+ /** build a VectorIndex from rows (vectors normalized on the way in). */
36
+ export function buildVectorIndex(tag, rows) {
37
+ if (!rows.length)
38
+ return emptyVectors(tag);
39
+ const dim = rows[0].vec.length;
40
+ if (dim <= 0)
41
+ return emptyVectors(tag);
42
+ const data = new Float32Array(rows.length * dim);
43
+ const ids = [];
44
+ for (let i = 0; i < rows.length; i++) {
45
+ if (rows[i].vec.length !== dim) {
46
+ throw new Error(`vector dimension mismatch for "${rows[i].id}": expected ${dim}, got ${rows[i].vec.length}`);
47
+ }
48
+ const v = Float32Array.from(rows[i].vec);
49
+ normalizeVec(v);
50
+ data.set(v, i * dim);
51
+ ids.push(rows[i].id);
52
+ }
53
+ return { tag, dim, ids, data };
54
+ }
55
+ /**
56
+ * Cosine top-K over a normalized vector index. `queryVec` is normalized here.
57
+ * An optional candidate allow-list (the BM25 prefilter) means cosine touches only
58
+ * a bounded set, never the whole corpus. Pure + deterministic.
59
+ */
60
+ export function cosineTopK(vi, queryVec, k = 50, candidates) {
61
+ if (!vi.dim || !vi.ids.length)
62
+ return [];
63
+ const q = normalizeVec(Float32Array.from(queryVec));
64
+ if (q.length !== vi.dim)
65
+ return [];
66
+ const out = [];
67
+ for (let i = 0; i < vi.ids.length; i++) {
68
+ const id = vi.ids[i];
69
+ if (candidates && !candidates.has(id))
70
+ continue;
71
+ let dot = 0;
72
+ const base = i * vi.dim;
73
+ for (let d = 0; d < vi.dim; d++)
74
+ dot += q[d] * vi.data[base + d];
75
+ out.push({ id, score: dot });
76
+ }
77
+ return out
78
+ .sort((a, b) => b.score - a.score || (a.id < b.id ? -1 : a.id > b.id ? 1 : 0))
79
+ .slice(0, k);
80
+ }
81
+ /** map id → row index, for incremental updates / lookups. */
82
+ export function vectorIds(vi) {
83
+ return new Set(vi.ids);
84
+ }
85
+ const VEC_FILE_VERSION = 1;
86
+ export function serializeVectors(vi) {
87
+ const buf = Buffer.from(vi.data.buffer, vi.data.byteOffset, vi.data.byteLength);
88
+ return { v: VEC_FILE_VERSION, tag: vi.tag, dim: vi.dim, ids: vi.ids, b64: buf.toString('base64') };
89
+ }
90
+ export function deserializeVectors(raw) {
91
+ const o = raw;
92
+ if (!o ||
93
+ o.v !== VEC_FILE_VERSION ||
94
+ typeof o.tag !== 'string' ||
95
+ typeof o.dim !== 'number' ||
96
+ !Number.isInteger(o.dim) ||
97
+ o.dim < 0 ||
98
+ !Array.isArray(o.ids) ||
99
+ !o.ids.every((id) => typeof id === 'string') ||
100
+ typeof o.b64 !== 'string') {
101
+ return emptyVectors();
102
+ }
103
+ // dim=0 is only valid for an empty index — normalize to emptyVectors so the invariant
104
+ // (dim===0 ⇔ ids=[] ⇔ data empty) holds at the deserializer boundary, not just downstream.
105
+ if (o.dim === 0)
106
+ return emptyVectors(o.tag);
107
+ const buf = Buffer.from(o.b64, 'base64');
108
+ if (buf.byteLength % 4 !== 0)
109
+ return emptyVectors(o.tag);
110
+ const arrayBuffer = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
111
+ const data = new Float32Array(arrayBuffer);
112
+ // defensive: row count must match ids*dim, else treat as corrupt
113
+ if (data.length !== o.ids.length * o.dim)
114
+ return emptyVectors(o.tag);
115
+ return { tag: o.tag ?? '', dim: o.dim, ids: o.ids, data: Float32Array.from(data) };
116
+ }
117
+ // ---- fs boundary (mirrors store.ts: atomic, 0o600, persistence-gated) ----
118
+ export async function loadVectors() {
119
+ try {
120
+ return deserializeVectors(JSON.parse(await readFile(VECTORS_PATH, 'utf8')));
121
+ }
122
+ catch {
123
+ return emptyVectors();
124
+ }
125
+ }
126
+ export async function saveVectors(vi) {
127
+ if (!persistenceEnabled())
128
+ return;
129
+ const dir = appHomePath('search');
130
+ await mkdir(dir, { recursive: true });
131
+ const tmp = join(dir, `vectors.${randomUUID()}.tmp`);
132
+ try {
133
+ await writeFile(tmp, `${JSON.stringify(serializeVectors(vi))}\n`, { mode: 0o600 });
134
+ await chmod(tmp, 0o600).catch(() => { });
135
+ await rename(tmp, VECTORS_PATH);
136
+ }
137
+ catch (e) {
138
+ await rm(tmp, { force: true }).catch(() => { });
139
+ throw e;
140
+ }
141
+ }
142
+ export async function vectorsMtimeMs() {
143
+ try {
144
+ return (await stat(VECTORS_PATH)).mtimeMs;
145
+ }
146
+ catch {
147
+ return 0;
148
+ }
149
+ }
150
+ // ---- networked: embedding (the only part that talks to a provider) ----
151
+ const BATCH = 64;
152
+ /** resolve a BYOK embedder (or null). Thin re-export so search code imports from one place. */
153
+ export function getEmbedder(spec) {
154
+ return resolveEmbedder(spec);
155
+ }
156
+ /**
157
+ * Embed many texts in batches with exponential backoff on rate limits. Returns
158
+ * one number[] per input, in order. Throws only if every retry fails — callers
159
+ * (engine/indexer) catch and fall back to BM25.
160
+ */
161
+ export async function embedTexts(embedder, texts) {
162
+ const out = [];
163
+ for (let i = 0; i < texts.length; i += BATCH) {
164
+ const slice = texts.slice(i, i + BATCH);
165
+ out.push(...(await embedBatchWithRetry(embedder, slice)));
166
+ }
167
+ return out;
168
+ }
169
+ /** embed a single query string. */
170
+ export async function embedQuery(embedder, text) {
171
+ return (await embedBatchWithRetry(embedder, [text]))[0];
172
+ }
173
+ async function embedBatchWithRetry(embedder, values, attempt = 0) {
174
+ try {
175
+ const { embeddings } = await embedMany({ model: embedder.model, values });
176
+ return embeddings;
177
+ }
178
+ catch (e) {
179
+ const msg = e.message ?? '';
180
+ const retryable = /429|rate.?limit|timeout|ECONNRESET|503|overloaded/i.test(msg);
181
+ if (retryable && attempt < 4) {
182
+ await new Promise((r) => setTimeout(r, 400 * 2 ** attempt));
183
+ return embedBatchWithRetry(embedder, values, attempt + 1);
184
+ }
185
+ throw e;
186
+ }
187
+ }
@@ -0,0 +1,203 @@
1
+ // ============================================================================
2
+ // src/search/engine.ts — the search orchestrator (the one module callers use).
3
+ //
4
+ // Implements the degradation ladder as a single search() call:
5
+ // mode='fts' → pure BM25 (the always-on floor)
6
+ // mode='semantic' → cosine over BYOK vectors (full recall)
7
+ // mode='hybrid' → BM25 ⊕ cosine ⊕ memory-importance prior, fused by RRF
8
+ // mode='auto' → hybrid when vectors are usable, else fts (the smart default)
9
+ //
10
+ // rankSearch() is the PURE core (index + optional vectors + optional query vector
11
+ // in, ranked hits out) so the whole ranking pipeline unit-tests with zero disk
12
+ // and zero network. search() is the thin disk/embedding wrapper: it caches the
13
+ // index by mtime, caches query embeddings in an LRU, resolves a BYOK embedder
14
+ // lazily, and on ANY embedding error degrades to BM25 with a `degraded` flag —
15
+ // search must never throw at the floor.
16
+ // ============================================================================
17
+ import { readFile } from 'node:fs/promises';
18
+ import { appHomePath } from '../brand.js';
19
+ import { bm25Search, termList } from './index-core.js';
20
+ import { rrfFuse } from './fuse.js';
21
+ import { cosineTopK, embedQuery, getEmbedder, loadVectors, vectorsMtimeMs, } from './embed-store.js';
22
+ import { indexMtimeMs, loadIndex } from './store.js';
23
+ const CAND = 60; // candidate pool depth per leg before fusion/limit
24
+ const SNIPPET_WIDTH = 64;
25
+ /** ±width snippet around the first matched query term; falls back to the head for semantic-only hits. */
26
+ function makeSnippet(text, qTerms, width = SNIPPET_WIDTH) {
27
+ const flat = text.replace(/\s+/g, ' ').trim();
28
+ const lower = flat.toLowerCase();
29
+ let pos = -1;
30
+ for (const t of qTerms) {
31
+ const i = lower.indexOf(t);
32
+ if (i >= 0 && (pos < 0 || i < pos))
33
+ pos = i;
34
+ }
35
+ if (pos < 0)
36
+ return flat.length > width * 2 ? `${flat.slice(0, width * 2).trim()}…` : flat;
37
+ const start = Math.max(0, pos - width);
38
+ const end = Math.min(flat.length, pos + width);
39
+ return `${start > 0 ? '…' : ''}${flat.slice(start, end).trim()}${end < flat.length ? '…' : ''}`;
40
+ }
41
+ /** ids of docs whose source is allowed (or all if no filter). */
42
+ function sourceFilteredIds(index, sources) {
43
+ if (!sources)
44
+ return undefined;
45
+ const out = new Set();
46
+ for (const m of index.docs.values())
47
+ if (sources.has(m.source))
48
+ out.add(m.id);
49
+ return out;
50
+ }
51
+ /**
52
+ * PURE ranking core. Given the index, optional vectors, and an optional query
53
+ * vector, produce ranked hits per the requested mode. No disk, no network.
54
+ */
55
+ export function rankSearch(index, query, opts = {}, vectors, queryVec) {
56
+ const mode = opts.mode ?? 'auto';
57
+ const limit = opts.limit ?? 8;
58
+ const sources = opts.sources?.length ? new Set(opts.sources) : undefined;
59
+ const qTerms = [...new Set(termList(query))];
60
+ const bm25 = bm25Search(index, query, CAND, sources);
61
+ const bm25Ids = bm25.map((h) => h.id);
62
+ const semanticPossible = !!(vectors && vectors.dim && queryVec && queryVec.length === vectors.dim);
63
+ const wantsSemantic = mode === 'semantic' || mode === 'hybrid' || mode === 'auto';
64
+ // resolve the executed mode + a degraded reason if the request can't be honored
65
+ let exec;
66
+ let degraded;
67
+ if (!wantsSemantic)
68
+ exec = 'fts';
69
+ else if (semanticPossible)
70
+ exec = mode === 'auto' ? 'hybrid' : mode;
71
+ else {
72
+ exec = 'fts';
73
+ if (mode === 'semantic' || mode === 'hybrid')
74
+ degraded = 'semantic-unavailable';
75
+ }
76
+ let orderedIds;
77
+ if (exec === 'fts') {
78
+ orderedIds = bm25Ids;
79
+ }
80
+ else {
81
+ const allowed = sourceFilteredIds(index, sources);
82
+ const cosine = cosineTopK(vectors, queryVec, CAND, allowed).filter((h) => index.docs.has(h.id));
83
+ const cosineIds = cosine.map((h) => h.id);
84
+ if (exec === 'semantic') {
85
+ orderedIds = cosineIds;
86
+ }
87
+ else {
88
+ // hybrid: BM25 ⊕ cosine ⊕ memory-importance prior, fused by rank (scale-free)
89
+ const priorIds = [...new Set([...bm25Ids, ...cosineIds])]
90
+ .map((id) => index.docs.get(id))
91
+ .filter((m) => !!m && m.source === 'memory' && m.importance != null)
92
+ .sort((a, b) => (b.importance ?? 0) - (a.importance ?? 0))
93
+ .map((m) => m.id);
94
+ orderedIds = rrfFuse([
95
+ { ids: bm25Ids },
96
+ { ids: cosineIds },
97
+ { ids: priorIds, weight: 0.4 },
98
+ ]);
99
+ }
100
+ }
101
+ const snippets = opts.snippets !== false;
102
+ const hits = [];
103
+ for (const id of orderedIds.slice(0, limit)) {
104
+ const m = index.docs.get(id);
105
+ if (!m)
106
+ continue;
107
+ hits.push({
108
+ id: m.id,
109
+ source: m.source,
110
+ title: m.title,
111
+ path: m.path,
112
+ noteType: m.noteType,
113
+ tags: m.tags,
114
+ score: 0, // rank-based; fused score isn't meaningful cross-mode, so we expose rank order
115
+ snippet: snippets ? makeSnippet(m.text, qTerms) : '',
116
+ importance: m.importance,
117
+ });
118
+ }
119
+ return { hits, mode: exec, degraded, total: new Set(orderedIds).size };
120
+ }
121
+ // ---- disk/embedding wrapper (the only impure part) -------------------------
122
+ let indexCache = null;
123
+ let vectorCache = null;
124
+ const queryVecLRU = new Map(); // key = `${tag}\n${query}`
125
+ const LRU_CAP = 100;
126
+ /** cached index load — re-reads only when the on-disk index.json mtime changes. */
127
+ async function cachedIndex() {
128
+ const mtime = await indexMtimeMs();
129
+ if (!indexCache || indexCache.mtime !== mtime) {
130
+ indexCache = { index: (await loadIndex()).index, mtime };
131
+ }
132
+ return indexCache.index;
133
+ }
134
+ async function cachedVectors() {
135
+ const mtime = await vectorsMtimeMs();
136
+ if (!vectorCache || vectorCache.mtime !== mtime) {
137
+ vectorCache = { vectors: await loadVectors(), mtime };
138
+ }
139
+ return vectorCache.vectors;
140
+ }
141
+ /** read an optional embeddingModel spec from ~/.sanook/config.json. */
142
+ async function configEmbeddingModel() {
143
+ try {
144
+ const cfg = JSON.parse(await readFile(appHomePath('config.json'), 'utf8'));
145
+ return cfg.embeddingModel;
146
+ }
147
+ catch {
148
+ return undefined;
149
+ }
150
+ }
151
+ /** drop in-process caches (tests + after a reindex in the same process). */
152
+ export function resetSearchCaches() {
153
+ indexCache = null;
154
+ vectorCache = null;
155
+ queryVecLRU.clear();
156
+ }
157
+ /**
158
+ * The public search entrypoint. Loads the cached index, lazily resolves a BYOK
159
+ * embedder (opts → env SANOOK_EMBEDDING_MODEL → config → auto-detect), embeds the
160
+ * query (LRU-cached) only when semantic is wanted AND a usable same-tag vector set
161
+ * exists, then delegates to rankSearch. Any embedding failure degrades to BM25.
162
+ */
163
+ export async function search(query, opts = {}) {
164
+ const index = await cachedIndex();
165
+ const mode = opts.mode ?? 'auto';
166
+ if (mode === 'fts')
167
+ return rankSearch(index, query, opts);
168
+ const spec = opts.embeddingModel ?? process.env.SANOOK_EMBEDDING_MODEL ?? (await configEmbeddingModel());
169
+ const embedder = getEmbedder(spec);
170
+ if (!embedder) {
171
+ const res = rankSearch(index, query, opts);
172
+ if (mode === 'semantic' || mode === 'hybrid')
173
+ res.degraded = 'no-embedder';
174
+ return res;
175
+ }
176
+ const vectors = await cachedVectors();
177
+ // a model change (different tag) invalidates the cache → behave as no-vectors until reindex
178
+ if (!vectors.dim || vectors.tag !== embedder.tag) {
179
+ const res = rankSearch(index, query, opts);
180
+ res.degraded = vectors.dim ? 'embedding-model-changed' : 'no-vectors';
181
+ return mode === 'auto' ? { ...res, degraded: undefined } : res;
182
+ }
183
+ let queryVec;
184
+ try {
185
+ const key = `${embedder.tag}\n${query}`;
186
+ const cached = queryVecLRU.get(key);
187
+ if (cached) {
188
+ queryVec = cached;
189
+ }
190
+ else {
191
+ queryVec = await embedQuery(embedder, query);
192
+ queryVecLRU.set(key, queryVec);
193
+ if (queryVecLRU.size > LRU_CAP)
194
+ queryVecLRU.delete(queryVecLRU.keys().next().value);
195
+ }
196
+ }
197
+ catch {
198
+ const res = rankSearch(index, query, opts); // embedding failed mid-query → BM25 floor
199
+ res.degraded = 'semantic-unavailable';
200
+ return res;
201
+ }
202
+ return rankSearch(index, query, opts, vectors, queryVec);
203
+ }
@@ -0,0 +1,35 @@
1
+ // ============================================================================
2
+ // src/search/fuse.ts — Reciprocal Rank Fusion (RRF).
3
+ //
4
+ // arra-oracle blends results with a hand-tuned linear formula
5
+ // (fts*0.7 + vec*0.65 + 0.12*overlap) that mixes BM25 magnitudes with cosine
6
+ // distances — two scales that are not comparable, so the weights are fragile and
7
+ // corpus-dependent. RRF sidesteps the whole problem: it fuses on RANK, not score,
8
+ // so a document's contribution depends only on where it placed in each list, not
9
+ // on the (incomparable) raw numbers. A doc that ranks well in two lists naturally
10
+ // sums two reciprocals and outranks a doc strong in only one. k=60 is the
11
+ // standard Cormack et al. constant. Pure, deterministic, parameter-light.
12
+ // ============================================================================
13
+ const RRF_K = 60;
14
+ /**
15
+ * Fuse N ranked id-lists into a single score map (higher = better).
16
+ * score(d) = Σ_lists weight / (k + rank_in_list(d)), rank 0-based.
17
+ */
18
+ export function rrf(lists, k = RRF_K) {
19
+ const scores = new Map();
20
+ for (const list of lists) {
21
+ const w = list.weight ?? 1;
22
+ for (let rank = 0; rank < list.ids.length; rank++) {
23
+ const id = list.ids[rank];
24
+ scores.set(id, (scores.get(id) ?? 0) + w / (k + rank));
25
+ }
26
+ }
27
+ return scores;
28
+ }
29
+ /** RRF then sort → fused id list (best first), deterministic tie-break by id. */
30
+ export function rrfFuse(lists, limit, k = RRF_K) {
31
+ const scores = rrf(lists, k);
32
+ const out = [...scores.entries()].sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0));
33
+ const ids = out.map(([id]) => id);
34
+ return limit == null ? ids : ids.slice(0, limit);
35
+ }