zubo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. package/.github/workflows/ci.yml +35 -0
  2. package/README.md +149 -0
  3. package/bun.lock +216 -0
  4. package/desktop/README.md +57 -0
  5. package/desktop/package.json +12 -0
  6. package/desktop/src-tauri/Cargo.toml +25 -0
  7. package/desktop/src-tauri/build.rs +3 -0
  8. package/desktop/src-tauri/icons/README.md +17 -0
  9. package/desktop/src-tauri/icons/icon.png +0 -0
  10. package/desktop/src-tauri/src/main.rs +189 -0
  11. package/desktop/src-tauri/tauri.conf.json +68 -0
  12. package/docs/ROADMAP.md +490 -0
  13. package/migrations/001_init.sql +9 -0
  14. package/migrations/002_memory.sql +33 -0
  15. package/migrations/003_cron.sql +24 -0
  16. package/migrations/004_usage.sql +12 -0
  17. package/migrations/005_secrets.sql +8 -0
  18. package/migrations/006_agents.sql +1 -0
  19. package/migrations/007_workflows.sql +22 -0
  20. package/migrations/008_proactive.sql +24 -0
  21. package/migrations/009_uploads.sql +9 -0
  22. package/migrations/010_observability.sql +22 -0
  23. package/migrations/011_api_keys.sql +7 -0
  24. package/migrations/012_indexes.sql +5 -0
  25. package/migrations/013_budget.sql +11 -0
  26. package/migrations/014_usage_session_idx.sql +2 -0
  27. package/package.json +39 -0
  28. package/site/404.html +156 -0
  29. package/site/CNAME +1 -0
  30. package/site/docs/agents.html +294 -0
  31. package/site/docs/api.html +446 -0
  32. package/site/docs/channels.html +345 -0
  33. package/site/docs/cli.html +238 -0
  34. package/site/docs/config.html +1034 -0
  35. package/site/docs/index.html +433 -0
  36. package/site/docs/integrations.html +381 -0
  37. package/site/docs/memory.html +254 -0
  38. package/site/docs/security.html +375 -0
  39. package/site/docs/skills.html +322 -0
  40. package/site/docs.css +412 -0
  41. package/site/index.html +638 -0
  42. package/site/install.sh +98 -0
  43. package/site/logo.svg +1 -0
  44. package/site/og-image.png +0 -0
  45. package/site/robots.txt +4 -0
  46. package/site/script.js +361 -0
  47. package/site/sitemap.xml +63 -0
  48. package/site/skills.html +532 -0
  49. package/site/style.css +1686 -0
  50. package/src/agent/agents.ts +159 -0
  51. package/src/agent/compaction.ts +53 -0
  52. package/src/agent/context.ts +18 -0
  53. package/src/agent/delegate.ts +118 -0
  54. package/src/agent/loop.ts +318 -0
  55. package/src/agent/prompts.ts +111 -0
  56. package/src/agent/session.ts +87 -0
  57. package/src/agent/teams.ts +116 -0
  58. package/src/agent/workflow-executor.ts +192 -0
  59. package/src/agent/workflow.ts +175 -0
  60. package/src/channels/adapter.ts +21 -0
  61. package/src/channels/dashboard.html.ts +2969 -0
  62. package/src/channels/discord.ts +137 -0
  63. package/src/channels/optional-deps.d.ts +17 -0
  64. package/src/channels/router.ts +199 -0
  65. package/src/channels/signal.ts +133 -0
  66. package/src/channels/slack.ts +101 -0
  67. package/src/channels/telegram.ts +102 -0
  68. package/src/channels/utils.ts +18 -0
  69. package/src/channels/webchat.ts +1797 -0
  70. package/src/channels/whatsapp.ts +119 -0
  71. package/src/config/loader.ts +22 -0
  72. package/src/config/paths.ts +43 -0
  73. package/src/config/schema.ts +121 -0
  74. package/src/db/connection.ts +20 -0
  75. package/src/db/export.ts +148 -0
  76. package/src/db/migrations.ts +42 -0
  77. package/src/index.ts +261 -0
  78. package/src/llm/claude.ts +193 -0
  79. package/src/llm/factory.ts +115 -0
  80. package/src/llm/failover.ts +101 -0
  81. package/src/llm/openai-compat.ts +409 -0
  82. package/src/llm/provider.ts +83 -0
  83. package/src/llm/smart-router.ts +241 -0
  84. package/src/logs.ts +53 -0
  85. package/src/memory/chunker.ts +58 -0
  86. package/src/memory/document-parser.ts +115 -0
  87. package/src/memory/embedder.ts +235 -0
  88. package/src/memory/engine.ts +170 -0
  89. package/src/memory/fts-index.ts +55 -0
  90. package/src/memory/hybrid-search.ts +72 -0
  91. package/src/memory/store.ts +56 -0
  92. package/src/memory/vector-index.ts +72 -0
  93. package/src/model.ts +118 -0
  94. package/src/registry/cli.ts +43 -0
  95. package/src/registry/client.ts +54 -0
  96. package/src/registry/installer.ts +67 -0
  97. package/src/scheduler/briefing.ts +71 -0
  98. package/src/scheduler/cron.ts +258 -0
  99. package/src/scheduler/heartbeat.ts +58 -0
  100. package/src/scheduler/memory-triggers.ts +100 -0
  101. package/src/scheduler/natural-cron.ts +163 -0
  102. package/src/scheduler/proactive.ts +25 -0
  103. package/src/scheduler/recipes.ts +110 -0
  104. package/src/secrets/store.ts +64 -0
  105. package/src/setup.ts +413 -0
  106. package/src/skills.ts +293 -0
  107. package/src/start.ts +373 -0
  108. package/src/status.ts +165 -0
  109. package/src/tools/builtin/connect-service.ts +205 -0
  110. package/src/tools/builtin/cron.ts +126 -0
  111. package/src/tools/builtin/datetime.ts +36 -0
  112. package/src/tools/builtin/delegate-task.ts +81 -0
  113. package/src/tools/builtin/delegate.ts +42 -0
  114. package/src/tools/builtin/diagnose.ts +41 -0
  115. package/src/tools/builtin/google-oauth.ts +379 -0
  116. package/src/tools/builtin/manage-agents.ts +149 -0
  117. package/src/tools/builtin/manage-skills.ts +294 -0
  118. package/src/tools/builtin/manage-teams.ts +89 -0
  119. package/src/tools/builtin/manage-triggers.ts +94 -0
  120. package/src/tools/builtin/manage-workflows.ts +119 -0
  121. package/src/tools/builtin/memory-search.ts +38 -0
  122. package/src/tools/builtin/memory-write.ts +30 -0
  123. package/src/tools/builtin/run-workflow.ts +36 -0
  124. package/src/tools/builtin/secrets.ts +122 -0
  125. package/src/tools/builtin/skill-registry.ts +75 -0
  126. package/src/tools/builtin-integrations/api-helpers.ts +26 -0
  127. package/src/tools/builtin-integrations/github/github_issues/SKILL.md +56 -0
  128. package/src/tools/builtin-integrations/github/github_issues/handler.ts +108 -0
  129. package/src/tools/builtin-integrations/github/github_prs/SKILL.md +57 -0
  130. package/src/tools/builtin-integrations/github/github_prs/handler.ts +113 -0
  131. package/src/tools/builtin-integrations/github/github_repos/SKILL.md +37 -0
  132. package/src/tools/builtin-integrations/github/github_repos/handler.ts +88 -0
  133. package/src/tools/builtin-integrations/google/gmail/SKILL.md +51 -0
  134. package/src/tools/builtin-integrations/google/gmail/handler.ts +125 -0
  135. package/src/tools/builtin-integrations/google/google_calendar/SKILL.md +35 -0
  136. package/src/tools/builtin-integrations/google/google_calendar/handler.ts +105 -0
  137. package/src/tools/builtin-integrations/google/google_docs/SKILL.md +35 -0
  138. package/src/tools/builtin-integrations/google/google_docs/handler.ts +108 -0
  139. package/src/tools/builtin-integrations/google/google_drive/SKILL.md +39 -0
  140. package/src/tools/builtin-integrations/google/google_drive/handler.ts +106 -0
  141. package/src/tools/builtin-integrations/google/google_sheets/SKILL.md +36 -0
  142. package/src/tools/builtin-integrations/google/google_sheets/handler.ts +116 -0
  143. package/src/tools/builtin-integrations/jira/jira_boards/SKILL.md +21 -0
  144. package/src/tools/builtin-integrations/jira/jira_boards/handler.ts +74 -0
  145. package/src/tools/builtin-integrations/jira/jira_issues/SKILL.md +28 -0
  146. package/src/tools/builtin-integrations/jira/jira_issues/handler.ts +140 -0
  147. package/src/tools/builtin-integrations/linear/linear_issues/SKILL.md +30 -0
  148. package/src/tools/builtin-integrations/linear/linear_issues/handler.ts +75 -0
  149. package/src/tools/builtin-integrations/linear/linear_projects/SKILL.md +21 -0
  150. package/src/tools/builtin-integrations/linear/linear_projects/handler.ts +43 -0
  151. package/src/tools/builtin-integrations/notion/notion_databases/SKILL.md +39 -0
  152. package/src/tools/builtin-integrations/notion/notion_databases/handler.ts +83 -0
  153. package/src/tools/builtin-integrations/notion/notion_pages/SKILL.md +43 -0
  154. package/src/tools/builtin-integrations/notion/notion_pages/handler.ts +130 -0
  155. package/src/tools/builtin-integrations/notion/notion_search/SKILL.md +27 -0
  156. package/src/tools/builtin-integrations/notion/notion_search/handler.ts +69 -0
  157. package/src/tools/builtin-integrations/slack/slack_messages/SKILL.md +42 -0
  158. package/src/tools/builtin-integrations/slack/slack_messages/handler.ts +72 -0
  159. package/src/tools/builtin-integrations/twitter/twitter_posts/SKILL.md +24 -0
  160. package/src/tools/builtin-integrations/twitter/twitter_posts/handler.ts +133 -0
  161. package/src/tools/builtin-skills/file-read/SKILL.md +26 -0
  162. package/src/tools/builtin-skills/file-read/handler.ts +66 -0
  163. package/src/tools/builtin-skills/file-write/SKILL.md +30 -0
  164. package/src/tools/builtin-skills/file-write/handler.ts +64 -0
  165. package/src/tools/builtin-skills/http-request/SKILL.md +34 -0
  166. package/src/tools/builtin-skills/http-request/handler.ts +87 -0
  167. package/src/tools/builtin-skills/shell/SKILL.md +26 -0
  168. package/src/tools/builtin-skills/shell/handler.ts +96 -0
  169. package/src/tools/builtin-skills/url-fetch/SKILL.md +26 -0
  170. package/src/tools/builtin-skills/url-fetch/handler.ts +37 -0
  171. package/src/tools/builtin-skills/web-search/SKILL.md +26 -0
  172. package/src/tools/builtin-skills/web-search/handler.ts +50 -0
  173. package/src/tools/executor.ts +205 -0
  174. package/src/tools/integration-installer.ts +106 -0
  175. package/src/tools/permissions.ts +45 -0
  176. package/src/tools/registry.ts +39 -0
  177. package/src/tools/sandbox-runner.ts +56 -0
  178. package/src/tools/sandbox.ts +82 -0
  179. package/src/tools/skill-installer.ts +52 -0
  180. package/src/tools/skill-loader.ts +259 -0
  181. package/src/types/optional-deps.d.ts +23 -0
  182. package/src/util/auth.ts +121 -0
  183. package/src/util/costs.ts +59 -0
  184. package/src/util/error-buffer.ts +32 -0
  185. package/src/util/google-tokens.ts +180 -0
  186. package/src/util/logger.ts +73 -0
  187. package/src/util/perf-collector.ts +35 -0
  188. package/src/util/rate-limiter.ts +70 -0
  189. package/src/util/tokens.ts +17 -0
  190. package/src/voice/stt.ts +57 -0
  191. package/src/voice/tts.ts +103 -0
  192. package/tests/agent/session.test.ts +109 -0
  193. package/tests/agent-loop.test.ts +54 -0
  194. package/tests/auth.test.ts +89 -0
  195. package/tests/channels.test.ts +67 -0
  196. package/tests/compaction.test.ts +44 -0
  197. package/tests/config.test.ts +51 -0
  198. package/tests/costs.test.ts +19 -0
  199. package/tests/cron.test.ts +55 -0
  200. package/tests/db/export.test.ts +219 -0
  201. package/tests/executor.test.ts +144 -0
  202. package/tests/export.test.ts +137 -0
  203. package/tests/helpers/mock-llm.ts +34 -0
  204. package/tests/helpers/test-db.ts +74 -0
  205. package/tests/integration/chat-flow.test.ts +48 -0
  206. package/tests/integrations.test.ts +97 -0
  207. package/tests/memory/engine.test.ts +114 -0
  208. package/tests/memory-engine.test.ts +57 -0
  209. package/tests/permissions.test.ts +21 -0
  210. package/tests/rate-limiter.test.ts +70 -0
  211. package/tests/registry.test.ts +67 -0
  212. package/tests/router.test.ts +36 -0
  213. package/tests/session.test.ts +58 -0
  214. package/tests/skill-loader.test.ts +44 -0
  215. package/tests/tokens.test.ts +30 -0
  216. package/tests/tools/executor.test.ts +130 -0
  217. package/tests/util/auth.test.ts +75 -0
  218. package/tests/util/rate-limiter.test.ts +73 -0
  219. package/tests/voice.test.ts +60 -0
  220. package/tests/webchat.test.ts +88 -0
  221. package/tests/workflow.test.ts +38 -0
  222. package/tsconfig.json +16 -0
@@ -0,0 +1,241 @@
1
+ import type {
2
+ LlmProvider,
3
+ LlmRequest,
4
+ LlmResponse,
5
+ LlmStreamEvent,
6
+ } from "./provider";
7
+ import { logger } from "../util/logger";
8
+
9
+ const CODE_MARKERS = [
10
+ "```",
11
+ "function ",
12
+ "const ",
13
+ "let ",
14
+ "var ",
15
+ "import ",
16
+ "export ",
17
+ "class ",
18
+ "interface ",
19
+ "=>",
20
+ "async ",
21
+ "await ",
22
+ "return ",
23
+ ".ts",
24
+ ".js",
25
+ ".py",
26
+ ".tsx",
27
+ ".jsx",
28
+ "/src/",
29
+ "/lib/",
30
+ "/bin/",
31
+ "node_modules",
32
+ ];
33
+
34
+ const MULTI_STEP_INDICATORS = [
35
+ "step by step",
36
+ "analyze",
37
+ "compare",
38
+ "implement",
39
+ "refactor",
40
+ "debug",
41
+ "build",
42
+ "create a",
43
+ "write a",
44
+ "design",
45
+ "architect",
46
+ "optimize",
47
+ "migrate",
48
+ "convert",
49
+ "transform",
50
+ ];
51
+
52
+ const REASONING_INDICATORS = [
53
+ "why",
54
+ "how does",
55
+ "explain in detail",
56
+ "trade-offs",
57
+ "tradeoffs",
58
+ "pros and cons",
59
+ "what are the differences",
60
+ "elaborate",
61
+ "break down",
62
+ "walk me through",
63
+ "reasoning",
64
+ "implications",
65
+ ];
66
+
67
+ /**
68
+ * Classify whether a user message is "simple" (can be handled by a fast/cheap model)
69
+ * or "complex" (needs the primary/expensive model).
70
+ */
71
+ export function classifyComplexity(text: string): "simple" | "complex" {
72
+ const lower = text.toLowerCase().trim();
73
+ const words = lower.split(/\s+/).filter(Boolean);
74
+ const wordCount = words.length;
75
+
76
+ // Long messages are complex
77
+ if (wordCount >= 50) {
78
+ return "complex";
79
+ }
80
+
81
+ // Check for code markers
82
+ for (const marker of CODE_MARKERS) {
83
+ if (lower.includes(marker)) {
84
+ return "complex";
85
+ }
86
+ }
87
+
88
+ // Check for multi-step indicators
89
+ for (const indicator of MULTI_STEP_INDICATORS) {
90
+ if (lower.includes(indicator)) {
91
+ return "complex";
92
+ }
93
+ }
94
+
95
+ // Check for reasoning indicators
96
+ for (const indicator of REASONING_INDICATORS) {
97
+ if (lower.includes(indicator)) {
98
+ return "complex";
99
+ }
100
+ }
101
+
102
+ // Short messages with no complexity markers are simple
103
+ return "simple";
104
+ }
105
+
106
+ export class SmartRouterProvider implements LlmProvider {
107
+ providerName: string;
108
+ model: string;
109
+ contextWindow: number;
110
+
111
+ constructor(
112
+ private primary: LlmProvider,
113
+ private fast: LlmProvider,
114
+ private enabled: boolean,
115
+ ) {
116
+ this.providerName = primary.providerName;
117
+ this.model = primary.model;
118
+ this.contextWindow = primary.contextWindow;
119
+ }
120
+
121
+ private selectProvider(request: LlmRequest): LlmProvider {
122
+ if (!this.enabled) {
123
+ return this.primary;
124
+ }
125
+
126
+ // Extract the last user message text for classification
127
+ const lastUserMsg = [...request.messages]
128
+ .reverse()
129
+ .find((m) => m.role === "user");
130
+
131
+ if (!lastUserMsg) {
132
+ return this.primary;
133
+ }
134
+
135
+ const text = typeof lastUserMsg.content === "string"
136
+ ? lastUserMsg.content
137
+ : lastUserMsg.content
138
+ .filter((b) => b.type === "text")
139
+ .map((b) => b.text ?? "")
140
+ .join(" ");
141
+
142
+ const complexity = classifyComplexity(text);
143
+
144
+ if (complexity === "simple") {
145
+ logger.info("Smart router: using fast model", {
146
+ provider: this.fast.providerName,
147
+ model: this.fast.model,
148
+ reason: "simple query",
149
+ });
150
+ this.providerName = this.fast.providerName;
151
+ this.model = this.fast.model;
152
+ return this.fast;
153
+ }
154
+
155
+ logger.info("Smart router: using primary model", {
156
+ provider: this.primary.providerName,
157
+ model: this.primary.model,
158
+ reason: "complex query",
159
+ });
160
+ this.providerName = this.primary.providerName;
161
+ this.model = this.primary.model;
162
+ return this.primary;
163
+ }
164
+
165
+ async chat(request: LlmRequest): Promise<LlmResponse> {
166
+ const provider = this.selectProvider(request);
167
+
168
+ if (provider === this.fast) {
169
+ try {
170
+ return await provider.chat(request);
171
+ } catch (err: any) {
172
+ logger.warn("Fast model failed, falling back to primary", {
173
+ error: err.message,
174
+ });
175
+ this.providerName = this.primary.providerName;
176
+ this.model = this.primary.model;
177
+ return this.primary.chat(request);
178
+ }
179
+ }
180
+
181
+ return provider.chat(request);
182
+ }
183
+
184
+ async *chatStream(request: LlmRequest): AsyncIterable<LlmStreamEvent> {
185
+ const MAX_STREAM_EVENTS = 50_000;
186
+ const provider = this.selectProvider(request);
187
+
188
+ if (provider === this.fast) {
189
+ // Try fast model with fallback to primary
190
+ if (provider.chatStream) {
191
+ const events: LlmStreamEvent[] = [];
192
+ let succeeded = false;
193
+ try {
194
+ for await (const event of provider.chatStream(request)) {
195
+ if (events.length >= MAX_STREAM_EVENTS) {
196
+ throw new Error(`Stream exceeded maximum event limit (${MAX_STREAM_EVENTS})`);
197
+ }
198
+ events.push(event);
199
+ }
200
+ succeeded = true;
201
+ } catch (err: any) {
202
+ logger.warn("Fast model stream failed, falling back to primary", {
203
+ error: err.message,
204
+ });
205
+ }
206
+
207
+ if (succeeded) {
208
+ for (const event of events) {
209
+ yield event;
210
+ }
211
+ return;
212
+ }
213
+
214
+ // Fallback to primary stream
215
+ this.providerName = this.primary.providerName;
216
+ this.model = this.primary.model;
217
+ } else {
218
+ // Fast model has no streaming, fall back to primary
219
+ logger.info("Fast model has no streaming support, using primary");
220
+ this.providerName = this.primary.providerName;
221
+ this.model = this.primary.model;
222
+ }
223
+ }
224
+
225
+ // Use primary model (streaming or non-streaming fallback)
226
+ if (this.primary.chatStream) {
227
+ yield* this.primary.chatStream(request);
228
+ } else {
229
+ const response = await this.primary.chat(request);
230
+ for (const block of response.content) {
231
+ if (block.type === "text" && block.text) {
232
+ yield { type: "text_delta", text: block.text };
233
+ } else if (block.type === "tool_use") {
234
+ yield { type: "tool_use_start", id: block.id!, name: block.name! };
235
+ yield { type: "tool_use_end", id: block.id! };
236
+ }
237
+ }
238
+ yield { type: "message_done", response };
239
+ }
240
+ }
241
+ }
package/src/logs.ts ADDED
@@ -0,0 +1,53 @@
1
+ import { existsSync, readFileSync, watchFile } from "fs";
2
+ import { paths } from "./config/paths";
3
+ import { logger } from "./util/logger";
4
+
5
+ export async function showLogs(follow = false) {
6
+ if (!existsSync(paths.logFile)) {
7
+ console.log("No log file found. Start Zubo first.");
8
+ return;
9
+ }
10
+
11
+ if (follow) {
12
+ await tailFollow();
13
+ } else {
14
+ tailLast(50);
15
+ }
16
+ }
17
+
18
+ function tailLast(n: number) {
19
+ const content = readFileSync(paths.logFile, "utf-8");
20
+ const lines = content.trimEnd().split("\n");
21
+ const tail = lines.slice(-n);
22
+ console.log(tail.join("\n"));
23
+ }
24
+
25
+ async function tailFollow() {
26
+ // Print last 10 lines first
27
+ tailLast(10);
28
+ console.log("--- following logs (Ctrl+C to stop) ---\n");
29
+
30
+ let pos = readFileSync(paths.logFile).byteLength;
31
+
32
+ watchFile(paths.logFile, { interval: 500 }, () => {
33
+ try {
34
+ const buf = readFileSync(paths.logFile);
35
+ if (buf.byteLength > pos) {
36
+ const newData = buf.subarray(pos).toString();
37
+ process.stdout.write(newData);
38
+ pos = buf.byteLength;
39
+ } else if (buf.byteLength < pos) {
40
+ // File was truncated/rotated
41
+ pos = 0;
42
+ const newData = buf.toString();
43
+ process.stdout.write(newData);
44
+ pos = buf.byteLength;
45
+ }
46
+ } catch (err: any) {
47
+ logger.warn("Failed to read log file update", { error: (err as Error).message });
48
+ }
49
+ });
50
+
51
+ // Keep process alive
52
+ await new Promise(() => {});
53
+ }
@@ -0,0 +1,58 @@
1
+ import { estimateTokens } from "../util/tokens";
2
+
3
+ const CHUNK_SIZE = 400; // tokens
4
+ const CHUNK_OVERLAP = 80; // tokens
5
+ const CHARS_PER_TOKEN = 4;
6
+
7
+ export interface Chunk {
8
+ content: string;
9
+ index: number;
10
+ sourceFile: string;
11
+ }
12
+
13
+ /**
14
+ * Split text into overlapping chunks of ~400 tokens.
15
+ */
16
+ export function chunkText(text: string, sourceFile: string): Chunk[] {
17
+ const chunks: Chunk[] = [];
18
+ const chunkChars = CHUNK_SIZE * CHARS_PER_TOKEN;
19
+ const overlapChars = CHUNK_OVERLAP * CHARS_PER_TOKEN;
20
+ const step = chunkChars - overlapChars;
21
+
22
+ if (text.length <= chunkChars) {
23
+ return [{ content: text.trim(), index: 0, sourceFile }];
24
+ }
25
+
26
+ let offset = 0;
27
+ let idx = 0;
28
+ while (offset < text.length) {
29
+ let end = offset + chunkChars;
30
+ if (end > text.length) end = text.length;
31
+
32
+ // Try to break at a paragraph or sentence boundary
33
+ if (end < text.length) {
34
+ const slice = text.slice(offset, end);
35
+ const lastPara = slice.lastIndexOf("\n\n");
36
+ const lastNewline = slice.lastIndexOf("\n");
37
+ const lastPeriod = slice.lastIndexOf(". ");
38
+
39
+ if (lastPara > chunkChars * 0.5) {
40
+ end = offset + lastPara + 2;
41
+ } else if (lastNewline > chunkChars * 0.5) {
42
+ end = offset + lastNewline + 1;
43
+ } else if (lastPeriod > chunkChars * 0.5) {
44
+ end = offset + lastPeriod + 2;
45
+ }
46
+ }
47
+
48
+ const chunk = text.slice(offset, end).trim();
49
+ if (chunk.length > 0) {
50
+ chunks.push({ content: chunk, index: idx++, sourceFile });
51
+ }
52
+
53
+ offset += step;
54
+ if (offset >= text.length) break;
55
+ }
56
+
57
+ return chunks;
58
+ }
@@ -0,0 +1,115 @@
1
+ import { readFileSync } from "fs";
2
+ import { extname } from "path";
3
+ import { logger } from "../util/logger";
4
+
5
+ export interface ParsedDocument {
6
+ text: string;
7
+ metadata: {
8
+ filename: string;
9
+ mimeType: string;
10
+ pages?: number;
11
+ wordCount: number;
12
+ };
13
+ }
14
+
15
+ export async function parseDocument(
16
+ filePath: string,
17
+ mimeType: string
18
+ ): Promise<ParsedDocument> {
19
+ const filename = filePath.split("/").pop() ?? "unknown";
20
+
21
+ switch (mimeType) {
22
+ case "text/plain":
23
+ case "text/markdown":
24
+ case "text/csv": {
25
+ const text = readFileSync(filePath, "utf-8");
26
+ return {
27
+ text,
28
+ metadata: { filename, mimeType, wordCount: countWords(text) },
29
+ };
30
+ }
31
+
32
+ case "application/pdf": {
33
+ try {
34
+ const pdfParse = (await import("pdf-parse")).default;
35
+ const buffer = readFileSync(filePath);
36
+ const MAX_TEXT_LENGTH = 5_000_000; // 5MB of text
37
+ const TIMEOUT_MS = 30_000;
38
+ const data = await Promise.race([
39
+ pdfParse(buffer) as Promise<{ text: string; numpages: number }>,
40
+ new Promise<never>((_, reject) => setTimeout(() => reject(new Error("PDF parsing timeout")), TIMEOUT_MS)),
41
+ ]);
42
+ const text = data.text.length > MAX_TEXT_LENGTH ? data.text.slice(0, MAX_TEXT_LENGTH) + "\n[Truncated]" : data.text;
43
+ return {
44
+ text,
45
+ metadata: {
46
+ filename,
47
+ mimeType,
48
+ pages: data.numpages,
49
+ wordCount: countWords(text),
50
+ },
51
+ };
52
+ } catch (err: any) {
53
+ logger.warn("PDF parsing failed — install pdf-parse for PDF support", { error: err.message });
54
+ return {
55
+ text: `[PDF file: ${filename} — install pdf-parse for content extraction]`,
56
+ metadata: { filename, mimeType, wordCount: 0 },
57
+ };
58
+ }
59
+ }
60
+
61
+ case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
62
+ try {
63
+ const mammoth = await import("mammoth");
64
+ const result = await mammoth.extractRawText({ buffer: readFileSync(filePath) });
65
+ return {
66
+ text: result.value,
67
+ metadata: { filename, mimeType, wordCount: countWords(result.value) },
68
+ };
69
+ } catch (err: any) {
70
+ logger.warn("DOCX parsing failed — install mammoth for DOCX support", { error: err.message });
71
+ return {
72
+ text: `[DOCX file: ${filename} — install mammoth for content extraction]`,
73
+ metadata: { filename, mimeType, wordCount: 0 },
74
+ };
75
+ }
76
+ }
77
+
78
+ default: {
79
+ // Try to read as text
80
+ const ext = extname(filePath).toLowerCase();
81
+ if ([".txt", ".md", ".csv", ".json", ".xml", ".yaml", ".yml", ".ts", ".js", ".py", ".sh"].includes(ext)) {
82
+ const text = readFileSync(filePath, "utf-8");
83
+ return {
84
+ text,
85
+ metadata: { filename, mimeType, wordCount: countWords(text) },
86
+ };
87
+ }
88
+ return {
89
+ text: `[Unsupported file type: ${mimeType}]`,
90
+ metadata: { filename, mimeType, wordCount: 0 },
91
+ };
92
+ }
93
+ }
94
+ }
95
+
96
+ function countWords(text: string): number {
97
+ return text.split(/\s+/).filter(Boolean).length;
98
+ }
99
+
100
+ const MIME_MAP: Record<string, string> = {
101
+ ".txt": "text/plain",
102
+ ".md": "text/markdown",
103
+ ".csv": "text/csv",
104
+ ".pdf": "application/pdf",
105
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
106
+ ".json": "application/json",
107
+ ".xml": "application/xml",
108
+ ".yaml": "text/yaml",
109
+ ".yml": "text/yaml",
110
+ };
111
+
112
+ export function guessMimeType(filename: string): string {
113
+ const ext = extname(filename).toLowerCase();
114
+ return MIME_MAP[ext] ?? "application/octet-stream";
115
+ }
@@ -0,0 +1,235 @@
1
+ import { InferenceSession, Tensor } from "onnxruntime-node";
2
+ import { paths } from "../config/paths";
3
+ import { join } from "path";
4
+ import { existsSync } from "fs";
5
+ import { logger } from "../util/logger";
6
+
7
+ const MODEL_NAME = "all-MiniLM-L6-v2";
8
+ const MODEL_URL = `https://huggingface.co/sentence-transformers/${MODEL_NAME}/resolve/main/onnx/model.onnx`;
9
+ const TOKENIZER_URL = `https://huggingface.co/sentence-transformers/${MODEL_NAME}/resolve/main/tokenizer.json`;
10
+ const EMBEDDING_DIM = 384;
11
+
12
+ let session: InferenceSession | null = null;
13
+ let vocab: Map<string, number> | null = null;
14
+
15
+ async function downloadFile(url: string, dest: string) {
16
+ if (existsSync(dest)) return;
17
+ logger.info(`Downloading ${url}...`);
18
+ const resp = await fetch(url, { redirect: "follow" });
19
+ if (!resp.ok) throw new Error(`Failed to download ${url}: ${resp.status}`);
20
+ const contentLength = resp.headers.get("content-length");
21
+ if (contentLength) {
22
+ logger.info(`Download size: ${(Number(contentLength) / 1024 / 1024).toFixed(1)} MB`);
23
+ }
24
+ const buffer = await resp.arrayBuffer();
25
+ await Bun.write(dest, buffer);
26
+ logger.info(`Saved to ${dest} (${(buffer.byteLength / 1024 / 1024).toFixed(1)} MB)`);
27
+ }
28
+
29
+ async function ensureModel(): Promise<string> {
30
+ const modelDir = join(paths.models, MODEL_NAME);
31
+ Bun.spawnSync(["mkdir", "-p", modelDir]);
32
+
33
+ const modelPath = join(modelDir, "model.onnx");
34
+ const tokenizerPath = join(modelDir, "tokenizer.json");
35
+
36
+ await Promise.all([
37
+ downloadFile(MODEL_URL, modelPath),
38
+ downloadFile(TOKENIZER_URL, tokenizerPath),
39
+ ]);
40
+
41
+ return modelDir;
42
+ }
43
+
44
+ /**
45
+ * WordPiece tokenization matching BERT/MiniLM expectations.
46
+ * 1. Lowercase + strip accents
47
+ * 2. Split on whitespace and punctuation
48
+ * 3. For each word, greedily match longest vocab prefix, then continue with ## prefixed subwords
49
+ */
50
+ function wordPieceTokenize(
51
+ text: string,
52
+ vocabMap: Map<string, number>,
53
+ maxLen: number = 128
54
+ ): { inputIds: number[]; attentionMask: number[] } {
55
+ const CLS = vocabMap.get("[CLS]") ?? 101;
56
+ const SEP = vocabMap.get("[SEP]") ?? 102;
57
+ const UNK = vocabMap.get("[UNK]") ?? 100;
58
+ const PAD = vocabMap.get("[PAD]") ?? 0;
59
+
60
+ // Basic pre-tokenization: lowercase, split on whitespace and punctuation
61
+ const normalized = text
62
+ .toLowerCase()
63
+ .normalize("NFD")
64
+ .replace(/[\u0300-\u036f]/g, ""); // strip accents
65
+
66
+ // Split into words, keeping punctuation as separate tokens
67
+ const words = normalized.match(/[a-z0-9]+|[^\s\w]/g) || [];
68
+
69
+ const tokens: number[] = [CLS];
70
+
71
+ for (const word of words) {
72
+ if (tokens.length >= maxLen - 1) break;
73
+
74
+ // WordPiece: greedily match longest subword from vocab
75
+ let start = 0;
76
+ let isBad = false;
77
+ const subTokens: number[] = [];
78
+
79
+ while (start < word.length) {
80
+ let end = word.length;
81
+ let found = false;
82
+
83
+ while (start < end) {
84
+ let substr = word.slice(start, end);
85
+ if (start > 0) substr = "##" + substr;
86
+
87
+ const id = vocabMap.get(substr);
88
+ if (id !== undefined) {
89
+ subTokens.push(id);
90
+ found = true;
91
+ break;
92
+ }
93
+ end--;
94
+ }
95
+
96
+ if (!found) {
97
+ isBad = true;
98
+ break;
99
+ }
100
+ start = end;
101
+ }
102
+
103
+ if (isBad) {
104
+ tokens.push(UNK);
105
+ } else {
106
+ for (const st of subTokens) {
107
+ if (tokens.length >= maxLen - 1) break;
108
+ tokens.push(st);
109
+ }
110
+ }
111
+ }
112
+
113
+ tokens.push(SEP);
114
+
115
+ const attentionMask = new Array(maxLen).fill(0);
116
+ for (let i = 0; i < tokens.length; i++) attentionMask[i] = 1;
117
+
118
+ while (tokens.length < maxLen) tokens.push(PAD);
119
+
120
+ return { inputIds: tokens, attentionMask };
121
+ }
122
+
123
+ async function loadVocab(modelDir: string): Promise<Map<string, number>> {
124
+ const tokenizerPath = join(modelDir, "tokenizer.json");
125
+ const raw = await Bun.file(tokenizerPath).json();
126
+ const map = new Map<string, number>();
127
+
128
+ if (raw.model?.vocab) {
129
+ for (const [token, id] of Object.entries(raw.model.vocab)) {
130
+ map.set(token, id as number);
131
+ }
132
+ }
133
+
134
+ return map;
135
+ }
136
+
137
+ export async function initEmbedder(): Promise<boolean> {
138
+ try {
139
+ const modelDir = await ensureModel();
140
+ const modelPath = join(modelDir, "model.onnx");
141
+
142
+ // Dispose existing session to prevent memory leak on re-initialization
143
+ if (session) {
144
+ try { (session as any).dispose?.(); } catch { /* ignore */ }
145
+ }
146
+
147
+ session = await InferenceSession.create(modelPath, {
148
+ executionProviders: ["cpu"],
149
+ });
150
+ vocab = await loadVocab(modelDir);
151
+
152
+ logger.info("Embedder initialized", {
153
+ model: MODEL_NAME,
154
+ dim: EMBEDDING_DIM,
155
+ vocabSize: vocab.size,
156
+ });
157
+ return true;
158
+ } catch (err: any) {
159
+ logger.warn("Failed to initialize embedder, falling back to FTS-only", {
160
+ error: err.message,
161
+ });
162
+ return false;
163
+ }
164
+ }
165
+
166
+ export async function embed(text: string): Promise<Float32Array | null> {
167
+ if (!session || !vocab) return null;
168
+
169
+ const { inputIds, attentionMask } = wordPieceTokenize(text, vocab);
170
+ const seqLen = inputIds.length;
171
+
172
+ const inputIdsTensor = new Tensor(
173
+ "int64",
174
+ BigInt64Array.from(inputIds.map(BigInt)),
175
+ [1, seqLen]
176
+ );
177
+ const attentionMaskTensor = new Tensor(
178
+ "int64",
179
+ BigInt64Array.from(attentionMask.map(BigInt)),
180
+ [1, seqLen]
181
+ );
182
+ const tokenTypeIds = new Tensor(
183
+ "int64",
184
+ new BigInt64Array(seqLen),
185
+ [1, seqLen]
186
+ );
187
+
188
+ const feeds: Record<string, Tensor> = {
189
+ input_ids: inputIdsTensor,
190
+ attention_mask: attentionMaskTensor,
191
+ token_type_ids: tokenTypeIds,
192
+ };
193
+
194
+ const output = await session.run(feeds);
195
+
196
+ const lastHidden = output["last_hidden_state"];
197
+ if (!lastHidden) return null;
198
+
199
+ const data = lastHidden.data as Float32Array;
200
+ const embedding = new Float32Array(EMBEDDING_DIM);
201
+
202
+ // Mean pooling over non-padding tokens
203
+ let count = 0;
204
+ for (let i = 0; i < seqLen; i++) {
205
+ if (attentionMask[i] === 1) {
206
+ for (let j = 0; j < EMBEDDING_DIM; j++) {
207
+ embedding[j] += data[i * EMBEDDING_DIM + j];
208
+ }
209
+ count++;
210
+ }
211
+ }
212
+ for (let j = 0; j < EMBEDDING_DIM; j++) {
213
+ embedding[j] /= count;
214
+ }
215
+
216
+ // L2 normalize
217
+ let norm = 0;
218
+ for (let j = 0; j < EMBEDDING_DIM; j++) norm += embedding[j] * embedding[j];
219
+ norm = Math.sqrt(norm);
220
+ if (norm > 0) {
221
+ for (let j = 0; j < EMBEDDING_DIM; j++) embedding[j] /= norm;
222
+ }
223
+
224
+ return embedding;
225
+ }
226
+
227
+ export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
228
+ let dot = 0;
229
+ for (let i = 0; i < a.length; i++) dot += a[i] * b[i];
230
+ return dot; // Already L2-normalized, so dot product = cosine similarity
231
+ }
232
+
233
+ export function isEmbedderReady(): boolean {
234
+ return session !== null && vocab !== null;
235
+ }