@aman_asmuei/aman-agent 0.7.6 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -392,7 +392,33 @@ Default budget: 8,000 tokens. Override with `--budget`.
392
392
  |:---|:---|:---|:---|
393
393
  | **Anthropic** | Claude Sonnet 4.6, Opus 4.6, Haiku 4.5 | Full | Full (with tools) |
394
394
  | **OpenAI** | GPT-4o, GPT-4o Mini, o3 | Full | Full (with tools) |
395
- | **Ollama** | Llama, Mistral, Gemma, any local model | Text only | Full |
395
+ | **Ollama** | Llama, Mistral, Gemma, any local model | Model-dependent | Full (with tools) |
396
+
397
+ ### Image Support (Vision)
398
+
399
+ Reference image files or URLs in your message and they'll be sent as vision content to the LLM:
400
+
401
+ ```
402
+ You > What's in this screenshot? ~/Desktop/screenshot.png
403
+ [attached image: screenshot.png (245.3KB)]
404
+ ```
405
+
406
+ **Supported formats:** `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`
407
+
408
+ **Image URLs** are also supported — paste any `https://...png` URL and it will be fetched and attached.
409
+
410
+ **Multiple files** can be referenced in a single message (images, text files, and documents together).
411
+
412
+ **Size limit:** 20MB per image.
413
+
414
+ **Vision model requirements:**
415
+ | Provider | Vision Models |
416
+ |:---|:---|
417
+ | **Anthropic** | All Claude models (Sonnet, Opus, Haiku) |
418
+ | **OpenAI** | GPT-4o, GPT-4o Mini |
419
+ | **Ollama** | LLaVA, Llama 3.2 Vision, Moondream, BakLLaVA |
420
+
421
+ Non-vision models will receive the image but may not be able to interpret it.
396
422
 
397
423
  ---
398
424
 
package/dist/index.js CHANGED
@@ -147,6 +147,16 @@ function toAnthropicMessages(messages) {
147
147
  if (block.type === "text") {
148
148
  return { type: "text", text: block.text };
149
149
  }
150
+ if (block.type === "image") {
151
+ return {
152
+ type: "image",
153
+ source: {
154
+ type: "base64",
155
+ media_type: block.source.media_type,
156
+ data: block.source.data
157
+ }
158
+ };
159
+ }
150
160
  if (block.type === "tool_use") {
151
161
  return {
152
162
  type: "tool_use",
@@ -311,8 +321,26 @@ function toOpenAIMessages(systemPrompt, messages) {
311
321
  }
312
322
  }
313
323
  } else {
314
- const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
315
- result.push({ role: "user", content: text2 });
324
+ const hasImages = m.content.some((b) => b.type === "image");
325
+ if (hasImages) {
326
+ const parts = [];
327
+ for (const b of m.content) {
328
+ if (b.type === "text") {
329
+ parts.push({ type: "text", text: b.text });
330
+ } else if (b.type === "image") {
331
+ parts.push({
332
+ type: "image_url",
333
+ image_url: {
334
+ url: `data:${b.source.media_type};base64,${b.source.data}`
335
+ }
336
+ });
337
+ }
338
+ }
339
+ result.push({ role: "user", content: parts });
340
+ } else {
341
+ const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
342
+ result.push({ role: "user", content: text2 });
343
+ }
316
344
  }
317
345
  }
318
346
  }
@@ -415,6 +443,74 @@ function createOpenAIClient(apiKey, model) {
415
443
 
416
444
  // src/llm/ollama.ts
417
445
  import OpenAI2 from "openai";
446
+ function toOllamaMessages(systemPrompt, messages) {
447
+ const result = [
448
+ { role: "system", content: systemPrompt }
449
+ ];
450
+ for (const m of messages) {
451
+ if (typeof m.content === "string") {
452
+ result.push({
453
+ role: m.role,
454
+ content: m.content
455
+ });
456
+ } else if (m.role === "assistant") {
457
+ const textParts = m.content.filter((b) => b.type === "text");
458
+ const toolUseParts = m.content.filter((b) => b.type === "tool_use");
459
+ const text2 = textParts.map((b) => "text" in b ? b.text : "").join("");
460
+ if (toolUseParts.length > 0) {
461
+ result.push({
462
+ role: "assistant",
463
+ content: text2 || null,
464
+ tool_calls: toolUseParts.map((b) => ({
465
+ id: "id" in b ? b.id : "",
466
+ type: "function",
467
+ function: {
468
+ name: "name" in b ? b.name : "",
469
+ arguments: JSON.stringify("input" in b ? b.input : {})
470
+ }
471
+ }))
472
+ });
473
+ } else {
474
+ result.push({ role: "assistant", content: text2 });
475
+ }
476
+ } else if (m.role === "user") {
477
+ const toolResults = m.content.filter((b) => b.type === "tool_result");
478
+ if (toolResults.length > 0) {
479
+ for (const tr of toolResults) {
480
+ if (tr.type === "tool_result") {
481
+ result.push({
482
+ role: "tool",
483
+ tool_call_id: tr.tool_use_id,
484
+ content: tr.content
485
+ });
486
+ }
487
+ }
488
+ } else {
489
+ const hasImages = m.content.some((b) => b.type === "image");
490
+ if (hasImages) {
491
+ const parts = [];
492
+ for (const b of m.content) {
493
+ if (b.type === "text") {
494
+ parts.push({ type: "text", text: b.text });
495
+ } else if (b.type === "image") {
496
+ parts.push({
497
+ type: "image_url",
498
+ image_url: {
499
+ url: `data:${b.source.media_type};base64,${b.source.data}`
500
+ }
501
+ });
502
+ }
503
+ }
504
+ result.push({ role: "user", content: parts });
505
+ } else {
506
+ const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
507
+ result.push({ role: "user", content: text2 });
508
+ }
509
+ }
510
+ }
511
+ }
512
+ return result;
513
+ }
418
514
  function createOllamaClient(model, baseURL) {
419
515
  const client = new OpenAI2({
420
516
  baseURL: baseURL || "http://localhost:11434/v1",
@@ -422,28 +518,83 @@ function createOllamaClient(model, baseURL) {
422
518
  // Ollama doesn't require a real key
423
519
  });
424
520
  return {
425
- async chat(systemPrompt, messages, onChunk, _tools) {
426
- let fullText = "";
521
+ async chat(systemPrompt, messages, onChunk, tools) {
522
+ const ollamaMessages = toOllamaMessages(systemPrompt, messages);
523
+ const hasTools = tools && tools.length > 0;
427
524
  try {
428
- const stream = await client.chat.completions.create({
525
+ let fullText = "";
526
+ const toolCallAccumulators = /* @__PURE__ */ new Map();
527
+ const createParams = {
429
528
  model,
430
529
  max_tokens: 8192,
431
- messages: [
432
- { role: "system", content: systemPrompt },
433
- ...messages.map((m) => ({
434
- role: m.role,
435
- content: typeof m.content === "string" ? m.content : m.content.filter((b) => b.type === "text").map((b) => "text" in b ? b.text : "").join("")
436
- }))
437
- ],
530
+ messages: ollamaMessages,
438
531
  stream: true
439
- });
532
+ };
533
+ if (hasTools) {
534
+ createParams.tools = tools.map((t) => ({
535
+ type: "function",
536
+ function: {
537
+ name: t.name,
538
+ description: t.description,
539
+ parameters: t.input_schema
540
+ }
541
+ }));
542
+ }
543
+ const stream = await client.chat.completions.create(
544
+ createParams
545
+ );
440
546
  for await (const chunk of stream) {
441
- const text2 = chunk.choices[0]?.delta?.content || "";
442
- if (text2) {
443
- fullText += text2;
444
- onChunk({ type: "text", text: text2 });
547
+ const delta = chunk.choices[0]?.delta;
548
+ if (!delta) continue;
549
+ if (delta.content) {
550
+ fullText += delta.content;
551
+ onChunk({ type: "text", text: delta.content });
445
552
  }
553
+ if (delta.tool_calls) {
554
+ for (const tc of delta.tool_calls) {
555
+ const idx = tc.index;
556
+ let acc = toolCallAccumulators.get(idx);
557
+ if (!acc) {
558
+ acc = { id: "", name: "", arguments: "" };
559
+ toolCallAccumulators.set(idx, acc);
560
+ }
561
+ if (tc.id) {
562
+ acc.id = tc.id;
563
+ }
564
+ if (tc.function?.name) {
565
+ acc.name = tc.function.name;
566
+ }
567
+ if (tc.function?.arguments) {
568
+ acc.arguments += tc.function.arguments;
569
+ }
570
+ }
571
+ }
572
+ }
573
+ const toolUses = Array.from(toolCallAccumulators.entries()).sort(([a], [b]) => a - b).map(([, acc]) => ({
574
+ id: acc.id,
575
+ name: acc.name,
576
+ input: JSON.parse(acc.arguments || "{}")
577
+ }));
578
+ onChunk({ type: "done" });
579
+ if (toolUses.length > 0) {
580
+ const contentBlocks = [
581
+ ...fullText ? [{ type: "text", text: fullText }] : [],
582
+ ...toolUses.map((tu) => ({
583
+ type: "tool_use",
584
+ id: tu.id,
585
+ name: tu.name,
586
+ input: tu.input
587
+ }))
588
+ ];
589
+ return {
590
+ message: { role: "assistant", content: contentBlocks },
591
+ toolUses
592
+ };
446
593
  }
594
+ return {
595
+ message: { role: "assistant", content: fullText },
596
+ toolUses: []
597
+ };
447
598
  } catch (error) {
448
599
  if (error instanceof Error && error.message.includes("ECONNREFUSED")) {
449
600
  throw new Error(
@@ -452,11 +603,6 @@ function createOllamaClient(model, baseURL) {
452
603
  }
453
604
  throw error;
454
605
  }
455
- onChunk({ type: "done" });
456
- return {
457
- message: { role: "assistant", content: fullText },
458
- toolUses: []
459
- };
460
606
  }
461
607
  };
462
608
  }
@@ -1020,7 +1166,18 @@ async function handleMemoryCommand(action, args, ctx) {
1020
1166
  output: pc.red("Memory not available: aman-mcp not connected. Start it with: npx @aman_asmuei/aman-mcp")
1021
1167
  };
1022
1168
  }
1023
- const result = await ctx.mcpManager.callTool("memory_context", { topic: "general overview" });
1169
+ const result = await ctx.mcpManager.callTool("memory_context", { topic: "recent context" });
1170
+ if (result.startsWith("Error")) {
1171
+ return { handled: true, output: pc.red(result) };
1172
+ }
1173
+ return { handled: true, output: result };
1174
+ }
1175
+ if (action && !["search", "clear", "timeline"].includes(action)) {
1176
+ if (!ctx.mcpManager) {
1177
+ return { handled: true, output: pc.red("Memory not available: MCP not connected.") };
1178
+ }
1179
+ const topic = [action, ...args].join(" ");
1180
+ const result = await ctx.mcpManager.callTool("memory_context", { topic });
1024
1181
  if (result.startsWith("Error")) {
1025
1182
  return { handled: true, output: pc.red(result) };
1026
1183
  }
@@ -2070,106 +2227,180 @@ ${wfMatch.steps}
2070
2227
  }
2071
2228
  }
2072
2229
  await trimConversation(messages, client);
2073
- let enrichedInput = input;
2074
- const filePathMatch = input.match(/(\/[\w./-]+|~\/[\w./-]+)/);
2075
- if (filePathMatch) {
2076
- let filePath = filePathMatch[1];
2230
+ const textExts = /* @__PURE__ */ new Set([
2231
+ ".txt",
2232
+ ".md",
2233
+ ".json",
2234
+ ".js",
2235
+ ".ts",
2236
+ ".jsx",
2237
+ ".tsx",
2238
+ ".py",
2239
+ ".html",
2240
+ ".css",
2241
+ ".yml",
2242
+ ".yaml",
2243
+ ".toml",
2244
+ ".xml",
2245
+ ".csv",
2246
+ ".sh",
2247
+ ".bash",
2248
+ ".zsh",
2249
+ ".env",
2250
+ ".cfg",
2251
+ ".ini",
2252
+ ".log",
2253
+ ".sql",
2254
+ ".graphql",
2255
+ ".rs",
2256
+ ".go",
2257
+ ".java",
2258
+ ".rb",
2259
+ ".php",
2260
+ ".c",
2261
+ ".cpp",
2262
+ ".h",
2263
+ ".swift",
2264
+ ".kt",
2265
+ ".r",
2266
+ ".lua"
2267
+ ]);
2268
+ const imageExts = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"]);
2269
+ const docExts = /* @__PURE__ */ new Set([".docx", ".doc", ".pdf", ".pptx", ".ppt", ".xlsx", ".xls", ".odt", ".rtf", ".epub"]);
2270
+ const mimeMap = {
2271
+ ".png": "image/png",
2272
+ ".jpg": "image/jpeg",
2273
+ ".jpeg": "image/jpeg",
2274
+ ".gif": "image/gif",
2275
+ ".webp": "image/webp",
2276
+ ".bmp": "image/png"
2277
+ };
2278
+ const maxImageBytes = 20 * 1024 * 1024;
2279
+ let textContent = input;
2280
+ const imageBlocks = [];
2281
+ const filePathMatches = [...input.matchAll(/(\/[\w./-]+|~\/[\w./-]+)/g)];
2282
+ for (const match of filePathMatches) {
2283
+ let filePath = match[1];
2077
2284
  if (filePath.startsWith("~/")) {
2078
2285
  filePath = path7.join(os7.homedir(), filePath.slice(2));
2079
2286
  }
2080
- if (fs7.existsSync(filePath) && fs7.statSync(filePath).isFile()) {
2081
- const ext = path7.extname(filePath).toLowerCase();
2082
- const textExts = /* @__PURE__ */ new Set([
2083
- ".txt",
2084
- ".md",
2085
- ".json",
2086
- ".js",
2087
- ".ts",
2088
- ".jsx",
2089
- ".tsx",
2090
- ".py",
2091
- ".html",
2092
- ".css",
2093
- ".yml",
2094
- ".yaml",
2095
- ".toml",
2096
- ".xml",
2097
- ".csv",
2098
- ".sh",
2099
- ".bash",
2100
- ".zsh",
2101
- ".env",
2102
- ".cfg",
2103
- ".ini",
2104
- ".log",
2105
- ".sql",
2106
- ".graphql",
2107
- ".rs",
2108
- ".go",
2109
- ".java",
2110
- ".rb",
2111
- ".php",
2112
- ".c",
2113
- ".cpp",
2114
- ".h",
2115
- ".swift",
2116
- ".kt",
2117
- ".r",
2118
- ".lua"
2119
- ]);
2120
- if (textExts.has(ext) || ext === "") {
2121
- try {
2122
- const content = fs7.readFileSync(filePath, "utf-8");
2123
- const maxChars = 5e4;
2124
- const trimmed = content.length > maxChars ? content.slice(0, maxChars) + `
2287
+ if (!fs7.existsSync(filePath) || !fs7.statSync(filePath).isFile()) continue;
2288
+ const ext = path7.extname(filePath).toLowerCase();
2289
+ if (imageExts.has(ext)) {
2290
+ try {
2291
+ const stat = fs7.statSync(filePath);
2292
+ if (stat.size > maxImageBytes) {
2293
+ process.stdout.write(pc3.yellow(` [skipped: ${path7.basename(filePath)} \u2014 exceeds 20MB limit]
2294
+ `));
2295
+ continue;
2296
+ }
2297
+ const data = fs7.readFileSync(filePath).toString("base64");
2298
+ const mediaType = mimeMap[ext] || "image/png";
2299
+ imageBlocks.push({
2300
+ type: "image",
2301
+ source: { type: "base64", media_type: mediaType, data }
2302
+ });
2303
+ process.stdout.write(pc3.dim(` [attached image: ${path7.basename(filePath)} (${(stat.size / 1024).toFixed(1)}KB)]
2304
+ `));
2305
+ } catch {
2306
+ process.stdout.write(pc3.dim(` [could not read image: ${filePath}]
2307
+ `));
2308
+ }
2309
+ } else if (textExts.has(ext) || ext === "") {
2310
+ try {
2311
+ const content = fs7.readFileSync(filePath, "utf-8");
2312
+ const maxChars = 5e4;
2313
+ const trimmed = content.length > maxChars ? content.slice(0, maxChars) + `
2125
2314
 
2126
2315
  [... truncated, ${content.length - maxChars} chars remaining]` : content;
2127
- enrichedInput = `${input}
2316
+ textContent += `
2128
2317
 
2129
2318
  <file path="${filePath}" size="${content.length} chars">
2130
2319
  ${trimmed}
2131
2320
  </file>`;
2132
- process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (${(content.length / 1024).toFixed(1)}KB)]
2321
+ process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (${(content.length / 1024).toFixed(1)}KB)]
2133
2322
  `));
2134
- } catch {
2135
- process.stdout.write(pc3.dim(` [could not read: ${filePath}]
2323
+ } catch {
2324
+ process.stdout.write(pc3.dim(` [could not read: ${filePath}]
2136
2325
  `));
2137
- }
2138
- } else if ([".docx", ".doc", ".pdf", ".pptx", ".ppt", ".xlsx", ".xls", ".odt", ".rtf", ".epub"].includes(ext)) {
2139
- if (mcpManager) {
2140
- try {
2141
- process.stdout.write(pc3.dim(` [converting: ${path7.basename(filePath)}...]
2326
+ }
2327
+ } else if (docExts.has(ext)) {
2328
+ if (mcpManager) {
2329
+ try {
2330
+ process.stdout.write(pc3.dim(` [converting: ${path7.basename(filePath)}...]
2142
2331
  `));
2143
- const converted = await mcpManager.callTool("doc_convert", { path: filePath });
2144
- if (converted && !converted.startsWith("Error") && !converted.includes("Could not convert")) {
2145
- enrichedInput = `${input}
2332
+ const converted = await mcpManager.callTool("doc_convert", { path: filePath });
2333
+ if (converted && !converted.startsWith("Error") && !converted.includes("Could not convert")) {
2334
+ textContent += `
2146
2335
 
2147
2336
  <file path="${filePath}" format="${ext}">
2148
2337
  ${converted.slice(0, 5e4)}
2149
2338
  </file>`;
2150
- process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (converted from ${ext})]
2339
+ process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (converted from ${ext})]
2151
2340
  `));
2152
- } else {
2153
- enrichedInput = `${input}
2341
+ } else {
2342
+ textContent += `
2154
2343
 
2155
2344
  <file-error path="${filePath}">
2156
2345
  ${converted}
2157
2346
  </file-error>`;
2158
- process.stdout.write(pc3.yellow(` [conversion note: ${converted.split("\n")[0]}]
2159
- `));
2160
- }
2161
- } catch {
2162
- process.stdout.write(pc3.dim(` [could not convert: ${path7.basename(filePath)}]
2347
+ process.stdout.write(pc3.yellow(` [conversion note: ${converted.split("\n")[0]}]
2163
2348
  `));
2164
2349
  }
2165
- } else {
2166
- process.stdout.write(pc3.yellow(` Binary file (${ext}) \u2014 install Docling for document support: pip install docling
2350
+ } catch {
2351
+ process.stdout.write(pc3.dim(` [could not convert: ${path7.basename(filePath)}]
2167
2352
  `));
2168
2353
  }
2354
+ } else {
2355
+ process.stdout.write(pc3.yellow(` Binary file (${ext}) \u2014 install Docling for document support: pip install docling
2356
+ `));
2357
+ }
2358
+ }
2359
+ }
2360
+ const urlImageMatches = [...input.matchAll(/https?:\/\/\S+\.(?:png|jpg|jpeg|gif|webp)(?:\?\S*)?/gi)];
2361
+ for (const match of urlImageMatches) {
2362
+ const url = match[0];
2363
+ try {
2364
+ process.stdout.write(pc3.dim(` [fetching image: ${url.slice(0, 60)}...]
2365
+ `));
2366
+ const response = await fetch(url);
2367
+ if (!response.ok) {
2368
+ process.stdout.write(pc3.yellow(` [could not fetch: HTTP ${response.status}]
2369
+ `));
2370
+ continue;
2169
2371
  }
2372
+ const buffer = Buffer.from(await response.arrayBuffer());
2373
+ if (buffer.length > maxImageBytes) {
2374
+ process.stdout.write(pc3.yellow(` [skipped: image URL exceeds 20MB limit]
2375
+ `));
2376
+ continue;
2377
+ }
2378
+ const contentType = response.headers.get("content-type") || "";
2379
+ let mediaType = "image/png";
2380
+ if (contentType.includes("jpeg") || contentType.includes("jpg")) mediaType = "image/jpeg";
2381
+ else if (contentType.includes("gif")) mediaType = "image/gif";
2382
+ else if (contentType.includes("webp")) mediaType = "image/webp";
2383
+ else if (contentType.includes("png")) mediaType = "image/png";
2384
+ imageBlocks.push({
2385
+ type: "image",
2386
+ source: { type: "base64", media_type: mediaType, data: buffer.toString("base64") }
2387
+ });
2388
+ process.stdout.write(pc3.dim(` [attached image URL: (${(buffer.length / 1024).toFixed(1)}KB)]
2389
+ `));
2390
+ } catch {
2391
+ process.stdout.write(pc3.dim(` [could not fetch image: ${url}]
2392
+ `));
2170
2393
  }
2171
2394
  }
2172
- messages.push({ role: "user", content: enrichedInput });
2395
+ if (imageBlocks.length > 0) {
2396
+ const blocks = [
2397
+ { type: "text", text: textContent },
2398
+ ...imageBlocks
2399
+ ];
2400
+ messages.push({ role: "user", content: blocks });
2401
+ } else {
2402
+ messages.push({ role: "user", content: textContent });
2403
+ }
2173
2404
  let augmentedSystemPrompt = activeSystemPrompt;
2174
2405
  let memoryTokens = 0;
2175
2406
  if (mcpManager) {