@aman_asmuei/aman-agent 0.7.7 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -392,7 +392,33 @@ Default budget: 8,000 tokens. Override with `--budget`.
392
392
  |:---|:---|:---|:---|
393
393
  | **Anthropic** | Claude Sonnet 4.6, Opus 4.6, Haiku 4.5 | Full | Full (with tools) |
394
394
  | **OpenAI** | GPT-4o, GPT-4o Mini, o3 | Full | Full (with tools) |
395
- | **Ollama** | Llama, Mistral, Gemma, any local model | Text only | Full |
395
+ | **Ollama** | Llama, Mistral, Gemma, any local model | Model-dependent | Full (with tools) |
396
+
397
+ ### Image Support (Vision)
398
+
399
+ Reference image files or URLs in your message and they'll be sent as vision content to the LLM:
400
+
401
+ ```
402
+ You > What's in this screenshot? ~/Desktop/screenshot.png
403
+ [attached image: screenshot.png (245.3KB)]
404
+ ```
405
+
406
+ **Supported formats:** `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`
407
+
408
+ **Image URLs** are also supported — paste any `https://...png` URL and it will be fetched and attached.
409
+
410
+ **Multiple files** can be referenced in a single message (images, text files, and documents together).
411
+
412
+ **Size limit:** 20MB per image.
413
+
414
+ **Vision model requirements:**
415
+ | Provider | Vision Models |
416
+ |:---|:---|
417
+ | **Anthropic** | All Claude models (Sonnet, Opus, Haiku) |
418
+ | **OpenAI** | GPT-4o, GPT-4o Mini |
419
+ | **Ollama** | LLaVA, Llama 3.2 Vision, Moondream, BakLLaVA |
420
+
421
+ Non-vision models will receive the image but may not be able to interpret it.
396
422
 
397
423
  ---
398
424
 
package/dist/index.js CHANGED
@@ -147,6 +147,16 @@ function toAnthropicMessages(messages) {
147
147
  if (block.type === "text") {
148
148
  return { type: "text", text: block.text };
149
149
  }
150
+ if (block.type === "image") {
151
+ return {
152
+ type: "image",
153
+ source: {
154
+ type: "base64",
155
+ media_type: block.source.media_type,
156
+ data: block.source.data
157
+ }
158
+ };
159
+ }
150
160
  if (block.type === "tool_use") {
151
161
  return {
152
162
  type: "tool_use",
@@ -311,8 +321,26 @@ function toOpenAIMessages(systemPrompt, messages) {
311
321
  }
312
322
  }
313
323
  } else {
314
- const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
315
- result.push({ role: "user", content: text2 });
324
+ const hasImages = m.content.some((b) => b.type === "image");
325
+ if (hasImages) {
326
+ const parts = [];
327
+ for (const b of m.content) {
328
+ if (b.type === "text") {
329
+ parts.push({ type: "text", text: b.text });
330
+ } else if (b.type === "image") {
331
+ parts.push({
332
+ type: "image_url",
333
+ image_url: {
334
+ url: `data:${b.source.media_type};base64,${b.source.data}`
335
+ }
336
+ });
337
+ }
338
+ }
339
+ result.push({ role: "user", content: parts });
340
+ } else {
341
+ const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
342
+ result.push({ role: "user", content: text2 });
343
+ }
316
344
  }
317
345
  }
318
346
  }
@@ -415,6 +443,74 @@ function createOpenAIClient(apiKey, model) {
415
443
 
416
444
  // src/llm/ollama.ts
417
445
  import OpenAI2 from "openai";
446
+ function toOllamaMessages(systemPrompt, messages) {
447
+ const result = [
448
+ { role: "system", content: systemPrompt }
449
+ ];
450
+ for (const m of messages) {
451
+ if (typeof m.content === "string") {
452
+ result.push({
453
+ role: m.role,
454
+ content: m.content
455
+ });
456
+ } else if (m.role === "assistant") {
457
+ const textParts = m.content.filter((b) => b.type === "text");
458
+ const toolUseParts = m.content.filter((b) => b.type === "tool_use");
459
+ const text2 = textParts.map((b) => "text" in b ? b.text : "").join("");
460
+ if (toolUseParts.length > 0) {
461
+ result.push({
462
+ role: "assistant",
463
+ content: text2 || null,
464
+ tool_calls: toolUseParts.map((b) => ({
465
+ id: "id" in b ? b.id : "",
466
+ type: "function",
467
+ function: {
468
+ name: "name" in b ? b.name : "",
469
+ arguments: JSON.stringify("input" in b ? b.input : {})
470
+ }
471
+ }))
472
+ });
473
+ } else {
474
+ result.push({ role: "assistant", content: text2 });
475
+ }
476
+ } else if (m.role === "user") {
477
+ const toolResults = m.content.filter((b) => b.type === "tool_result");
478
+ if (toolResults.length > 0) {
479
+ for (const tr of toolResults) {
480
+ if (tr.type === "tool_result") {
481
+ result.push({
482
+ role: "tool",
483
+ tool_call_id: tr.tool_use_id,
484
+ content: tr.content
485
+ });
486
+ }
487
+ }
488
+ } else {
489
+ const hasImages = m.content.some((b) => b.type === "image");
490
+ if (hasImages) {
491
+ const parts = [];
492
+ for (const b of m.content) {
493
+ if (b.type === "text") {
494
+ parts.push({ type: "text", text: b.text });
495
+ } else if (b.type === "image") {
496
+ parts.push({
497
+ type: "image_url",
498
+ image_url: {
499
+ url: `data:${b.source.media_type};base64,${b.source.data}`
500
+ }
501
+ });
502
+ }
503
+ }
504
+ result.push({ role: "user", content: parts });
505
+ } else {
506
+ const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
507
+ result.push({ role: "user", content: text2 });
508
+ }
509
+ }
510
+ }
511
+ }
512
+ return result;
513
+ }
418
514
  function createOllamaClient(model, baseURL) {
419
515
  const client = new OpenAI2({
420
516
  baseURL: baseURL || "http://localhost:11434/v1",
@@ -422,28 +518,83 @@ function createOllamaClient(model, baseURL) {
422
518
  // Ollama doesn't require a real key
423
519
  });
424
520
  return {
425
- async chat(systemPrompt, messages, onChunk, _tools) {
426
- let fullText = "";
521
+ async chat(systemPrompt, messages, onChunk, tools) {
522
+ const ollamaMessages = toOllamaMessages(systemPrompt, messages);
523
+ const hasTools = tools && tools.length > 0;
427
524
  try {
428
- const stream = await client.chat.completions.create({
525
+ let fullText = "";
526
+ const toolCallAccumulators = /* @__PURE__ */ new Map();
527
+ const createParams = {
429
528
  model,
430
529
  max_tokens: 8192,
431
- messages: [
432
- { role: "system", content: systemPrompt },
433
- ...messages.map((m) => ({
434
- role: m.role,
435
- content: typeof m.content === "string" ? m.content : m.content.filter((b) => b.type === "text").map((b) => "text" in b ? b.text : "").join("")
436
- }))
437
- ],
530
+ messages: ollamaMessages,
438
531
  stream: true
439
- });
532
+ };
533
+ if (hasTools) {
534
+ createParams.tools = tools.map((t) => ({
535
+ type: "function",
536
+ function: {
537
+ name: t.name,
538
+ description: t.description,
539
+ parameters: t.input_schema
540
+ }
541
+ }));
542
+ }
543
+ const stream = await client.chat.completions.create(
544
+ createParams
545
+ );
440
546
  for await (const chunk of stream) {
441
- const text2 = chunk.choices[0]?.delta?.content || "";
442
- if (text2) {
443
- fullText += text2;
444
- onChunk({ type: "text", text: text2 });
547
+ const delta = chunk.choices[0]?.delta;
548
+ if (!delta) continue;
549
+ if (delta.content) {
550
+ fullText += delta.content;
551
+ onChunk({ type: "text", text: delta.content });
552
+ }
553
+ if (delta.tool_calls) {
554
+ for (const tc of delta.tool_calls) {
555
+ const idx = tc.index;
556
+ let acc = toolCallAccumulators.get(idx);
557
+ if (!acc) {
558
+ acc = { id: "", name: "", arguments: "" };
559
+ toolCallAccumulators.set(idx, acc);
560
+ }
561
+ if (tc.id) {
562
+ acc.id = tc.id;
563
+ }
564
+ if (tc.function?.name) {
565
+ acc.name = tc.function.name;
566
+ }
567
+ if (tc.function?.arguments) {
568
+ acc.arguments += tc.function.arguments;
569
+ }
570
+ }
445
571
  }
446
572
  }
573
+ const toolUses = Array.from(toolCallAccumulators.entries()).sort(([a], [b]) => a - b).map(([, acc]) => ({
574
+ id: acc.id,
575
+ name: acc.name,
576
+ input: JSON.parse(acc.arguments || "{}")
577
+ }));
578
+ onChunk({ type: "done" });
579
+ if (toolUses.length > 0) {
580
+ const contentBlocks = [
581
+ ...fullText ? [{ type: "text", text: fullText }] : [],
582
+ ...toolUses.map((tu) => ({
583
+ type: "tool_use",
584
+ id: tu.id,
585
+ name: tu.name,
586
+ input: tu.input
587
+ }))
588
+ ];
589
+ return {
590
+ message: { role: "assistant", content: contentBlocks },
591
+ toolUses
592
+ };
593
+ }
594
+ return {
595
+ message: { role: "assistant", content: fullText },
596
+ toolUses: []
597
+ };
447
598
  } catch (error) {
448
599
  if (error instanceof Error && error.message.includes("ECONNREFUSED")) {
449
600
  throw new Error(
@@ -452,11 +603,6 @@ function createOllamaClient(model, baseURL) {
452
603
  }
453
604
  throw error;
454
605
  }
455
- onChunk({ type: "done" });
456
- return {
457
- message: { role: "assistant", content: fullText },
458
- toolUses: []
459
- };
460
606
  }
461
607
  };
462
608
  }
@@ -2081,106 +2227,180 @@ ${wfMatch.steps}
2081
2227
  }
2082
2228
  }
2083
2229
  await trimConversation(messages, client);
2084
- let enrichedInput = input;
2085
- const filePathMatch = input.match(/(\/[\w./-]+|~\/[\w./-]+)/);
2086
- if (filePathMatch) {
2087
- let filePath = filePathMatch[1];
2230
+ const textExts = /* @__PURE__ */ new Set([
2231
+ ".txt",
2232
+ ".md",
2233
+ ".json",
2234
+ ".js",
2235
+ ".ts",
2236
+ ".jsx",
2237
+ ".tsx",
2238
+ ".py",
2239
+ ".html",
2240
+ ".css",
2241
+ ".yml",
2242
+ ".yaml",
2243
+ ".toml",
2244
+ ".xml",
2245
+ ".csv",
2246
+ ".sh",
2247
+ ".bash",
2248
+ ".zsh",
2249
+ ".env",
2250
+ ".cfg",
2251
+ ".ini",
2252
+ ".log",
2253
+ ".sql",
2254
+ ".graphql",
2255
+ ".rs",
2256
+ ".go",
2257
+ ".java",
2258
+ ".rb",
2259
+ ".php",
2260
+ ".c",
2261
+ ".cpp",
2262
+ ".h",
2263
+ ".swift",
2264
+ ".kt",
2265
+ ".r",
2266
+ ".lua"
2267
+ ]);
2268
+ const imageExts = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"]);
2269
+ const docExts = /* @__PURE__ */ new Set([".docx", ".doc", ".pdf", ".pptx", ".ppt", ".xlsx", ".xls", ".odt", ".rtf", ".epub"]);
2270
+ const mimeMap = {
2271
+ ".png": "image/png",
2272
+ ".jpg": "image/jpeg",
2273
+ ".jpeg": "image/jpeg",
2274
+ ".gif": "image/gif",
2275
+ ".webp": "image/webp",
2276
+ ".bmp": "image/png"
2277
+ };
2278
+ const maxImageBytes = 20 * 1024 * 1024;
2279
+ let textContent = input;
2280
+ const imageBlocks = [];
2281
+ const filePathMatches = [...input.matchAll(/(\/[\w./-]+|~\/[\w./-]+)/g)];
2282
+ for (const match of filePathMatches) {
2283
+ let filePath = match[1];
2088
2284
  if (filePath.startsWith("~/")) {
2089
2285
  filePath = path7.join(os7.homedir(), filePath.slice(2));
2090
2286
  }
2091
- if (fs7.existsSync(filePath) && fs7.statSync(filePath).isFile()) {
2092
- const ext = path7.extname(filePath).toLowerCase();
2093
- const textExts = /* @__PURE__ */ new Set([
2094
- ".txt",
2095
- ".md",
2096
- ".json",
2097
- ".js",
2098
- ".ts",
2099
- ".jsx",
2100
- ".tsx",
2101
- ".py",
2102
- ".html",
2103
- ".css",
2104
- ".yml",
2105
- ".yaml",
2106
- ".toml",
2107
- ".xml",
2108
- ".csv",
2109
- ".sh",
2110
- ".bash",
2111
- ".zsh",
2112
- ".env",
2113
- ".cfg",
2114
- ".ini",
2115
- ".log",
2116
- ".sql",
2117
- ".graphql",
2118
- ".rs",
2119
- ".go",
2120
- ".java",
2121
- ".rb",
2122
- ".php",
2123
- ".c",
2124
- ".cpp",
2125
- ".h",
2126
- ".swift",
2127
- ".kt",
2128
- ".r",
2129
- ".lua"
2130
- ]);
2131
- if (textExts.has(ext) || ext === "") {
2132
- try {
2133
- const content = fs7.readFileSync(filePath, "utf-8");
2134
- const maxChars = 5e4;
2135
- const trimmed = content.length > maxChars ? content.slice(0, maxChars) + `
2287
+ if (!fs7.existsSync(filePath) || !fs7.statSync(filePath).isFile()) continue;
2288
+ const ext = path7.extname(filePath).toLowerCase();
2289
+ if (imageExts.has(ext)) {
2290
+ try {
2291
+ const stat = fs7.statSync(filePath);
2292
+ if (stat.size > maxImageBytes) {
2293
+ process.stdout.write(pc3.yellow(` [skipped: ${path7.basename(filePath)} \u2014 exceeds 20MB limit]
2294
+ `));
2295
+ continue;
2296
+ }
2297
+ const data = fs7.readFileSync(filePath).toString("base64");
2298
+ const mediaType = mimeMap[ext] || "image/png";
2299
+ imageBlocks.push({
2300
+ type: "image",
2301
+ source: { type: "base64", media_type: mediaType, data }
2302
+ });
2303
+ process.stdout.write(pc3.dim(` [attached image: ${path7.basename(filePath)} (${(stat.size / 1024).toFixed(1)}KB)]
2304
+ `));
2305
+ } catch {
2306
+ process.stdout.write(pc3.dim(` [could not read image: ${filePath}]
2307
+ `));
2308
+ }
2309
+ } else if (textExts.has(ext) || ext === "") {
2310
+ try {
2311
+ const content = fs7.readFileSync(filePath, "utf-8");
2312
+ const maxChars = 5e4;
2313
+ const trimmed = content.length > maxChars ? content.slice(0, maxChars) + `
2136
2314
 
2137
2315
  [... truncated, ${content.length - maxChars} chars remaining]` : content;
2138
- enrichedInput = `${input}
2316
+ textContent += `
2139
2317
 
2140
2318
  <file path="${filePath}" size="${content.length} chars">
2141
2319
  ${trimmed}
2142
2320
  </file>`;
2143
- process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (${(content.length / 1024).toFixed(1)}KB)]
2321
+ process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (${(content.length / 1024).toFixed(1)}KB)]
2144
2322
  `));
2145
- } catch {
2146
- process.stdout.write(pc3.dim(` [could not read: ${filePath}]
2323
+ } catch {
2324
+ process.stdout.write(pc3.dim(` [could not read: ${filePath}]
2147
2325
  `));
2148
- }
2149
- } else if ([".docx", ".doc", ".pdf", ".pptx", ".ppt", ".xlsx", ".xls", ".odt", ".rtf", ".epub"].includes(ext)) {
2150
- if (mcpManager) {
2151
- try {
2152
- process.stdout.write(pc3.dim(` [converting: ${path7.basename(filePath)}...]
2326
+ }
2327
+ } else if (docExts.has(ext)) {
2328
+ if (mcpManager) {
2329
+ try {
2330
+ process.stdout.write(pc3.dim(` [converting: ${path7.basename(filePath)}...]
2153
2331
  `));
2154
- const converted = await mcpManager.callTool("doc_convert", { path: filePath });
2155
- if (converted && !converted.startsWith("Error") && !converted.includes("Could not convert")) {
2156
- enrichedInput = `${input}
2332
+ const converted = await mcpManager.callTool("doc_convert", { path: filePath });
2333
+ if (converted && !converted.startsWith("Error") && !converted.includes("Could not convert")) {
2334
+ textContent += `
2157
2335
 
2158
2336
  <file path="${filePath}" format="${ext}">
2159
2337
  ${converted.slice(0, 5e4)}
2160
2338
  </file>`;
2161
- process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (converted from ${ext})]
2339
+ process.stdout.write(pc3.dim(` [attached: ${path7.basename(filePath)} (converted from ${ext})]
2162
2340
  `));
2163
- } else {
2164
- enrichedInput = `${input}
2341
+ } else {
2342
+ textContent += `
2165
2343
 
2166
2344
  <file-error path="${filePath}">
2167
2345
  ${converted}
2168
2346
  </file-error>`;
2169
- process.stdout.write(pc3.yellow(` [conversion note: ${converted.split("\n")[0]}]
2170
- `));
2171
- }
2172
- } catch {
2173
- process.stdout.write(pc3.dim(` [could not convert: ${path7.basename(filePath)}]
2347
+ process.stdout.write(pc3.yellow(` [conversion note: ${converted.split("\n")[0]}]
2174
2348
  `));
2175
2349
  }
2176
- } else {
2177
- process.stdout.write(pc3.yellow(` Binary file (${ext}) \u2014 install Docling for document support: pip install docling
2350
+ } catch {
2351
+ process.stdout.write(pc3.dim(` [could not convert: ${path7.basename(filePath)}]
2178
2352
  `));
2179
2353
  }
2354
+ } else {
2355
+ process.stdout.write(pc3.yellow(` Binary file (${ext}) \u2014 install Docling for document support: pip install docling
2356
+ `));
2180
2357
  }
2181
2358
  }
2182
2359
  }
2183
- messages.push({ role: "user", content: enrichedInput });
2360
+ const urlImageMatches = [...input.matchAll(/https?:\/\/\S+\.(?:png|jpg|jpeg|gif|webp)(?:\?\S*)?/gi)];
2361
+ for (const match of urlImageMatches) {
2362
+ const url = match[0];
2363
+ try {
2364
+ process.stdout.write(pc3.dim(` [fetching image: ${url.slice(0, 60)}...]
2365
+ `));
2366
+ const response = await fetch(url);
2367
+ if (!response.ok) {
2368
+ process.stdout.write(pc3.yellow(` [could not fetch: HTTP ${response.status}]
2369
+ `));
2370
+ continue;
2371
+ }
2372
+ const buffer = Buffer.from(await response.arrayBuffer());
2373
+ if (buffer.length > maxImageBytes) {
2374
+ process.stdout.write(pc3.yellow(` [skipped: image URL exceeds 20MB limit]
2375
+ `));
2376
+ continue;
2377
+ }
2378
+ const contentType = response.headers.get("content-type") || "";
2379
+ let mediaType = "image/png";
2380
+ if (contentType.includes("jpeg") || contentType.includes("jpg")) mediaType = "image/jpeg";
2381
+ else if (contentType.includes("gif")) mediaType = "image/gif";
2382
+ else if (contentType.includes("webp")) mediaType = "image/webp";
2383
+ else if (contentType.includes("png")) mediaType = "image/png";
2384
+ imageBlocks.push({
2385
+ type: "image",
2386
+ source: { type: "base64", media_type: mediaType, data: buffer.toString("base64") }
2387
+ });
2388
+ process.stdout.write(pc3.dim(` [attached image URL: (${(buffer.length / 1024).toFixed(1)}KB)]
2389
+ `));
2390
+ } catch {
2391
+ process.stdout.write(pc3.dim(` [could not fetch image: ${url}]
2392
+ `));
2393
+ }
2394
+ }
2395
+ if (imageBlocks.length > 0) {
2396
+ const blocks = [
2397
+ { type: "text", text: textContent },
2398
+ ...imageBlocks
2399
+ ];
2400
+ messages.push({ role: "user", content: blocks });
2401
+ } else {
2402
+ messages.push({ role: "user", content: textContent });
2403
+ }
2184
2404
  let augmentedSystemPrompt = activeSystemPrompt;
2185
2405
  let memoryTokens = 0;
2186
2406
  if (mcpManager) {