@pentatonic-ai/ai-agent-sdk 0.5.5 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.5.5",
3
+ "version": "0.5.6",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -263,6 +263,62 @@ describe("createAIClient", () => {
263
263
  await client.chat([{ role: "user", content: "q" }]);
264
264
  expect(hitUrl).toBe("http://localhost:11434/v1/chat/completions");
265
265
  });
266
+
267
+ it("embedBatch sends all inputs in one HTTP call", async () => {
268
+ let callCount = 0;
269
+ let lastBody;
270
+ globalThis.fetch = async (_url, opts) => {
271
+ callCount++;
272
+ lastBody = JSON.parse(opts.body);
273
+ return {
274
+ ok: true,
275
+ json: async () => ({
276
+ data: lastBody.input.map((_, i) => ({
277
+ embedding: [0.1, 0.2, 0.3],
278
+ index: i,
279
+ })),
280
+ }),
281
+ };
282
+ };
283
+ const client = createAIClient({
284
+ url: "http://localhost:11434/v1",
285
+ model: "m",
286
+ });
287
+ const out = await client.embedBatch(["a", "b", "c"], "passage");
288
+ expect(callCount).toBe(1);
289
+ expect(lastBody.input).toEqual(["a", "b", "c"]);
290
+ expect(out.length).toBe(3);
291
+ expect(out.every((r) => r.embedding.length === 3)).toBe(true);
292
+ });
293
+
294
+ it("embedBatch returns nulls on non-2xx without throwing", async () => {
295
+ globalThis.fetch = async () => ({ ok: false, json: async () => ({}) });
296
+ const client = createAIClient({
297
+ url: "http://localhost:11434/v1",
298
+ model: "m",
299
+ });
300
+ const out = await client.embedBatch(["a", "b"]);
301
+ expect(out).toEqual([null, null]);
302
+ });
303
+
304
+ it("embedBatch parses Ollama/Pentatonic-style {embeddings: [[...]]} response", async () => {
305
+ globalThis.fetch = async () => ({
306
+ ok: true,
307
+ json: async () => ({
308
+ embeddings: [
309
+ [0.1, 0.2],
310
+ [0.3, 0.4],
311
+ ],
312
+ }),
313
+ });
314
+ const client = createAIClient({
315
+ url: "http://localhost:11434/v1",
316
+ model: "m",
317
+ });
318
+ const out = await client.embedBatch(["x", "y"]);
319
+ expect(out[0].embedding).toEqual([0.1, 0.2]);
320
+ expect(out[1].embedding).toEqual([0.3, 0.4]);
321
+ });
266
322
  });
267
323
 
268
324
  // --- Search options contract ---
@@ -45,6 +45,40 @@ export function createAIClient(config) {
45
45
  const chatPath = stripLeading(config.chatPath || "chat/completions");
46
46
  const baseUrl = stripTrailing(config.url);
47
47
 
48
+ /**
49
+ * Send an embedding request with N inputs. Shared by embed() and
50
+ * embedBatch(). Returns an array of { embedding, dimensions, model } or
51
+ * nulls (one per input, preserving order).
52
+ */
53
+ async function rawEmbed(texts, inputType) {
54
+ if (!texts.length) return [];
55
+ try {
56
+ const res = await fetch(`${baseUrl}/${embeddingPath}`, {
57
+ method: "POST",
58
+ headers,
59
+ body: JSON.stringify({
60
+ input: texts.map((t) => (t ?? "").substring(0, 8192)),
61
+ model: config.model,
62
+ input_type: inputType,
63
+ }),
64
+ signal: AbortSignal.timeout(30000),
65
+ });
66
+ if (!res.ok) return texts.map(() => null);
67
+ const data = await res.json();
68
+ // OpenAI-compat: data.data = [{embedding, index}, ...]
69
+ // Pentatonic gateway / Ollama: data.embeddings = [[...], [...], ...]
70
+ const vectors =
71
+ data.data?.map((d) => d.embedding) || data.embeddings || [];
72
+ return texts.map((_, i) => {
73
+ const embedding = vectors[i];
74
+ if (!embedding) return null;
75
+ return { embedding, dimensions: embedding.length, model: config.model };
76
+ });
77
+ } catch {
78
+ return texts.map(() => null);
79
+ }
80
+ }
81
+
48
82
  return {
49
83
  /**
50
84
  * Generate an embedding vector for text.
@@ -54,32 +88,25 @@ export function createAIClient(config) {
54
88
  * @returns {Promise<{embedding: number[], dimensions: number, model: string} | null>}
55
89
  */
56
90
  async embed(text, inputType = "passage") {
57
- try {
58
- const res = await fetch(`${baseUrl}/${embeddingPath}`, {
59
- method: "POST",
60
- headers,
61
- body: JSON.stringify({
62
- input: [text.substring(0, 8192)],
63
- model: config.model,
64
- input_type: inputType,
65
- }),
66
- signal: AbortSignal.timeout(30000),
67
- });
68
-
69
- if (!res.ok) return null;
70
-
71
- const data = await res.json();
72
- const embedding = data.data?.[0]?.embedding || data.embeddings?.[0];
73
- if (!embedding) return null;
91
+ const results = await rawEmbed([text], inputType);
92
+ return results[0];
93
+ },
74
94
 
75
- return {
76
- embedding,
77
- dimensions: embedding.length,
78
- model: config.model,
79
- };
80
- } catch {
81
- return null;
82
- }
95
+ /**
96
+ * Generate embeddings for N texts in a single HTTP round-trip. Returns
97
+ * an array the same length as the input; each entry is either the
98
+ * embedding object or null on failure.
99
+ *
100
+ * Batching matters under load — one call instead of N cuts GPU overhead
101
+ * and downstream queueing. Used by distill() to embed all atoms from a
102
+ * raw memory in one shot rather than N serial calls.
103
+ *
104
+ * @param {string[]} texts
105
+ * @param {string} [inputType="passage"]
106
+ * @returns {Promise<Array<{embedding: number[], dimensions: number, model: string} | null>>}
107
+ */
108
+ async embedBatch(texts, inputType = "passage") {
109
+ return rawEmbed(texts, inputType);
83
110
  },
84
111
 
85
112
  /**
@@ -104,8 +104,27 @@ export async function distill(db, ai, llm, sourceId, content, opts = {}) {
104
104
  }
105
105
  const layerId = layerResult.rows[0].id;
106
106
 
107
+ // Batch-embed all atoms in one HTTP call. Under load this is a big
108
+ // win over N serial embed calls — one GPU forward pass instead of N,
109
+ // less downstream queueing.
110
+ let embeddings;
111
+ if (ai.embedBatch) {
112
+ try {
113
+ embeddings = await ai.embedBatch(facts, "passage");
114
+ } catch (err) {
115
+ log(`distill: batch embed failed: ${err.message}`);
116
+ embeddings = facts.map(() => null);
117
+ }
118
+ } else {
119
+ // Older AI clients without embedBatch — fall through to per-atom embed
120
+ // inside the loop below. Kept for backwards compat with any custom
121
+ // client passed into createMemorySystem.
122
+ embeddings = null;
123
+ }
124
+
107
125
  const stored = [];
108
- for (const fact of facts) {
126
+ for (let i = 0; i < facts.length; i++) {
127
+ const fact = facts[i];
109
128
  try {
110
129
  const atomId = `mem_${crypto.randomUUID()}`;
111
130
 
@@ -124,9 +143,13 @@ export async function distill(db, ai, llm, sourceId, content, opts = {}) {
124
143
  ]
125
144
  );
126
145
 
127
- // Embed the atom (non-fatal)
146
+ // Attach embedding — from the batch when available, else fall back
147
+ // to a per-atom call.
128
148
  try {
129
- const embResult = await ai.embed(fact, "passage");
149
+ let embResult = embeddings ? embeddings[i] : null;
150
+ if (!embResult && !embeddings) {
151
+ embResult = await ai.embed(fact, "passage");
152
+ }
130
153
  if (embResult?.embedding) {
131
154
  await db(
132
155
  `UPDATE memory_nodes SET embedding = $1, updated_at = NOW() WHERE id = $2`,
@@ -137,7 +160,9 @@ export async function distill(db, ai, llm, sourceId, content, opts = {}) {
137
160
  log(`distill: embedding failed for ${atomId}: ${err.message}`);
138
161
  }
139
162
 
140
- // HyDE (2 queries for atoms — they're already focused)
163
+ // HyDE (2 queries for atoms — they're already focused).
164
+ // Still per-atom — chat completions don't share a batch surface
165
+ // across providers the way embeddings do.
141
166
  try {
142
167
  const queries = await generateHypotheticalQueries(llm, fact);
143
168
  const trimmed = queries.slice(0, 2);
@@ -347,7 +347,7 @@ async function main() {
347
347
  const health = {
348
348
  status: "ok",
349
349
  client: CLIENT_ID,
350
- version: "0.5.5",
350
+ version: "0.5.6",
351
351
  search: "text",
352
352
  db: false,
353
353
  ollama: false,