@pentatonic-ai/ai-agent-sdk 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
878
878
  }
879
879
 
880
880
  // src/telemetry.js
881
- var VERSION = "0.10.2";
881
+ var VERSION = "0.10.3";
882
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
883
883
  function machineId() {
884
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
847
847
  }
848
848
 
849
849
  // src/telemetry.js
850
- var VERSION = "0.10.2";
850
+ var VERSION = "0.10.3";
851
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
852
852
  function machineId() {
853
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.10.2",
3
+ "version": "0.10.3",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -256,3 +256,83 @@ def test_build_event_block_format_contract_truncation() -> None:
256
256
  })
257
257
  content_part = block.split("---\n", 1)[1]
258
258
  assert len(content_part) == worker.MAX_CONTENT_CHARS
259
+
260
+
261
+ # ----------------------------------------------------------------------
262
+ # clean_content() — HTML/CSS strip so email + doc events don't distil
263
+ # into junk `concept` entities (font-face, mso-font-alt, panose-1, etc.).
264
+ # ----------------------------------------------------------------------
265
+
266
+ def test_clean_content_passthrough_on_plain_text() -> None:
267
+ """Hot path: events without `<` or `{` skip all regex work."""
268
+ plain = "Hi Phil, can we ship this Thursday? — Carly"
269
+ assert worker.clean_content(plain) == plain
270
+
271
+
272
+ def test_clean_content_strips_style_block() -> None:
273
+ """`<style>...</style>` blocks contain raw CSS that would otherwise
274
+ pollute the entity graph with `font-family`, `mso-*` etc."""
275
+ html = (
276
+ "<style>body { font-family: Arial; mso-ascii-font-family: Times; }</style>"
277
+ "<p>Meeting confirmed for Thursday</p>"
278
+ )
279
+ cleaned = worker.clean_content(html)
280
+ assert "font-family" not in cleaned
281
+ assert "mso-" not in cleaned
282
+ assert "Meeting confirmed for Thursday" in cleaned
283
+
284
+
285
+ def test_clean_content_strips_inline_tags() -> None:
286
+ """Inline tags removed but the human text between them is kept."""
287
+ html = "<div><b>Acme Corp</b> joined the call</div>"
288
+ cleaned = worker.clean_content(html)
289
+ assert "<" not in cleaned and ">" not in cleaned
290
+ assert "Acme Corp joined the call" in cleaned
291
+
292
+
293
+ def test_clean_content_decodes_entities_in_markup() -> None:
294
+ """HTML entities decode after tag-strip so we keep human meaning.
295
+ Only fires when the content was tagged in the first place — pure
296
+ plain text takes the fast-path skip and entities pass through as-is
297
+ (which is fine; plain-text entities are rare and harmless)."""
298
+ html = "<p>Phil &amp; Carly agreed: ship &lt;next week&gt;</p>"
299
+ cleaned = worker.clean_content(html)
300
+ assert cleaned == "Phil & Carly agreed: ship <next week>"
301
+
302
+
303
+ def test_clean_content_strips_mso_when_with_markup() -> None:
304
+ """`mso-*` / `panose-1` declarations leak from Outlook exports. The
305
+ fast-path only fires on text without `<` or `{`, so we test the
306
+ common case: mso-tokens alongside HTML that triggers the cleaner.
307
+ Realistic shape — each declaration terminated by `;` like in real
308
+ Outlook CSS leak — so the strip doesn't greedy-match into the body."""
309
+ weird = "<div>mso-font-alt: Arial; panose-1: 2 11 6 4;\nMeeting on Thursday</div>"
310
+ cleaned = worker.clean_content(weird)
311
+ assert "mso-" not in cleaned
312
+ assert "panose-1" not in cleaned
313
+ assert "Meeting on Thursday" in cleaned
314
+
315
+
316
+ def test_clean_content_fast_path_returns_plain_text_unchanged() -> None:
317
+ """Documented contract of the fast-path: input with neither `<`
318
+ nor `{` passes through verbatim. Locks in the perf-vs-correctness
319
+ trade-off (most events are plain text; running 7 regexes on each
320
+ one is wasted)."""
321
+ plain = "mso-font-alt should pass through unchanged here"
322
+ assert worker.clean_content(plain) == plain
323
+
324
+
325
+ def test_clean_content_preserves_extractable_signal() -> None:
326
+ """End-to-end: a representative email-shaped event should clean down
327
+ to just the human-readable body."""
328
+ email = (
329
+ "<html><head><style>"
330
+ "@font-face { font-family: 'Calibri'; panose-1: 2 15 5 2; }"
331
+ "</style></head><body>"
332
+ "<p>Phil — I&#39;ve confirmed the SAFE amendments for Thursday.</p>"
333
+ "<p>— Timothy</p></body></html>"
334
+ )
335
+ cleaned = worker.clean_content(email)
336
+ assert "Calibri" not in cleaned and "panose-1" not in cleaned
337
+ assert "I've confirmed the SAFE amendments" in cleaned
338
+ assert "Timothy" in cleaned
@@ -143,12 +143,16 @@ nothing to extract, emit ONLY the header.
143
143
  ENT|person|Alex Wong|alex@example.com
144
144
  ENT|person|Acme Corp (org, no email)
145
145
  ENT|person|Sam Patel (person, email not visible)
146
- - FCT lines have exactly 6 fields: `FCT`, category, subject, \
147
- predicate, object, statement.
146
+ - FCT lines have EXACTLY 6 pipe-separated fields: `FCT`, category, subject, \
147
+ predicate, object, statement. COUNT THE PIPES: there must be 6 `|` segments. \
148
+ predicate and object are SEPARATE fields — NEVER merge them into the statement, \
149
+ and NEVER drop a field.
148
150
  category ∈ {decision, commitment, state, mention, observation, preference}
149
151
  subject MUST be an entity name declared in THIS event's ENT lines.
152
+ predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
150
153
  object MAY be an entity name OR a literal string OR `-` if absent.
151
- statement ≤ 140 characters.
154
+ statement ≤ 140 characters, a self-contained sentence.
155
+ WORKED EXAMPLE: `FCT|commitment|Timothy Bradley|agreed to|SAFE amendments|Timothy confirmed the SAFE amendments are set (14 May 2026)`
152
156
  - REL lines have exactly 4 fields: `REL`, from, to, rel_type.
153
157
  from and to MUST be entity names declared in THIS event's ENT lines.
154
158
  rel_type is a short verb / preposition phrase.
@@ -167,10 +171,59 @@ A whole file is one entity, not twenty.
167
171
  SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
168
172
 
169
173
 
174
+ # --------------------------------------------------------------------
175
+ # Content cleaner — strip HTML/CSS so email + doc styling never reaches
176
+ # the LLM as text to extract. Without this, events containing Outlook /
177
+ # Gmail / docx-export markup get distilled into junk concept entities
178
+ # (`font-face`, `mso-font-alt`, `panose-1`, `src`) that pollute the
179
+ # graph. clean_content() is a no-op fast path on plain text — only
180
+ # events whose body contains `<` or `{` pay the regex cost.
181
+ # --------------------------------------------------------------------
182
+
183
+ _CC_STYLE = re.compile(r"<(style|script)\b[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL)
184
+ _CC_CSSRULE = re.compile(r"[.#@]?[A-Za-z0-9_.:#> -]+\s*\{[^{}]*\}")
185
+ _CC_MSO = re.compile(r"\b(mso-[\w-]+|panose-1|font-family|font-face)\b[^;\n]*;?", re.IGNORECASE)
186
+ _CC_TAG = re.compile(r"<[^>]+>")
187
+ _CC_WS = re.compile(r"[ \t\r\f]+")
188
+ _CC_NL = re.compile(r"\n{3,}")
189
+ _CC_ENT = (
190
+ ("&nbsp;", " "), ("&amp;", "&"), ("&lt;", "<"),
191
+ ("&gt;", ">"), ("&quot;", '"'), ("&#39;", "'"), ("&apos;", "'"),
192
+ )
193
+
194
+
195
+ def clean_content(text: str) -> str:
196
+ """Strip HTML/CSS so email + doc styling doesn't distil into junk
197
+ `concept` entities (font-face, mso-font-alt, etc.).
198
+
199
+ Fast early return on plain text (no `<` or `{`). On marked-up
200
+ content, removes `<style>` / `<script>` blocks first, then
201
+ standalone CSS rules, then all remaining tags, then MS-Office /
202
+ panose / font-face property runs that leak as freestanding tokens
203
+ in some Outlook exports. HTML entities are decoded last so we
204
+ don't accidentally introduce `<` tags from `&lt;` after the tag
205
+ pass."""
206
+ if not text or ("<" not in text and "{" not in text):
207
+ return text
208
+ t = _CC_STYLE.sub(" ", text)
209
+ t = _CC_CSSRULE.sub(" ", t)
210
+ t = _CC_TAG.sub(" ", t)
211
+ t = _CC_MSO.sub(" ", t)
212
+ for a, b in _CC_ENT:
213
+ t = t.replace(a, b)
214
+ t = _CC_WS.sub(" ", t)
215
+ t = _CC_NL.sub("\n\n", t)
216
+ return t.strip()
217
+
218
+
170
219
  def build_event_block(idx: int, event: dict[str, Any]) -> str:
171
- """Render one event as `[event K]\nheader\n---\ncontent` block."""
220
+ """Render one event as `[event K]\nheader\n---\ncontent` block.
221
+
222
+ Content is passed through `clean_content()` before truncation so
223
+ that the MAX_CONTENT_CHARS slice doesn't end up containing pure
224
+ HTML markup with no extractable signal."""
172
225
  src = event.get("source_kind", "unknown")
173
- content = (event.get("content") or "")[:MAX_CONTENT_CHARS]
226
+ content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
174
227
  attrs = event.get("attributes") or {}
175
228
  when = attrs.get("emitted_at") or attrs.get("timestamp")
176
229
  author = attrs.get("author") or attrs.get("user_id")