npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.2 → 0.10.3 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1
package/packages/memory-engine-v2/extractor-async/test_async_ent_parser.py +80 -0
package/packages/memory-engine-v2/extractor-async/worker.py +58 -5

package/dist/index.cjs CHANGED Viewed

@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.2";
+var VERSION = "0.10.3";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.2";
+var VERSION = "0.10.3";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.10.2",
+  "version": "0.10.3",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine-v2/extractor-async/test_async_ent_parser.py CHANGED Viewed

@@ -256,3 +256,83 @@ def test_build_event_block_format_contract_truncation() -> None:
     })
     content_part = block.split("---\n", 1)[1]
     assert len(content_part) == worker.MAX_CONTENT_CHARS
+# ----------------------------------------------------------------------
+# clean_content() — HTML/CSS strip so email + doc events don't distil
+# into junk `concept` entities (font-face, mso-font-alt, panose-1, etc.).
+# ----------------------------------------------------------------------
+def test_clean_content_passthrough_on_plain_text() -> None:
+    """Hot path: events without `<` or `{` skip all regex work."""
+    plain = "Hi Phil, can we ship this Thursday? — Carly"
+    assert worker.clean_content(plain) == plain
+def test_clean_content_strips_style_block() -> None:
+    """`<style>...</style>` blocks contain raw CSS that would otherwise
+    pollute the entity graph with `font-family`, `mso-*` etc."""
+    html = (
+        "<style>body { font-family: Arial; mso-ascii-font-family: Times; }</style>"
+        "<p>Meeting confirmed for Thursday</p>"
+    )
+    cleaned = worker.clean_content(html)
+    assert "font-family" not in cleaned
+    assert "mso-" not in cleaned
+    assert "Meeting confirmed for Thursday" in cleaned
+def test_clean_content_strips_inline_tags() -> None:
+    """Inline tags removed but the human text between them is kept."""
+    html = "<div><b>Acme Corp</b> joined the call</div>"
+    cleaned = worker.clean_content(html)
+    assert "<" not in cleaned and ">" not in cleaned
+    assert "Acme Corp joined the call" in cleaned
+def test_clean_content_decodes_entities_in_markup() -> None:
+    """HTML entities decode after tag-strip so we keep human meaning.
+    Only fires when the content was tagged in the first place — pure
+    plain text takes the fast-path skip and entities pass through as-is
+    (which is fine; plain-text entities are rare and harmless)."""
+    html = "<p>Phil &amp; Carly agreed: ship &lt;next week&gt;</p>"
+    cleaned = worker.clean_content(html)
+    assert cleaned == "Phil & Carly agreed: ship <next week>"
+def test_clean_content_strips_mso_when_with_markup() -> None:
+    """`mso-*` / `panose-1` declarations leak from Outlook exports. The
+    fast-path only fires on text without `<` or `{`, so we test the
+    common case: mso-tokens alongside HTML that triggers the cleaner.
+    Realistic shape — each declaration terminated by `;` like in real
+    Outlook CSS leak — so the strip doesn't greedy-match into the body."""
+    weird = "<div>mso-font-alt: Arial; panose-1: 2 11 6 4;\nMeeting on Thursday</div>"
+    cleaned = worker.clean_content(weird)
+    assert "mso-" not in cleaned
+    assert "panose-1" not in cleaned
+    assert "Meeting on Thursday" in cleaned
+def test_clean_content_fast_path_returns_plain_text_unchanged() -> None:
+    """Documented contract of the fast-path: input with neither `<`
+    nor `{` passes through verbatim. Locks in the perf-vs-correctness
+    trade-off (most events are plain text; running 7 regexes on each
+    one is wasted)."""
+    plain = "mso-font-alt should pass through unchanged here"
+    assert worker.clean_content(plain) == plain
+def test_clean_content_preserves_extractable_signal() -> None:
+    """End-to-end: a representative email-shaped event should clean down
+    to just the human-readable body."""
+    email = (
+        "<html><head><style>"
+        "@font-face { font-family: 'Calibri'; panose-1: 2 15 5 2; }"
+        "</style></head><body>"
+        "<p>Phil — I&#39;ve confirmed the SAFE amendments for Thursday.</p>"
+        "<p>— Timothy</p></body></html>"
+    )
+    cleaned = worker.clean_content(email)
+    assert "Calibri" not in cleaned and "panose-1" not in cleaned
+    assert "I've confirmed the SAFE amendments" in cleaned
+    assert "Timothy" in cleaned

package/packages/memory-engine-v2/extractor-async/worker.py CHANGED Viewed

@@ -143,12 +143,16 @@ nothing to extract, emit ONLY the header.
     ENT|person|Alex Wong|alex@example.com
     ENT|person|Acme Corp           (org, no email)
     ENT|person|Sam Patel           (person, email not visible)
-- FCT lines have exactly 6 fields: `FCT`, category, subject, \
-predicate, object, statement.
+- FCT lines have EXACTLY 6 pipe-separated fields: `FCT`, category, subject, \
+predicate, object, statement. COUNT THE PIPES: there must be 6 `|` segments. \
+predicate and object are SEPARATE fields — NEVER merge them into the statement, \
+and NEVER drop a field.
   category ∈ {decision, commitment, state, mention, observation, preference}
   subject MUST be an entity name declared in THIS event's ENT lines.
+  predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
   object MAY be an entity name OR a literal string OR `-` if absent.
-  statement ≤ 140 characters.
+  statement ≤ 140 characters, a self-contained sentence.
+  WORKED EXAMPLE: `FCT|commitment|Timothy Bradley|agreed to|SAFE amendments|Timothy confirmed the SAFE amendments are set (14 May 2026)`
 - REL lines have exactly 4 fields: `REL`, from, to, rel_type.
   from and to MUST be entity names declared in THIS event's ENT lines.
   rel_type is a short verb / preposition phrase.
@@ -167,10 +171,59 @@ A whole file is one entity, not twenty.
 SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
+# --------------------------------------------------------------------
+# Content cleaner — strip HTML/CSS so email + doc styling never reaches
+# the LLM as text to extract. Without this, events containing Outlook /
+# Gmail / docx-export markup get distilled into junk concept entities
+# (`font-face`, `mso-font-alt`, `panose-1`, `src`) that pollute the
+# graph. clean_content() is a no-op fast path on plain text — only
+# events whose body contains `<` or `{` pay the regex cost.
+# --------------------------------------------------------------------
+_CC_STYLE = re.compile(r"<(style|script)\b[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL)
+_CC_CSSRULE = re.compile(r"[.#@]?[A-Za-z0-9_.:#> -]+\s*\{[^{}]*\}")
+_CC_MSO = re.compile(r"\b(mso-[\w-]+|panose-1|font-family|font-face)\b[^;\n]*;?", re.IGNORECASE)
+_CC_TAG = re.compile(r"<[^>]+>")
+_CC_WS = re.compile(r"[ \t\r\f]+")
+_CC_NL = re.compile(r"\n{3,}")
+_CC_ENT = (
+    ("&nbsp;", " "), ("&amp;", "&"), ("&lt;", "<"),
+    ("&gt;", ">"), ("&quot;", '"'), ("&#39;", "'"), ("&apos;", "'"),
+)
+def clean_content(text: str) -> str:
+    """Strip HTML/CSS so email + doc styling doesn't distil into junk
+    `concept` entities (font-face, mso-font-alt, etc.).
+    Fast early return on plain text (no `<` or `{`). On marked-up
+    content, removes `<style>` / `<script>` blocks first, then
+    standalone CSS rules, then all remaining tags, then MS-Office /
+    panose / font-face property runs that leak as freestanding tokens
+    in some Outlook exports. HTML entities are decoded last so we
+    don't accidentally introduce `<` tags from `&lt;` after the tag
+    pass."""
+    if not text or ("<" not in text and "{" not in text):
+        return text
+    t = _CC_STYLE.sub(" ", text)
+    t = _CC_CSSRULE.sub(" ", t)
+    t = _CC_TAG.sub(" ", t)
+    t = _CC_MSO.sub(" ", t)
+    for a, b in _CC_ENT:
+        t = t.replace(a, b)
+    t = _CC_WS.sub(" ", t)
+    t = _CC_NL.sub("\n\n", t)
+    return t.strip()
 def build_event_block(idx: int, event: dict[str, Any]) -> str:
-    """Render one event as `[event K]\nheader\n---\ncontent` block."""
+    """Render one event as `[event K]\nheader\n---\ncontent` block.
+    Content is passed through `clean_content()` before truncation so
+    that the MAX_CONTENT_CHARS slice doesn't end up containing pure
+    HTML markup with no extractable signal."""
     src = event.get("source_kind", "unknown")
-    content = (event.get("content") or "")[:MAX_CONTENT_CHARS]
+    content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
     attrs = event.get("attributes") or {}
     when = attrs.get("emitted_at") or attrs.get("timestamp")
     author = attrs.get("author") or attrs.get("user_id")