npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.6 → 0.10.0 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py ADDED Viewed

@@ -0,0 +1,758 @@
+#!/usr/bin/env python3
+"""Generate synthetic memory-engine-v2 events from real-corpus distributions.
+Outputs JSONL matching the events table schema. PII-free; deterministic
+with --seed. Drop-in for tests/fixtures + extractor pipeline validation.
+The distributions baked into DISTRIBUTIONS below were extracted from the
+prod v2 org-model on 2026-05-29 via aggregate queries — counts and
+percentiles only, no record content sampled. See the ENGINEERING_FIX
+register for the methodology.
+The synthetic content is deliberately seeded with named-entity fragments
+(people, companies, projects, dates, decisions) so the v2 extractor
+pipeline has meaningful targets — pure lorem-ipsum would leave the
+extractor with nothing to extract.
+Stdlib only. Set --use-faker to fall back to faker if installed (richer
+names, but optional — not required for the script to run).
+Usage
+-----
+    python generate_synthetic_corpus.py \\
+        --chat 700 --note 700 --event 700 \\
+        --seed 42 \\
+        --output ~/dumps/synthetic-2026-05-28/
+Output structure
+----------------
+    ~/dumps/synthetic-2026-05-28/
+      chat.jsonl          # 700 records
+      note.jsonl          # 700 records
+      event.jsonl         # 700 records
+      manifest.json       # generation parameters + checksums
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import random
+import sys
+import uuid
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any
+# ---------------------------------------------------------------------------
+# Distributions extracted from prod v2 org-model 2026-05-29.
+# Numbers only — no content was sampled. Each percentile is character count
+# for `content` field; means thread depth = msgs/thread; etc.
+# ---------------------------------------------------------------------------
+# Hour-of-day weighting shared across kinds. The prod data shows a heavy
+# ingest-batch spike at 17 UTC because emitted_at = ingest time, not send
+# time. For synthetic data we want REAL send times distributed across the
+# working day, so we use a 9am-6pm weighted bell.
+_HOUR_WEIGHTS = [
+    0.01, 0.01, 0.01, 0.01, 0.01, 0.02,  # 0-5
+    0.03, 0.05, 0.07, 0.09, 0.10, 0.10,  # 6-11
+    0.08, 0.09, 0.09, 0.08, 0.06, 0.04,  # 12-17
+    0.03, 0.01, 0.01, 0.01, 0.01, 0.01,  # 18-23
+]
+DISTRIBUTIONS: dict[str, dict[str, Any]] = {
+    "chat": {
+        # Percentiles for length(content) in chars (p10, p25, p50, p75, p90, p99, max)
+        "content_len_percentiles": [22, 37, 78, 190, 603, 1960, 4727],
+        "content_len_mean": 227,
+        # Threading: 60% of chats live in a thread; avg 4.5 msgs/thread
+        "pct_threaded": 0.60,
+        "thread_depth_percentiles": [3, 10, 25, 70],  # p50, p90, p99, max
+        # Inbound vs outbound (real: 90/10)
+        "p_inbound": 0.90,
+        "hour_weights": _HOUR_WEIGHTS,
+        "distinct_teams": 83,
+        "distinct_channels": 2,
+        "distinct_authors": 128,
+    },
+    "note": {
+        # Email body — note the 2000-char ingest cap on max
+        "content_len_percentiles": [152, 195, 863, 1823, 1961, 1996, 2000],
+        "content_len_mean": 996,
+        "pct_threaded": 0.45,
+        "thread_depth_percentiles": [1, 1, 6, 57],
+        "p_inbound": 0.75,
+        "hour_weights": _HOUR_WEIGHTS,
+        "distinct_authors": 1109,
+    },
+    "event": {
+        # Calendar event — title + description
+        "content_len_percentiles": [108, 133, 308, 604, 1122, 1988, 1998],
+        "content_len_mean": 465,
+        "pct_threaded": 0.0,
+        "thread_depth_percentiles": None,
+        "p_inbound": None,
+        "hour_weights": _HOUR_WEIGHTS,
+        "distinct_authors": 110,
+    },
+}
+# ---------------------------------------------------------------------------
+# Name + word pools — embedded so the script has zero runtime deps.
+# Picked to be obviously-synthetic so generated content can't be confused
+# with real corpus content.
+# ---------------------------------------------------------------------------
+FIRST_NAMES = [
+    "Avery", "Bailey", "Carson", "Drew", "Emerson", "Finley", "Greer",
+    "Harper", "Indie", "Jordan", "Kai", "Logan", "Morgan", "Noa", "Oakley",
+    "Parker", "Quinn", "Reese", "Sage", "Tatum", "Uma", "Vesper", "Wren",
+    "Xen", "Yara", "Zane", "Arden", "Blake", "Cassidy", "Devon", "Ellis",
+]
+LAST_NAMES = [
+    "Sterling", "Holloway", "Whitfield", "Carrington", "Vance", "Ashford",
+    "Beaumont", "Caldwell", "Donovan", "Everhart", "Fairfax", "Granger",
+    "Hathaway", "Ingram", "Jasper", "Kingsley", "Lockwood", "Merritt",
+    "Northrop", "Ormsby", "Pemberton", "Quinton", "Radcliffe", "Sinclair",
+    "Thornton", "Underhill", "Valencia", "Westbrook", "Yardley", "Zamora",
+]
+COMPANIES = [
+    "Aldera", "Brevix", "Calibrant", "Demarcation", "Evermere", "Fjordline",
+    "Glasshouse", "Hexalite", "Iridos", "Junctura", "Kestrel Works",
+    "Lumenfold", "Mistrell", "Norden", "Obsidiana", "Parallax Labs",
+    "Quillet", "Reflectory", "Silica Tide", "Tessera", "Umberhold",
+    "Verdant Loop", "Wayfound", "Xerelt", "Yarrow Forge", "Zincara",
+]
+PROJECTS = [
+    "Beacon", "Cinder", "Ditto", "Eclipse", "Fathom", "Glide", "Hatch",
+    "Inkwell", "Juniper", "Keelhaul", "Loom", "Mosaic", "Notch", "Onyx",
+    "Polaris", "Quill", "Rover", "Slate", "Tundra", "Unify", "Voyage",
+    "Whisper", "Xenith", "Yonder", "Zephyr",
+]
+TEAMS = [
+    "platform-core", "growth-experiments", "design-systems", "infra-edge",
+    "data-pipeline", "billing-and-ledger", "ml-foundations", "frontend-app",
+    "mobile-launch", "search-quality", "observability", "trust-and-safety",
+    "partner-integrations", "customer-success", "qa-automation",
+    "docs-and-developer-experience", "embedded-devices", "hardware-eng",
+]
+EMOJI_REACTIONS = [
+    "👍", "🔥", "🚀", "👀", "🎉", "✅", "🙏", "💯", "👋", "🤔",
+    "💭", "💪", "🧠", "📌", "🎯", "🛠️", "📊", "📝", "🐛", "❤️",
+]
+CHAT_TEMPLATES = [
+    "@{p1} did you see the {project} update?",
+    "yeah I'll pick up {project} by {day}",
+    "should we move the {project} review to {day}?",
+    "{p1} just merged the {project} fix",
+    "approving the {company} contract this week",
+    "any blockers on {project}?",
+    "shipped {project} to staging just now",
+    "can someone take a look at {project} when you have a sec",
+    "{p1} {emoji} that's exactly what I was thinking",
+    "let's chat about {project} after standup",
+    "I disagree — going with {company} would lock us in",
+    "decision: we'll use {project} as the canonical path",
+    "{p1} you free for 15 in the {team} room?",
+    "logged the {project} bug, ticket is in the channel",
+    "rollback completed for {project}, prod is stable",
+]
+EMAIL_SUBJECT_TEMPLATES = [
+    "Re: {project} — {day} update",
+    "{company} renewal: action required",
+    "Follow-up: {project} review meeting",
+    "Quick note on {project}",
+    "{company} <> {our_company} partnership next steps",
+    "FYI: {project} deployment Friday",
+    "Decision needed: {project} approach",
+    "Heads up — {project} timeline update",
+]
+EMAIL_BODY_FRAGMENTS = [
+    "Hi {p1},\n\nWanted to flag a few things on {project}:",
+    "Thanks for the call earlier. To recap what we agreed:",
+    "Quick note ahead of {day}'s {project} sync —",
+    "Following up on the {company} discussion.",
+    "Sharing notes from the {team} review.",
+]
+EMAIL_BODY_MIDDLES = [
+    "1. The {project} scope is now locked for {day}.\n2. {p1} owns the implementation.\n3. {p2} will review the design doc by EOW.\n",
+    "We decided to go with {company} for the integration. The contract is in legal review and should close by {day}.\n",
+    "Open question: do we want {project} to ship under the {team} brand or its own surface?\n",
+    "Action items from the call:\n- {p1}: update the {project} migration plan\n- {p2}: draft the {company} announcement\n- {p3}: schedule the {team} sync\n",
+    "Risks I want to call out:\n- {project} depends on {company}'s release; if they slip we slip\n- {team} bandwidth is tight through {day}\n- The {project} rollback path needs a dry run\n",
+]
+EMAIL_CLOSINGS = [
+    "\nLet me know if anything looks off.\n\nThanks,\n{author}",
+    "\nHappy to hop on a call to walk through this.\n\nBest,\n{author}",
+    "\nWill follow up after the {team} review.\n\n{author}",
+    "\nLet's regroup on {day}.\n\n{author}",
+]
+EVENT_TITLE_TEMPLATES = [
+    "{project} weekly sync",
+    "{p1} <> {p2} 1:1",
+    "{company} contract review",
+    "{team} planning",
+    "Office hours: {project}",
+    "{project} retro",
+    "{p1} interview ({project} role)",
+    "Demo: {project} v{ver}",
+    "{team} standup",
+    "{company} kickoff call",
+]
+EVENT_DESC_TEMPLATES = [
+    "Agenda:\n- {project} status\n- Open risks\n- Decision: {project} launch date\n\nAttendees: {p1}, {p2}, {p3}\nLocation: {place}",
+    "Walkthrough of the {project} design doc.\n\nDial-in: meet.{our_company}.com/{slug}\nNotes will be shared after.",
+    "Quarterly {team} planning. Bring your top 3 priorities for the next quarter.\n\nFacilitator: {p1}\nNote-taker: {p2}",
+    "Demo + Q&A for the {project} release.\n\nRecording will be posted in #{team}.\nLocation: {place}",
+    "Discussion with {company} on integration scope.\n\nOur side: {p1}, {p2}\nTheir side: TBD",
+]
+# ---------------------------------------------------------------------------
+# Sampling helpers
+# ---------------------------------------------------------------------------
+def piecewise_sample(rng: random.Random, percentiles: list[int]) -> int:
+    """Sample from a value distribution defined by percentile points.
+    percentiles[0..6] = [p10, p25, p50, p75, p90, p99, max]. We interpolate
+    linearly between adjacent percentile pairs based on a uniform [0,1]
+    draw. This gives a heavy-tailed distribution matching the real-corpus
+    shape without needing scipy.
+    """
+    boundaries = [0.10, 0.25, 0.50, 0.75, 0.90, 0.99, 1.00]
+    u = rng.random()
+    # find which bucket u falls into
+    prev_b, prev_v = 0.0, max(1, percentiles[0] // 3)  # lower edge
+    for b, v in zip(boundaries, percentiles):
+        if u <= b:
+            # linear interpolation between (prev_b, prev_v) and (b, v)
+            t = (u - prev_b) / (b - prev_b) if b > prev_b else 0
+            return max(1, int(prev_v + t * (v - prev_v)))
+        prev_b, prev_v = b, v
+    return percentiles[-1]
+def weighted_hour(rng: random.Random, weights: list[float]) -> int:
+    return rng.choices(range(24), weights=weights, k=1)[0]
+def sample_timestamp(rng: random.Random, start: datetime, span_days: int,
+                     hour_weights: list[float]) -> datetime:
+    """Pick a random datetime within [start, start+span_days), with hour-of-day
+    weighting (so the synthetic corpus skews to working hours like real data
+    would, rather than uniformly across midnight)."""
+    day_offset = rng.uniform(0, span_days)
+    base = start + timedelta(days=day_offset)
+    hour = weighted_hour(rng, hour_weights)
+    minute = rng.randint(0, 59)
+    second = rng.randint(0, 59)
+    millisecond = rng.randint(0, 999)
+    return base.replace(hour=hour, minute=minute, second=second,
+                        microsecond=millisecond * 1000)
+def synth_email(rng: random.Random, first: str, last: str,
+                domain: str = "example-corp.com") -> str:
+    return f"{first.lower()}.{last.lower()}@{domain}"
+def synth_slack_id(rng: random.Random) -> str:
+    return "U" + "".join(rng.choices("0123456789ABCDEF", k=10))
+# ---------------------------------------------------------------------------
+# Content generators per source_kind. All produce content that:
+#   - contains named entities (people, companies, projects) for extractor
+#   - matches a target length approximately
+#   - is obviously-synthetic (no real-world specifics)
+# ---------------------------------------------------------------------------
+def gen_chat_content(rng: random.Random, target_len: int) -> str:
+    """A slack-like chat message. Short by default (p50=78); occasionally
+    long (p99=1960). Built from short templates, padded with emoji or
+    follow-on sentences if we need more length."""
+    p1 = rng.choice(FIRST_NAMES)
+    project = rng.choice(PROJECTS)
+    company = rng.choice(COMPANIES)
+    team = rng.choice(TEAMS)
+    day = rng.choice(["Monday", "Tuesday", "Wednesday", "Thursday",
+                      "Friday", "next week", "EOD", "tomorrow"])
+    emoji = rng.choice(EMOJI_REACTIONS)
+    msg = rng.choice(CHAT_TEMPLATES).format(
+        p1=p1, project=project, company=company, team=team, day=day, emoji=emoji
+    )
+    # If target is much longer than the template, append follow-ups
+    while len(msg) < target_len:
+        addition_target = target_len - len(msg)
+        if addition_target < 30:
+            msg += " " + rng.choice(EMOJI_REACTIONS)
+            break
+        next_msg = rng.choice(CHAT_TEMPLATES).format(
+            p1=rng.choice(FIRST_NAMES), project=rng.choice(PROJECTS),
+            company=rng.choice(COMPANIES), team=rng.choice(TEAMS),
+            day=rng.choice(["Monday", "Tuesday", "Friday", "next sprint"]),
+            emoji=rng.choice(EMOJI_REACTIONS),
+        )
+        msg += "\n" + next_msg
+    return msg[:target_len]
+def gen_email_content(rng: random.Random, target_len: int, author_name: str) -> str:
+    """Email body with subject implicit (subject goes in attributes).
+    Has greeting + middle + closing structure with named entities."""
+    p1 = rng.choice(FIRST_NAMES)
+    p2 = rng.choice(FIRST_NAMES)
+    p3 = rng.choice(FIRST_NAMES)
+    project = rng.choice(PROJECTS)
+    company = rng.choice(COMPANIES)
+    team = rng.choice(TEAMS)
+    day = rng.choice(["Monday", "Wednesday", "Friday", "next week", "end of month"])
+    parts = [
+        rng.choice(EMAIL_BODY_FRAGMENTS).format(p1=p1, project=project,
+                                                company=company, team=team, day=day),
+    ]
+    # Add middles until we hit target length
+    while sum(len(p) for p in parts) < target_len * 0.8:
+        parts.append(
+            rng.choice(EMAIL_BODY_MIDDLES).format(
+                p1=p1, p2=p2, p3=p3, project=project, company=company,
+                team=team, day=day,
+            )
+        )
+    parts.append(rng.choice(EMAIL_CLOSINGS).format(author=author_name, team=team, day=day))
+    body = "\n".join(parts)
+    return body[:target_len]
+def _event_section(rng: random.Random, kind: str, project: str, company: str,
+                   team: str, p1: str, p2: str, p3: str) -> str:
+    """Build one additional section of an event description. Used to extend
+    a base event template to match a target length — real calendar events
+    often have long bodies (agendas, pre-reads, attendee notes)."""
+    if kind == "pre_read":
+        return (
+            f"\n\nPre-read:\n"
+            f"- Latest {project} status doc (linked in calendar)\n"
+            f"- {company} contract draft (shared earlier this week)\n"
+            f"- {team} OKRs deck"
+        )
+    if kind == "attendees":
+        more = ", ".join(rng.sample(FIRST_NAMES, k=rng.randint(4, 8)))
+        return f"\n\nExpected attendees: {p1}, {p2}, {p3}, {more}"
+    if kind == "agenda":
+        items = rng.sample([
+            f"{project} status update",
+            f"Risks and dependencies on {company}",
+            f"{team} headcount + hiring",
+            f"Demo of latest {project} build",
+            f"Q&A and open discussion",
+            f"Action items + owners",
+            f"Decision: {project} launch date",
+            f"Review of last week's action items",
+            f"Customer feedback ({company} pilot)",
+            f"Roadmap alignment with {team}",
+        ], k=rng.randint(3, 6))
+        return "\n\nAgenda:\n" + "\n".join(f"- {x}" for x in items)
+    if kind == "context":
+        return (
+            f"\n\nContext: We're aligning on {project} ahead of the {team} "
+            f"review next week. {p1} will drive the discussion; {p2} is "
+            f"taking notes; {p3} will follow up on action items. Please "
+            f"come prepared with your top concerns + suggested mitigations."
+        )
+    if kind == "logistics":
+        return (
+            f"\n\nLogistics:\n"
+            f"- Dial-in: meet.example-corp.com/{project.lower()}-{team.split('-')[0]}\n"
+            f"- Backup line: +1-555-0100 PIN {rng.randint(100000, 999999)}\n"
+            f"- Recording: enabled, will be shared in #{team}"
+        )
+    return ""
+def gen_event_content(rng: random.Random, target_len: int, ver: str) -> str:
+    """Calendar event title + description. Extends with additional
+    sections (agenda / attendees / pre-read / context / logistics) to
+    reach the target length, mirroring how real calendar entries grow
+    with longer agendas."""
+    p1, p2, p3 = (rng.choice(FIRST_NAMES) for _ in range(3))
+    project = rng.choice(PROJECTS)
+    company = rng.choice(COMPANIES)
+    team = rng.choice(TEAMS)
+    place = rng.choice(["Room Apollo", "Room Beacon", "Zoom", "Google Meet",
+                         "Office HQ", "Lab 2", "Conf Rm 4"])
+    our_company = rng.choice(COMPANIES).lower().replace(" ", "")
+    slug = "-".join(rng.choices(PROJECTS, k=2)).lower()
+    title = rng.choice(EVENT_TITLE_TEMPLATES).format(
+        p1=p1, p2=p2, project=project, company=company, team=team, ver=ver
+    )
+    desc = rng.choice(EVENT_DESC_TEMPLATES).format(
+        p1=p1, p2=p2, p3=p3, project=project, company=company, team=team,
+        place=place, slug=slug, our_company=our_company,
+    )
+    full = f"{title}\n\n{desc}"
+    # Extend with sections until close to target length. Use each section
+    # type at most once so we don't get repetitive output.
+    sections_used: set[str] = set()
+    section_order = ["agenda", "attendees", "pre_read", "context", "logistics"]
+    rng.shuffle(section_order)
+    for kind in section_order:
+        if len(full) >= target_len * 0.95:
+            break
+        if kind in sections_used:
+            continue
+        full += _event_section(rng, kind, project, company, team, p1, p2, p3)
+        sections_used.add(kind)
+    return full[:target_len]
+# ---------------------------------------------------------------------------
+# Record builders. Output shape matches the events table columns:
+#   event_id, arena, source_kind, source_id, emitted_at, content,
+#   content_hash, participant_set (array), participant_kind, attributes (jsonb)
+# ---------------------------------------------------------------------------
+def content_hash(arena: str, content: str) -> str:
+    """Content-addressed hash matching the v2 schema's `content_hash`
+    (sha256 of arena||':'||canonical_content). Used for idempotent upsert
+    and dedup."""
+    return hashlib.sha256(f"{arena}:{content}".encode("utf-8")).hexdigest()
+def build_chat_record(rng: random.Random, arena: str, start: datetime,
+                      span_days: int, thread_id: str | None = None,
+                      author_pool: list[tuple[str, str, str]] | None = None) -> dict:
+    dist = DISTRIBUTIONS["chat"]
+    length = piecewise_sample(rng, dist["content_len_percentiles"])
+    content = gen_chat_content(rng, length)
+    is_outbound = rng.random() >= dist["p_inbound"]
+    direction = "outbound" if is_outbound else "inbound"
+    author = rng.choice(author_pool) if author_pool else (
+        rng.choice(FIRST_NAMES), rng.choice(LAST_NAMES), synth_slack_id(rng)
+    )
+    author_first, author_last, author_id = author
+    team = rng.choice(TEAMS)
+    timestamp = sample_timestamp(rng, start, span_days, dist["hour_weights"])
+    src_id = f"{int(timestamp.timestamp() * 1000) / 1000:.3f}.{rng.randint(100000, 999999)}"
+    attrs = {
+        "kind": "chat",
+        "team": team,
+        "source": "synthetic-slack",
+        "channel": "slack",
+        "author": author_id,
+        "doc_type": "chat",
+        "direction": direction,
+        "source_id": src_id,
+        "thread_id": thread_id,
+        "timestamp": timestamp.isoformat().replace("+00:00", "Z"),
+        "event_type": "STORE_MEMORY",
+        "layer_type": "episodic",
+        "entity_type": "conversation",
+        "memory_kind": "chat",
+        "contact_name": f"{author_first} {author_last}",
+        "contact_email": synth_email(rng, author_first, author_last),
+        "source_system": "synthetic-slack-ingest",
+    }
+    event_id = str(uuid.uuid4())
+    return {
+        "event_id": event_id,
+        "arena": arena,
+        "source_kind": "chat",
+        "source_id": src_id,
+        "emitted_at": timestamp.isoformat().replace("+00:00", "Z"),
+        "content": content,
+        "content_hash": content_hash(arena, content),
+        "participant_set": [arena],
+        "participant_kind": "unknown",
+        "attributes": attrs,
+    }
+def build_note_record(rng: random.Random, arena: str, start: datetime,
+                      span_days: int, thread_id: str | None = None,
+                      author_pool: list[tuple[str, str, str]] | None = None,
+                      arena_owner_email: str = "user@example-corp.com") -> dict:
+    dist = DISTRIBUTIONS["note"]
+    length = piecewise_sample(rng, dist["content_len_percentiles"])
+    author = rng.choice(author_pool) if author_pool else (
+        rng.choice(FIRST_NAMES), rng.choice(LAST_NAMES), synth_slack_id(rng)
+    )
+    author_first, author_last, _ = author
+    author_email = synth_email(rng, author_first, author_last)
+    author_name = f"{author_first} {author_last}"
+    content = gen_email_content(rng, length, author_name)
+    is_outbound = rng.random() >= dist["p_inbound"]
+    direction = "outbound" if is_outbound else "inbound"
+    timestamp = sample_timestamp(rng, start, span_days, dist["hour_weights"])
+    src_id = f"<{uuid.uuid4().hex}@example-corp.com>"
+    # Subject is part of email metadata, generated separately from body
+    subject = rng.choice(EMAIL_SUBJECT_TEMPLATES).format(
+        project=rng.choice(PROJECTS),
+        company=rng.choice(COMPANIES),
+        day=rng.choice(["Monday", "Friday", "this week"]),
+        our_company=rng.choice(COMPANIES).lower().replace(" ", ""),
+    )
+    attrs = {
+        "kind": "note",
+        "source": "synthetic-gmail",
+        "channel": "email",
+        "doc_type": "email",
+        "subject": subject,
+        "from_email": author_email,
+        "from_name": author_name,
+        "to_email": arena_owner_email if direction == "inbound" else author_email,
+        "direction": direction,
+        "source_id": src_id,
+        "thread_id": thread_id,
+        "timestamp": timestamp.isoformat().replace("+00:00", "Z"),
+        "event_type": "STORE_MEMORY",
+        "memory_kind": "note",
+        "source_system": "synthetic-gmail-ingest",
+    }
+    return {
+        "event_id": str(uuid.uuid4()),
+        "arena": arena,
+        "source_kind": "note",
+        "source_id": src_id,
+        "emitted_at": timestamp.isoformat().replace("+00:00", "Z"),
+        "content": content,
+        "content_hash": content_hash(arena, content),
+        "participant_set": [arena],
+        "participant_kind": "unknown",
+        "attributes": attrs,
+    }
+def build_event_record(rng: random.Random, arena: str, start: datetime,
+                       span_days: int) -> dict:
+    dist = DISTRIBUTIONS["event"]
+    length = piecewise_sample(rng, dist["content_len_percentiles"])
+    ver = f"{rng.randint(0,3)}.{rng.randint(0,12)}.{rng.randint(0,9)}"
+    content = gen_event_content(rng, length, ver=ver)
+    timestamp = sample_timestamp(rng, start, span_days, dist["hour_weights"])
+    src_id = uuid.uuid4().hex  # mimics google calendar event id
+    attrs = {
+        "kind": "event",
+        "source": "synthetic-gcal",
+        "doc_type": "calendar",
+        "calendar": "primary",
+        "title": content.split("\n", 1)[0][:200],
+        "duration_minutes": rng.choice([15, 30, 45, 60, 90]),
+        "source_id": src_id,
+        "timestamp": timestamp.isoformat().replace("+00:00", "Z"),
+        "event_type": "STORE_MEMORY",
+        "memory_kind": "event",
+        "source_system": "synthetic-gcal-ingest",
+    }
+    return {
+        "event_id": str(uuid.uuid4()),
+        "arena": arena,
+        "source_kind": "event",
+        "source_id": src_id,
+        "emitted_at": timestamp.isoformat().replace("+00:00", "Z"),
+        "content": content,
+        "content_hash": content_hash(arena, content),
+        "participant_set": [arena],
+        "participant_kind": "unknown",
+        "attributes": attrs,
+    }
+# ---------------------------------------------------------------------------
+# Thread orchestration. The real corpus has 60% of chats in threads with
+# avg depth 4.5, so we plan thread groups up front then assign messages.
+# ---------------------------------------------------------------------------
+def sample_thread_depth(rng: random.Random, percentiles: list[int]) -> int:
+    """Sample a thread depth (messages per thread) from a 4-point percentile
+    distribution `[p50, p90, p99, max]`. Uses bucketed linear interpolation
+    matching the exact percentile semantics — distinct from the 7-point
+    piecewise_sample used for content lengths, because thread depths are
+    heavily concentrated at the low end (p50 is often 1 or 3 even when max
+    is in the tens)."""
+    p50, p90, p99, pmax = percentiles
+    u = rng.random()
+    if u <= 0.50:
+        return max(1, int(1 + u * 2 * (p50 - 1)))
+    elif u <= 0.90:
+        return max(1, int(p50 + (u - 0.50) / 0.40 * (p90 - p50)))
+    elif u <= 0.99:
+        return max(1, int(p90 + (u - 0.90) / 0.09 * (p99 - p90)))
+    else:
+        return max(1, int(p99 + (u - 0.99) / 0.01 * (pmax - p99)))
+def plan_threads(rng: random.Random, total: int, pct_threaded: float,
+                 depth_percentiles: list[int]) -> tuple[list[str | None], int]:
+    """Returns (thread_id_per_record, n_threads_created).
+    Each record gets either a thread_id (if part of a thread) or None.
+    Threads are sized by sampling the depth percentile distribution via
+    `sample_thread_depth`, which respects the heavy left-skew of real
+    thread-depth distributions.
+    """
+    if pct_threaded == 0.0 or depth_percentiles is None:
+        return [None] * total, 0
+    target_threaded = int(total * pct_threaded)
+    thread_assignments: list[str | None] = []
+    threads_created = 0
+    remaining_threaded = target_threaded
+    while remaining_threaded > 0:
+        depth = sample_thread_depth(rng, depth_percentiles)
+        depth = min(depth, remaining_threaded, depth_percentiles[-1])
+        tid = f"thread-{uuid.uuid4().hex[:12]}"
+        for _ in range(depth):
+            thread_assignments.append(tid)
+        threads_created += 1
+        remaining_threaded -= depth
+    # Pad with standalone (None) records
+    while len(thread_assignments) < total:
+        thread_assignments.append(None)
+    rng.shuffle(thread_assignments)
+    return thread_assignments[:total], threads_created
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--chat", type=int, default=700, help="number of chat records")
+    p.add_argument("--note", type=int, default=700, help="number of note (email) records")
+    p.add_argument("--event", type=int, default=700, help="number of event (cal) records")
+    p.add_argument("--arena", default=None,
+                   help="synthetic tenant arena id (default: synthetic-tenant:<deterministic-uuid>)")
+    p.add_argument("--arena-owner-email", default="user@example-corp.com",
+                   help="arena owner email (for inbound/outbound to_email)")
+    p.add_argument("--span-days", type=int, default=10,
+                   help="time range over which to spread synthetic events (real data spans ~10d)")
+    p.add_argument("--seed", type=int, default=42, help="RNG seed for reproducibility")
+    p.add_argument("--output", type=Path, required=True,
+                   help="output directory (will be created if missing)")
+    return p.parse_args(argv)
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    rng = random.Random(args.seed)
+    output_dir = args.output.expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    arena = args.arena or f"synthetic-tenant:{uuid.uuid5(uuid.NAMESPACE_DNS, f'synthetic-{args.seed}').hex}"
+    # Time window: real corpus spans 10 days; we anchor at "now - span" so
+    # timestamps look recent without being today-only.
+    end = datetime.now(timezone.utc)
+    start = end - timedelta(days=args.span_days)
+    # Build a small pool of plausible synthetic authors so the corpus has
+    # repeat senders (matching the real attribute_cardinality numbers).
+    author_pool = [
+        (rng.choice(FIRST_NAMES), rng.choice(LAST_NAMES), synth_slack_id(rng))
+        for _ in range(min(150, max(args.chat, args.note) // 5))
+    ]
+    manifest = {
+        "generated_at_utc": end.isoformat(),
+        "seed": args.seed,
+        "arena": arena,
+        "arena_owner_email": args.arena_owner_email,
+        "span_days": args.span_days,
+        "counts": {"chat": args.chat, "note": args.note, "event": args.event},
+        "distributions_source": "prod v2 org-model 2026-05-29 (aggregate only)",
+        "files": {},
+    }
+    # --- chat ---
+    chat_tids, chat_threads = plan_threads(
+        rng, args.chat,
+        DISTRIBUTIONS["chat"]["pct_threaded"],
+        DISTRIBUTIONS["chat"]["thread_depth_percentiles"],
+    )
+    chat_path = output_dir / "chat.jsonl"
+    with chat_path.open("w") as f:
+        for tid in chat_tids:
+            rec = build_chat_record(rng, arena, start, args.span_days, tid, author_pool)
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    manifest["files"]["chat.jsonl"] = {
+        "records": args.chat, "threads": chat_threads,
+        "sha256": _file_sha256(chat_path),
+    }
+    # --- note ---
+    note_tids, note_threads = plan_threads(
+        rng, args.note,
+        DISTRIBUTIONS["note"]["pct_threaded"],
+        DISTRIBUTIONS["note"]["thread_depth_percentiles"],
+    )
+    note_path = output_dir / "note.jsonl"
+    with note_path.open("w") as f:
+        for tid in note_tids:
+            rec = build_note_record(rng, arena, start, args.span_days, tid,
+                                    author_pool, args.arena_owner_email)
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    manifest["files"]["note.jsonl"] = {
+        "records": args.note, "threads": note_threads,
+        "sha256": _file_sha256(note_path),
+    }
+    # --- event ---
+    event_path = output_dir / "event.jsonl"
+    with event_path.open("w") as f:
+        for _ in range(args.event):
+            rec = build_event_record(rng, arena, start, args.span_days)
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    manifest["files"]["event.jsonl"] = {
+        "records": args.event, "threads": 0,
+        "sha256": _file_sha256(event_path),
+    }
+    manifest_path = output_dir / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2))
+    total = args.chat + args.note + args.event
+    print(f"wrote {total} records to {output_dir}/")
+    for kind, info in manifest["files"].items():
+        print(f"  {kind}: {info['records']} records, {info['threads']} threads")
+    print(f"  manifest.json: arena={arena}")
+    return 0
+def _file_sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(65536), b""):
+            h.update(chunk)
+    return h.hexdigest()
+if __name__ == "__main__":
+    sys.exit(main())