@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +1 -1
  5. package/dist/index.js +1 -1
  6. package/package.json +2 -2
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/src/server.js +2 -2
  12. package/packages/memory-engine-v2/.env.example +30 -0
  13. package/packages/memory-engine-v2/README.md +125 -0
  14. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  15. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  16. package/packages/memory-engine-v2/compat/server.py +1047 -0
  17. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  18. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  19. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  20. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  21. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  22. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  23. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  24. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  25. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  26. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  27. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  28. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  29. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  30. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  31. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  32. package/packages/memory-engine/.env.example +0 -13
  33. package/packages/memory-engine/MIGRATION.md +0 -219
  34. package/packages/memory-engine/README.md +0 -145
  35. package/packages/memory-engine/bench/README.md +0 -99
  36. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  37. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  38. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  39. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  40. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  41. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  42. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  43. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  44. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  45. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  48. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  49. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  50. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  51. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  60. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  61. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  62. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  63. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  66. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  67. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  68. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  69. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  78. package/packages/memory-engine/compat/Dockerfile +0 -22
  79. package/packages/memory-engine/compat/server.py +0 -1255
  80. package/packages/memory-engine/docker-compose.test.yml +0 -59
  81. package/packages/memory-engine/docker-compose.yml +0 -255
  82. package/packages/memory-engine/engine/README.md +0 -52
  83. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  84. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  85. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  86. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  87. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
  88. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  89. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  90. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  91. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  92. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  93. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  94. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  95. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  96. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  97. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  98. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  99. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  100. package/packages/memory-engine/pme_memory/embed.py +0 -74
  101. package/packages/memory-engine/pme_memory/health.py +0 -36
  102. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  103. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  104. package/packages/memory-engine/pme_memory/needs.py +0 -55
  105. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  106. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  107. package/packages/memory-engine/pme_memory/search.py +0 -52
  108. package/packages/memory-engine/pme_memory/store.py +0 -86
  109. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  110. package/packages/memory-engine/pyproject.toml +0 -65
  111. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  112. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  113. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  114. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  115. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  116. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  117. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  118. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  119. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  120. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  121. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  122. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  123. package/packages/memory-engine/tests/test_embed_provider.py +0 -693
  124. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  125. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  126. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  127. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -0,0 +1,758 @@
1
+ #!/usr/bin/env python3
2
+ """Generate synthetic memory-engine-v2 events from real-corpus distributions.
3
+
4
+ Outputs JSONL matching the events table schema. PII-free; deterministic
5
+ with --seed. Drop-in for tests/fixtures + extractor pipeline validation.
6
+
7
+ The distributions baked into DISTRIBUTIONS below were extracted from the
8
+ prod v2 org-model on 2026-05-29 via aggregate queries — counts and
9
+ percentiles only, no record content sampled. See the ENGINEERING_FIX
10
+ register for the methodology.
11
+
12
+ The synthetic content is deliberately seeded with named-entity fragments
13
+ (people, companies, projects, dates, decisions) so the v2 extractor
14
+ pipeline has meaningful targets — pure lorem-ipsum would leave the
15
+ extractor with nothing to extract.
16
+
17
+ Stdlib only. Set --use-faker to fall back to faker if installed (richer
18
+ names, but optional — not required for the script to run).
19
+
20
+ Usage
21
+ -----
22
+ python generate_synthetic_corpus.py \\
23
+ --chat 700 --note 700 --event 700 \\
24
+ --seed 42 \\
25
+ --output ~/dumps/synthetic-2026-05-28/
26
+
27
+ Output structure
28
+ ----------------
29
+ ~/dumps/synthetic-2026-05-28/
30
+ chat.jsonl # 700 records
31
+ note.jsonl # 700 records
32
+ event.jsonl # 700 records
33
+ manifest.json # generation parameters + checksums
34
+ """
35
+ from __future__ import annotations
36
+
37
+ import argparse
38
+ import hashlib
39
+ import json
40
+ import os
41
+ import random
42
+ import sys
43
+ import uuid
44
+ from datetime import datetime, timedelta, timezone
45
+ from pathlib import Path
46
+ from typing import Any
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Distributions extracted from prod v2 org-model 2026-05-29.
51
+ # Numbers only — no content was sampled. Each percentile is character count
52
+ # for `content` field; means thread depth = msgs/thread; etc.
53
+ # ---------------------------------------------------------------------------
54
+
55
+ # Hour-of-day weighting shared across kinds. The prod data shows a heavy
56
+ # ingest-batch spike at 17 UTC because emitted_at = ingest time, not send
57
+ # time. For synthetic data we want REAL send times distributed across the
58
+ # working day, so we use a 9am-6pm weighted bell.
59
+ _HOUR_WEIGHTS = [
60
+ 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, # 0-5
61
+ 0.03, 0.05, 0.07, 0.09, 0.10, 0.10, # 6-11
62
+ 0.08, 0.09, 0.09, 0.08, 0.06, 0.04, # 12-17
63
+ 0.03, 0.01, 0.01, 0.01, 0.01, 0.01, # 18-23
64
+ ]
65
+
66
+ DISTRIBUTIONS: dict[str, dict[str, Any]] = {
67
+ "chat": {
68
+ # Percentiles for length(content) in chars (p10, p25, p50, p75, p90, p99, max)
69
+ "content_len_percentiles": [22, 37, 78, 190, 603, 1960, 4727],
70
+ "content_len_mean": 227,
71
+ # Threading: 60% of chats live in a thread; avg 4.5 msgs/thread
72
+ "pct_threaded": 0.60,
73
+ "thread_depth_percentiles": [3, 10, 25, 70], # p50, p90, p99, max
74
+ # Inbound vs outbound (real: 90/10)
75
+ "p_inbound": 0.90,
76
+ "hour_weights": _HOUR_WEIGHTS,
77
+ "distinct_teams": 83,
78
+ "distinct_channels": 2,
79
+ "distinct_authors": 128,
80
+ },
81
+ "note": {
82
+ # Email body — note the 2000-char ingest cap on max
83
+ "content_len_percentiles": [152, 195, 863, 1823, 1961, 1996, 2000],
84
+ "content_len_mean": 996,
85
+ "pct_threaded": 0.45,
86
+ "thread_depth_percentiles": [1, 1, 6, 57],
87
+ "p_inbound": 0.75,
88
+ "hour_weights": _HOUR_WEIGHTS,
89
+ "distinct_authors": 1109,
90
+ },
91
+ "event": {
92
+ # Calendar event — title + description
93
+ "content_len_percentiles": [108, 133, 308, 604, 1122, 1988, 1998],
94
+ "content_len_mean": 465,
95
+ "pct_threaded": 0.0,
96
+ "thread_depth_percentiles": None,
97
+ "p_inbound": None,
98
+ "hour_weights": _HOUR_WEIGHTS,
99
+ "distinct_authors": 110,
100
+ },
101
+ }
102
+
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # Name + word pools — embedded so the script has zero runtime deps.
106
+ # Picked to be obviously-synthetic so generated content can't be confused
107
+ # with real corpus content.
108
+ # ---------------------------------------------------------------------------
109
+
110
+ FIRST_NAMES = [
111
+ "Avery", "Bailey", "Carson", "Drew", "Emerson", "Finley", "Greer",
112
+ "Harper", "Indie", "Jordan", "Kai", "Logan", "Morgan", "Noa", "Oakley",
113
+ "Parker", "Quinn", "Reese", "Sage", "Tatum", "Uma", "Vesper", "Wren",
114
+ "Xen", "Yara", "Zane", "Arden", "Blake", "Cassidy", "Devon", "Ellis",
115
+ ]
116
+ LAST_NAMES = [
117
+ "Sterling", "Holloway", "Whitfield", "Carrington", "Vance", "Ashford",
118
+ "Beaumont", "Caldwell", "Donovan", "Everhart", "Fairfax", "Granger",
119
+ "Hathaway", "Ingram", "Jasper", "Kingsley", "Lockwood", "Merritt",
120
+ "Northrop", "Ormsby", "Pemberton", "Quinton", "Radcliffe", "Sinclair",
121
+ "Thornton", "Underhill", "Valencia", "Westbrook", "Yardley", "Zamora",
122
+ ]
123
+ COMPANIES = [
124
+ "Aldera", "Brevix", "Calibrant", "Demarcation", "Evermere", "Fjordline",
125
+ "Glasshouse", "Hexalite", "Iridos", "Junctura", "Kestrel Works",
126
+ "Lumenfold", "Mistrell", "Norden", "Obsidiana", "Parallax Labs",
127
+ "Quillet", "Reflectory", "Silica Tide", "Tessera", "Umberhold",
128
+ "Verdant Loop", "Wayfound", "Xerelt", "Yarrow Forge", "Zincara",
129
+ ]
130
+ PROJECTS = [
131
+ "Beacon", "Cinder", "Ditto", "Eclipse", "Fathom", "Glide", "Hatch",
132
+ "Inkwell", "Juniper", "Keelhaul", "Loom", "Mosaic", "Notch", "Onyx",
133
+ "Polaris", "Quill", "Rover", "Slate", "Tundra", "Unify", "Voyage",
134
+ "Whisper", "Xenith", "Yonder", "Zephyr",
135
+ ]
136
+ TEAMS = [
137
+ "platform-core", "growth-experiments", "design-systems", "infra-edge",
138
+ "data-pipeline", "billing-and-ledger", "ml-foundations", "frontend-app",
139
+ "mobile-launch", "search-quality", "observability", "trust-and-safety",
140
+ "partner-integrations", "customer-success", "qa-automation",
141
+ "docs-and-developer-experience", "embedded-devices", "hardware-eng",
142
+ ]
143
+ EMOJI_REACTIONS = [
144
+ "👍", "🔥", "🚀", "👀", "🎉", "✅", "🙏", "💯", "👋", "🤔",
145
+ "💭", "💪", "🧠", "📌", "🎯", "🛠️", "📊", "📝", "🐛", "❤️",
146
+ ]
147
+
148
+ CHAT_TEMPLATES = [
149
+ "@{p1} did you see the {project} update?",
150
+ "yeah I'll pick up {project} by {day}",
151
+ "should we move the {project} review to {day}?",
152
+ "{p1} just merged the {project} fix",
153
+ "approving the {company} contract this week",
154
+ "any blockers on {project}?",
155
+ "shipped {project} to staging just now",
156
+ "can someone take a look at {project} when you have a sec",
157
+ "{p1} {emoji} that's exactly what I was thinking",
158
+ "let's chat about {project} after standup",
159
+ "I disagree — going with {company} would lock us in",
160
+ "decision: we'll use {project} as the canonical path",
161
+ "{p1} you free for 15 in the {team} room?",
162
+ "logged the {project} bug, ticket is in the channel",
163
+ "rollback completed for {project}, prod is stable",
164
+ ]
165
+
166
+ EMAIL_SUBJECT_TEMPLATES = [
167
+ "Re: {project} — {day} update",
168
+ "{company} renewal: action required",
169
+ "Follow-up: {project} review meeting",
170
+ "Quick note on {project}",
171
+ "{company} <> {our_company} partnership next steps",
172
+ "FYI: {project} deployment Friday",
173
+ "Decision needed: {project} approach",
174
+ "Heads up — {project} timeline update",
175
+ ]
176
+
177
+ EMAIL_BODY_FRAGMENTS = [
178
+ "Hi {p1},\n\nWanted to flag a few things on {project}:",
179
+ "Thanks for the call earlier. To recap what we agreed:",
180
+ "Quick note ahead of {day}'s {project} sync —",
181
+ "Following up on the {company} discussion.",
182
+ "Sharing notes from the {team} review.",
183
+ ]
184
+ EMAIL_BODY_MIDDLES = [
185
+ "1. The {project} scope is now locked for {day}.\n2. {p1} owns the implementation.\n3. {p2} will review the design doc by EOW.\n",
186
+ "We decided to go with {company} for the integration. The contract is in legal review and should close by {day}.\n",
187
+ "Open question: do we want {project} to ship under the {team} brand or its own surface?\n",
188
+ "Action items from the call:\n- {p1}: update the {project} migration plan\n- {p2}: draft the {company} announcement\n- {p3}: schedule the {team} sync\n",
189
+ "Risks I want to call out:\n- {project} depends on {company}'s release; if they slip we slip\n- {team} bandwidth is tight through {day}\n- The {project} rollback path needs a dry run\n",
190
+ ]
191
+ EMAIL_CLOSINGS = [
192
+ "\nLet me know if anything looks off.\n\nThanks,\n{author}",
193
+ "\nHappy to hop on a call to walk through this.\n\nBest,\n{author}",
194
+ "\nWill follow up after the {team} review.\n\n{author}",
195
+ "\nLet's regroup on {day}.\n\n{author}",
196
+ ]
197
+
198
+ EVENT_TITLE_TEMPLATES = [
199
+ "{project} weekly sync",
200
+ "{p1} <> {p2} 1:1",
201
+ "{company} contract review",
202
+ "{team} planning",
203
+ "Office hours: {project}",
204
+ "{project} retro",
205
+ "{p1} interview ({project} role)",
206
+ "Demo: {project} v{ver}",
207
+ "{team} standup",
208
+ "{company} kickoff call",
209
+ ]
210
+ EVENT_DESC_TEMPLATES = [
211
+ "Agenda:\n- {project} status\n- Open risks\n- Decision: {project} launch date\n\nAttendees: {p1}, {p2}, {p3}\nLocation: {place}",
212
+ "Walkthrough of the {project} design doc.\n\nDial-in: meet.{our_company}.com/{slug}\nNotes will be shared after.",
213
+ "Quarterly {team} planning. Bring your top 3 priorities for the next quarter.\n\nFacilitator: {p1}\nNote-taker: {p2}",
214
+ "Demo + Q&A for the {project} release.\n\nRecording will be posted in #{team}.\nLocation: {place}",
215
+ "Discussion with {company} on integration scope.\n\nOur side: {p1}, {p2}\nTheir side: TBD",
216
+ ]
217
+
218
+
219
+ # ---------------------------------------------------------------------------
220
+ # Sampling helpers
221
+ # ---------------------------------------------------------------------------
222
+
223
+ def piecewise_sample(rng: random.Random, percentiles: list[int]) -> int:
224
+ """Sample from a value distribution defined by percentile points.
225
+
226
+ percentiles[0..6] = [p10, p25, p50, p75, p90, p99, max]. We interpolate
227
+ linearly between adjacent percentile pairs based on a uniform [0,1]
228
+ draw. This gives a heavy-tailed distribution matching the real-corpus
229
+ shape without needing scipy.
230
+ """
231
+ boundaries = [0.10, 0.25, 0.50, 0.75, 0.90, 0.99, 1.00]
232
+ u = rng.random()
233
+ # find which bucket u falls into
234
+ prev_b, prev_v = 0.0, max(1, percentiles[0] // 3) # lower edge
235
+ for b, v in zip(boundaries, percentiles):
236
+ if u <= b:
237
+ # linear interpolation between (prev_b, prev_v) and (b, v)
238
+ t = (u - prev_b) / (b - prev_b) if b > prev_b else 0
239
+ return max(1, int(prev_v + t * (v - prev_v)))
240
+ prev_b, prev_v = b, v
241
+ return percentiles[-1]
242
+
243
+
244
+ def weighted_hour(rng: random.Random, weights: list[float]) -> int:
245
+ return rng.choices(range(24), weights=weights, k=1)[0]
246
+
247
+
248
+ def sample_timestamp(rng: random.Random, start: datetime, span_days: int,
249
+ hour_weights: list[float]) -> datetime:
250
+ """Pick a random datetime within [start, start+span_days), with hour-of-day
251
+ weighting (so the synthetic corpus skews to working hours like real data
252
+ would, rather than uniformly across midnight)."""
253
+ day_offset = rng.uniform(0, span_days)
254
+ base = start + timedelta(days=day_offset)
255
+ hour = weighted_hour(rng, hour_weights)
256
+ minute = rng.randint(0, 59)
257
+ second = rng.randint(0, 59)
258
+ millisecond = rng.randint(0, 999)
259
+ return base.replace(hour=hour, minute=minute, second=second,
260
+ microsecond=millisecond * 1000)
261
+
262
+
263
+ def synth_email(rng: random.Random, first: str, last: str,
264
+ domain: str = "example-corp.com") -> str:
265
+ return f"{first.lower()}.{last.lower()}@{domain}"
266
+
267
+
268
+ def synth_slack_id(rng: random.Random) -> str:
269
+ return "U" + "".join(rng.choices("0123456789ABCDEF", k=10))
270
+
271
+
272
+ # ---------------------------------------------------------------------------
273
+ # Content generators per source_kind. All produce content that:
274
+ # - contains named entities (people, companies, projects) for extractor
275
+ # - matches a target length approximately
276
+ # - is obviously-synthetic (no real-world specifics)
277
+ # ---------------------------------------------------------------------------
278
+
279
+ def gen_chat_content(rng: random.Random, target_len: int) -> str:
280
+ """A slack-like chat message. Short by default (p50=78); occasionally
281
+ long (p99=1960). Built from short templates, padded with emoji or
282
+ follow-on sentences if we need more length."""
283
+ p1 = rng.choice(FIRST_NAMES)
284
+ project = rng.choice(PROJECTS)
285
+ company = rng.choice(COMPANIES)
286
+ team = rng.choice(TEAMS)
287
+ day = rng.choice(["Monday", "Tuesday", "Wednesday", "Thursday",
288
+ "Friday", "next week", "EOD", "tomorrow"])
289
+ emoji = rng.choice(EMOJI_REACTIONS)
290
+ msg = rng.choice(CHAT_TEMPLATES).format(
291
+ p1=p1, project=project, company=company, team=team, day=day, emoji=emoji
292
+ )
293
+ # If target is much longer than the template, append follow-ups
294
+ while len(msg) < target_len:
295
+ addition_target = target_len - len(msg)
296
+ if addition_target < 30:
297
+ msg += " " + rng.choice(EMOJI_REACTIONS)
298
+ break
299
+ next_msg = rng.choice(CHAT_TEMPLATES).format(
300
+ p1=rng.choice(FIRST_NAMES), project=rng.choice(PROJECTS),
301
+ company=rng.choice(COMPANIES), team=rng.choice(TEAMS),
302
+ day=rng.choice(["Monday", "Tuesday", "Friday", "next sprint"]),
303
+ emoji=rng.choice(EMOJI_REACTIONS),
304
+ )
305
+ msg += "\n" + next_msg
306
+ return msg[:target_len]
307
+
308
+
309
+ def gen_email_content(rng: random.Random, target_len: int, author_name: str) -> str:
310
+ """Email body with subject implicit (subject goes in attributes).
311
+ Has greeting + middle + closing structure with named entities."""
312
+ p1 = rng.choice(FIRST_NAMES)
313
+ p2 = rng.choice(FIRST_NAMES)
314
+ p3 = rng.choice(FIRST_NAMES)
315
+ project = rng.choice(PROJECTS)
316
+ company = rng.choice(COMPANIES)
317
+ team = rng.choice(TEAMS)
318
+ day = rng.choice(["Monday", "Wednesday", "Friday", "next week", "end of month"])
319
+
320
+ parts = [
321
+ rng.choice(EMAIL_BODY_FRAGMENTS).format(p1=p1, project=project,
322
+ company=company, team=team, day=day),
323
+ ]
324
+ # Add middles until we hit target length
325
+ while sum(len(p) for p in parts) < target_len * 0.8:
326
+ parts.append(
327
+ rng.choice(EMAIL_BODY_MIDDLES).format(
328
+ p1=p1, p2=p2, p3=p3, project=project, company=company,
329
+ team=team, day=day,
330
+ )
331
+ )
332
+ parts.append(rng.choice(EMAIL_CLOSINGS).format(author=author_name, team=team, day=day))
333
+ body = "\n".join(parts)
334
+ return body[:target_len]
335
+
336
+
337
+ def _event_section(rng: random.Random, kind: str, project: str, company: str,
338
+ team: str, p1: str, p2: str, p3: str) -> str:
339
+ """Build one additional section of an event description. Used to extend
340
+ a base event template to match a target length — real calendar events
341
+ often have long bodies (agendas, pre-reads, attendee notes)."""
342
+ if kind == "pre_read":
343
+ return (
344
+ f"\n\nPre-read:\n"
345
+ f"- Latest {project} status doc (linked in calendar)\n"
346
+ f"- {company} contract draft (shared earlier this week)\n"
347
+ f"- {team} OKRs deck"
348
+ )
349
+ if kind == "attendees":
350
+ more = ", ".join(rng.sample(FIRST_NAMES, k=rng.randint(4, 8)))
351
+ return f"\n\nExpected attendees: {p1}, {p2}, {p3}, {more}"
352
+ if kind == "agenda":
353
+ items = rng.sample([
354
+ f"{project} status update",
355
+ f"Risks and dependencies on {company}",
356
+ f"{team} headcount + hiring",
357
+ f"Demo of latest {project} build",
358
+ f"Q&A and open discussion",
359
+ f"Action items + owners",
360
+ f"Decision: {project} launch date",
361
+ f"Review of last week's action items",
362
+ f"Customer feedback ({company} pilot)",
363
+ f"Roadmap alignment with {team}",
364
+ ], k=rng.randint(3, 6))
365
+ return "\n\nAgenda:\n" + "\n".join(f"- {x}" for x in items)
366
+ if kind == "context":
367
+ return (
368
+ f"\n\nContext: We're aligning on {project} ahead of the {team} "
369
+ f"review next week. {p1} will drive the discussion; {p2} is "
370
+ f"taking notes; {p3} will follow up on action items. Please "
371
+ f"come prepared with your top concerns + suggested mitigations."
372
+ )
373
+ if kind == "logistics":
374
+ return (
375
+ f"\n\nLogistics:\n"
376
+ f"- Dial-in: meet.example-corp.com/{project.lower()}-{team.split('-')[0]}\n"
377
+ f"- Backup line: +1-555-0100 PIN {rng.randint(100000, 999999)}\n"
378
+ f"- Recording: enabled, will be shared in #{team}"
379
+ )
380
+ return ""
381
+
382
+
383
+ def gen_event_content(rng: random.Random, target_len: int, ver: str) -> str:
384
+ """Calendar event title + description. Extends with additional
385
+ sections (agenda / attendees / pre-read / context / logistics) to
386
+ reach the target length, mirroring how real calendar entries grow
387
+ with longer agendas."""
388
+ p1, p2, p3 = (rng.choice(FIRST_NAMES) for _ in range(3))
389
+ project = rng.choice(PROJECTS)
390
+ company = rng.choice(COMPANIES)
391
+ team = rng.choice(TEAMS)
392
+ place = rng.choice(["Room Apollo", "Room Beacon", "Zoom", "Google Meet",
393
+ "Office HQ", "Lab 2", "Conf Rm 4"])
394
+ our_company = rng.choice(COMPANIES).lower().replace(" ", "")
395
+ slug = "-".join(rng.choices(PROJECTS, k=2)).lower()
396
+ title = rng.choice(EVENT_TITLE_TEMPLATES).format(
397
+ p1=p1, p2=p2, project=project, company=company, team=team, ver=ver
398
+ )
399
+ desc = rng.choice(EVENT_DESC_TEMPLATES).format(
400
+ p1=p1, p2=p2, p3=p3, project=project, company=company, team=team,
401
+ place=place, slug=slug, our_company=our_company,
402
+ )
403
+ full = f"{title}\n\n{desc}"
404
+
405
+ # Extend with sections until close to target length. Use each section
406
+ # type at most once so we don't get repetitive output.
407
+ sections_used: set[str] = set()
408
+ section_order = ["agenda", "attendees", "pre_read", "context", "logistics"]
409
+ rng.shuffle(section_order)
410
+ for kind in section_order:
411
+ if len(full) >= target_len * 0.95:
412
+ break
413
+ if kind in sections_used:
414
+ continue
415
+ full += _event_section(rng, kind, project, company, team, p1, p2, p3)
416
+ sections_used.add(kind)
417
+ return full[:target_len]
418
+
419
+
420
+ # ---------------------------------------------------------------------------
421
+ # Record builders. Output shape matches the events table columns:
422
+ # event_id, arena, source_kind, source_id, emitted_at, content,
423
+ # content_hash, participant_set (array), participant_kind, attributes (jsonb)
424
+ # ---------------------------------------------------------------------------
425
+
426
+ def content_hash(arena: str, content: str) -> str:
427
+ """Content-addressed hash matching the v2 schema's `content_hash`
428
+ (sha256 of arena||':'||canonical_content). Used for idempotent upsert
429
+ and dedup."""
430
+ return hashlib.sha256(f"{arena}:{content}".encode("utf-8")).hexdigest()
431
+
432
+
433
+ def build_chat_record(rng: random.Random, arena: str, start: datetime,
434
+ span_days: int, thread_id: str | None = None,
435
+ author_pool: list[tuple[str, str, str]] | None = None) -> dict:
436
+ dist = DISTRIBUTIONS["chat"]
437
+ length = piecewise_sample(rng, dist["content_len_percentiles"])
438
+ content = gen_chat_content(rng, length)
439
+
440
+ is_outbound = rng.random() >= dist["p_inbound"]
441
+ direction = "outbound" if is_outbound else "inbound"
442
+
443
+ author = rng.choice(author_pool) if author_pool else (
444
+ rng.choice(FIRST_NAMES), rng.choice(LAST_NAMES), synth_slack_id(rng)
445
+ )
446
+ author_first, author_last, author_id = author
447
+ team = rng.choice(TEAMS)
448
+ timestamp = sample_timestamp(rng, start, span_days, dist["hour_weights"])
449
+ src_id = f"{int(timestamp.timestamp() * 1000) / 1000:.3f}.{rng.randint(100000, 999999)}"
450
+
451
+ attrs = {
452
+ "kind": "chat",
453
+ "team": team,
454
+ "source": "synthetic-slack",
455
+ "channel": "slack",
456
+ "author": author_id,
457
+ "doc_type": "chat",
458
+ "direction": direction,
459
+ "source_id": src_id,
460
+ "thread_id": thread_id,
461
+ "timestamp": timestamp.isoformat().replace("+00:00", "Z"),
462
+ "event_type": "STORE_MEMORY",
463
+ "layer_type": "episodic",
464
+ "entity_type": "conversation",
465
+ "memory_kind": "chat",
466
+ "contact_name": f"{author_first} {author_last}",
467
+ "contact_email": synth_email(rng, author_first, author_last),
468
+ "source_system": "synthetic-slack-ingest",
469
+ }
470
+ event_id = str(uuid.uuid4())
471
+ return {
472
+ "event_id": event_id,
473
+ "arena": arena,
474
+ "source_kind": "chat",
475
+ "source_id": src_id,
476
+ "emitted_at": timestamp.isoformat().replace("+00:00", "Z"),
477
+ "content": content,
478
+ "content_hash": content_hash(arena, content),
479
+ "participant_set": [arena],
480
+ "participant_kind": "unknown",
481
+ "attributes": attrs,
482
+ }
483
+
484
+
485
+ def build_note_record(rng: random.Random, arena: str, start: datetime,
486
+ span_days: int, thread_id: str | None = None,
487
+ author_pool: list[tuple[str, str, str]] | None = None,
488
+ arena_owner_email: str = "user@example-corp.com") -> dict:
489
+ dist = DISTRIBUTIONS["note"]
490
+ length = piecewise_sample(rng, dist["content_len_percentiles"])
491
+
492
+ author = rng.choice(author_pool) if author_pool else (
493
+ rng.choice(FIRST_NAMES), rng.choice(LAST_NAMES), synth_slack_id(rng)
494
+ )
495
+ author_first, author_last, _ = author
496
+ author_email = synth_email(rng, author_first, author_last)
497
+ author_name = f"{author_first} {author_last}"
498
+
499
+ content = gen_email_content(rng, length, author_name)
500
+
501
+ is_outbound = rng.random() >= dist["p_inbound"]
502
+ direction = "outbound" if is_outbound else "inbound"
503
+ timestamp = sample_timestamp(rng, start, span_days, dist["hour_weights"])
504
+ src_id = f"<{uuid.uuid4().hex}@example-corp.com>"
505
+
506
+ # Subject is part of email metadata, generated separately from body
507
+ subject = rng.choice(EMAIL_SUBJECT_TEMPLATES).format(
508
+ project=rng.choice(PROJECTS),
509
+ company=rng.choice(COMPANIES),
510
+ day=rng.choice(["Monday", "Friday", "this week"]),
511
+ our_company=rng.choice(COMPANIES).lower().replace(" ", ""),
512
+ )
513
+
514
+ attrs = {
515
+ "kind": "note",
516
+ "source": "synthetic-gmail",
517
+ "channel": "email",
518
+ "doc_type": "email",
519
+ "subject": subject,
520
+ "from_email": author_email,
521
+ "from_name": author_name,
522
+ "to_email": arena_owner_email if direction == "inbound" else author_email,
523
+ "direction": direction,
524
+ "source_id": src_id,
525
+ "thread_id": thread_id,
526
+ "timestamp": timestamp.isoformat().replace("+00:00", "Z"),
527
+ "event_type": "STORE_MEMORY",
528
+ "memory_kind": "note",
529
+ "source_system": "synthetic-gmail-ingest",
530
+ }
531
+ return {
532
+ "event_id": str(uuid.uuid4()),
533
+ "arena": arena,
534
+ "source_kind": "note",
535
+ "source_id": src_id,
536
+ "emitted_at": timestamp.isoformat().replace("+00:00", "Z"),
537
+ "content": content,
538
+ "content_hash": content_hash(arena, content),
539
+ "participant_set": [arena],
540
+ "participant_kind": "unknown",
541
+ "attributes": attrs,
542
+ }
543
+
544
+
545
+ def build_event_record(rng: random.Random, arena: str, start: datetime,
546
+ span_days: int) -> dict:
547
+ dist = DISTRIBUTIONS["event"]
548
+ length = piecewise_sample(rng, dist["content_len_percentiles"])
549
+ ver = f"{rng.randint(0,3)}.{rng.randint(0,12)}.{rng.randint(0,9)}"
550
+ content = gen_event_content(rng, length, ver=ver)
551
+
552
+ timestamp = sample_timestamp(rng, start, span_days, dist["hour_weights"])
553
+ src_id = uuid.uuid4().hex # mimics google calendar event id
554
+
555
+ attrs = {
556
+ "kind": "event",
557
+ "source": "synthetic-gcal",
558
+ "doc_type": "calendar",
559
+ "calendar": "primary",
560
+ "title": content.split("\n", 1)[0][:200],
561
+ "duration_minutes": rng.choice([15, 30, 45, 60, 90]),
562
+ "source_id": src_id,
563
+ "timestamp": timestamp.isoformat().replace("+00:00", "Z"),
564
+ "event_type": "STORE_MEMORY",
565
+ "memory_kind": "event",
566
+ "source_system": "synthetic-gcal-ingest",
567
+ }
568
+ return {
569
+ "event_id": str(uuid.uuid4()),
570
+ "arena": arena,
571
+ "source_kind": "event",
572
+ "source_id": src_id,
573
+ "emitted_at": timestamp.isoformat().replace("+00:00", "Z"),
574
+ "content": content,
575
+ "content_hash": content_hash(arena, content),
576
+ "participant_set": [arena],
577
+ "participant_kind": "unknown",
578
+ "attributes": attrs,
579
+ }
580
+
581
+
582
+ # ---------------------------------------------------------------------------
583
+ # Thread orchestration. The real corpus has 60% of chats in threads with
584
+ # avg depth 4.5, so we plan thread groups up front then assign messages.
585
+ # ---------------------------------------------------------------------------
586
+
587
+ def sample_thread_depth(rng: random.Random, percentiles: list[int]) -> int:
588
+ """Sample a thread depth (messages per thread) from a 4-point percentile
589
+ distribution `[p50, p90, p99, max]`. Uses bucketed linear interpolation
590
+ matching the exact percentile semantics — distinct from the 7-point
591
+ piecewise_sample used for content lengths, because thread depths are
592
+ heavily concentrated at the low end (p50 is often 1 or 3 even when max
593
+ is in the tens)."""
594
+ p50, p90, p99, pmax = percentiles
595
+ u = rng.random()
596
+ if u <= 0.50:
597
+ return max(1, int(1 + u * 2 * (p50 - 1)))
598
+ elif u <= 0.90:
599
+ return max(1, int(p50 + (u - 0.50) / 0.40 * (p90 - p50)))
600
+ elif u <= 0.99:
601
+ return max(1, int(p90 + (u - 0.90) / 0.09 * (p99 - p90)))
602
+ else:
603
+ return max(1, int(p99 + (u - 0.99) / 0.01 * (pmax - p99)))
604
+
605
+
606
+ def plan_threads(rng: random.Random, total: int, pct_threaded: float,
607
+ depth_percentiles: list[int]) -> tuple[list[str | None], int]:
608
+ """Returns (thread_id_per_record, n_threads_created).
609
+
610
+ Each record gets either a thread_id (if part of a thread) or None.
611
+ Threads are sized by sampling the depth percentile distribution via
612
+ `sample_thread_depth`, which respects the heavy left-skew of real
613
+ thread-depth distributions.
614
+ """
615
+ if pct_threaded == 0.0 or depth_percentiles is None:
616
+ return [None] * total, 0
617
+
618
+ target_threaded = int(total * pct_threaded)
619
+ thread_assignments: list[str | None] = []
620
+ threads_created = 0
621
+ remaining_threaded = target_threaded
622
+
623
+ while remaining_threaded > 0:
624
+ depth = sample_thread_depth(rng, depth_percentiles)
625
+ depth = min(depth, remaining_threaded, depth_percentiles[-1])
626
+ tid = f"thread-{uuid.uuid4().hex[:12]}"
627
+ for _ in range(depth):
628
+ thread_assignments.append(tid)
629
+ threads_created += 1
630
+ remaining_threaded -= depth
631
+
632
+ # Pad with standalone (None) records
633
+ while len(thread_assignments) < total:
634
+ thread_assignments.append(None)
635
+
636
+ rng.shuffle(thread_assignments)
637
+ return thread_assignments[:total], threads_created
638
+
639
+
640
+ # ---------------------------------------------------------------------------
641
+ # Main entry point
642
+ # ---------------------------------------------------------------------------
643
+
644
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
645
+ p = argparse.ArgumentParser(description=__doc__,
646
+ formatter_class=argparse.RawDescriptionHelpFormatter)
647
+ p.add_argument("--chat", type=int, default=700, help="number of chat records")
648
+ p.add_argument("--note", type=int, default=700, help="number of note (email) records")
649
+ p.add_argument("--event", type=int, default=700, help="number of event (cal) records")
650
+ p.add_argument("--arena", default=None,
651
+ help="synthetic tenant arena id (default: synthetic-tenant:<deterministic-uuid>)")
652
+ p.add_argument("--arena-owner-email", default="user@example-corp.com",
653
+ help="arena owner email (for inbound/outbound to_email)")
654
+ p.add_argument("--span-days", type=int, default=10,
655
+ help="time range over which to spread synthetic events (real data spans ~10d)")
656
+ p.add_argument("--seed", type=int, default=42, help="RNG seed for reproducibility")
657
+ p.add_argument("--output", type=Path, required=True,
658
+ help="output directory (will be created if missing)")
659
+ return p.parse_args(argv)
660
+
661
+
662
+ def main(argv: list[str] | None = None) -> int:
663
+ args = parse_args(argv)
664
+ rng = random.Random(args.seed)
665
+
666
+ output_dir = args.output.expanduser().resolve()
667
+ output_dir.mkdir(parents=True, exist_ok=True)
668
+
669
+ arena = args.arena or f"synthetic-tenant:{uuid.uuid5(uuid.NAMESPACE_DNS, f'synthetic-{args.seed}').hex}"
670
+
671
+ # Time window: real corpus spans 10 days; we anchor at "now - span" so
672
+ # timestamps look recent without being today-only.
673
+ end = datetime.now(timezone.utc)
674
+ start = end - timedelta(days=args.span_days)
675
+
676
+ # Build a small pool of plausible synthetic authors so the corpus has
677
+ # repeat senders (matching the real attribute_cardinality numbers).
678
+ author_pool = [
679
+ (rng.choice(FIRST_NAMES), rng.choice(LAST_NAMES), synth_slack_id(rng))
680
+ for _ in range(min(150, max(args.chat, args.note) // 5))
681
+ ]
682
+
683
+ manifest = {
684
+ "generated_at_utc": end.isoformat(),
685
+ "seed": args.seed,
686
+ "arena": arena,
687
+ "arena_owner_email": args.arena_owner_email,
688
+ "span_days": args.span_days,
689
+ "counts": {"chat": args.chat, "note": args.note, "event": args.event},
690
+ "distributions_source": "prod v2 org-model 2026-05-29 (aggregate only)",
691
+ "files": {},
692
+ }
693
+
694
+ # --- chat ---
695
+ chat_tids, chat_threads = plan_threads(
696
+ rng, args.chat,
697
+ DISTRIBUTIONS["chat"]["pct_threaded"],
698
+ DISTRIBUTIONS["chat"]["thread_depth_percentiles"],
699
+ )
700
+ chat_path = output_dir / "chat.jsonl"
701
+ with chat_path.open("w") as f:
702
+ for tid in chat_tids:
703
+ rec = build_chat_record(rng, arena, start, args.span_days, tid, author_pool)
704
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
705
+ manifest["files"]["chat.jsonl"] = {
706
+ "records": args.chat, "threads": chat_threads,
707
+ "sha256": _file_sha256(chat_path),
708
+ }
709
+
710
+ # --- note ---
711
+ note_tids, note_threads = plan_threads(
712
+ rng, args.note,
713
+ DISTRIBUTIONS["note"]["pct_threaded"],
714
+ DISTRIBUTIONS["note"]["thread_depth_percentiles"],
715
+ )
716
+ note_path = output_dir / "note.jsonl"
717
+ with note_path.open("w") as f:
718
+ for tid in note_tids:
719
+ rec = build_note_record(rng, arena, start, args.span_days, tid,
720
+ author_pool, args.arena_owner_email)
721
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
722
+ manifest["files"]["note.jsonl"] = {
723
+ "records": args.note, "threads": note_threads,
724
+ "sha256": _file_sha256(note_path),
725
+ }
726
+
727
+ # --- event ---
728
+ event_path = output_dir / "event.jsonl"
729
+ with event_path.open("w") as f:
730
+ for _ in range(args.event):
731
+ rec = build_event_record(rng, arena, start, args.span_days)
732
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
733
+ manifest["files"]["event.jsonl"] = {
734
+ "records": args.event, "threads": 0,
735
+ "sha256": _file_sha256(event_path),
736
+ }
737
+
738
+ manifest_path = output_dir / "manifest.json"
739
+ manifest_path.write_text(json.dumps(manifest, indent=2))
740
+
741
+ total = args.chat + args.note + args.event
742
+ print(f"wrote {total} records to {output_dir}/")
743
+ for kind, info in manifest["files"].items():
744
+ print(f" {kind}: {info['records']} records, {info['threads']} threads")
745
+ print(f" manifest.json: arena={arena}")
746
+ return 0
747
+
748
+
749
+ def _file_sha256(path: Path) -> str:
750
+ h = hashlib.sha256()
751
+ with path.open("rb") as f:
752
+ for chunk in iter(lambda: f.read(65536), b""):
753
+ h.update(chunk)
754
+ return h.hexdigest()
755
+
756
+
757
+ if __name__ == "__main__":
758
+ sys.exit(main())