@rubytech/create-realagent 1.0.828 → 1.0.830

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/package.json +1 -1
  2. package/payload/platform/config/brand.json +1 -1
  3. package/payload/platform/lib/oauth-llm/dist/index.d.ts +1 -1
  4. package/payload/platform/lib/oauth-llm/dist/index.d.ts.map +1 -1
  5. package/payload/platform/lib/oauth-llm/dist/index.js +21 -0
  6. package/payload/platform/lib/oauth-llm/dist/index.js.map +1 -1
  7. package/payload/platform/lib/oauth-llm/src/index.ts +24 -0
  8. package/payload/platform/neo4j/migrations/007-conversation-archive-source.ts +116 -0
  9. package/payload/platform/neo4j/schema.cypher +12 -2
  10. package/payload/platform/package.json +2 -2
  11. package/payload/platform/plugins/admin/hooks/__tests__/archive-ingest-surface-gate.test.sh +6 -6
  12. package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +14 -8
  13. package/payload/platform/plugins/admin/skills/onboarding/SKILL.md +2 -2
  14. package/payload/platform/plugins/contacts/mcp/dist/index.js +5 -5
  15. package/payload/platform/plugins/contacts/mcp/dist/index.js.map +1 -1
  16. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts +1 -1
  17. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts.map +1 -1
  18. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js +29 -23
  19. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js.map +1 -1
  20. package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
  21. package/payload/platform/plugins/memory/PLUGIN.md +6 -5
  22. package/payload/platform/plugins/{whatsapp-import/bin/ingest.mjs → memory/bin/conversation-archive-ingest.mjs} +136 -212
  23. package/payload/platform/plugins/{whatsapp-import/bin/whatsapp-ingest.sh → memory/bin/conversation-archive-ingest.sh} +27 -19
  24. package/payload/platform/plugins/memory/mcp/dist/index.js +26 -212
  25. package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
  26. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +4 -3
  27. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
  28. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js +11 -6
  29. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js.map +1 -1
  30. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-validator.test.js +103 -0
  31. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-validator.test.js.map +1 -1
  32. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts +5 -0
  33. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts.map +1 -0
  34. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js +30 -0
  35. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js.map +1 -0
  36. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts +48 -0
  37. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts.map +1 -0
  38. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js +23 -0
  39. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js.map +1 -0
  40. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts +3 -0
  41. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts.map +1 -0
  42. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js +237 -0
  43. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js.map +1 -0
  44. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts +11 -0
  45. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts.map +1 -0
  46. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js +21 -0
  47. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js.map +1 -0
  48. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts +16 -0
  49. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts.map +1 -0
  50. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js +39 -0
  51. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js.map +1 -0
  52. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts +17 -0
  53. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts.map +1 -0
  54. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js +90 -0
  55. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js.map +1 -0
  56. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts +9 -0
  57. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts.map +1 -0
  58. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js +32 -0
  59. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js.map +1 -0
  60. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts +3 -0
  61. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts.map +1 -0
  62. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js +27 -0
  63. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js.map +1 -0
  64. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts +45 -0
  65. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts.map +1 -0
  66. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js +125 -0
  67. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js.map +1 -0
  68. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +24 -1
  69. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
  70. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +293 -33
  71. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
  72. package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.d.ts.map +1 -1
  73. package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js +9 -2
  74. package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js.map +1 -1
  75. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.d.ts +16 -1
  76. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.d.ts.map +1 -1
  77. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.js +12 -3
  78. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.js.map +1 -1
  79. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts +2 -0
  80. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts.map +1 -0
  81. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js +75 -0
  82. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js.map +1 -0
  83. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts +2 -0
  84. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts.map +1 -0
  85. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js +67 -0
  86. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js.map +1 -0
  87. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-archive-write.test.js +2 -138
  88. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-archive-write.test.js.map +1 -1
  89. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +39 -3
  90. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -1
  91. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.d.ts +2 -0
  92. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.d.ts.map +1 -0
  93. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.js +148 -0
  94. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.js.map +1 -0
  95. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts +1 -47
  96. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts.map +1 -1
  97. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js +9 -318
  98. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js.map +1 -1
  99. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +7 -0
  100. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
  101. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +14 -8
  102. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
  103. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.d.ts +21 -17
  104. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.d.ts.map +1 -1
  105. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.js +77 -37
  106. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.js.map +1 -1
  107. package/payload/platform/plugins/memory/references/schema-base.md +3 -1
  108. package/payload/platform/plugins/{whatsapp-import/skills/whatsapp-import → memory/skills/conversation-archive}/SKILL.md +45 -36
  109. package/payload/platform/plugins/memory/skills/document-ingest/SKILL.md +59 -6
  110. package/payload/platform/plugins/whatsapp/PLUGIN.md +1 -1
  111. package/payload/platform/scripts/seed-neo4j.sh +9 -8
  112. package/payload/platform/templates/specialists/agents/database-operator.md +7 -14
  113. package/payload/server/chunk-7BO5HDJC.js +10093 -0
  114. package/payload/server/chunk-CUSH3UXP.js +2305 -0
  115. package/payload/server/chunk-EL4DZ56X.js +1116 -0
  116. package/payload/server/chunk-IWNDVGKT.js +10077 -0
  117. package/payload/server/chunk-KC7NUABI.js +654 -0
  118. package/payload/server/chunk-QOJ2D26Z.js +654 -0
  119. package/payload/server/chunk-RC46ZYGT.js +2305 -0
  120. package/payload/server/chunk-WUVXPZIV.js +1116 -0
  121. package/payload/server/client-pool-3TM3SRIA.js +32 -0
  122. package/payload/server/client-pool-7NTEFNVQ.js +32 -0
  123. package/payload/server/cloudflare-task-tracker-4NIODMGL.js +19 -0
  124. package/payload/server/cloudflare-task-tracker-WE77WXSI.js +19 -0
  125. package/payload/server/maxy-edge.js +3 -3
  126. package/payload/server/neo4j-migrations-4XPNJNM6.js +490 -0
  127. package/payload/server/neo4j-migrations-XTQ4WEV6.js +428 -0
  128. package/payload/server/server.js +6 -6
  129. package/payload/platform/plugins/whatsapp-import/PLUGIN.md +0 -48
  130. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/delta-append.test.ts +0 -163
  131. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/parse-export-lrm.test.ts +0 -83
  132. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/parse-export.test.ts +0 -678
  133. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/sessionize.test.ts +0 -91
  134. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/to-classifier-input.test.ts +0 -59
  135. package/payload/platform/plugins/whatsapp-import/lib/src/delta-cursor.ts +0 -54
  136. package/payload/platform/plugins/whatsapp-import/lib/src/derive-keys.ts +0 -82
  137. package/payload/platform/plugins/whatsapp-import/lib/src/index.ts +0 -22
  138. package/payload/platform/plugins/whatsapp-import/lib/src/parse-export.ts +0 -471
  139. package/payload/platform/plugins/whatsapp-import/lib/src/sessionize.ts +0 -81
  140. package/payload/platform/plugins/whatsapp-import/lib/src/to-classifier-input.ts +0 -48
  141. package/payload/platform/plugins/whatsapp-import/lib/tsconfig.json +0 -9
  142. package/payload/platform/plugins/whatsapp-import/lib/vitest.config.ts +0 -9
  143. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/conversation-archive-shape.md +0 -143
  144. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/export-parse.md +0 -109
@@ -1,471 +0,0 @@
1
- import { createHash } from "node:crypto";
2
- import { readFileSync } from "node:fs";
3
-
4
- // ---------------------------------------------------------------------------
5
- // parse-export — deterministic WhatsApp `_chat.txt` parser (Task 805).
6
- //
7
- // Pure function. No LLM in the per-line decision path. Replaces the prose
8
- // grammar that lived in references/export-parse.md when the database-operator
9
- // specialist's Sonnet was the line tokeniser. Every grammar branch here is
10
- // exercised by the vitest grid in `__tests__/parse-export.test.ts`; that
11
- // grid IS the contract — extending the grammar means a new test first.
12
- //
13
- // Doctrine alignment:
14
- // - feedback_deterministic_means_remove_llm.md — the LLM is no longer in
15
- // the per-line decision path.
16
- // - feedback_deterministic_is_a_shell_script.md — TypeScript is the right
17
- // deliverable shape here (UTF-8 decode + multi-line body assembly + sha256
18
- // would be cumbersome in shell); the LITERAL-MAPPING rule yields to
19
- // "Node module" because the per-line decision path is the deliverable, not
20
- // a one-shot orchestrator.
21
- // - feedback_loud_failures.md — encoding errors, empty files, and lines
22
- // that match a timestamp prefix but cannot be tokenised throw with named
23
- // reasons rather than degrading silently.
24
- // ---------------------------------------------------------------------------
25
-
26
- export interface ParseExportInput {
27
- /** Absolute path to the `_chat.txt` file. */
28
- filePath: string;
29
- /** Account scope used to compose `conversationId`. */
30
- accountId: string;
31
- /** IANA timezone the operator confirmed (e.g. `Europe/London`). */
32
- timezone: string;
33
- /**
34
- * Date ordering and year shape. Omit for auto-detect (Task 845): the parser
35
- * probes the first matched line as DD/MM and locks that ordering if range-valid;
36
- * otherwise locks MM/DD. Year shape is independent — `\d{2,4}` accepts 2-digit
37
- * (mapped `2000+yy`) and 4-digit (passed through) years per-line, including
38
- * mixed-year files.
39
- */
40
- dateFormat?: "DD/MM/YY" | "MM/DD/YY" | "DD/MM/YYYY" | "MM/DD/YYYY";
41
- }
42
-
43
- export interface ParsedLine {
44
- senderName: string;
45
- /** ISO 8601 with timezone offset for the supplied IANA zone. */
46
- dateSent: string;
47
- body: string;
48
- /** Position within emitted (post-skip) messages, 0-based. */
49
- sequenceIndex: number;
50
- }
51
-
52
- export interface ParseExportCounters {
53
- parsed: number;
54
- systemSkipped: number;
55
- mediaSkipped: number;
56
- parseErrors: number;
57
- }
58
-
59
- export interface ParseExportResult {
60
- conversationId: string;
61
- /** `whatsapp-export:<sha256-hex>` of the raw file bytes. */
62
- archiveSourceFile: string;
63
- parsedLines: ParsedLine[];
64
- counters: ParseExportCounters;
65
- }
66
-
67
- // Year capture is `\d{2}|\d{4}` so a single regex covers both 2-digit (legacy)
68
- // and 4-digit (modern WhatsApp default) prefixes — Task 845. Exactly 2 or 4
69
- // chars; 3-digit years (truncation typos, hand-edited files) are rejected as
70
- // not-a-prefix and surface via parse-grammar-miss, not silently coerced into
71
- // year-202-AD timestamps. Year semantics are resolved per-match in
72
- // `matchTimestampPrefix` from the captured length, not from the regex shape,
73
- // so mixed-year files parse natively.
74
- const TIMESTAMP_PREFIX_DDMMYY =
75
- /^\[(\d{2})\/(\d{2})\/(\d{4}|\d{2}),\s+(\d{1,2}):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
76
-
77
- const TIMESTAMP_PREFIX_MMDDYY = TIMESTAMP_PREFIX_DDMMYY; // shape is identical; ordering differs in interpretation only
78
-
79
- // System-message patterns that appear WITHOUT a `: ` sender/body separator.
80
- // WhatsApp emits group-event and security-code lines as `<Sender> <verb> ...`
81
- // (no colon). Lines that match the timestamp prefix but lack `: ` and do not
82
- // match one of these patterns are LOUD-FAIL parse errors — never silently
83
- // dropped.
84
- const LINE_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
85
- /^Messages and calls are end-to-end encrypted/i,
86
- /'s security code changed\.?$/i,
87
- / created group ["“”]/,
88
- / added /,
89
- / removed /,
90
- / left$/,
91
- / changed the subject from /,
92
- / changed this group's icon/,
93
- / joined using this group's invite link/,
94
- /^You're now an admin$/i,
95
- /^You created group/i,
96
- ];
97
-
98
- // Body-level patterns evaluated after `Sender: body` split. These are real
99
- // messages syntactically but carry no graph value (deletions, media-only).
100
- const BODY_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
101
- /^You deleted this message\.?$/,
102
- /^This message was deleted\.?$/,
103
- ];
104
-
105
- const MEDIA_ONLY_PATTERNS: RegExp[] = [
106
- /^<Media omitted>$/,
107
- /^IMG-\d+-\w+\.(jpg|jpeg|png|heic|gif)\s*\(file attached\)$/i,
108
- /^VID-\d+-\w+\.mp4\s*\(file attached\)$/i,
109
- /^PTT-\d+-\w+\.opus\s*\(file attached\)$/i,
110
- /^AUD-\d+-\w+\.opus\s*\(file attached\)$/i,
111
- /^STK-\d+-\w+\.webp\s*\(file attached\)$/i,
112
- /^.+\.(pdf|docx|doc|xlsx|xls|pptx|ppt|zip|csv|txt)\s*\(file attached\)$/i,
113
- /^‎.+attached:\s*.+$/, // alternative LRM-prefixed format on some platforms
114
- ];
115
-
116
- export function parseExport(input: ParseExportInput): ParseExportResult {
117
- const { filePath, accountId, timezone, dateFormat: explicitDateFormat } = input;
118
-
119
- if (!accountId || !accountId.trim()) {
120
- throw new Error("parse-export: accountId is required.");
121
- }
122
- if (!timezone || !timezone.trim()) {
123
- throw new Error("parse-export: timezone is required (e.g. 'Europe/London').");
124
- }
125
-
126
- const rawBytes = readFileSync(filePath);
127
- const sha256Hex = createHash("sha256").update(rawBytes).digest("hex");
128
- const archiveSourceFile = `whatsapp-export:${sha256Hex}`;
129
- const conversationId = `whatsapp-export:${sha256Hex}:${accountId}`;
130
-
131
- const text = decodeAndNormalise(rawBytes);
132
- if (text.length === 0) {
133
- throw new Error(
134
- `parse-export: file is empty — not a _chat.txt. file=${filePath}`,
135
- );
136
- }
137
-
138
- const lines = text.split("\n");
139
- // Auto-detect when `dateFormat` is omitted (Task 845): probe the first line
140
- // that contains a timestamp prefix as DD/MM; lock DD/MM if range-valid,
141
- // otherwise lock MM/DD. WhatsApp's locale is set per device, so a single
142
- // file never mixes DD/MM and MM/DD — locking once from line 1 is correct.
143
- // Concatenated multi-locale exports require an explicit `dateFormat`.
144
- const ordering = resolveOrdering(explicitDateFormat, lines);
145
- const counters: ParseExportCounters = {
146
- parsed: 0,
147
- systemSkipped: 0,
148
- mediaSkipped: 0,
149
- parseErrors: 0,
150
- };
151
-
152
- // Stage 1 — tokenise into raw messages (timestamp + remainder), accumulating
153
- // continuation lines into the previous remainder. Stage 2 then categorises
154
- // each tokenised message (system / media / real) so the counter increments
155
- // happen exactly once per source line.
156
- interface RawMessage {
157
- rawLineIndex: number; // 1-based file line number for LOUD-FAIL diagnostics
158
- year: number;
159
- month: number;
160
- day: number;
161
- hour: number;
162
- minute: number;
163
- second: number;
164
- remainder: string; // everything after `]` on the prefix line, plus continuation lines
165
- }
166
- const raw: RawMessage[] = [];
167
-
168
- for (let i = 0; i < lines.length; i++) {
169
- const line = lines[i];
170
- if (line.length === 0 && i === lines.length - 1) continue; // trailing newline
171
- const prefixMatch = matchTimestampPrefix(line, ordering);
172
- if (prefixMatch) {
173
- raw.push({
174
- rawLineIndex: i + 1,
175
- ...prefixMatch.dateParts,
176
- remainder: prefixMatch.remainder,
177
- });
178
- } else {
179
- // Continuation of the previous message body. If there is no previous
180
- // message, this line is leading garbage — ignore it (matches the
181
- // export-parse.md edge case where a leading BOM or blank line precedes
182
- // the first timestamp).
183
- const last = raw[raw.length - 1];
184
- if (last) {
185
- last.remainder += "\n" + line;
186
- }
187
- }
188
- }
189
-
190
- // Stage 2 — categorise each raw message. Do NOT trim trailing whitespace
191
- // from the remainder before splitting — `Joel: ` (sender + colon + trailing
192
- // space + newline) collapses to `Joel:` after a `\s+$` trim and the `: `
193
- // separator disappears, turning an empty-body system skip into a LOUD-FAIL.
194
- const parsedLines: ParsedLine[] = [];
195
- for (const r of raw) {
196
- const remainder = r.remainder;
197
- const colonIdx = findFirstColonSeparator(remainder);
198
-
199
- if (colonIdx === -1) {
200
- // No `: ` separator. Must match a known system pattern or LOUD-FAIL.
201
- const trimmed = remainder.replace(/\s+$/, "");
202
- if (matchesAny(trimmed, LINE_LEVEL_SYSTEM_PATTERNS)) {
203
- counters.systemSkipped++;
204
- continue;
205
- }
206
- counters.parseErrors++;
207
- throw new Error(
208
- `parse-export: parse-error file=${filePath} line=${r.rawLineIndex} reason=no-sender-body-separator content="${trimmed.slice(0, 80)}"`,
209
- );
210
- }
211
-
212
- const senderName = remainder.slice(0, colonIdx).trim();
213
- const body = remainder.slice(colonIdx + 2).replace(/\s+$/, "");
214
-
215
- if (body.length === 0) {
216
- counters.systemSkipped++;
217
- continue;
218
- }
219
- if (matchesAny(body, BODY_LEVEL_SYSTEM_PATTERNS)) {
220
- counters.systemSkipped++;
221
- continue;
222
- }
223
- if (matchesAny(body, MEDIA_ONLY_PATTERNS)) {
224
- counters.mediaSkipped++;
225
- continue;
226
- }
227
-
228
- const dateSent = isoWithOffset(
229
- r.year,
230
- r.month,
231
- r.day,
232
- r.hour,
233
- r.minute,
234
- r.second,
235
- timezone,
236
- );
237
-
238
- parsedLines.push({
239
- senderName,
240
- dateSent,
241
- body,
242
- sequenceIndex: parsedLines.length,
243
- });
244
- counters.parsed++;
245
- }
246
-
247
- if (parsedLines.length === 0 && counters.systemSkipped === 0 && counters.mediaSkipped === 0) {
248
- // Task 845: include a sanitised first-line sample so the operator knows
249
- // WHY the file rejected — closes the diagnostic gap that left conversation
250
- // 47c6a590-0c2c-4006-9aca-6ee9ec93c95f guessing. Echoed to stderr too so
251
- // server.log has a grep-able adjunct to the existing parse-failed line.
252
- const sample = sampleFirstNonBlankLine(lines, 100);
253
- process.stderr.write(
254
- `[whatsapp-import] parse-grammar-miss first-line="${sample}"\n`,
255
- );
256
- throw new Error(
257
- `parse-export: zero parsed lines after walking ${filePath} — not a _chat.txt or all lines failed grammar. parse-grammar-miss first-line="${sample}"`,
258
- );
259
- }
260
-
261
- return {
262
- conversationId,
263
- archiveSourceFile,
264
- parsedLines,
265
- counters,
266
- };
267
- }
268
-
269
- // ---------------------------------------------------------------------------
270
- // Internals
271
- // ---------------------------------------------------------------------------
272
-
273
- function decodeAndNormalise(bytes: Buffer): string {
274
- // Strict UTF-8 decode. Node's TextDecoder with `fatal: true` throws on
275
- // invalid bytes — that's the LOUD-FAIL the brief mandates for encoding
276
- // errors. The default `Buffer.toString('utf8')` silently substitutes
277
- // U+FFFD, which would let bad bytes propagate into the graph.
278
- let text: string;
279
- try {
280
- text = new TextDecoder("utf-8", { fatal: true }).decode(bytes);
281
- } catch (err) {
282
- throw new Error(
283
- `parse-export: UTF-8 decode failed — ${err instanceof Error ? err.message : String(err)}. The file is not valid UTF-8; re-export from WhatsApp.`,
284
- );
285
- }
286
-
287
- // Strip leading BOM (U+FEFF).
288
- if (text.charCodeAt(0) === 0xfeff) {
289
- text = text.slice(1);
290
- }
291
-
292
- // Normalise mixed line endings to LF.
293
- text = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
294
-
295
- // Task 887 — strip Unicode bidi marks (U+200E LRM, U+200F RLM) only at
296
- // line-start, where some WhatsApp builds prefix the timestamp header.
297
- // Without stripping, `^\[(\d{2})\/...` fails on the prefixed line, the
298
- // line is appended as a continuation of the previous body, and the next
299
- // clean header parses its senderName off the polluted body — leaking 23
300
- // bogus :Person nodes per import in the Adam Mackay archive. Body-internal
301
- // bidi marks (e.g. the LRM in `: ‎Forwarded`) are preserved — they carry
302
- // semantic information about message origin and are exercised by
303
- // parse-export.test.ts. Counts emitted to stderr for the operator's tail.
304
- const leadingBidiMatches = text.match(/(?:^|\n)[‎‏]+/g) || [];
305
- let lrmStripped = 0;
306
- let rlmStripped = 0;
307
- for (const m of leadingBidiMatches) {
308
- for (const ch of m) {
309
- if (ch === "‎") lrmStripped++;
310
- else if (ch === "‏") rlmStripped++;
311
- }
312
- }
313
- if (leadingBidiMatches.length > 0) {
314
- text = text.replace(/(^|\n)[‎‏]+/g, "$1");
315
- process.stderr.write(
316
- `[whatsapp-ingest] decoded normalised lrm-stripped=${lrmStripped} rlm-stripped=${rlmStripped}\n`,
317
- );
318
- }
319
-
320
- return text;
321
- }
322
-
323
- interface TimestampMatch {
324
- dateParts: {
325
- year: number;
326
- month: number;
327
- day: number;
328
- hour: number;
329
- minute: number;
330
- second: number;
331
- };
332
- remainder: string;
333
- }
334
-
335
- type Ordering = "DDMM" | "MMDD";
336
-
337
- function matchTimestampPrefix(
338
- line: string,
339
- ordering: Ordering,
340
- ): TimestampMatch | null {
341
- const re = ordering === "MMDD" ? TIMESTAMP_PREFIX_MMDDYY : TIMESTAMP_PREFIX_DDMMYY;
342
- const m = line.match(re);
343
- if (!m) return null;
344
- const a = parseInt(m[1], 10); // dd or mm depending on ordering
345
- const b = parseInt(m[2], 10); // mm or dd
346
- const yearRaw = m[3];
347
- const hour = parseInt(m[4], 10);
348
- const minute = parseInt(m[5], 10);
349
- const second = m[6] !== undefined ? parseInt(m[6], 10) : 0;
350
- const remainder = m[7] ?? "";
351
- const day = ordering === "MMDD" ? b : a;
352
- const month = ordering === "MMDD" ? a : b;
353
- // Range-check before passing to Date.UTC — that function silently rolls
354
- // over invalid components (Date.UTC(2026, 13, 1) → 2027-02-01), which
355
- // would corrupt timestamps when the operator passes the wrong ordering
356
- // for a US-locale export. Reject as not-a-prefix; the caller retries the
357
- // file with the correct ordering or LOUD-FAILs when the file isn't a chat.
358
- if (month < 1 || month > 12 || day < 1 || day > 31) return null;
359
- if (hour > 23 || minute > 59 || second > 59) return null;
360
- // Task 845: branch year semantics on captured length. WhatsApp's modern
361
- // exports emit 4-digit years; legacy exports emit 2-digit. Both are
362
- // accepted by the same regex and disambiguated here so a single file may
363
- // hold both shapes (mixed-year imports parse natively).
364
- const year = yearRaw.length === 2 ? 2000 + parseInt(yearRaw, 10) : parseInt(yearRaw, 10);
365
- return {
366
- dateParts: { year, month, day, hour, minute, second },
367
- remainder,
368
- };
369
- }
370
-
371
- function resolveOrdering(
372
- explicit: ParseExportInput["dateFormat"],
373
- lines: readonly string[],
374
- ): Ordering {
375
- if (explicit === "MM/DD/YY" || explicit === "MM/DD/YYYY") return "MMDD";
376
- if (explicit === "DD/MM/YY" || explicit === "DD/MM/YYYY") return "DDMM";
377
- // Auto-detect: probe the first prefix-matching line as DD/MM. If range-valid,
378
- // lock DD/MM (WhatsApp's global default). Otherwise lock MM/DD (US-locale
379
- // exports, which are the only meaningful exception). Locked once.
380
- for (const line of lines) {
381
- if (matchTimestampPrefix(line, "DDMM")) return "DDMM";
382
- if (matchTimestampPrefix(line, "MMDD")) return "MMDD";
383
- }
384
- return "DDMM"; // No matching line — caller will throw zero-parsed-lines anyway.
385
- }
386
-
387
- function sampleFirstNonBlankLine(lines: readonly string[], maxScan: number): string {
388
- const scanLimit = Math.min(maxScan, lines.length);
389
- for (let i = 0; i < scanLimit; i++) {
390
- const trimmed = lines[i].trim();
391
- if (trimmed.length === 0) continue;
392
- // Strip control characters (including tab, BEL, etc.) so the diagnostic
393
- // line stays single-line and grep-friendly. Truncate to 80 chars per
394
- // Task 845 brief — enough to recognise the offending header shape.
395
- const sanitised = trimmed.replace(/[\x00-\x1F\x7F]/g, "");
396
- return sanitised.slice(0, 80);
397
- }
398
- return "";
399
- }
400
-
401
- function findFirstColonSeparator(remainder: string): number {
402
- // Split on the FIRST `: ` (colon-space). A sender display name may itself
403
- // contain a `:` (e.g. "Joel: Work"), so we anchor on the first colon
404
- // followed by a space — that's the WhatsApp export's stable separator.
405
- const idx = remainder.indexOf(": ");
406
- return idx;
407
- }
408
-
409
- function matchesAny(text: string, patterns: RegExp[]): boolean {
410
- for (const p of patterns) {
411
- if (p.test(text)) return true;
412
- }
413
- return false;
414
- }
415
-
416
- function isoWithOffset(
417
- year: number,
418
- month: number,
419
- day: number,
420
- hour: number,
421
- minute: number,
422
- second: number,
423
- timezone: string,
424
- ): string {
425
- // Produce ISO 8601 with the offset that the supplied IANA zone holds for
426
- // this wall-clock instant. Two-step refinement is needed to handle DST:
427
- // the wall-clock components describe a local time, and we need the offset
428
- // for the corresponding UTC instant in `timezone`.
429
- const guessUtcMs = Date.UTC(year, month - 1, day, hour, minute, second);
430
- let offMin = offsetMinutesAt(new Date(guessUtcMs), timezone);
431
- const refinedUtcMs = guessUtcMs - offMin * 60_000;
432
- offMin = offsetMinutesAt(new Date(refinedUtcMs), timezone);
433
-
434
- const sign = offMin >= 0 ? "+" : "-";
435
- const absOff = Math.abs(offMin);
436
- const offHH = String(Math.floor(absOff / 60)).padStart(2, "0");
437
- const offMM = String(absOff % 60).padStart(2, "0");
438
- const Y = String(year).padStart(4, "0");
439
- const M = String(month).padStart(2, "0");
440
- const D = String(day).padStart(2, "0");
441
- const H = String(hour).padStart(2, "0");
442
- const Mi = String(minute).padStart(2, "0");
443
- const S = String(second).padStart(2, "0");
444
- return `${Y}-${M}-${D}T${H}:${Mi}:${S}${sign}${offHH}:${offMM}`;
445
- }
446
-
447
- function offsetMinutesAt(date: Date, timezone: string): number {
448
- // Use Intl.DateTimeFormat with longOffset to read the IANA-zone offset for
449
- // the given UTC instant. Output format: "GMT+01:00", "GMT-05:00", or "GMT".
450
- const formatter = new Intl.DateTimeFormat("en-US", {
451
- timeZone: timezone,
452
- timeZoneName: "longOffset",
453
- });
454
- const parts = formatter.formatToParts(date);
455
- const tzPart = parts.find((p) => p.type === "timeZoneName");
456
- if (!tzPart) {
457
- throw new Error(`parse-export: unable to read offset for timezone "${timezone}".`);
458
- }
459
- const value = tzPart.value;
460
- if (value === "GMT" || value === "UTC") return 0;
461
- const m = value.match(/^(?:GMT|UTC)([+-])(\d{1,2}):?(\d{2})?$/);
462
- if (!m) {
463
- throw new Error(
464
- `parse-export: cannot parse timezone offset "${value}" for IANA zone "${timezone}".`,
465
- );
466
- }
467
- const sign = m[1] === "+" ? 1 : -1;
468
- const hh = parseInt(m[2], 10);
469
- const mm = m[3] ? parseInt(m[3], 10) : 0;
470
- return sign * (hh * 60 + mm);
471
- }
@@ -1,81 +0,0 @@
1
- import type { ParsedLine } from "./parse-export.js";
2
-
3
- // ---------------------------------------------------------------------------
4
- // sessionize — Pass 1 of the chunked-archive pipeline (Task 891).
5
- //
6
- // Pure function. Splits a chronologically-ordered sequence of parsed messages
7
- // into "sessions" wherever the gap between consecutive `dateSent` values
8
- // exceeds `gapHours`. Each session feeds memory-classify (mode='chat') as a
9
- // turn-attributed block; the LLM then chunks each session into one or more
10
- // `:Section:Conversation` rows with summary+keywords (Pass 2).
11
- //
12
- // Why deterministic gap-cut, not LLM topic detection:
13
- // - The natural cadence of human chat (sleep, working hours, weekend gaps)
14
- // produces clean session boundaries that the operator can intuit.
15
- // - LLM-only chunking against a 10K-message archive sends 10K messages into
16
- // one prompt and pays attention only to the last 1K — gap-cut bounds the
17
- // window before the LLM ever sees it.
18
- // - The default 12h gap matches one sleep cycle: messages on the same day
19
- // belong together; a 14-hour gap (last evening message → next morning) is
20
- // a fresh session even when the topic is identical.
21
- // ---------------------------------------------------------------------------
22
-
23
- export interface Session {
24
- /** 0-based index across the archive's sessions. */
25
- index: number;
26
- /** ISO 8601 timestamp of the first message in the session. */
27
- firstMessageAt: string;
28
- /** ISO 8601 timestamp of the last message in the session. */
29
- lastMessageAt: string;
30
- /** Messages in the session, chronological. */
31
- messages: ParsedLine[];
32
- }
33
-
34
- /**
35
- * Split parsed messages into sessions on gaps ≥ `gapHours`. Input must be
36
- * pre-sorted by `dateSent` (parse-export emits in file order, which IS
37
- * chronological for any well-formed `_chat.txt`).
38
- *
39
- * Boundary semantics (exact-at-threshold):
40
- * gap == gapHours → cut here (start a new session)
41
- * gap < gapHours → same session
42
- * gap > gapHours → cut here
43
- *
44
- * Empty input returns []; single-message input returns one one-message session.
45
- */
46
- export function sessionize(
47
- messages: readonly ParsedLine[],
48
- gapHours: number,
49
- ): Session[] {
50
- if (gapHours <= 0) {
51
- throw new Error(`sessionize: gapHours must be positive, got ${gapHours}`);
52
- }
53
- if (messages.length === 0) return [];
54
-
55
- const gapMs = gapHours * 60 * 60 * 1000;
56
- const sessions: Session[] = [];
57
- let currentMessages: ParsedLine[] = [messages[0]];
58
-
59
- const flush = () => {
60
- sessions.push({
61
- index: sessions.length,
62
- firstMessageAt: currentMessages[0].dateSent,
63
- lastMessageAt: currentMessages[currentMessages.length - 1].dateSent,
64
- messages: currentMessages,
65
- });
66
- };
67
-
68
- for (let i = 1; i < messages.length; i++) {
69
- const prevMs = Date.parse(messages[i - 1].dateSent);
70
- const currMs = Date.parse(messages[i].dateSent);
71
- const gap = currMs - prevMs;
72
- if (gap >= gapMs) {
73
- flush();
74
- currentMessages = [messages[i]];
75
- } else {
76
- currentMessages.push(messages[i]);
77
- }
78
- }
79
- flush();
80
- return sessions;
81
- }
@@ -1,48 +0,0 @@
1
- import type { Session } from "./sessionize.js";
2
-
3
- // ---------------------------------------------------------------------------
4
- // to-classifier-input — Pass 2 input formatter (Task 891).
5
- //
6
- // Pure function. Renders one Session as a turn-attributed text block ready
7
- // to hand to memory-classify (mode='chat'). Format:
8
- //
9
- // [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
10
- // [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
11
- // ...
12
- //
13
- // Multi-line message bodies are kept verbatim (with their internal newlines).
14
- // The leading `[ts] <Sender>: ` prefix is the only structural addition; the
15
- // classifier prompt instructs Haiku to preserve it in the chunk `body` so
16
- // downstream Phase 2 work can recover per-message provenance via snippet
17
- // matching against the conversation tail.
18
- //
19
- // Timezone: each `dateSent` ISO already carries an offset (set by parseExport
20
- // from the operator's confirmed IANA zone). The renderer prints the
21
- // human-readable wall-clock for that offset; the trailing "TZ" suffix is the
22
- // offset itself, not a zone name.
23
- // ---------------------------------------------------------------------------
24
-
25
- export function toClassifierInput(session: Session): string {
26
- const lines: string[] = [];
27
- for (const m of session.messages) {
28
- lines.push(`[${formatWallClock(m.dateSent)}] ${m.senderName}: ${m.body}`);
29
- }
30
- return lines.join("\n");
31
- }
32
-
33
- /**
34
- * Format an ISO 8601 instant with offset as `YYYY-MM-DD HH:MM:SS ±HH:MM`,
35
- * preserving the offset that the parser set from the operator's IANA zone.
36
- * The wall-clock components are read directly from the ISO string — no
37
- * Date construction (which would re-interpret in the local zone).
38
- */
39
- function formatWallClock(iso: string): string {
40
- // ISO from parse-export is shaped: "YYYY-MM-DDTHH:MM:SS±HH:MM" (or "Z").
41
- const m = iso.match(
42
- /^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})(?:\.\d+)?(Z|[+-]\d{2}:?\d{2})$/,
43
- );
44
- if (!m) return iso; // surface the raw value if the shape drifted; pure function never throws on caller-supplied data
45
- const [, y, mo, d, h, mi, s, off] = m;
46
- const offsetLabel = off === "Z" ? "+00:00" : off;
47
- return `${y}-${mo}-${d} ${h}:${mi}:${s} ${offsetLabel}`;
48
- }
@@ -1,9 +0,0 @@
1
- {
2
- "extends": "../../../tsconfig.base.json",
3
- "compilerOptions": {
4
- "outDir": "dist",
5
- "rootDir": "src"
6
- },
7
- "include": ["src"],
8
- "exclude": ["src/__tests__"]
9
- }
@@ -1,9 +0,0 @@
1
- import { defineConfig } from "vitest/config";
2
-
3
- export default defineConfig({
4
- test: {
5
- environment: "node",
6
- globals: false,
7
- include: ["src/__tests__/**/*.test.ts"],
8
- },
9
- });