@rubytech/create-realagent 1.0.826 → 1.0.828

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/package.json +1 -1
  2. package/payload/platform/neo4j/schema.cypher +34 -2
  3. package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +19 -13
  4. package/payload/platform/plugins/admin/skills/onboarding/SKILL.md +5 -5
  5. package/payload/platform/plugins/docs/references/cloudflare.md +1 -1
  6. package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
  7. package/payload/platform/plugins/docs/references/troubleshooting.md +1 -0
  8. package/payload/platform/plugins/memory/PLUGIN.md +1 -1
  9. package/payload/platform/plugins/memory/mcp/dist/index.js +6 -41
  10. package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
  11. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +51 -0
  12. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
  13. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +19 -4
  14. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
  15. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +139 -56
  16. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
  17. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.d.ts +2 -0
  18. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.d.ts.map +1 -0
  19. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +61 -0
  20. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -0
  21. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +34 -0
  22. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
  23. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +241 -0
  24. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
  25. package/payload/platform/plugins/memory/references/schema-base.md +5 -2
  26. package/payload/platform/plugins/whatsapp-import/PLUGIN.md +17 -15
  27. package/payload/platform/plugins/whatsapp-import/bin/ingest.mjs +313 -366
  28. package/payload/platform/plugins/whatsapp-import/bin/whatsapp-ingest.sh +27 -60
  29. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.d.ts +18 -0
  30. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.d.ts.map +1 -0
  31. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.js +31 -0
  32. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.js.map +1 -0
  33. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.d.ts +27 -12
  34. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.d.ts.map +1 -1
  35. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.js +40 -20
  36. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.js.map +1 -1
  37. package/payload/platform/plugins/whatsapp-import/lib/dist/index.d.ts +7 -4
  38. package/payload/platform/plugins/whatsapp-import/lib/dist/index.d.ts.map +1 -1
  39. package/payload/platform/plugins/whatsapp-import/lib/dist/index.js +9 -6
  40. package/payload/platform/plugins/whatsapp-import/lib/dist/index.js.map +1 -1
  41. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.d.ts +25 -0
  42. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.d.ts.map +1 -0
  43. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.js +48 -0
  44. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.js.map +1 -0
  45. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.d.ts +3 -0
  46. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.d.ts.map +1 -0
  47. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.js +47 -0
  48. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.js.map +1 -0
  49. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/delta-append.test.ts +163 -0
  50. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/sessionize.test.ts +91 -0
  51. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/to-classifier-input.test.ts +59 -0
  52. package/payload/platform/plugins/whatsapp-import/lib/src/delta-cursor.ts +54 -0
  53. package/payload/platform/plugins/whatsapp-import/lib/src/derive-keys.ts +55 -32
  54. package/payload/platform/plugins/whatsapp-import/lib/src/index.ts +9 -6
  55. package/payload/platform/plugins/whatsapp-import/lib/src/sessionize.ts +81 -0
  56. package/payload/platform/plugins/whatsapp-import/lib/src/to-classifier-input.ts +48 -0
  57. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md +66 -73
  58. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/conversation-archive-shape.md +143 -0
  59. package/payload/platform/templates/specialists/agents/database-operator.md +10 -11
  60. package/payload/server/chunk-T2OPNP3L.js +654 -0
  61. package/payload/server/cloudflare-task-tracker-CR6TL4VL.js +19 -0
  62. package/payload/server/public/assets/{admin-DOkUspG1.js → admin-BNwPsMhJ.js} +2 -2
  63. package/payload/server/public/assets/{graph-LLMJa4Ch.js → graph-N_Bw-8oT.js} +1 -1
  64. package/payload/server/public/assets/{page-DoaF3DB0.js → page-BKLGP-th.js} +1 -1
  65. package/payload/server/public/graph.html +2 -2
  66. package/payload/server/public/index.html +2 -2
  67. package/payload/server/server.js +277 -164
  68. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/filter-gate.test.ts +0 -172
  69. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/ingest-idempotence.test.ts +0 -141
  70. package/payload/platform/plugins/whatsapp-import/lib/src/filter.ts +0 -136
  71. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import-enrich/SKILL.md +0 -333
@@ -0,0 +1,163 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { findDeltaCursor } from "../delta-cursor.js";
3
+ import {
4
+ deriveConversationIdentity,
5
+ deriveMessageContentHash,
6
+ normaliseSenderName,
7
+ } from "../derive-keys.js";
8
+ import type { ParsedLine } from "../parse-export.js";
9
+
10
+ function mk(dateSent: string, senderName: string, body: string): ParsedLine {
11
+ return { dateSent, senderName, body, sequenceIndex: 0 };
12
+ }
13
+
14
+ describe("deriveConversationIdentity", () => {
15
+ it("is stable across participant order (sorted internally)", () => {
16
+ const a = deriveConversationIdentity({
17
+ accountId: "acct-1",
18
+ participantElementIds: ["1:abc:1", "1:abc:2"],
19
+ });
20
+ const b = deriveConversationIdentity({
21
+ accountId: "acct-1",
22
+ participantElementIds: ["1:abc:2", "1:abc:1"],
23
+ });
24
+ expect(a).toBe(b);
25
+ });
26
+
27
+ it("changes when accountId differs", () => {
28
+ const a = deriveConversationIdentity({
29
+ accountId: "acct-1",
30
+ participantElementIds: ["1:abc:1", "1:abc:2"],
31
+ });
32
+ const b = deriveConversationIdentity({
33
+ accountId: "acct-2",
34
+ participantElementIds: ["1:abc:1", "1:abc:2"],
35
+ });
36
+ expect(a).not.toBe(b);
37
+ });
38
+
39
+ it("rejects empty participant array", () => {
40
+ expect(() =>
41
+ deriveConversationIdentity({
42
+ accountId: "acct-1",
43
+ participantElementIds: [],
44
+ }),
45
+ ).toThrow(/non-empty/);
46
+ });
47
+
48
+ it("rejects empty accountId", () => {
49
+ expect(() =>
50
+ deriveConversationIdentity({
51
+ accountId: "",
52
+ participantElementIds: ["1:abc:1"],
53
+ }),
54
+ ).toThrow(/accountId/);
55
+ });
56
+
57
+ it("treats DM and group identically (same formula, different array length)", () => {
58
+ const dm = deriveConversationIdentity({
59
+ accountId: "acct-1",
60
+ participantElementIds: ["1:a:1", "1:a:2"],
61
+ });
62
+ const group = deriveConversationIdentity({
63
+ accountId: "acct-1",
64
+ participantElementIds: ["1:a:1", "1:a:2", "1:a:3"],
65
+ });
66
+ expect(dm).not.toBe(group);
67
+ // Both must hex-decode to a sha256 (64 hex chars).
68
+ expect(dm).toMatch(/^[a-f0-9]{64}$/);
69
+ expect(group).toMatch(/^[a-f0-9]{64}$/);
70
+ });
71
+ });
72
+
73
+ describe("deriveMessageContentHash", () => {
74
+ it("is content-only — no archive bytes contribute", () => {
75
+ const a = deriveMessageContentHash({
76
+ dateSent: "2026-03-14T10:00:00+00:00",
77
+ senderName: "Joel",
78
+ body: "hi",
79
+ });
80
+ const b = deriveMessageContentHash({
81
+ dateSent: "2026-03-14T10:00:00+00:00",
82
+ senderName: "Joel",
83
+ body: "hi",
84
+ });
85
+ expect(a).toBe(b);
86
+ });
87
+
88
+ it("changes when any field changes", () => {
89
+ const base = {
90
+ dateSent: "2026-03-14T10:00:00+00:00",
91
+ senderName: "Joel",
92
+ body: "hi",
93
+ };
94
+ expect(deriveMessageContentHash(base)).not.toBe(
95
+ deriveMessageContentHash({ ...base, body: "hello" }),
96
+ );
97
+ expect(deriveMessageContentHash(base)).not.toBe(
98
+ deriveMessageContentHash({ ...base, senderName: "Adam" }),
99
+ );
100
+ expect(deriveMessageContentHash(base)).not.toBe(
101
+ deriveMessageContentHash({ ...base, dateSent: "2026-03-14T10:00:01+00:00" }),
102
+ );
103
+ });
104
+
105
+ it("collapses NFKC-equivalent sender names to one hash", () => {
106
+ const a = deriveMessageContentHash({
107
+ dateSent: "2026-03-14T10:00:00+00:00",
108
+ senderName: " ADAM Mackay ",
109
+ body: "hi",
110
+ });
111
+ const b = deriveMessageContentHash({
112
+ dateSent: "2026-03-14T10:00:00+00:00",
113
+ senderName: "adam mackay",
114
+ body: "hi",
115
+ });
116
+ expect(a).toBe(b);
117
+ });
118
+ });
119
+
120
+ describe("normaliseSenderName", () => {
121
+ it("returns NFKC-trim-lower form", () => {
122
+ expect(normaliseSenderName(" Adam Mackay ")).toBe("adam mackay");
123
+ });
124
+ });
125
+
126
+ describe("findDeltaCursor", () => {
127
+ const lines: ParsedLine[] = [
128
+ mk("2026-03-14T10:00:00+00:00", "Joel", "first"),
129
+ mk("2026-03-14T10:01:00+00:00", "Adam", "second"),
130
+ mk("2026-03-14T10:02:00+00:00", "Joel", "third"),
131
+ ];
132
+ const hashFor = (l: ParsedLine) =>
133
+ deriveMessageContentHash({
134
+ dateSent: l.dateSent,
135
+ senderName: l.senderName,
136
+ body: l.body,
137
+ });
138
+
139
+ it("returns 'found' with deltaStart = cursor + 1 when the hash matches a prior line", () => {
140
+ const result = findDeltaCursor(lines, hashFor(lines[0]));
141
+ expect(result).toEqual({ kind: "found", deltaStart: 1 });
142
+ });
143
+
144
+ it("returns 'empty' when the cursor is the last line (no new messages)", () => {
145
+ const result = findDeltaCursor(lines, hashFor(lines[2]));
146
+ expect(result).toEqual({ kind: "empty" });
147
+ });
148
+
149
+ it("returns 'missing' when no parsed line matches (LOUD-FAIL upstream)", () => {
150
+ const result = findDeltaCursor(lines, "deadbeef".repeat(8));
151
+ expect(result).toEqual({ kind: "missing" });
152
+ });
153
+
154
+ it("returns 'missing' for an empty parsed sequence (cursor cannot exist)", () => {
155
+ const result = findDeltaCursor([], hashFor(lines[0]));
156
+ expect(result).toEqual({ kind: "missing" });
157
+ });
158
+
159
+ it("rejects empty cursor hash", () => {
160
+ expect(() => findDeltaCursor(lines, "")).toThrow(/non-empty/);
161
+ expect(() => findDeltaCursor(lines, " ")).toThrow(/non-empty/);
162
+ });
163
+ });
@@ -0,0 +1,91 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { sessionize } from "../sessionize.js";
3
+ import type { ParsedLine } from "../parse-export.js";
4
+
5
+ function mk(dateSent: string, body = "x"): ParsedLine {
6
+ return { senderName: "Joel", dateSent, body, sequenceIndex: 0 };
7
+ }
8
+
9
+ describe("sessionize", () => {
10
+ it("returns [] for empty input", () => {
11
+ expect(sessionize([], 12)).toEqual([]);
12
+ });
13
+
14
+ it("returns one one-message session for a single message", () => {
15
+ const out = sessionize([mk("2026-03-14T10:00:00+00:00")], 12);
16
+ expect(out).toHaveLength(1);
17
+ expect(out[0].messages).toHaveLength(1);
18
+ expect(out[0].index).toBe(0);
19
+ expect(out[0].firstMessageAt).toBe("2026-03-14T10:00:00+00:00");
20
+ expect(out[0].lastMessageAt).toBe("2026-03-14T10:00:00+00:00");
21
+ });
22
+
23
+ it("groups messages within the gap into one session", () => {
24
+ const messages = [
25
+ mk("2026-03-14T10:00:00+00:00"),
26
+ mk("2026-03-14T11:00:00+00:00"),
27
+ mk("2026-03-14T12:00:00+00:00"),
28
+ ];
29
+ const out = sessionize(messages, 12);
30
+ expect(out).toHaveLength(1);
31
+ expect(out[0].messages).toHaveLength(3);
32
+ });
33
+
34
+ it("cuts at gap > gapHours", () => {
35
+ const messages = [
36
+ mk("2026-03-14T10:00:00+00:00"),
37
+ mk("2026-03-14T11:00:00+00:00"),
38
+ mk("2026-03-14T23:30:00+00:00"), // 12.5h gap from previous
39
+ mk("2026-03-15T00:00:00+00:00"),
40
+ ];
41
+ const out = sessionize(messages, 12);
42
+ expect(out).toHaveLength(2);
43
+ expect(out[0].messages).toHaveLength(2);
44
+ expect(out[1].messages).toHaveLength(2);
45
+ expect(out[1].index).toBe(1);
46
+ });
47
+
48
+ it("cuts at exact-at-threshold (gap == gapHours triggers a cut)", () => {
49
+ const messages = [
50
+ mk("2026-03-14T10:00:00+00:00"),
51
+ mk("2026-03-14T22:00:00+00:00"), // exactly 12h gap
52
+ ];
53
+ const out = sessionize(messages, 12);
54
+ expect(out).toHaveLength(2);
55
+ });
56
+
57
+ it("does not cut at gap just under threshold", () => {
58
+ const messages = [
59
+ mk("2026-03-14T10:00:00+00:00"),
60
+ mk("2026-03-14T21:59:59+00:00"), // 11h59m59s gap
61
+ ];
62
+ const out = sessionize(messages, 12);
63
+ expect(out).toHaveLength(1);
64
+ });
65
+
66
+ it("emits firstMessageAt / lastMessageAt from the session boundary messages", () => {
67
+ const messages = [
68
+ mk("2026-03-14T10:00:00+00:00"),
69
+ mk("2026-03-14T11:00:00+00:00"),
70
+ mk("2026-03-14T12:00:00+00:00"),
71
+ ];
72
+ const out = sessionize(messages, 12);
73
+ expect(out[0].firstMessageAt).toBe("2026-03-14T10:00:00+00:00");
74
+ expect(out[0].lastMessageAt).toBe("2026-03-14T12:00:00+00:00");
75
+ });
76
+
77
+ it("rejects non-positive gapHours", () => {
78
+ expect(() => sessionize([mk("2026-03-14T10:00:00+00:00")], 0)).toThrow(/positive/);
79
+ expect(() => sessionize([mk("2026-03-14T10:00:00+00:00")], -1)).toThrow(/positive/);
80
+ });
81
+
82
+ it("indexes sessions starting at 0 in chronological order", () => {
83
+ const messages = [
84
+ mk("2026-03-14T10:00:00+00:00"),
85
+ mk("2026-03-15T10:00:00+00:00"), // 24h gap
86
+ mk("2026-03-16T10:00:00+00:00"), // 24h gap
87
+ ];
88
+ const out = sessionize(messages, 12);
89
+ expect(out.map((s) => s.index)).toEqual([0, 1, 2]);
90
+ });
91
+ });
@@ -0,0 +1,59 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { toClassifierInput } from "../to-classifier-input.js";
3
+ import type { Session } from "../sessionize.js";
4
+
5
+ function mkSession(messages: Session["messages"]): Session {
6
+ return {
7
+ index: 0,
8
+ firstMessageAt: messages[0].dateSent,
9
+ lastMessageAt: messages[messages.length - 1].dateSent,
10
+ messages,
11
+ };
12
+ }
13
+
14
+ describe("toClassifierInput", () => {
15
+ it("renders one line per message in `[ts] Sender: body` form", () => {
16
+ const session = mkSession([
17
+ { senderName: "Joel", dateSent: "2026-03-14T10:00:00+00:00", body: "hi", sequenceIndex: 0 },
18
+ { senderName: "Adam", dateSent: "2026-03-14T10:01:00+00:00", body: "hey", sequenceIndex: 1 },
19
+ ]);
20
+ expect(toClassifierInput(session)).toBe(
21
+ "[2026-03-14 10:00:00 +00:00] Joel: hi\n[2026-03-14 10:01:00 +00:00] Adam: hey",
22
+ );
23
+ });
24
+
25
+ it("preserves the operator's offset in the formatted timestamp", () => {
26
+ const session = mkSession([
27
+ { senderName: "Joel", dateSent: "2026-03-14T10:00:00+01:00", body: "hi", sequenceIndex: 0 },
28
+ ]);
29
+ expect(toClassifierInput(session)).toBe("[2026-03-14 10:00:00 +01:00] Joel: hi");
30
+ });
31
+
32
+ it("normalises a 'Z' offset to +00:00 in the rendered form", () => {
33
+ const session = mkSession([
34
+ { senderName: "Joel", dateSent: "2026-03-14T10:00:00Z", body: "hi", sequenceIndex: 0 },
35
+ ]);
36
+ expect(toClassifierInput(session)).toBe("[2026-03-14 10:00:00 +00:00] Joel: hi");
37
+ });
38
+
39
+ it("preserves multi-line message bodies verbatim (no escape, internal newlines retained)", () => {
40
+ const session = mkSession([
41
+ {
42
+ senderName: "Joel",
43
+ dateSent: "2026-03-14T10:00:00+00:00",
44
+ body: "line one\nline two",
45
+ sequenceIndex: 0,
46
+ },
47
+ ]);
48
+ expect(toClassifierInput(session)).toBe(
49
+ "[2026-03-14 10:00:00 +00:00] Joel: line one\nline two",
50
+ );
51
+ });
52
+
53
+ it("falls back to the raw ISO when the timestamp shape does not match (defensive)", () => {
54
+ const session = mkSession([
55
+ { senderName: "Joel", dateSent: "not-an-iso", body: "hi", sequenceIndex: 0 },
56
+ ]);
57
+ expect(toClassifierInput(session)).toBe("[not-an-iso] Joel: hi");
58
+ });
59
+ });
@@ -0,0 +1,54 @@
1
+ import type { ParsedLine } from "./parse-export.js";
2
+ import { deriveMessageContentHash } from "./derive-keys.js";
3
+
4
+ // ---------------------------------------------------------------------------
5
+ // delta-cursor — locate the cursor for delta-append re-imports (Task 891).
6
+ //
7
+ // Pure function. Given a parsed re-export and the `lastIngestedMessageHash`
8
+ // recorded on the prior `:ConversationArchive`, find the position of the
9
+ // matching message in the parsed sequence and return `index + 1` (the slice
10
+ // start for the delta). Three outcomes:
11
+ //
12
+ // { kind: "found", deltaStart } → slice parsedLines from deltaStart
13
+ // { kind: "empty" } → cursor is the last line; no delta
14
+ // { kind: "missing" } → cursor not found (LOUD-FAIL upstream)
15
+ //
16
+ // `missing` covers both the "operator deleted prior messages from the
17
+ // re-export" case and the "operator imported a different chat archive"
18
+ // case. The orchestrator emits `[whatsapp-import] FAIL delta-cursor-missing`
19
+ // and exits non-zero when this surfaces.
20
+ // ---------------------------------------------------------------------------
21
+
22
+ export type CursorResult =
23
+ | { kind: "found"; deltaStart: number }
24
+ | { kind: "empty" }
25
+ | { kind: "missing" };
26
+
27
+ /**
28
+ * Walk parsed lines forward and return the first index whose content hash
29
+ * matches `lastIngestedMessageHash`. The first match is correct because
30
+ * messages with identical (dateSent, normalisedSenderName, body) tuples
31
+ * are genuine duplicates — there is no way to disambiguate them and slicing
32
+ * after the first occurrence is the chronologically safe choice.
33
+ */
34
+ export function findDeltaCursor(
35
+ parsedLines: readonly ParsedLine[],
36
+ lastIngestedMessageHash: string,
37
+ ): CursorResult {
38
+ if (!lastIngestedMessageHash || !lastIngestedMessageHash.trim()) {
39
+ throw new Error("findDeltaCursor: lastIngestedMessageHash must be non-empty");
40
+ }
41
+ for (let i = 0; i < parsedLines.length; i++) {
42
+ const line = parsedLines[i];
43
+ const hash = deriveMessageContentHash({
44
+ dateSent: line.dateSent,
45
+ senderName: line.senderName,
46
+ body: line.body,
47
+ });
48
+ if (hash === lastIngestedMessageHash) {
49
+ if (i === parsedLines.length - 1) return { kind: "empty" };
50
+ return { kind: "found", deltaStart: i + 1 };
51
+ }
52
+ }
53
+ return { kind: "missing" };
54
+ }
@@ -1,22 +1,23 @@
1
1
  import { createHash } from "node:crypto";
2
2
 
3
3
  // ---------------------------------------------------------------------------
4
- // derive-keys — natural-key derivation for whatsapp-import (Task 870).
4
+ // derive-keys — natural-key derivation for whatsapp-import (Task 891,
5
+ // supersedes Task 870's per-message contract).
5
6
  //
6
7
  // Pure functions. No I/O. The whole point is that re-imports of the same
7
- // archive collapse to the same Message identity regardless of release-level
8
- // drift in array indices, hash widths, or arbitrary tiebreakers.
8
+ // archive collapse to the same identity regardless of release-level drift in
9
+ // chunk indices, hash widths, or arbitrary tiebreakers.
9
10
  //
10
- // Key shape (Task 870 brief):
11
+ // Identity contracts (Task 891 brief):
11
12
  //
12
- // messageId = whatsapp-export:msg:<conversationSha256>:<dateSentISO>
13
- // :<NFKC-trim-lower(senderName)>
14
- // :<sha256-hex(body)>
13
+ // conversationIdentity = sha256(accountId + ":" + sortedParticipantElementIds.join(","))
14
+ // messageContentHash = sha256(dateSent + "|" + NFKC-trim-lower(senderName) + "|" + body)
15
15
  //
16
- // Operator constraint: the same archive must be re-imported with the same
17
- // `--timezone` flag. Different timezones reinterpret wall-clock instants and
18
- // will produce drifted messageIds that is correct semantics, not a bug.
19
- // Documented in .docs/whatsapp.md natural-key contract section.
16
+ // `conversationIdentity` is stable across re-exports same operator + same
17
+ // participant set same identity, regardless of file bytes. DM and group
18
+ // follow the same formula; the difference is the participant array length.
19
+ // `messageContentHash` is content-only (no archive sha256, no chunk index)
20
+ // so cursor lookup survives a fresh re-export of the same chat.
20
21
  // ---------------------------------------------------------------------------
21
22
 
22
23
  export function normaliseSenderName(name: string): string {
@@ -27,33 +28,55 @@ export function sha256Hex(input: string): string {
27
28
  return createHash("sha256").update(input).digest("hex");
28
29
  }
29
30
 
30
- export interface DeriveMessageIdInput {
31
- /** SHA-256 of the source `_chat.txt` bytes — stable across re-imports. */
32
- conversationSha256: string;
31
+ export interface DeriveConversationIdentityInput {
32
+ accountId: string;
33
+ /**
34
+ * Element IDs of every confirmed participant (owner + others). Order is
35
+ * not significant; the function sorts internally so the same set always
36
+ * produces the same identity.
37
+ */
38
+ participantElementIds: readonly string[];
39
+ }
40
+
41
+ /**
42
+ * Compute the stable identity for a conversation. Same accountId + same
43
+ * participant set ⇒ same identity, regardless of message content or export
44
+ * file bytes. DM and group chats use this identical formula.
45
+ */
46
+ export function deriveConversationIdentity(
47
+ input: DeriveConversationIdentityInput,
48
+ ): string {
49
+ if (!input.accountId || !input.accountId.trim()) {
50
+ throw new Error("deriveConversationIdentity: accountId is required");
51
+ }
52
+ if (input.participantElementIds.length === 0) {
53
+ throw new Error("deriveConversationIdentity: participantElementIds must be non-empty");
54
+ }
55
+ const sorted = [...input.participantElementIds].sort();
56
+ return sha256Hex(`${input.accountId}:${sorted.join(",")}`);
57
+ }
58
+
59
+ export interface DeriveMessageContentHashInput {
33
60
  /** ISO 8601 with timezone offset, as emitted by parseExport. */
34
61
  dateSent: string;
35
62
  /** Raw senderName from the export line. Normalised internally. */
36
63
  senderName: string;
37
- /** Raw message body. Hashed internally. */
64
+ /** Raw message body. */
38
65
  body: string;
39
66
  }
40
67
 
41
- export function deriveMessageId(input: DeriveMessageIdInput): string {
68
+ /**
69
+ * Compute a content-only hash for a single message. Used as the delta-append
70
+ * cursor: `:ConversationArchive.lastIngestedMessageHash` records the hash of
71
+ * the last ingested message; on re-import, the orchestrator finds the line
72
+ * with the matching hash and slices everything after it.
73
+ *
74
+ * Excludes archive sha256 deliberately — the cursor must survive a fresh
75
+ * re-export of the same chat (different file bytes, same message tuples).
76
+ */
77
+ export function deriveMessageContentHash(
78
+ input: DeriveMessageContentHashInput,
79
+ ): string {
42
80
  const norm = normaliseSenderName(input.senderName);
43
- const bodyHash = sha256Hex(input.body);
44
- return `whatsapp-export:msg:${input.conversationSha256}:${input.dateSent}:${norm}:${bodyHash}`;
45
- }
46
-
47
- export interface ObservationContentFields {
48
- summary?: string | null;
49
- from?: string | null;
50
- to?: string | null;
51
- subject?: string | null;
52
- }
53
-
54
- export function observationContentHash(fields: ObservationContentFields): string {
55
- const parts = [fields.summary, fields.from, fields.to, fields.subject].map(
56
- (p) => (p ?? "").normalize("NFKC").trim().toLowerCase(),
57
- );
58
- return sha256Hex(parts.join("|"));
81
+ return sha256Hex(`${input.dateSent}|${norm}|${input.body}`);
59
82
  }
@@ -5,15 +5,18 @@ export type {
5
5
  ParseExportCounters,
6
6
  ParsedLine,
7
7
  } from "./parse-export.js";
8
- export { parseFilterArg, applyFilter } from "./filter.js";
9
- export type { Filter } from "./filter.js";
10
8
  export {
11
9
  normaliseSenderName,
12
10
  sha256Hex,
13
- deriveMessageId,
14
- observationContentHash,
11
+ deriveConversationIdentity,
12
+ deriveMessageContentHash,
15
13
  } from "./derive-keys.js";
16
14
  export type {
17
- DeriveMessageIdInput,
18
- ObservationContentFields,
15
+ DeriveConversationIdentityInput,
16
+ DeriveMessageContentHashInput,
19
17
  } from "./derive-keys.js";
18
+ export { sessionize } from "./sessionize.js";
19
+ export type { Session } from "./sessionize.js";
20
+ export { toClassifierInput } from "./to-classifier-input.js";
21
+ export { findDeltaCursor } from "./delta-cursor.js";
22
+ export type { CursorResult } from "./delta-cursor.js";
@@ -0,0 +1,81 @@
1
+ import type { ParsedLine } from "./parse-export.js";
2
+
3
+ // ---------------------------------------------------------------------------
4
+ // sessionize — Pass 1 of the chunked-archive pipeline (Task 891).
5
+ //
6
+ // Pure function. Splits a chronologically-ordered sequence of parsed messages
7
+ // into "sessions" wherever the gap between consecutive `dateSent` values
8
+ // exceeds `gapHours`. Each session feeds memory-classify (mode='chat') as a
9
+ // turn-attributed block; the LLM then chunks each session into one or more
10
+ // `:Section:Conversation` rows with summary+keywords (Pass 2).
11
+ //
12
+ // Why deterministic gap-cut, not LLM topic detection:
13
+ // - The natural cadence of human chat (sleep, working hours, weekend gaps)
14
+ // produces clean session boundaries that the operator can intuit.
15
+ // - LLM-only chunking against a 10K-message archive sends 10K messages into
16
+ // one prompt and pays attention only to the last 1K — gap-cut bounds the
17
+ // window before the LLM ever sees it.
18
+ // - The default 12h gap matches one sleep cycle: messages on the same day
19
+ // belong together; a 14-hour gap (last evening message → next morning) is
20
+ // a fresh session even when the topic is identical.
21
+ // ---------------------------------------------------------------------------
22
+
23
+ export interface Session {
24
+ /** 0-based index across the archive's sessions. */
25
+ index: number;
26
+ /** ISO 8601 timestamp of the first message in the session. */
27
+ firstMessageAt: string;
28
+ /** ISO 8601 timestamp of the last message in the session. */
29
+ lastMessageAt: string;
30
+ /** Messages in the session, chronological. */
31
+ messages: ParsedLine[];
32
+ }
33
+
34
+ /**
35
+ * Split parsed messages into sessions on gaps ≥ `gapHours`. Input must be
36
+ * pre-sorted by `dateSent` (parse-export emits in file order, which IS
37
+ * chronological for any well-formed `_chat.txt`).
38
+ *
39
+ * Boundary semantics (exact-at-threshold):
40
+ * gap == gapHours → cut here (start a new session)
41
+ * gap < gapHours → same session
42
+ * gap > gapHours → cut here
43
+ *
44
+ * Empty input returns []; single-message input returns one one-message session.
45
+ */
46
+ export function sessionize(
47
+ messages: readonly ParsedLine[],
48
+ gapHours: number,
49
+ ): Session[] {
50
+ if (gapHours <= 0) {
51
+ throw new Error(`sessionize: gapHours must be positive, got ${gapHours}`);
52
+ }
53
+ if (messages.length === 0) return [];
54
+
55
+ const gapMs = gapHours * 60 * 60 * 1000;
56
+ const sessions: Session[] = [];
57
+ let currentMessages: ParsedLine[] = [messages[0]];
58
+
59
+ const flush = () => {
60
+ sessions.push({
61
+ index: sessions.length,
62
+ firstMessageAt: currentMessages[0].dateSent,
63
+ lastMessageAt: currentMessages[currentMessages.length - 1].dateSent,
64
+ messages: currentMessages,
65
+ });
66
+ };
67
+
68
+ for (let i = 1; i < messages.length; i++) {
69
+ const prevMs = Date.parse(messages[i - 1].dateSent);
70
+ const currMs = Date.parse(messages[i].dateSent);
71
+ const gap = currMs - prevMs;
72
+ if (gap >= gapMs) {
73
+ flush();
74
+ currentMessages = [messages[i]];
75
+ } else {
76
+ currentMessages.push(messages[i]);
77
+ }
78
+ }
79
+ flush();
80
+ return sessions;
81
+ }
@@ -0,0 +1,48 @@
1
+ import type { Session } from "./sessionize.js";
2
+
3
+ // ---------------------------------------------------------------------------
4
+ // to-classifier-input — Pass 2 input formatter (Task 891).
5
+ //
6
+ // Pure function. Renders one Session as a turn-attributed text block ready
7
+ // to hand to memory-classify (mode='chat'). Format:
8
+ //
9
+ // [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
10
+ // [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
11
+ // ...
12
+ //
13
+ // Multi-line message bodies are kept verbatim (with their internal newlines).
14
+ // The leading `[ts] <Sender>: ` prefix is the only structural addition; the
15
+ // classifier prompt instructs Haiku to preserve it in the chunk `body` so
16
+ // downstream Phase 2 work can recover per-message provenance via snippet
17
+ // matching against the conversation tail.
18
+ //
19
+ // Timezone: each `dateSent` ISO already carries an offset (set by parseExport
20
+ // from the operator's confirmed IANA zone). The renderer prints the
21
+ // human-readable wall-clock for that offset; the trailing "TZ" suffix is the
22
+ // offset itself, not a zone name.
23
+ // ---------------------------------------------------------------------------
24
+
25
+ export function toClassifierInput(session: Session): string {
26
+ const lines: string[] = [];
27
+ for (const m of session.messages) {
28
+ lines.push(`[${formatWallClock(m.dateSent)}] ${m.senderName}: ${m.body}`);
29
+ }
30
+ return lines.join("\n");
31
+ }
32
+
33
+ /**
34
+ * Format an ISO 8601 instant with offset as `YYYY-MM-DD HH:MM:SS ±HH:MM`,
35
+ * preserving the offset that the parser set from the operator's IANA zone.
36
+ * The wall-clock components are read directly from the ISO string — no
37
+ * Date construction (which would re-interpret in the local zone).
38
+ */
39
+ function formatWallClock(iso: string): string {
40
+ // ISO from parse-export is shaped: "YYYY-MM-DDTHH:MM:SS±HH:MM" (or "Z").
41
+ const m = iso.match(
42
+ /^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})(?:\.\d+)?(Z|[+-]\d{2}:?\d{2})$/,
43
+ );
44
+ if (!m) return iso; // surface the raw value if the shape drifted; pure function never throws on caller-supplied data
45
+ const [, y, mo, d, h, mi, s, off] = m;
46
+ const offsetLabel = off === "Z" ? "+00:00" : off;
47
+ return `${y}-${mo}-${d} ${h}:${mi}:${s} ${offsetLabel}`;
48
+ }