@rubytech/create-realagent 1.0.826 → 1.0.828
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/payload/platform/neo4j/schema.cypher +34 -2
- package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +19 -13
- package/payload/platform/plugins/admin/skills/onboarding/SKILL.md +5 -5
- package/payload/platform/plugins/docs/references/cloudflare.md +1 -1
- package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
- package/payload/platform/plugins/docs/references/troubleshooting.md +1 -0
- package/payload/platform/plugins/memory/PLUGIN.md +1 -1
- package/payload/platform/plugins/memory/mcp/dist/index.js +6 -41
- package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +51 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +19 -4
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +139 -56
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.d.ts +2 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +61 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +34 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +241 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
- package/payload/platform/plugins/memory/references/schema-base.md +5 -2
- package/payload/platform/plugins/whatsapp-import/PLUGIN.md +17 -15
- package/payload/platform/plugins/whatsapp-import/bin/ingest.mjs +313 -366
- package/payload/platform/plugins/whatsapp-import/bin/whatsapp-ingest.sh +27 -60
- package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.d.ts +18 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.d.ts.map +1 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.js +31 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.js.map +1 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.d.ts +27 -12
- package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.d.ts.map +1 -1
- package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.js +40 -20
- package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.js.map +1 -1
- package/payload/platform/plugins/whatsapp-import/lib/dist/index.d.ts +7 -4
- package/payload/platform/plugins/whatsapp-import/lib/dist/index.d.ts.map +1 -1
- package/payload/platform/plugins/whatsapp-import/lib/dist/index.js +9 -6
- package/payload/platform/plugins/whatsapp-import/lib/dist/index.js.map +1 -1
- package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.d.ts +25 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.d.ts.map +1 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.js +48 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.js.map +1 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.d.ts +3 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.d.ts.map +1 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.js +47 -0
- package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.js.map +1 -0
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/delta-append.test.ts +163 -0
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/sessionize.test.ts +91 -0
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/to-classifier-input.test.ts +59 -0
- package/payload/platform/plugins/whatsapp-import/lib/src/delta-cursor.ts +54 -0
- package/payload/platform/plugins/whatsapp-import/lib/src/derive-keys.ts +55 -32
- package/payload/platform/plugins/whatsapp-import/lib/src/index.ts +9 -6
- package/payload/platform/plugins/whatsapp-import/lib/src/sessionize.ts +81 -0
- package/payload/platform/plugins/whatsapp-import/lib/src/to-classifier-input.ts +48 -0
- package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md +66 -73
- package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/conversation-archive-shape.md +143 -0
- package/payload/platform/templates/specialists/agents/database-operator.md +10 -11
- package/payload/server/chunk-T2OPNP3L.js +654 -0
- package/payload/server/cloudflare-task-tracker-CR6TL4VL.js +19 -0
- package/payload/server/public/assets/{admin-DOkUspG1.js → admin-BNwPsMhJ.js} +2 -2
- package/payload/server/public/assets/{graph-LLMJa4Ch.js → graph-N_Bw-8oT.js} +1 -1
- package/payload/server/public/assets/{page-DoaF3DB0.js → page-BKLGP-th.js} +1 -1
- package/payload/server/public/graph.html +2 -2
- package/payload/server/public/index.html +2 -2
- package/payload/server/server.js +277 -164
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/filter-gate.test.ts +0 -172
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/ingest-idempotence.test.ts +0 -141
- package/payload/platform/plugins/whatsapp-import/lib/src/filter.ts +0 -136
- package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import-enrich/SKILL.md +0 -333
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { findDeltaCursor } from "../delta-cursor.js";
|
|
3
|
+
import {
|
|
4
|
+
deriveConversationIdentity,
|
|
5
|
+
deriveMessageContentHash,
|
|
6
|
+
normaliseSenderName,
|
|
7
|
+
} from "../derive-keys.js";
|
|
8
|
+
import type { ParsedLine } from "../parse-export.js";
|
|
9
|
+
|
|
10
|
+
function mk(dateSent: string, senderName: string, body: string): ParsedLine {
|
|
11
|
+
return { dateSent, senderName, body, sequenceIndex: 0 };
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
describe("deriveConversationIdentity", () => {
|
|
15
|
+
it("is stable across participant order (sorted internally)", () => {
|
|
16
|
+
const a = deriveConversationIdentity({
|
|
17
|
+
accountId: "acct-1",
|
|
18
|
+
participantElementIds: ["1:abc:1", "1:abc:2"],
|
|
19
|
+
});
|
|
20
|
+
const b = deriveConversationIdentity({
|
|
21
|
+
accountId: "acct-1",
|
|
22
|
+
participantElementIds: ["1:abc:2", "1:abc:1"],
|
|
23
|
+
});
|
|
24
|
+
expect(a).toBe(b);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("changes when accountId differs", () => {
|
|
28
|
+
const a = deriveConversationIdentity({
|
|
29
|
+
accountId: "acct-1",
|
|
30
|
+
participantElementIds: ["1:abc:1", "1:abc:2"],
|
|
31
|
+
});
|
|
32
|
+
const b = deriveConversationIdentity({
|
|
33
|
+
accountId: "acct-2",
|
|
34
|
+
participantElementIds: ["1:abc:1", "1:abc:2"],
|
|
35
|
+
});
|
|
36
|
+
expect(a).not.toBe(b);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("rejects empty participant array", () => {
|
|
40
|
+
expect(() =>
|
|
41
|
+
deriveConversationIdentity({
|
|
42
|
+
accountId: "acct-1",
|
|
43
|
+
participantElementIds: [],
|
|
44
|
+
}),
|
|
45
|
+
).toThrow(/non-empty/);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it("rejects empty accountId", () => {
|
|
49
|
+
expect(() =>
|
|
50
|
+
deriveConversationIdentity({
|
|
51
|
+
accountId: "",
|
|
52
|
+
participantElementIds: ["1:abc:1"],
|
|
53
|
+
}),
|
|
54
|
+
).toThrow(/accountId/);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("treats DM and group identically (same formula, different array length)", () => {
|
|
58
|
+
const dm = deriveConversationIdentity({
|
|
59
|
+
accountId: "acct-1",
|
|
60
|
+
participantElementIds: ["1:a:1", "1:a:2"],
|
|
61
|
+
});
|
|
62
|
+
const group = deriveConversationIdentity({
|
|
63
|
+
accountId: "acct-1",
|
|
64
|
+
participantElementIds: ["1:a:1", "1:a:2", "1:a:3"],
|
|
65
|
+
});
|
|
66
|
+
expect(dm).not.toBe(group);
|
|
67
|
+
// Both must hex-decode to a sha256 (64 hex chars).
|
|
68
|
+
expect(dm).toMatch(/^[a-f0-9]{64}$/);
|
|
69
|
+
expect(group).toMatch(/^[a-f0-9]{64}$/);
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
describe("deriveMessageContentHash", () => {
|
|
74
|
+
it("is content-only — no archive bytes contribute", () => {
|
|
75
|
+
const a = deriveMessageContentHash({
|
|
76
|
+
dateSent: "2026-03-14T10:00:00+00:00",
|
|
77
|
+
senderName: "Joel",
|
|
78
|
+
body: "hi",
|
|
79
|
+
});
|
|
80
|
+
const b = deriveMessageContentHash({
|
|
81
|
+
dateSent: "2026-03-14T10:00:00+00:00",
|
|
82
|
+
senderName: "Joel",
|
|
83
|
+
body: "hi",
|
|
84
|
+
});
|
|
85
|
+
expect(a).toBe(b);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it("changes when any field changes", () => {
|
|
89
|
+
const base = {
|
|
90
|
+
dateSent: "2026-03-14T10:00:00+00:00",
|
|
91
|
+
senderName: "Joel",
|
|
92
|
+
body: "hi",
|
|
93
|
+
};
|
|
94
|
+
expect(deriveMessageContentHash(base)).not.toBe(
|
|
95
|
+
deriveMessageContentHash({ ...base, body: "hello" }),
|
|
96
|
+
);
|
|
97
|
+
expect(deriveMessageContentHash(base)).not.toBe(
|
|
98
|
+
deriveMessageContentHash({ ...base, senderName: "Adam" }),
|
|
99
|
+
);
|
|
100
|
+
expect(deriveMessageContentHash(base)).not.toBe(
|
|
101
|
+
deriveMessageContentHash({ ...base, dateSent: "2026-03-14T10:00:01+00:00" }),
|
|
102
|
+
);
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it("collapses NFKC-equivalent sender names to one hash", () => {
|
|
106
|
+
const a = deriveMessageContentHash({
|
|
107
|
+
dateSent: "2026-03-14T10:00:00+00:00",
|
|
108
|
+
senderName: " ADAM Mackay ",
|
|
109
|
+
body: "hi",
|
|
110
|
+
});
|
|
111
|
+
const b = deriveMessageContentHash({
|
|
112
|
+
dateSent: "2026-03-14T10:00:00+00:00",
|
|
113
|
+
senderName: "adam mackay",
|
|
114
|
+
body: "hi",
|
|
115
|
+
});
|
|
116
|
+
expect(a).toBe(b);
|
|
117
|
+
});
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
describe("normaliseSenderName", () => {
|
|
121
|
+
it("returns NFKC-trim-lower form", () => {
|
|
122
|
+
expect(normaliseSenderName(" Adam Mackay ")).toBe("adam mackay");
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
describe("findDeltaCursor", () => {
|
|
127
|
+
const lines: ParsedLine[] = [
|
|
128
|
+
mk("2026-03-14T10:00:00+00:00", "Joel", "first"),
|
|
129
|
+
mk("2026-03-14T10:01:00+00:00", "Adam", "second"),
|
|
130
|
+
mk("2026-03-14T10:02:00+00:00", "Joel", "third"),
|
|
131
|
+
];
|
|
132
|
+
const hashFor = (l: ParsedLine) =>
|
|
133
|
+
deriveMessageContentHash({
|
|
134
|
+
dateSent: l.dateSent,
|
|
135
|
+
senderName: l.senderName,
|
|
136
|
+
body: l.body,
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it("returns 'found' with deltaStart = cursor + 1 when the hash matches a prior line", () => {
|
|
140
|
+
const result = findDeltaCursor(lines, hashFor(lines[0]));
|
|
141
|
+
expect(result).toEqual({ kind: "found", deltaStart: 1 });
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
it("returns 'empty' when the cursor is the last line (no new messages)", () => {
|
|
145
|
+
const result = findDeltaCursor(lines, hashFor(lines[2]));
|
|
146
|
+
expect(result).toEqual({ kind: "empty" });
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
it("returns 'missing' when no parsed line matches (LOUD-FAIL upstream)", () => {
|
|
150
|
+
const result = findDeltaCursor(lines, "deadbeef".repeat(8));
|
|
151
|
+
expect(result).toEqual({ kind: "missing" });
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
it("returns 'missing' for an empty parsed sequence (cursor cannot exist)", () => {
|
|
155
|
+
const result = findDeltaCursor([], hashFor(lines[0]));
|
|
156
|
+
expect(result).toEqual({ kind: "missing" });
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it("rejects empty cursor hash", () => {
|
|
160
|
+
expect(() => findDeltaCursor(lines, "")).toThrow(/non-empty/);
|
|
161
|
+
expect(() => findDeltaCursor(lines, " ")).toThrow(/non-empty/);
|
|
162
|
+
});
|
|
163
|
+
});
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { sessionize } from "../sessionize.js";
|
|
3
|
+
import type { ParsedLine } from "../parse-export.js";
|
|
4
|
+
|
|
5
|
+
function mk(dateSent: string, body = "x"): ParsedLine {
|
|
6
|
+
return { senderName: "Joel", dateSent, body, sequenceIndex: 0 };
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
describe("sessionize", () => {
|
|
10
|
+
it("returns [] for empty input", () => {
|
|
11
|
+
expect(sessionize([], 12)).toEqual([]);
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
it("returns one one-message session for a single message", () => {
|
|
15
|
+
const out = sessionize([mk("2026-03-14T10:00:00+00:00")], 12);
|
|
16
|
+
expect(out).toHaveLength(1);
|
|
17
|
+
expect(out[0].messages).toHaveLength(1);
|
|
18
|
+
expect(out[0].index).toBe(0);
|
|
19
|
+
expect(out[0].firstMessageAt).toBe("2026-03-14T10:00:00+00:00");
|
|
20
|
+
expect(out[0].lastMessageAt).toBe("2026-03-14T10:00:00+00:00");
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it("groups messages within the gap into one session", () => {
|
|
24
|
+
const messages = [
|
|
25
|
+
mk("2026-03-14T10:00:00+00:00"),
|
|
26
|
+
mk("2026-03-14T11:00:00+00:00"),
|
|
27
|
+
mk("2026-03-14T12:00:00+00:00"),
|
|
28
|
+
];
|
|
29
|
+
const out = sessionize(messages, 12);
|
|
30
|
+
expect(out).toHaveLength(1);
|
|
31
|
+
expect(out[0].messages).toHaveLength(3);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it("cuts at gap > gapHours", () => {
|
|
35
|
+
const messages = [
|
|
36
|
+
mk("2026-03-14T10:00:00+00:00"),
|
|
37
|
+
mk("2026-03-14T11:00:00+00:00"),
|
|
38
|
+
mk("2026-03-14T23:30:00+00:00"), // 12.5h gap from previous
|
|
39
|
+
mk("2026-03-15T00:00:00+00:00"),
|
|
40
|
+
];
|
|
41
|
+
const out = sessionize(messages, 12);
|
|
42
|
+
expect(out).toHaveLength(2);
|
|
43
|
+
expect(out[0].messages).toHaveLength(2);
|
|
44
|
+
expect(out[1].messages).toHaveLength(2);
|
|
45
|
+
expect(out[1].index).toBe(1);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it("cuts at exact-at-threshold (gap == gapHours triggers a cut)", () => {
|
|
49
|
+
const messages = [
|
|
50
|
+
mk("2026-03-14T10:00:00+00:00"),
|
|
51
|
+
mk("2026-03-14T22:00:00+00:00"), // exactly 12h gap
|
|
52
|
+
];
|
|
53
|
+
const out = sessionize(messages, 12);
|
|
54
|
+
expect(out).toHaveLength(2);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("does not cut at gap just under threshold", () => {
|
|
58
|
+
const messages = [
|
|
59
|
+
mk("2026-03-14T10:00:00+00:00"),
|
|
60
|
+
mk("2026-03-14T21:59:59+00:00"), // 11h59m59s gap
|
|
61
|
+
];
|
|
62
|
+
const out = sessionize(messages, 12);
|
|
63
|
+
expect(out).toHaveLength(1);
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
it("emits firstMessageAt / lastMessageAt from the session boundary messages", () => {
|
|
67
|
+
const messages = [
|
|
68
|
+
mk("2026-03-14T10:00:00+00:00"),
|
|
69
|
+
mk("2026-03-14T11:00:00+00:00"),
|
|
70
|
+
mk("2026-03-14T12:00:00+00:00"),
|
|
71
|
+
];
|
|
72
|
+
const out = sessionize(messages, 12);
|
|
73
|
+
expect(out[0].firstMessageAt).toBe("2026-03-14T10:00:00+00:00");
|
|
74
|
+
expect(out[0].lastMessageAt).toBe("2026-03-14T12:00:00+00:00");
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("rejects non-positive gapHours", () => {
|
|
78
|
+
expect(() => sessionize([mk("2026-03-14T10:00:00+00:00")], 0)).toThrow(/positive/);
|
|
79
|
+
expect(() => sessionize([mk("2026-03-14T10:00:00+00:00")], -1)).toThrow(/positive/);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it("indexes sessions starting at 0 in chronological order", () => {
|
|
83
|
+
const messages = [
|
|
84
|
+
mk("2026-03-14T10:00:00+00:00"),
|
|
85
|
+
mk("2026-03-15T10:00:00+00:00"), // 24h gap
|
|
86
|
+
mk("2026-03-16T10:00:00+00:00"), // 24h gap
|
|
87
|
+
];
|
|
88
|
+
const out = sessionize(messages, 12);
|
|
89
|
+
expect(out.map((s) => s.index)).toEqual([0, 1, 2]);
|
|
90
|
+
});
|
|
91
|
+
});
|
package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/to-classifier-input.test.ts
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { toClassifierInput } from "../to-classifier-input.js";
|
|
3
|
+
import type { Session } from "../sessionize.js";
|
|
4
|
+
|
|
5
|
+
function mkSession(messages: Session["messages"]): Session {
|
|
6
|
+
return {
|
|
7
|
+
index: 0,
|
|
8
|
+
firstMessageAt: messages[0].dateSent,
|
|
9
|
+
lastMessageAt: messages[messages.length - 1].dateSent,
|
|
10
|
+
messages,
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
describe("toClassifierInput", () => {
|
|
15
|
+
it("renders one line per message in `[ts] Sender: body` form", () => {
|
|
16
|
+
const session = mkSession([
|
|
17
|
+
{ senderName: "Joel", dateSent: "2026-03-14T10:00:00+00:00", body: "hi", sequenceIndex: 0 },
|
|
18
|
+
{ senderName: "Adam", dateSent: "2026-03-14T10:01:00+00:00", body: "hey", sequenceIndex: 1 },
|
|
19
|
+
]);
|
|
20
|
+
expect(toClassifierInput(session)).toBe(
|
|
21
|
+
"[2026-03-14 10:00:00 +00:00] Joel: hi\n[2026-03-14 10:01:00 +00:00] Adam: hey",
|
|
22
|
+
);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("preserves the operator's offset in the formatted timestamp", () => {
|
|
26
|
+
const session = mkSession([
|
|
27
|
+
{ senderName: "Joel", dateSent: "2026-03-14T10:00:00+01:00", body: "hi", sequenceIndex: 0 },
|
|
28
|
+
]);
|
|
29
|
+
expect(toClassifierInput(session)).toBe("[2026-03-14 10:00:00 +01:00] Joel: hi");
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
it("normalises a 'Z' offset to +00:00 in the rendered form", () => {
|
|
33
|
+
const session = mkSession([
|
|
34
|
+
{ senderName: "Joel", dateSent: "2026-03-14T10:00:00Z", body: "hi", sequenceIndex: 0 },
|
|
35
|
+
]);
|
|
36
|
+
expect(toClassifierInput(session)).toBe("[2026-03-14 10:00:00 +00:00] Joel: hi");
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("preserves multi-line message bodies verbatim (no escape, internal newlines retained)", () => {
|
|
40
|
+
const session = mkSession([
|
|
41
|
+
{
|
|
42
|
+
senderName: "Joel",
|
|
43
|
+
dateSent: "2026-03-14T10:00:00+00:00",
|
|
44
|
+
body: "line one\nline two",
|
|
45
|
+
sequenceIndex: 0,
|
|
46
|
+
},
|
|
47
|
+
]);
|
|
48
|
+
expect(toClassifierInput(session)).toBe(
|
|
49
|
+
"[2026-03-14 10:00:00 +00:00] Joel: line one\nline two",
|
|
50
|
+
);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it("falls back to the raw ISO when the timestamp shape does not match (defensive)", () => {
|
|
54
|
+
const session = mkSession([
|
|
55
|
+
{ senderName: "Joel", dateSent: "not-an-iso", body: "hi", sequenceIndex: 0 },
|
|
56
|
+
]);
|
|
57
|
+
expect(toClassifierInput(session)).toBe("[not-an-iso] Joel: hi");
|
|
58
|
+
});
|
|
59
|
+
});
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import type { ParsedLine } from "./parse-export.js";
|
|
2
|
+
import { deriveMessageContentHash } from "./derive-keys.js";
|
|
3
|
+
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
// delta-cursor — locate the cursor for delta-append re-imports (Task 891).
|
|
6
|
+
//
|
|
7
|
+
// Pure function. Given a parsed re-export and the `lastIngestedMessageHash`
|
|
8
|
+
// recorded on the prior `:ConversationArchive`, find the position of the
|
|
9
|
+
// matching message in the parsed sequence and return `index + 1` (the slice
|
|
10
|
+
// start for the delta). Three outcomes:
|
|
11
|
+
//
|
|
12
|
+
// { kind: "found", deltaStart } → slice parsedLines from deltaStart
|
|
13
|
+
// { kind: "empty" } → cursor is the last line; no delta
|
|
14
|
+
// { kind: "missing" } → cursor not found (LOUD-FAIL upstream)
|
|
15
|
+
//
|
|
16
|
+
// `missing` covers both the "operator deleted prior messages from the
|
|
17
|
+
// re-export" case and the "operator imported a different chat archive"
|
|
18
|
+
// case. The orchestrator emits `[whatsapp-import] FAIL delta-cursor-missing`
|
|
19
|
+
// and exits non-zero when this surfaces.
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
export type CursorResult =
|
|
23
|
+
| { kind: "found"; deltaStart: number }
|
|
24
|
+
| { kind: "empty" }
|
|
25
|
+
| { kind: "missing" };
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Walk parsed lines forward and return the first index whose content hash
|
|
29
|
+
* matches `lastIngestedMessageHash`. The first match is correct because
|
|
30
|
+
* messages with identical (dateSent, normalisedSenderName, body) tuples
|
|
31
|
+
* are genuine duplicates — there is no way to disambiguate them and slicing
|
|
32
|
+
* after the first occurrence is the chronologically safe choice.
|
|
33
|
+
*/
|
|
34
|
+
export function findDeltaCursor(
|
|
35
|
+
parsedLines: readonly ParsedLine[],
|
|
36
|
+
lastIngestedMessageHash: string,
|
|
37
|
+
): CursorResult {
|
|
38
|
+
if (!lastIngestedMessageHash || !lastIngestedMessageHash.trim()) {
|
|
39
|
+
throw new Error("findDeltaCursor: lastIngestedMessageHash must be non-empty");
|
|
40
|
+
}
|
|
41
|
+
for (let i = 0; i < parsedLines.length; i++) {
|
|
42
|
+
const line = parsedLines[i];
|
|
43
|
+
const hash = deriveMessageContentHash({
|
|
44
|
+
dateSent: line.dateSent,
|
|
45
|
+
senderName: line.senderName,
|
|
46
|
+
body: line.body,
|
|
47
|
+
});
|
|
48
|
+
if (hash === lastIngestedMessageHash) {
|
|
49
|
+
if (i === parsedLines.length - 1) return { kind: "empty" };
|
|
50
|
+
return { kind: "found", deltaStart: i + 1 };
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return { kind: "missing" };
|
|
54
|
+
}
|
|
@@ -1,22 +1,23 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
|
|
3
3
|
// ---------------------------------------------------------------------------
|
|
4
|
-
// derive-keys — natural-key derivation for whatsapp-import (Task
|
|
4
|
+
// derive-keys — natural-key derivation for whatsapp-import (Task 891,
|
|
5
|
+
// supersedes Task 870's per-message contract).
|
|
5
6
|
//
|
|
6
7
|
// Pure functions. No I/O. The whole point is that re-imports of the same
|
|
7
|
-
// archive collapse to the same
|
|
8
|
-
//
|
|
8
|
+
// archive collapse to the same identity regardless of release-level drift in
|
|
9
|
+
// chunk indices, hash widths, or arbitrary tiebreakers.
|
|
9
10
|
//
|
|
10
|
-
//
|
|
11
|
+
// Identity contracts (Task 891 brief):
|
|
11
12
|
//
|
|
12
|
-
//
|
|
13
|
-
//
|
|
14
|
-
// :<sha256-hex(body)>
|
|
13
|
+
// conversationIdentity = sha256(accountId + ":" + sortedParticipantElementIds.join(","))
|
|
14
|
+
// messageContentHash = sha256(dateSent + "|" + NFKC-trim-lower(senderName) + "|" + body)
|
|
15
15
|
//
|
|
16
|
-
//
|
|
17
|
-
//
|
|
18
|
-
//
|
|
19
|
-
//
|
|
16
|
+
// `conversationIdentity` is stable across re-exports — same operator + same
|
|
17
|
+
// participant set → same identity, regardless of file bytes. DM and group
|
|
18
|
+
// follow the same formula; the difference is the participant array length.
|
|
19
|
+
// `messageContentHash` is content-only (no archive sha256, no chunk index)
|
|
20
|
+
// so cursor lookup survives a fresh re-export of the same chat.
|
|
20
21
|
// ---------------------------------------------------------------------------
|
|
21
22
|
|
|
22
23
|
export function normaliseSenderName(name: string): string {
|
|
@@ -27,33 +28,55 @@ export function sha256Hex(input: string): string {
|
|
|
27
28
|
return createHash("sha256").update(input).digest("hex");
|
|
28
29
|
}
|
|
29
30
|
|
|
30
|
-
export interface
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
export interface DeriveConversationIdentityInput {
|
|
32
|
+
accountId: string;
|
|
33
|
+
/**
|
|
34
|
+
* Element IDs of every confirmed participant (owner + others). Order is
|
|
35
|
+
* not significant; the function sorts internally so the same set always
|
|
36
|
+
* produces the same identity.
|
|
37
|
+
*/
|
|
38
|
+
participantElementIds: readonly string[];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Compute the stable identity for a conversation. Same accountId + same
|
|
43
|
+
* participant set ⇒ same identity, regardless of message content or export
|
|
44
|
+
* file bytes. DM and group chats use this identical formula.
|
|
45
|
+
*/
|
|
46
|
+
export function deriveConversationIdentity(
|
|
47
|
+
input: DeriveConversationIdentityInput,
|
|
48
|
+
): string {
|
|
49
|
+
if (!input.accountId || !input.accountId.trim()) {
|
|
50
|
+
throw new Error("deriveConversationIdentity: accountId is required");
|
|
51
|
+
}
|
|
52
|
+
if (input.participantElementIds.length === 0) {
|
|
53
|
+
throw new Error("deriveConversationIdentity: participantElementIds must be non-empty");
|
|
54
|
+
}
|
|
55
|
+
const sorted = [...input.participantElementIds].sort();
|
|
56
|
+
return sha256Hex(`${input.accountId}:${sorted.join(",")}`);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface DeriveMessageContentHashInput {
|
|
33
60
|
/** ISO 8601 with timezone offset, as emitted by parseExport. */
|
|
34
61
|
dateSent: string;
|
|
35
62
|
/** Raw senderName from the export line. Normalised internally. */
|
|
36
63
|
senderName: string;
|
|
37
|
-
/** Raw message body.
|
|
64
|
+
/** Raw message body. */
|
|
38
65
|
body: string;
|
|
39
66
|
}
|
|
40
67
|
|
|
41
|
-
|
|
68
|
+
/**
|
|
69
|
+
* Compute a content-only hash for a single message. Used as the delta-append
|
|
70
|
+
* cursor: `:ConversationArchive.lastIngestedMessageHash` records the hash of
|
|
71
|
+
* the last ingested message; on re-import, the orchestrator finds the line
|
|
72
|
+
* with the matching hash and slices everything after it.
|
|
73
|
+
*
|
|
74
|
+
* Excludes archive sha256 deliberately — the cursor must survive a fresh
|
|
75
|
+
* re-export of the same chat (different file bytes, same message tuples).
|
|
76
|
+
*/
|
|
77
|
+
export function deriveMessageContentHash(
|
|
78
|
+
input: DeriveMessageContentHashInput,
|
|
79
|
+
): string {
|
|
42
80
|
const norm = normaliseSenderName(input.senderName);
|
|
43
|
-
|
|
44
|
-
return `whatsapp-export:msg:${input.conversationSha256}:${input.dateSent}:${norm}:${bodyHash}`;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export interface ObservationContentFields {
|
|
48
|
-
summary?: string | null;
|
|
49
|
-
from?: string | null;
|
|
50
|
-
to?: string | null;
|
|
51
|
-
subject?: string | null;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
export function observationContentHash(fields: ObservationContentFields): string {
|
|
55
|
-
const parts = [fields.summary, fields.from, fields.to, fields.subject].map(
|
|
56
|
-
(p) => (p ?? "").normalize("NFKC").trim().toLowerCase(),
|
|
57
|
-
);
|
|
58
|
-
return sha256Hex(parts.join("|"));
|
|
81
|
+
return sha256Hex(`${input.dateSent}|${norm}|${input.body}`);
|
|
59
82
|
}
|
|
@@ -5,15 +5,18 @@ export type {
|
|
|
5
5
|
ParseExportCounters,
|
|
6
6
|
ParsedLine,
|
|
7
7
|
} from "./parse-export.js";
|
|
8
|
-
export { parseFilterArg, applyFilter } from "./filter.js";
|
|
9
|
-
export type { Filter } from "./filter.js";
|
|
10
8
|
export {
|
|
11
9
|
normaliseSenderName,
|
|
12
10
|
sha256Hex,
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
deriveConversationIdentity,
|
|
12
|
+
deriveMessageContentHash,
|
|
15
13
|
} from "./derive-keys.js";
|
|
16
14
|
export type {
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
DeriveConversationIdentityInput,
|
|
16
|
+
DeriveMessageContentHashInput,
|
|
19
17
|
} from "./derive-keys.js";
|
|
18
|
+
export { sessionize } from "./sessionize.js";
|
|
19
|
+
export type { Session } from "./sessionize.js";
|
|
20
|
+
export { toClassifierInput } from "./to-classifier-input.js";
|
|
21
|
+
export { findDeltaCursor } from "./delta-cursor.js";
|
|
22
|
+
export type { CursorResult } from "./delta-cursor.js";
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import type { ParsedLine } from "./parse-export.js";
|
|
2
|
+
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// sessionize — Pass 1 of the chunked-archive pipeline (Task 891).
|
|
5
|
+
//
|
|
6
|
+
// Pure function. Splits a chronologically-ordered sequence of parsed messages
|
|
7
|
+
// into "sessions" wherever the gap between consecutive `dateSent` values
|
|
8
|
+
// exceeds `gapHours`. Each session feeds memory-classify (mode='chat') as a
|
|
9
|
+
// turn-attributed block; the LLM then chunks each session into one or more
|
|
10
|
+
// `:Section:Conversation` rows with summary+keywords (Pass 2).
|
|
11
|
+
//
|
|
12
|
+
// Why deterministic gap-cut, not LLM topic detection:
|
|
13
|
+
// - The natural cadence of human chat (sleep, working hours, weekend gaps)
|
|
14
|
+
// produces clean session boundaries that the operator can intuit.
|
|
15
|
+
// - LLM-only chunking against a 10K-message archive sends 10K messages into
|
|
16
|
+
// one prompt and pays attention only to the last 1K — gap-cut bounds the
|
|
17
|
+
// window before the LLM ever sees it.
|
|
18
|
+
// - The default 12h gap matches one sleep cycle: messages on the same day
|
|
19
|
+
// belong together; a 14-hour gap (last evening message → next morning) is
|
|
20
|
+
// a fresh session even when the topic is identical.
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
export interface Session {
|
|
24
|
+
/** 0-based index across the archive's sessions. */
|
|
25
|
+
index: number;
|
|
26
|
+
/** ISO 8601 timestamp of the first message in the session. */
|
|
27
|
+
firstMessageAt: string;
|
|
28
|
+
/** ISO 8601 timestamp of the last message in the session. */
|
|
29
|
+
lastMessageAt: string;
|
|
30
|
+
/** Messages in the session, chronological. */
|
|
31
|
+
messages: ParsedLine[];
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Split parsed messages into sessions on gaps ≥ `gapHours`. Input must be
|
|
36
|
+
* pre-sorted by `dateSent` (parse-export emits in file order, which IS
|
|
37
|
+
* chronological for any well-formed `_chat.txt`).
|
|
38
|
+
*
|
|
39
|
+
* Boundary semantics (exact-at-threshold):
|
|
40
|
+
* gap == gapHours → cut here (start a new session)
|
|
41
|
+
* gap < gapHours → same session
|
|
42
|
+
* gap > gapHours → cut here
|
|
43
|
+
*
|
|
44
|
+
* Empty input returns []; single-message input returns one one-message session.
|
|
45
|
+
*/
|
|
46
|
+
export function sessionize(
|
|
47
|
+
messages: readonly ParsedLine[],
|
|
48
|
+
gapHours: number,
|
|
49
|
+
): Session[] {
|
|
50
|
+
if (gapHours <= 0) {
|
|
51
|
+
throw new Error(`sessionize: gapHours must be positive, got ${gapHours}`);
|
|
52
|
+
}
|
|
53
|
+
if (messages.length === 0) return [];
|
|
54
|
+
|
|
55
|
+
const gapMs = gapHours * 60 * 60 * 1000;
|
|
56
|
+
const sessions: Session[] = [];
|
|
57
|
+
let currentMessages: ParsedLine[] = [messages[0]];
|
|
58
|
+
|
|
59
|
+
const flush = () => {
|
|
60
|
+
sessions.push({
|
|
61
|
+
index: sessions.length,
|
|
62
|
+
firstMessageAt: currentMessages[0].dateSent,
|
|
63
|
+
lastMessageAt: currentMessages[currentMessages.length - 1].dateSent,
|
|
64
|
+
messages: currentMessages,
|
|
65
|
+
});
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
for (let i = 1; i < messages.length; i++) {
|
|
69
|
+
const prevMs = Date.parse(messages[i - 1].dateSent);
|
|
70
|
+
const currMs = Date.parse(messages[i].dateSent);
|
|
71
|
+
const gap = currMs - prevMs;
|
|
72
|
+
if (gap >= gapMs) {
|
|
73
|
+
flush();
|
|
74
|
+
currentMessages = [messages[i]];
|
|
75
|
+
} else {
|
|
76
|
+
currentMessages.push(messages[i]);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
flush();
|
|
80
|
+
return sessions;
|
|
81
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { Session } from "./sessionize.js";
|
|
2
|
+
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// to-classifier-input — Pass 2 input formatter (Task 891).
|
|
5
|
+
//
|
|
6
|
+
// Pure function. Renders one Session as a turn-attributed text block ready
|
|
7
|
+
// to hand to memory-classify (mode='chat'). Format:
|
|
8
|
+
//
|
|
9
|
+
// [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
|
|
10
|
+
// [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
|
|
11
|
+
// ...
|
|
12
|
+
//
|
|
13
|
+
// Multi-line message bodies are kept verbatim (with their internal newlines).
|
|
14
|
+
// The leading `[ts] <Sender>: ` prefix is the only structural addition; the
|
|
15
|
+
// classifier prompt instructs Haiku to preserve it in the chunk `body` so
|
|
16
|
+
// downstream Phase 2 work can recover per-message provenance via snippet
|
|
17
|
+
// matching against the conversation tail.
|
|
18
|
+
//
|
|
19
|
+
// Timezone: each `dateSent` ISO already carries an offset (set by parseExport
|
|
20
|
+
// from the operator's confirmed IANA zone). The renderer prints the
|
|
21
|
+
// human-readable wall-clock for that offset; the trailing "TZ" suffix is the
|
|
22
|
+
// offset itself, not a zone name.
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
export function toClassifierInput(session: Session): string {
|
|
26
|
+
const lines: string[] = [];
|
|
27
|
+
for (const m of session.messages) {
|
|
28
|
+
lines.push(`[${formatWallClock(m.dateSent)}] ${m.senderName}: ${m.body}`);
|
|
29
|
+
}
|
|
30
|
+
return lines.join("\n");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Format an ISO 8601 instant with offset as `YYYY-MM-DD HH:MM:SS ±HH:MM`,
|
|
35
|
+
* preserving the offset that the parser set from the operator's IANA zone.
|
|
36
|
+
* The wall-clock components are read directly from the ISO string — no
|
|
37
|
+
* Date construction (which would re-interpret in the local zone).
|
|
38
|
+
*/
|
|
39
|
+
function formatWallClock(iso: string): string {
|
|
40
|
+
// ISO from parse-export is shaped: "YYYY-MM-DDTHH:MM:SS±HH:MM" (or "Z").
|
|
41
|
+
const m = iso.match(
|
|
42
|
+
/^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})(?:\.\d+)?(Z|[+-]\d{2}:?\d{2})$/,
|
|
43
|
+
);
|
|
44
|
+
if (!m) return iso; // surface the raw value if the shape drifted; pure function never throws on caller-supplied data
|
|
45
|
+
const [, y, mo, d, h, mi, s, off] = m;
|
|
46
|
+
const offsetLabel = off === "Z" ? "+00:00" : off;
|
|
47
|
+
return `${y}-${mo}-${d} ${h}:${mi}:${s} ${offsetLabel}`;
|
|
48
|
+
}
|