@rubytech/create-realagent 1.0.828 → 1.0.830
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/payload/platform/config/brand.json +1 -1
- package/payload/platform/lib/oauth-llm/dist/index.d.ts +1 -1
- package/payload/platform/lib/oauth-llm/dist/index.d.ts.map +1 -1
- package/payload/platform/lib/oauth-llm/dist/index.js +21 -0
- package/payload/platform/lib/oauth-llm/dist/index.js.map +1 -1
- package/payload/platform/lib/oauth-llm/src/index.ts +24 -0
- package/payload/platform/neo4j/migrations/007-conversation-archive-source.ts +116 -0
- package/payload/platform/neo4j/schema.cypher +12 -2
- package/payload/platform/package.json +2 -2
- package/payload/platform/plugins/admin/hooks/__tests__/archive-ingest-surface-gate.test.sh +6 -6
- package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +14 -8
- package/payload/platform/plugins/admin/skills/onboarding/SKILL.md +2 -2
- package/payload/platform/plugins/contacts/mcp/dist/index.js +5 -5
- package/payload/platform/plugins/contacts/mcp/dist/index.js.map +1 -1
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts +1 -1
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts.map +1 -1
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js +29 -23
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js.map +1 -1
- package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
- package/payload/platform/plugins/memory/PLUGIN.md +6 -5
- package/payload/platform/plugins/{whatsapp-import/bin/ingest.mjs → memory/bin/conversation-archive-ingest.mjs} +136 -212
- package/payload/platform/plugins/{whatsapp-import/bin/whatsapp-ingest.sh → memory/bin/conversation-archive-ingest.sh} +27 -19
- package/payload/platform/plugins/memory/mcp/dist/index.js +26 -212
- package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +4 -3
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js +11 -6
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-validator.test.js +103 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-validator.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts +5 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js +30 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts +48 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js +23 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts +3 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js +237 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts +11 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js +21 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts +16 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js +39 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts +17 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js +90 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts +9 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js +32 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts +3 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js +27 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts +45 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js +125 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +24 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +293 -33
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js +9 -2
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.d.ts +16 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.js +12 -3
- package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts +2 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js +75 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts +2 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js +67 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-archive-write.test.js +2 -138
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-archive-write.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +39 -3
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.d.ts +2 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.js +148 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts +1 -47
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js +9 -318
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +7 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +14 -8
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.d.ts +21 -17
- package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.js +77 -37
- package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.js.map +1 -1
- package/payload/platform/plugins/memory/references/schema-base.md +3 -1
- package/payload/platform/plugins/{whatsapp-import/skills/whatsapp-import → memory/skills/conversation-archive}/SKILL.md +45 -36
- package/payload/platform/plugins/memory/skills/document-ingest/SKILL.md +59 -6
- package/payload/platform/plugins/whatsapp/PLUGIN.md +1 -1
- package/payload/platform/scripts/seed-neo4j.sh +9 -8
- package/payload/platform/templates/specialists/agents/database-operator.md +7 -14
- package/payload/server/chunk-7BO5HDJC.js +10093 -0
- package/payload/server/chunk-CUSH3UXP.js +2305 -0
- package/payload/server/chunk-EL4DZ56X.js +1116 -0
- package/payload/server/chunk-IWNDVGKT.js +10077 -0
- package/payload/server/chunk-KC7NUABI.js +654 -0
- package/payload/server/chunk-QOJ2D26Z.js +654 -0
- package/payload/server/chunk-RC46ZYGT.js +2305 -0
- package/payload/server/chunk-WUVXPZIV.js +1116 -0
- package/payload/server/client-pool-3TM3SRIA.js +32 -0
- package/payload/server/client-pool-7NTEFNVQ.js +32 -0
- package/payload/server/cloudflare-task-tracker-4NIODMGL.js +19 -0
- package/payload/server/cloudflare-task-tracker-WE77WXSI.js +19 -0
- package/payload/server/maxy-edge.js +3 -3
- package/payload/server/neo4j-migrations-4XPNJNM6.js +490 -0
- package/payload/server/neo4j-migrations-XTQ4WEV6.js +428 -0
- package/payload/server/server.js +6 -6
- package/payload/platform/plugins/whatsapp-import/PLUGIN.md +0 -48
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/delta-append.test.ts +0 -163
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/parse-export-lrm.test.ts +0 -83
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/parse-export.test.ts +0 -678
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/sessionize.test.ts +0 -91
- package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/to-classifier-input.test.ts +0 -59
- package/payload/platform/plugins/whatsapp-import/lib/src/delta-cursor.ts +0 -54
- package/payload/platform/plugins/whatsapp-import/lib/src/derive-keys.ts +0 -82
- package/payload/platform/plugins/whatsapp-import/lib/src/index.ts +0 -22
- package/payload/platform/plugins/whatsapp-import/lib/src/parse-export.ts +0 -471
- package/payload/platform/plugins/whatsapp-import/lib/src/sessionize.ts +0 -81
- package/payload/platform/plugins/whatsapp-import/lib/src/to-classifier-input.ts +0 -48
- package/payload/platform/plugins/whatsapp-import/lib/tsconfig.json +0 -9
- package/payload/platform/plugins/whatsapp-import/lib/vitest.config.ts +0 -9
- package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/conversation-archive-shape.md +0 -143
- package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/export-parse.md +0 -109
|
@@ -1,471 +0,0 @@
|
|
|
1
|
-
import { createHash } from "node:crypto";
|
|
2
|
-
import { readFileSync } from "node:fs";
|
|
3
|
-
|
|
4
|
-
// ---------------------------------------------------------------------------
|
|
5
|
-
// parse-export — deterministic WhatsApp `_chat.txt` parser (Task 805).
|
|
6
|
-
//
|
|
7
|
-
// Pure function. No LLM in the per-line decision path. Replaces the prose
|
|
8
|
-
// grammar that lived in references/export-parse.md when the database-operator
|
|
9
|
-
// specialist's Sonnet was the line tokeniser. Every grammar branch here is
|
|
10
|
-
// exercised by the vitest grid in `__tests__/parse-export.test.ts`; that
|
|
11
|
-
// grid IS the contract — extending the grammar means a new test first.
|
|
12
|
-
//
|
|
13
|
-
// Doctrine alignment:
|
|
14
|
-
// - feedback_deterministic_means_remove_llm.md — the LLM is no longer in
|
|
15
|
-
// the per-line decision path.
|
|
16
|
-
// - feedback_deterministic_is_a_shell_script.md — TypeScript is the right
|
|
17
|
-
// deliverable shape here (UTF-8 decode + multi-line body assembly + sha256
|
|
18
|
-
// would be cumbersome in shell); the LITERAL-MAPPING rule yields to
|
|
19
|
-
// "Node module" because the per-line decision path is the deliverable, not
|
|
20
|
-
// a one-shot orchestrator.
|
|
21
|
-
// - feedback_loud_failures.md — encoding errors, empty files, and lines
|
|
22
|
-
// that match a timestamp prefix but cannot be tokenised throw with named
|
|
23
|
-
// reasons rather than degrading silently.
|
|
24
|
-
// ---------------------------------------------------------------------------
|
|
25
|
-
|
|
26
|
-
export interface ParseExportInput {
|
|
27
|
-
/** Absolute path to the `_chat.txt` file. */
|
|
28
|
-
filePath: string;
|
|
29
|
-
/** Account scope used to compose `conversationId`. */
|
|
30
|
-
accountId: string;
|
|
31
|
-
/** IANA timezone the operator confirmed (e.g. `Europe/London`). */
|
|
32
|
-
timezone: string;
|
|
33
|
-
/**
|
|
34
|
-
* Date ordering and year shape. Omit for auto-detect (Task 845): the parser
|
|
35
|
-
* probes the first matched line as DD/MM and locks that ordering if range-valid;
|
|
36
|
-
* otherwise locks MM/DD. Year shape is independent — `\d{2,4}` accepts 2-digit
|
|
37
|
-
* (mapped `2000+yy`) and 4-digit (passed through) years per-line, including
|
|
38
|
-
* mixed-year files.
|
|
39
|
-
*/
|
|
40
|
-
dateFormat?: "DD/MM/YY" | "MM/DD/YY" | "DD/MM/YYYY" | "MM/DD/YYYY";
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
export interface ParsedLine {
|
|
44
|
-
senderName: string;
|
|
45
|
-
/** ISO 8601 with timezone offset for the supplied IANA zone. */
|
|
46
|
-
dateSent: string;
|
|
47
|
-
body: string;
|
|
48
|
-
/** Position within emitted (post-skip) messages, 0-based. */
|
|
49
|
-
sequenceIndex: number;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
export interface ParseExportCounters {
|
|
53
|
-
parsed: number;
|
|
54
|
-
systemSkipped: number;
|
|
55
|
-
mediaSkipped: number;
|
|
56
|
-
parseErrors: number;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
export interface ParseExportResult {
|
|
60
|
-
conversationId: string;
|
|
61
|
-
/** `whatsapp-export:<sha256-hex>` of the raw file bytes. */
|
|
62
|
-
archiveSourceFile: string;
|
|
63
|
-
parsedLines: ParsedLine[];
|
|
64
|
-
counters: ParseExportCounters;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// Year capture is `\d{2}|\d{4}` so a single regex covers both 2-digit (legacy)
|
|
68
|
-
// and 4-digit (modern WhatsApp default) prefixes — Task 845. Exactly 2 or 4
|
|
69
|
-
// chars; 3-digit years (truncation typos, hand-edited files) are rejected as
|
|
70
|
-
// not-a-prefix and surface via parse-grammar-miss, not silently coerced into
|
|
71
|
-
// year-202-AD timestamps. Year semantics are resolved per-match in
|
|
72
|
-
// `matchTimestampPrefix` from the captured length, not from the regex shape,
|
|
73
|
-
// so mixed-year files parse natively.
|
|
74
|
-
const TIMESTAMP_PREFIX_DDMMYY =
|
|
75
|
-
/^\[(\d{2})\/(\d{2})\/(\d{4}|\d{2}),\s+(\d{1,2}):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
|
|
76
|
-
|
|
77
|
-
const TIMESTAMP_PREFIX_MMDDYY = TIMESTAMP_PREFIX_DDMMYY; // shape is identical; ordering differs in interpretation only
|
|
78
|
-
|
|
79
|
-
// System-message patterns that appear WITHOUT a `: ` sender/body separator.
|
|
80
|
-
// WhatsApp emits group-event and security-code lines as `<Sender> <verb> ...`
|
|
81
|
-
// (no colon). Lines that match the timestamp prefix but lack `: ` and do not
|
|
82
|
-
// match one of these patterns are LOUD-FAIL parse errors — never silently
|
|
83
|
-
// dropped.
|
|
84
|
-
const LINE_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
|
|
85
|
-
/^Messages and calls are end-to-end encrypted/i,
|
|
86
|
-
/'s security code changed\.?$/i,
|
|
87
|
-
/ created group ["“”]/,
|
|
88
|
-
/ added /,
|
|
89
|
-
/ removed /,
|
|
90
|
-
/ left$/,
|
|
91
|
-
/ changed the subject from /,
|
|
92
|
-
/ changed this group's icon/,
|
|
93
|
-
/ joined using this group's invite link/,
|
|
94
|
-
/^You're now an admin$/i,
|
|
95
|
-
/^You created group/i,
|
|
96
|
-
];
|
|
97
|
-
|
|
98
|
-
// Body-level patterns evaluated after `Sender: body` split. These are real
|
|
99
|
-
// messages syntactically but carry no graph value (deletions, media-only).
|
|
100
|
-
const BODY_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
|
|
101
|
-
/^You deleted this message\.?$/,
|
|
102
|
-
/^This message was deleted\.?$/,
|
|
103
|
-
];
|
|
104
|
-
|
|
105
|
-
const MEDIA_ONLY_PATTERNS: RegExp[] = [
|
|
106
|
-
/^<Media omitted>$/,
|
|
107
|
-
/^IMG-\d+-\w+\.(jpg|jpeg|png|heic|gif)\s*\(file attached\)$/i,
|
|
108
|
-
/^VID-\d+-\w+\.mp4\s*\(file attached\)$/i,
|
|
109
|
-
/^PTT-\d+-\w+\.opus\s*\(file attached\)$/i,
|
|
110
|
-
/^AUD-\d+-\w+\.opus\s*\(file attached\)$/i,
|
|
111
|
-
/^STK-\d+-\w+\.webp\s*\(file attached\)$/i,
|
|
112
|
-
/^.+\.(pdf|docx|doc|xlsx|xls|pptx|ppt|zip|csv|txt)\s*\(file attached\)$/i,
|
|
113
|
-
/^.+attached:\s*.+$/, // alternative LRM-prefixed format on some platforms
|
|
114
|
-
];
|
|
115
|
-
|
|
116
|
-
export function parseExport(input: ParseExportInput): ParseExportResult {
|
|
117
|
-
const { filePath, accountId, timezone, dateFormat: explicitDateFormat } = input;
|
|
118
|
-
|
|
119
|
-
if (!accountId || !accountId.trim()) {
|
|
120
|
-
throw new Error("parse-export: accountId is required.");
|
|
121
|
-
}
|
|
122
|
-
if (!timezone || !timezone.trim()) {
|
|
123
|
-
throw new Error("parse-export: timezone is required (e.g. 'Europe/London').");
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const rawBytes = readFileSync(filePath);
|
|
127
|
-
const sha256Hex = createHash("sha256").update(rawBytes).digest("hex");
|
|
128
|
-
const archiveSourceFile = `whatsapp-export:${sha256Hex}`;
|
|
129
|
-
const conversationId = `whatsapp-export:${sha256Hex}:${accountId}`;
|
|
130
|
-
|
|
131
|
-
const text = decodeAndNormalise(rawBytes);
|
|
132
|
-
if (text.length === 0) {
|
|
133
|
-
throw new Error(
|
|
134
|
-
`parse-export: file is empty — not a _chat.txt. file=${filePath}`,
|
|
135
|
-
);
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
const lines = text.split("\n");
|
|
139
|
-
// Auto-detect when `dateFormat` is omitted (Task 845): probe the first line
|
|
140
|
-
// that contains a timestamp prefix as DD/MM; lock DD/MM if range-valid,
|
|
141
|
-
// otherwise lock MM/DD. WhatsApp's locale is set per device, so a single
|
|
142
|
-
// file never mixes DD/MM and MM/DD — locking once from line 1 is correct.
|
|
143
|
-
// Concatenated multi-locale exports require an explicit `dateFormat`.
|
|
144
|
-
const ordering = resolveOrdering(explicitDateFormat, lines);
|
|
145
|
-
const counters: ParseExportCounters = {
|
|
146
|
-
parsed: 0,
|
|
147
|
-
systemSkipped: 0,
|
|
148
|
-
mediaSkipped: 0,
|
|
149
|
-
parseErrors: 0,
|
|
150
|
-
};
|
|
151
|
-
|
|
152
|
-
// Stage 1 — tokenise into raw messages (timestamp + remainder), accumulating
|
|
153
|
-
// continuation lines into the previous remainder. Stage 2 then categorises
|
|
154
|
-
// each tokenised message (system / media / real) so the counter increments
|
|
155
|
-
// happen exactly once per source line.
|
|
156
|
-
interface RawMessage {
|
|
157
|
-
rawLineIndex: number; // 1-based file line number for LOUD-FAIL diagnostics
|
|
158
|
-
year: number;
|
|
159
|
-
month: number;
|
|
160
|
-
day: number;
|
|
161
|
-
hour: number;
|
|
162
|
-
minute: number;
|
|
163
|
-
second: number;
|
|
164
|
-
remainder: string; // everything after `]` on the prefix line, plus continuation lines
|
|
165
|
-
}
|
|
166
|
-
const raw: RawMessage[] = [];
|
|
167
|
-
|
|
168
|
-
for (let i = 0; i < lines.length; i++) {
|
|
169
|
-
const line = lines[i];
|
|
170
|
-
if (line.length === 0 && i === lines.length - 1) continue; // trailing newline
|
|
171
|
-
const prefixMatch = matchTimestampPrefix(line, ordering);
|
|
172
|
-
if (prefixMatch) {
|
|
173
|
-
raw.push({
|
|
174
|
-
rawLineIndex: i + 1,
|
|
175
|
-
...prefixMatch.dateParts,
|
|
176
|
-
remainder: prefixMatch.remainder,
|
|
177
|
-
});
|
|
178
|
-
} else {
|
|
179
|
-
// Continuation of the previous message body. If there is no previous
|
|
180
|
-
// message, this line is leading garbage — ignore it (matches the
|
|
181
|
-
// export-parse.md edge case where a leading BOM or blank line precedes
|
|
182
|
-
// the first timestamp).
|
|
183
|
-
const last = raw[raw.length - 1];
|
|
184
|
-
if (last) {
|
|
185
|
-
last.remainder += "\n" + line;
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
// Stage 2 — categorise each raw message. Do NOT trim trailing whitespace
|
|
191
|
-
// from the remainder before splitting — `Joel: ` (sender + colon + trailing
|
|
192
|
-
// space + newline) collapses to `Joel:` after a `\s+$` trim and the `: `
|
|
193
|
-
// separator disappears, turning an empty-body system skip into a LOUD-FAIL.
|
|
194
|
-
const parsedLines: ParsedLine[] = [];
|
|
195
|
-
for (const r of raw) {
|
|
196
|
-
const remainder = r.remainder;
|
|
197
|
-
const colonIdx = findFirstColonSeparator(remainder);
|
|
198
|
-
|
|
199
|
-
if (colonIdx === -1) {
|
|
200
|
-
// No `: ` separator. Must match a known system pattern or LOUD-FAIL.
|
|
201
|
-
const trimmed = remainder.replace(/\s+$/, "");
|
|
202
|
-
if (matchesAny(trimmed, LINE_LEVEL_SYSTEM_PATTERNS)) {
|
|
203
|
-
counters.systemSkipped++;
|
|
204
|
-
continue;
|
|
205
|
-
}
|
|
206
|
-
counters.parseErrors++;
|
|
207
|
-
throw new Error(
|
|
208
|
-
`parse-export: parse-error file=${filePath} line=${r.rawLineIndex} reason=no-sender-body-separator content="${trimmed.slice(0, 80)}"`,
|
|
209
|
-
);
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
const senderName = remainder.slice(0, colonIdx).trim();
|
|
213
|
-
const body = remainder.slice(colonIdx + 2).replace(/\s+$/, "");
|
|
214
|
-
|
|
215
|
-
if (body.length === 0) {
|
|
216
|
-
counters.systemSkipped++;
|
|
217
|
-
continue;
|
|
218
|
-
}
|
|
219
|
-
if (matchesAny(body, BODY_LEVEL_SYSTEM_PATTERNS)) {
|
|
220
|
-
counters.systemSkipped++;
|
|
221
|
-
continue;
|
|
222
|
-
}
|
|
223
|
-
if (matchesAny(body, MEDIA_ONLY_PATTERNS)) {
|
|
224
|
-
counters.mediaSkipped++;
|
|
225
|
-
continue;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
const dateSent = isoWithOffset(
|
|
229
|
-
r.year,
|
|
230
|
-
r.month,
|
|
231
|
-
r.day,
|
|
232
|
-
r.hour,
|
|
233
|
-
r.minute,
|
|
234
|
-
r.second,
|
|
235
|
-
timezone,
|
|
236
|
-
);
|
|
237
|
-
|
|
238
|
-
parsedLines.push({
|
|
239
|
-
senderName,
|
|
240
|
-
dateSent,
|
|
241
|
-
body,
|
|
242
|
-
sequenceIndex: parsedLines.length,
|
|
243
|
-
});
|
|
244
|
-
counters.parsed++;
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
if (parsedLines.length === 0 && counters.systemSkipped === 0 && counters.mediaSkipped === 0) {
|
|
248
|
-
// Task 845: include a sanitised first-line sample so the operator knows
|
|
249
|
-
// WHY the file rejected — closes the diagnostic gap that left conversation
|
|
250
|
-
// 47c6a590-0c2c-4006-9aca-6ee9ec93c95f guessing. Echoed to stderr too so
|
|
251
|
-
// server.log has a grep-able adjunct to the existing parse-failed line.
|
|
252
|
-
const sample = sampleFirstNonBlankLine(lines, 100);
|
|
253
|
-
process.stderr.write(
|
|
254
|
-
`[whatsapp-import] parse-grammar-miss first-line="${sample}"\n`,
|
|
255
|
-
);
|
|
256
|
-
throw new Error(
|
|
257
|
-
`parse-export: zero parsed lines after walking ${filePath} — not a _chat.txt or all lines failed grammar. parse-grammar-miss first-line="${sample}"`,
|
|
258
|
-
);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
return {
|
|
262
|
-
conversationId,
|
|
263
|
-
archiveSourceFile,
|
|
264
|
-
parsedLines,
|
|
265
|
-
counters,
|
|
266
|
-
};
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
// ---------------------------------------------------------------------------
|
|
270
|
-
// Internals
|
|
271
|
-
// ---------------------------------------------------------------------------
|
|
272
|
-
|
|
273
|
-
function decodeAndNormalise(bytes: Buffer): string {
|
|
274
|
-
// Strict UTF-8 decode. Node's TextDecoder with `fatal: true` throws on
|
|
275
|
-
// invalid bytes — that's the LOUD-FAIL the brief mandates for encoding
|
|
276
|
-
// errors. The default `Buffer.toString('utf8')` silently substitutes
|
|
277
|
-
// U+FFFD, which would let bad bytes propagate into the graph.
|
|
278
|
-
let text: string;
|
|
279
|
-
try {
|
|
280
|
-
text = new TextDecoder("utf-8", { fatal: true }).decode(bytes);
|
|
281
|
-
} catch (err) {
|
|
282
|
-
throw new Error(
|
|
283
|
-
`parse-export: UTF-8 decode failed — ${err instanceof Error ? err.message : String(err)}. The file is not valid UTF-8; re-export from WhatsApp.`,
|
|
284
|
-
);
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
// Strip leading BOM (U+FEFF).
|
|
288
|
-
if (text.charCodeAt(0) === 0xfeff) {
|
|
289
|
-
text = text.slice(1);
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
// Normalise mixed line endings to LF.
|
|
293
|
-
text = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
|
|
294
|
-
|
|
295
|
-
// Task 887 — strip Unicode bidi marks (U+200E LRM, U+200F RLM) only at
|
|
296
|
-
// line-start, where some WhatsApp builds prefix the timestamp header.
|
|
297
|
-
// Without stripping, `^\[(\d{2})\/...` fails on the prefixed line, the
|
|
298
|
-
// line is appended as a continuation of the previous body, and the next
|
|
299
|
-
// clean header parses its senderName off the polluted body — leaking 23
|
|
300
|
-
// bogus :Person nodes per import in the Adam Mackay archive. Body-internal
|
|
301
|
-
// bidi marks (e.g. the LRM in `: Forwarded`) are preserved — they carry
|
|
302
|
-
// semantic information about message origin and are exercised by
|
|
303
|
-
// parse-export.test.ts. Counts emitted to stderr for the operator's tail.
|
|
304
|
-
const leadingBidiMatches = text.match(/(?:^|\n)[]+/g) || [];
|
|
305
|
-
let lrmStripped = 0;
|
|
306
|
-
let rlmStripped = 0;
|
|
307
|
-
for (const m of leadingBidiMatches) {
|
|
308
|
-
for (const ch of m) {
|
|
309
|
-
if (ch === "") lrmStripped++;
|
|
310
|
-
else if (ch === "") rlmStripped++;
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
|
-
if (leadingBidiMatches.length > 0) {
|
|
314
|
-
text = text.replace(/(^|\n)[]+/g, "$1");
|
|
315
|
-
process.stderr.write(
|
|
316
|
-
`[whatsapp-ingest] decoded normalised lrm-stripped=${lrmStripped} rlm-stripped=${rlmStripped}\n`,
|
|
317
|
-
);
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
return text;
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
interface TimestampMatch {
|
|
324
|
-
dateParts: {
|
|
325
|
-
year: number;
|
|
326
|
-
month: number;
|
|
327
|
-
day: number;
|
|
328
|
-
hour: number;
|
|
329
|
-
minute: number;
|
|
330
|
-
second: number;
|
|
331
|
-
};
|
|
332
|
-
remainder: string;
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
type Ordering = "DDMM" | "MMDD";
|
|
336
|
-
|
|
337
|
-
function matchTimestampPrefix(
|
|
338
|
-
line: string,
|
|
339
|
-
ordering: Ordering,
|
|
340
|
-
): TimestampMatch | null {
|
|
341
|
-
const re = ordering === "MMDD" ? TIMESTAMP_PREFIX_MMDDYY : TIMESTAMP_PREFIX_DDMMYY;
|
|
342
|
-
const m = line.match(re);
|
|
343
|
-
if (!m) return null;
|
|
344
|
-
const a = parseInt(m[1], 10); // dd or mm depending on ordering
|
|
345
|
-
const b = parseInt(m[2], 10); // mm or dd
|
|
346
|
-
const yearRaw = m[3];
|
|
347
|
-
const hour = parseInt(m[4], 10);
|
|
348
|
-
const minute = parseInt(m[5], 10);
|
|
349
|
-
const second = m[6] !== undefined ? parseInt(m[6], 10) : 0;
|
|
350
|
-
const remainder = m[7] ?? "";
|
|
351
|
-
const day = ordering === "MMDD" ? b : a;
|
|
352
|
-
const month = ordering === "MMDD" ? a : b;
|
|
353
|
-
// Range-check before passing to Date.UTC — that function silently rolls
|
|
354
|
-
// over invalid components (Date.UTC(2026, 13, 1) → 2027-02-01), which
|
|
355
|
-
// would corrupt timestamps when the operator passes the wrong ordering
|
|
356
|
-
// for a US-locale export. Reject as not-a-prefix; the caller retries the
|
|
357
|
-
// file with the correct ordering or LOUD-FAILs when the file isn't a chat.
|
|
358
|
-
if (month < 1 || month > 12 || day < 1 || day > 31) return null;
|
|
359
|
-
if (hour > 23 || minute > 59 || second > 59) return null;
|
|
360
|
-
// Task 845: branch year semantics on captured length. WhatsApp's modern
|
|
361
|
-
// exports emit 4-digit years; legacy exports emit 2-digit. Both are
|
|
362
|
-
// accepted by the same regex and disambiguated here so a single file may
|
|
363
|
-
// hold both shapes (mixed-year imports parse natively).
|
|
364
|
-
const year = yearRaw.length === 2 ? 2000 + parseInt(yearRaw, 10) : parseInt(yearRaw, 10);
|
|
365
|
-
return {
|
|
366
|
-
dateParts: { year, month, day, hour, minute, second },
|
|
367
|
-
remainder,
|
|
368
|
-
};
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
function resolveOrdering(
|
|
372
|
-
explicit: ParseExportInput["dateFormat"],
|
|
373
|
-
lines: readonly string[],
|
|
374
|
-
): Ordering {
|
|
375
|
-
if (explicit === "MM/DD/YY" || explicit === "MM/DD/YYYY") return "MMDD";
|
|
376
|
-
if (explicit === "DD/MM/YY" || explicit === "DD/MM/YYYY") return "DDMM";
|
|
377
|
-
// Auto-detect: probe the first prefix-matching line as DD/MM. If range-valid,
|
|
378
|
-
// lock DD/MM (WhatsApp's global default). Otherwise lock MM/DD (US-locale
|
|
379
|
-
// exports, which are the only meaningful exception). Locked once.
|
|
380
|
-
for (const line of lines) {
|
|
381
|
-
if (matchTimestampPrefix(line, "DDMM")) return "DDMM";
|
|
382
|
-
if (matchTimestampPrefix(line, "MMDD")) return "MMDD";
|
|
383
|
-
}
|
|
384
|
-
return "DDMM"; // No matching line — caller will throw zero-parsed-lines anyway.
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
function sampleFirstNonBlankLine(lines: readonly string[], maxScan: number): string {
|
|
388
|
-
const scanLimit = Math.min(maxScan, lines.length);
|
|
389
|
-
for (let i = 0; i < scanLimit; i++) {
|
|
390
|
-
const trimmed = lines[i].trim();
|
|
391
|
-
if (trimmed.length === 0) continue;
|
|
392
|
-
// Strip control characters (including tab, BEL, etc.) so the diagnostic
|
|
393
|
-
// line stays single-line and grep-friendly. Truncate to 80 chars per
|
|
394
|
-
// Task 845 brief — enough to recognise the offending header shape.
|
|
395
|
-
const sanitised = trimmed.replace(/[\x00-\x1F\x7F]/g, "");
|
|
396
|
-
return sanitised.slice(0, 80);
|
|
397
|
-
}
|
|
398
|
-
return "";
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
function findFirstColonSeparator(remainder: string): number {
|
|
402
|
-
// Split on the FIRST `: ` (colon-space). A sender display name may itself
|
|
403
|
-
// contain a `:` (e.g. "Joel: Work"), so we anchor on the first colon
|
|
404
|
-
// followed by a space — that's the WhatsApp export's stable separator.
|
|
405
|
-
const idx = remainder.indexOf(": ");
|
|
406
|
-
return idx;
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
function matchesAny(text: string, patterns: RegExp[]): boolean {
|
|
410
|
-
for (const p of patterns) {
|
|
411
|
-
if (p.test(text)) return true;
|
|
412
|
-
}
|
|
413
|
-
return false;
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
function isoWithOffset(
|
|
417
|
-
year: number,
|
|
418
|
-
month: number,
|
|
419
|
-
day: number,
|
|
420
|
-
hour: number,
|
|
421
|
-
minute: number,
|
|
422
|
-
second: number,
|
|
423
|
-
timezone: string,
|
|
424
|
-
): string {
|
|
425
|
-
// Produce ISO 8601 with the offset that the supplied IANA zone holds for
|
|
426
|
-
// this wall-clock instant. Two-step refinement is needed to handle DST:
|
|
427
|
-
// the wall-clock components describe a local time, and we need the offset
|
|
428
|
-
// for the corresponding UTC instant in `timezone`.
|
|
429
|
-
const guessUtcMs = Date.UTC(year, month - 1, day, hour, minute, second);
|
|
430
|
-
let offMin = offsetMinutesAt(new Date(guessUtcMs), timezone);
|
|
431
|
-
const refinedUtcMs = guessUtcMs - offMin * 60_000;
|
|
432
|
-
offMin = offsetMinutesAt(new Date(refinedUtcMs), timezone);
|
|
433
|
-
|
|
434
|
-
const sign = offMin >= 0 ? "+" : "-";
|
|
435
|
-
const absOff = Math.abs(offMin);
|
|
436
|
-
const offHH = String(Math.floor(absOff / 60)).padStart(2, "0");
|
|
437
|
-
const offMM = String(absOff % 60).padStart(2, "0");
|
|
438
|
-
const Y = String(year).padStart(4, "0");
|
|
439
|
-
const M = String(month).padStart(2, "0");
|
|
440
|
-
const D = String(day).padStart(2, "0");
|
|
441
|
-
const H = String(hour).padStart(2, "0");
|
|
442
|
-
const Mi = String(minute).padStart(2, "0");
|
|
443
|
-
const S = String(second).padStart(2, "0");
|
|
444
|
-
return `${Y}-${M}-${D}T${H}:${Mi}:${S}${sign}${offHH}:${offMM}`;
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
function offsetMinutesAt(date: Date, timezone: string): number {
|
|
448
|
-
// Use Intl.DateTimeFormat with longOffset to read the IANA-zone offset for
|
|
449
|
-
// the given UTC instant. Output format: "GMT+01:00", "GMT-05:00", or "GMT".
|
|
450
|
-
const formatter = new Intl.DateTimeFormat("en-US", {
|
|
451
|
-
timeZone: timezone,
|
|
452
|
-
timeZoneName: "longOffset",
|
|
453
|
-
});
|
|
454
|
-
const parts = formatter.formatToParts(date);
|
|
455
|
-
const tzPart = parts.find((p) => p.type === "timeZoneName");
|
|
456
|
-
if (!tzPart) {
|
|
457
|
-
throw new Error(`parse-export: unable to read offset for timezone "${timezone}".`);
|
|
458
|
-
}
|
|
459
|
-
const value = tzPart.value;
|
|
460
|
-
if (value === "GMT" || value === "UTC") return 0;
|
|
461
|
-
const m = value.match(/^(?:GMT|UTC)([+-])(\d{1,2}):?(\d{2})?$/);
|
|
462
|
-
if (!m) {
|
|
463
|
-
throw new Error(
|
|
464
|
-
`parse-export: cannot parse timezone offset "${value}" for IANA zone "${timezone}".`,
|
|
465
|
-
);
|
|
466
|
-
}
|
|
467
|
-
const sign = m[1] === "+" ? 1 : -1;
|
|
468
|
-
const hh = parseInt(m[2], 10);
|
|
469
|
-
const mm = m[3] ? parseInt(m[3], 10) : 0;
|
|
470
|
-
return sign * (hh * 60 + mm);
|
|
471
|
-
}
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import type { ParsedLine } from "./parse-export.js";
|
|
2
|
-
|
|
3
|
-
// ---------------------------------------------------------------------------
|
|
4
|
-
// sessionize — Pass 1 of the chunked-archive pipeline (Task 891).
|
|
5
|
-
//
|
|
6
|
-
// Pure function. Splits a chronologically-ordered sequence of parsed messages
|
|
7
|
-
// into "sessions" wherever the gap between consecutive `dateSent` values
|
|
8
|
-
// exceeds `gapHours`. Each session feeds memory-classify (mode='chat') as a
|
|
9
|
-
// turn-attributed block; the LLM then chunks each session into one or more
|
|
10
|
-
// `:Section:Conversation` rows with summary+keywords (Pass 2).
|
|
11
|
-
//
|
|
12
|
-
// Why deterministic gap-cut, not LLM topic detection:
|
|
13
|
-
// - The natural cadence of human chat (sleep, working hours, weekend gaps)
|
|
14
|
-
// produces clean session boundaries that the operator can intuit.
|
|
15
|
-
// - LLM-only chunking against a 10K-message archive sends 10K messages into
|
|
16
|
-
// one prompt and pays attention only to the last 1K — gap-cut bounds the
|
|
17
|
-
// window before the LLM ever sees it.
|
|
18
|
-
// - The default 12h gap matches one sleep cycle: messages on the same day
|
|
19
|
-
// belong together; a 14-hour gap (last evening message → next morning) is
|
|
20
|
-
// a fresh session even when the topic is identical.
|
|
21
|
-
// ---------------------------------------------------------------------------
|
|
22
|
-
|
|
23
|
-
export interface Session {
|
|
24
|
-
/** 0-based index across the archive's sessions. */
|
|
25
|
-
index: number;
|
|
26
|
-
/** ISO 8601 timestamp of the first message in the session. */
|
|
27
|
-
firstMessageAt: string;
|
|
28
|
-
/** ISO 8601 timestamp of the last message in the session. */
|
|
29
|
-
lastMessageAt: string;
|
|
30
|
-
/** Messages in the session, chronological. */
|
|
31
|
-
messages: ParsedLine[];
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
/**
|
|
35
|
-
* Split parsed messages into sessions on gaps ≥ `gapHours`. Input must be
|
|
36
|
-
* pre-sorted by `dateSent` (parse-export emits in file order, which IS
|
|
37
|
-
* chronological for any well-formed `_chat.txt`).
|
|
38
|
-
*
|
|
39
|
-
* Boundary semantics (exact-at-threshold):
|
|
40
|
-
* gap == gapHours → cut here (start a new session)
|
|
41
|
-
* gap < gapHours → same session
|
|
42
|
-
* gap > gapHours → cut here
|
|
43
|
-
*
|
|
44
|
-
* Empty input returns []; single-message input returns one one-message session.
|
|
45
|
-
*/
|
|
46
|
-
export function sessionize(
|
|
47
|
-
messages: readonly ParsedLine[],
|
|
48
|
-
gapHours: number,
|
|
49
|
-
): Session[] {
|
|
50
|
-
if (gapHours <= 0) {
|
|
51
|
-
throw new Error(`sessionize: gapHours must be positive, got ${gapHours}`);
|
|
52
|
-
}
|
|
53
|
-
if (messages.length === 0) return [];
|
|
54
|
-
|
|
55
|
-
const gapMs = gapHours * 60 * 60 * 1000;
|
|
56
|
-
const sessions: Session[] = [];
|
|
57
|
-
let currentMessages: ParsedLine[] = [messages[0]];
|
|
58
|
-
|
|
59
|
-
const flush = () => {
|
|
60
|
-
sessions.push({
|
|
61
|
-
index: sessions.length,
|
|
62
|
-
firstMessageAt: currentMessages[0].dateSent,
|
|
63
|
-
lastMessageAt: currentMessages[currentMessages.length - 1].dateSent,
|
|
64
|
-
messages: currentMessages,
|
|
65
|
-
});
|
|
66
|
-
};
|
|
67
|
-
|
|
68
|
-
for (let i = 1; i < messages.length; i++) {
|
|
69
|
-
const prevMs = Date.parse(messages[i - 1].dateSent);
|
|
70
|
-
const currMs = Date.parse(messages[i].dateSent);
|
|
71
|
-
const gap = currMs - prevMs;
|
|
72
|
-
if (gap >= gapMs) {
|
|
73
|
-
flush();
|
|
74
|
-
currentMessages = [messages[i]];
|
|
75
|
-
} else {
|
|
76
|
-
currentMessages.push(messages[i]);
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
flush();
|
|
80
|
-
return sessions;
|
|
81
|
-
}
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import type { Session } from "./sessionize.js";
|
|
2
|
-
|
|
3
|
-
// ---------------------------------------------------------------------------
|
|
4
|
-
// to-classifier-input — Pass 2 input formatter (Task 891).
|
|
5
|
-
//
|
|
6
|
-
// Pure function. Renders one Session as a turn-attributed text block ready
|
|
7
|
-
// to hand to memory-classify (mode='chat'). Format:
|
|
8
|
-
//
|
|
9
|
-
// [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
|
|
10
|
-
// [YYYY-MM-DD HH:MM:SS TZ] <Sender>: <body>
|
|
11
|
-
// ...
|
|
12
|
-
//
|
|
13
|
-
// Multi-line message bodies are kept verbatim (with their internal newlines).
|
|
14
|
-
// The leading `[ts] <Sender>: ` prefix is the only structural addition; the
|
|
15
|
-
// classifier prompt instructs Haiku to preserve it in the chunk `body` so
|
|
16
|
-
// downstream Phase 2 work can recover per-message provenance via snippet
|
|
17
|
-
// matching against the conversation tail.
|
|
18
|
-
//
|
|
19
|
-
// Timezone: each `dateSent` ISO already carries an offset (set by parseExport
|
|
20
|
-
// from the operator's confirmed IANA zone). The renderer prints the
|
|
21
|
-
// human-readable wall-clock for that offset; the trailing "TZ" suffix is the
|
|
22
|
-
// offset itself, not a zone name.
|
|
23
|
-
// ---------------------------------------------------------------------------
|
|
24
|
-
|
|
25
|
-
export function toClassifierInput(session: Session): string {
|
|
26
|
-
const lines: string[] = [];
|
|
27
|
-
for (const m of session.messages) {
|
|
28
|
-
lines.push(`[${formatWallClock(m.dateSent)}] ${m.senderName}: ${m.body}`);
|
|
29
|
-
}
|
|
30
|
-
return lines.join("\n");
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
/**
|
|
34
|
-
* Format an ISO 8601 instant with offset as `YYYY-MM-DD HH:MM:SS ±HH:MM`,
|
|
35
|
-
* preserving the offset that the parser set from the operator's IANA zone.
|
|
36
|
-
* The wall-clock components are read directly from the ISO string — no
|
|
37
|
-
* Date construction (which would re-interpret in the local zone).
|
|
38
|
-
*/
|
|
39
|
-
function formatWallClock(iso: string): string {
|
|
40
|
-
// ISO from parse-export is shaped: "YYYY-MM-DDTHH:MM:SS±HH:MM" (or "Z").
|
|
41
|
-
const m = iso.match(
|
|
42
|
-
/^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})(?:\.\d+)?(Z|[+-]\d{2}:?\d{2})$/,
|
|
43
|
-
);
|
|
44
|
-
if (!m) return iso; // surface the raw value if the shape drifted; pure function never throws on caller-supplied data
|
|
45
|
-
const [, y, mo, d, h, mi, s, off] = m;
|
|
46
|
-
const offsetLabel = off === "Z" ? "+00:00" : off;
|
|
47
|
-
return `${y}-${mo}-${d} ${h}:${mi}:${s} ${offsetLabel}`;
|
|
48
|
-
}
|