@rubytech/create-realagent 1.0.826 → 1.0.829

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/package.json +1 -1
  2. package/payload/platform/neo4j/schema.cypher +35 -2
  3. package/payload/platform/package.json +2 -2
  4. package/payload/platform/plugins/admin/hooks/__tests__/archive-ingest-surface-gate.test.sh +39 -54
  5. package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +26 -52
  6. package/payload/platform/plugins/admin/skills/onboarding/SKILL.md +7 -7
  7. package/payload/platform/plugins/docs/references/cloudflare.md +1 -1
  8. package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
  9. package/payload/platform/plugins/docs/references/troubleshooting.md +1 -0
  10. package/payload/platform/plugins/memory/PLUGIN.md +5 -5
  11. package/payload/platform/plugins/memory/mcp/dist/index.js +18 -253
  12. package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
  13. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +51 -0
  14. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
  15. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-validator.test.js +103 -0
  16. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-validator.test.js.map +1 -1
  17. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +19 -4
  18. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
  19. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +149 -56
  20. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
  21. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.d.ts +16 -1
  22. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.d.ts.map +1 -1
  23. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.js +12 -3
  24. package/payload/platform/plugins/memory/mcp/dist/lib/schema-validator.js.map +1 -1
  25. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-archive-write.test.js +2 -138
  26. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-archive-write.test.js.map +1 -1
  27. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.d.ts +2 -0
  28. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.d.ts.map +1 -0
  29. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +66 -0
  30. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -0
  31. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.d.ts +2 -0
  32. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.d.ts.map +1 -0
  33. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.js +148 -0
  34. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/profile-update-personfields-open.test.js.map +1 -0
  35. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts +1 -64
  36. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts.map +1 -1
  37. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js +6 -336
  38. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js.map +1 -1
  39. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +30 -0
  40. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
  41. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +231 -0
  42. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
  43. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.d.ts +21 -17
  44. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.d.ts.map +1 -1
  45. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.js +77 -37
  46. package/payload/platform/plugins/memory/mcp/dist/tools/profile-update.js.map +1 -1
  47. package/payload/platform/plugins/memory/references/schema-base.md +7 -2
  48. package/payload/platform/plugins/memory/skills/document-ingest/SKILL.md +54 -4
  49. package/payload/platform/plugins/whatsapp/PLUGIN.md +1 -1
  50. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.d.ts +18 -0
  51. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.d.ts.map +1 -0
  52. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.js +31 -0
  53. package/payload/platform/plugins/whatsapp-import/lib/dist/delta-cursor.js.map +1 -0
  54. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.d.ts +27 -12
  55. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.d.ts.map +1 -1
  56. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.js +40 -20
  57. package/payload/platform/plugins/whatsapp-import/lib/dist/derive-keys.js.map +1 -1
  58. package/payload/platform/plugins/whatsapp-import/lib/dist/index.d.ts +7 -4
  59. package/payload/platform/plugins/whatsapp-import/lib/dist/index.d.ts.map +1 -1
  60. package/payload/platform/plugins/whatsapp-import/lib/dist/index.js +9 -6
  61. package/payload/platform/plugins/whatsapp-import/lib/dist/index.js.map +1 -1
  62. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.d.ts +25 -0
  63. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.d.ts.map +1 -0
  64. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.js +48 -0
  65. package/payload/platform/plugins/whatsapp-import/lib/dist/sessionize.js.map +1 -0
  66. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.d.ts +3 -0
  67. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.d.ts.map +1 -0
  68. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.js +47 -0
  69. package/payload/platform/plugins/whatsapp-import/lib/dist/to-classifier-input.js.map +1 -0
  70. package/payload/platform/scripts/seed-neo4j.sh +15 -14
  71. package/payload/platform/templates/specialists/agents/database-operator.md +10 -17
  72. package/payload/server/chunk-CUSH3UXP.js +2305 -0
  73. package/payload/server/chunk-IWNDVGKT.js +10077 -0
  74. package/payload/server/chunk-KC7NUABI.js +654 -0
  75. package/payload/server/chunk-T2OPNP3L.js +654 -0
  76. package/payload/server/chunk-WUVXPZIV.js +1116 -0
  77. package/payload/server/client-pool-3TM3SRIA.js +32 -0
  78. package/payload/server/cloudflare-task-tracker-4NIODMGL.js +19 -0
  79. package/payload/server/cloudflare-task-tracker-CR6TL4VL.js +19 -0
  80. package/payload/server/maxy-edge.js +3 -3
  81. package/payload/server/neo4j-migrations-XTQ4WEV6.js +428 -0
  82. package/payload/server/public/assets/{admin-DOkUspG1.js → admin-BNwPsMhJ.js} +2 -2
  83. package/payload/server/public/assets/{graph-LLMJa4Ch.js → graph-N_Bw-8oT.js} +1 -1
  84. package/payload/server/public/assets/{page-DoaF3DB0.js → page-BKLGP-th.js} +1 -1
  85. package/payload/server/public/graph.html +2 -2
  86. package/payload/server/public/index.html +2 -2
  87. package/payload/server/server.js +281 -168
  88. package/payload/platform/plugins/whatsapp-import/PLUGIN.md +0 -46
  89. package/payload/platform/plugins/whatsapp-import/bin/ingest.mjs +0 -670
  90. package/payload/platform/plugins/whatsapp-import/bin/whatsapp-ingest.sh +0 -131
  91. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/filter-gate.test.ts +0 -172
  92. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/ingest-idempotence.test.ts +0 -141
  93. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/parse-export-lrm.test.ts +0 -83
  94. package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/parse-export.test.ts +0 -678
  95. package/payload/platform/plugins/whatsapp-import/lib/src/derive-keys.ts +0 -59
  96. package/payload/platform/plugins/whatsapp-import/lib/src/filter.ts +0 -136
  97. package/payload/platform/plugins/whatsapp-import/lib/src/index.ts +0 -19
  98. package/payload/platform/plugins/whatsapp-import/lib/src/parse-export.ts +0 -471
  99. package/payload/platform/plugins/whatsapp-import/lib/tsconfig.json +0 -9
  100. package/payload/platform/plugins/whatsapp-import/lib/vitest.config.ts +0 -9
  101. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md +0 -131
  102. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/export-parse.md +0 -109
  103. package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import-enrich/SKILL.md +0 -333
@@ -1,59 +0,0 @@
1
- import { createHash } from "node:crypto";
2
-
3
- // ---------------------------------------------------------------------------
4
- // derive-keys — natural-key derivation for whatsapp-import (Task 870).
5
- //
6
- // Pure functions. No I/O. The whole point is that re-imports of the same
7
- // archive collapse to the same Message identity regardless of release-level
8
- // drift in array indices, hash widths, or arbitrary tiebreakers.
9
- //
10
- // Key shape (Task 870 brief):
11
- //
12
- // messageId = whatsapp-export:msg:<conversationSha256>:<dateSentISO>
13
- // :<NFKC-trim-lower(senderName)>
14
- // :<sha256-hex(body)>
15
- //
16
- // Operator constraint: the same archive must be re-imported with the same
17
- // `--timezone` flag. Different timezones reinterpret wall-clock instants and
18
- // will produce drifted messageIds — that is correct semantics, not a bug.
19
- // Documented in .docs/whatsapp.md natural-key contract section.
20
- // ---------------------------------------------------------------------------
21
-
22
- export function normaliseSenderName(name: string): string {
23
- return name.normalize("NFKC").trim().toLowerCase();
24
- }
25
-
26
- export function sha256Hex(input: string): string {
27
- return createHash("sha256").update(input).digest("hex");
28
- }
29
-
30
- export interface DeriveMessageIdInput {
31
- /** SHA-256 of the source `_chat.txt` bytes — stable across re-imports. */
32
- conversationSha256: string;
33
- /** ISO 8601 with timezone offset, as emitted by parseExport. */
34
- dateSent: string;
35
- /** Raw senderName from the export line. Normalised internally. */
36
- senderName: string;
37
- /** Raw message body. Hashed internally. */
38
- body: string;
39
- }
40
-
41
- export function deriveMessageId(input: DeriveMessageIdInput): string {
42
- const norm = normaliseSenderName(input.senderName);
43
- const bodyHash = sha256Hex(input.body);
44
- return `whatsapp-export:msg:${input.conversationSha256}:${input.dateSent}:${norm}:${bodyHash}`;
45
- }
46
-
47
- export interface ObservationContentFields {
48
- summary?: string | null;
49
- from?: string | null;
50
- to?: string | null;
51
- subject?: string | null;
52
- }
53
-
54
- export function observationContentHash(fields: ObservationContentFields): string {
55
- const parts = [fields.summary, fields.from, fields.to, fields.subject].map(
56
- (p) => (p ?? "").normalize("NFKC").trim().toLowerCase(),
57
- );
58
- return sha256Hex(parts.join("|"));
59
- }
@@ -1,136 +0,0 @@
1
- // ---------------------------------------------------------------------------
2
- // filter — operator-supplied gate over ParsedLine[] (Task 871).
3
- //
4
- // Phase 1 ingest is now mandatory-filter: the deterministic Bash entry refuses
5
- // to write a bulk archive without `--filter`. Three forms cover the operator
6
- // patterns named in the brief:
7
- //
8
- // --filter all → no row drop
9
- // --filter senders=Alice,Bob Carter → keep rows whose
10
- // senderName matches
11
- // any csv entry exactly
12
- // --filter date-range=2024-01-01..2024-06-30 → keep rows whose
13
- // dateSent ISO falls
14
- // inside the inclusive
15
- // range (date or full
16
- // ISO 8601)
17
- //
18
- // Doctrine alignment:
19
- // - feedback_compress_at_ingest_for_bulk_archives.md — the gate is
20
- // mandatory at write-time, not after.
21
- // - feedback_deterministic_means_remove_llm.md — the filter parser is a
22
- // pure function, no LLM in the per-row decision path.
23
- // - feedback_loud_failures.md — malformed `--filter` raises a structured
24
- // error with a named reason rather than silently coercing to `all`.
25
- // ---------------------------------------------------------------------------
26
-
27
- import type { ParsedLine } from "./parse-export.js";
28
-
29
- export type Filter =
30
- | { kind: "all" }
31
- | { kind: "senders"; senders: string[] }
32
- | { kind: "date-range"; fromIso: string; toIso: string };
33
-
34
- /**
35
- * Parse a CLI `--filter` argument into a structured Filter.
36
- *
37
- * Throws Error with message starting "filter: …" on malformed input. The
38
- * caller (ingest.mjs / vitest) surfaces the reason verbatim — the brief
39
- * mandates `[whatsapp-ingest] FAIL filter-required reason="…"` so the
40
- * operator can grep one line.
41
- */
42
- export function parseFilterArg(raw: string | undefined | null): Filter {
43
- if (raw == null || raw.trim() === "") {
44
- throw new Error(
45
- 'filter: --filter is required (one of "all", "senders=<csv>", "date-range=<isoFrom>..<isoTo>")',
46
- );
47
- }
48
- const value = raw.trim();
49
- if (value === "all") return { kind: "all" };
50
- if (value.startsWith("senders=")) {
51
- const csv = value.slice("senders=".length);
52
- const senders = csv
53
- .split(",")
54
- .map((s) => s.trim())
55
- .filter((s) => s.length > 0);
56
- if (senders.length === 0) {
57
- throw new Error('filter: senders= requires at least one comma-separated name');
58
- }
59
- return { kind: "senders", senders };
60
- }
61
- if (value.startsWith("date-range=")) {
62
- const range = value.slice("date-range=".length);
63
- const parts = range.split("..");
64
- if (parts.length !== 2) {
65
- throw new Error(
66
- `filter: date-range must be "<isoFrom>..<isoTo>" — got "${range}"`,
67
- );
68
- }
69
- const [fromIso, toIso] = parts.map((p) => p.trim());
70
- if (!fromIso || !toIso) {
71
- throw new Error(
72
- `filter: date-range requires both endpoints — got "${range}"`,
73
- );
74
- }
75
- if (Number.isNaN(Date.parse(fromIso))) {
76
- throw new Error(`filter: date-range fromIso="${fromIso}" is not parseable as ISO 8601`);
77
- }
78
- if (Number.isNaN(Date.parse(toIso))) {
79
- throw new Error(`filter: date-range toIso="${toIso}" is not parseable as ISO 8601`);
80
- }
81
- if (Date.parse(fromIso) > Date.parse(toIso)) {
82
- throw new Error(`filter: date-range fromIso="${fromIso}" is later than toIso="${toIso}"`);
83
- }
84
- return { kind: "date-range", fromIso, toIso };
85
- }
86
- throw new Error(
87
- `filter: unrecognised form "${value}" — must be "all", "senders=<csv>", or "date-range=<isoFrom>..<isoTo>"`,
88
- );
89
- }
90
-
91
- /**
92
- * Apply a parsed Filter to ParsedLine[]. Returns a new array of kept lines
93
- * with the parser's original `sequenceIndex` preserved (the filter never
94
- * reorders). ingest.mjs re-stamps `sequenceIndex` to its post-filter position
95
- * during row construction for archive-write — re-stamping here too would be
96
- * redundant.
97
- */
98
- export function applyFilter(
99
- parsedLines: readonly ParsedLine[],
100
- filter: Filter,
101
- ): ParsedLine[] {
102
- const predicate = makePredicate(filter);
103
- const kept: ParsedLine[] = [];
104
- for (const line of parsedLines) {
105
- if (!predicate(line)) continue;
106
- kept.push(line);
107
- }
108
- return kept;
109
- }
110
-
111
- function makePredicate(filter: Filter): (line: ParsedLine) => boolean {
112
- if (filter.kind === "all") return () => true;
113
- if (filter.kind === "senders") {
114
- const allow = new Set(filter.senders);
115
- return (line) => allow.has(line.senderName);
116
- }
117
- // date-range: inclusive on both ends. Date-only endpoints widen to whole-
118
- // day semantics: `from=YYYY-MM-DD` → `T00:00:00Z`, `to=YYYY-MM-DD` →
119
- // `T23:59:59.999Z`. Full ISO 8601 endpoints with `T` are passed through.
120
- // Without this widening, `--filter date-range=2024-01-01..2024-06-30`
121
- // would silently drop every message later than 2024-06-30T00:00:00Z on the
122
- // last day — a UX trap that contradicts the operator's reading.
123
- const fromMs = parseRangeEndpoint(filter.fromIso, "start");
124
- const toMs = parseRangeEndpoint(filter.toIso, "end");
125
- return (line) => {
126
- const ms = Date.parse(line.dateSent);
127
- return ms >= fromMs && ms <= toMs;
128
- };
129
- }
130
-
131
- function parseRangeEndpoint(iso: string, edge: "start" | "end"): number {
132
- if (/T/.test(iso)) return Date.parse(iso);
133
- // Date-only — widen to whole-day inclusive on the requested edge.
134
- const suffix = edge === "start" ? "T00:00:00.000Z" : "T23:59:59.999Z";
135
- return Date.parse(iso + suffix);
136
- }
@@ -1,19 +0,0 @@
1
- export { parseExport } from "./parse-export.js";
2
- export type {
3
- ParseExportInput,
4
- ParseExportResult,
5
- ParseExportCounters,
6
- ParsedLine,
7
- } from "./parse-export.js";
8
- export { parseFilterArg, applyFilter } from "./filter.js";
9
- export type { Filter } from "./filter.js";
10
- export {
11
- normaliseSenderName,
12
- sha256Hex,
13
- deriveMessageId,
14
- observationContentHash,
15
- } from "./derive-keys.js";
16
- export type {
17
- DeriveMessageIdInput,
18
- ObservationContentFields,
19
- } from "./derive-keys.js";
@@ -1,471 +0,0 @@
1
- import { createHash } from "node:crypto";
2
- import { readFileSync } from "node:fs";
3
-
4
- // ---------------------------------------------------------------------------
5
- // parse-export — deterministic WhatsApp `_chat.txt` parser (Task 805).
6
- //
7
- // Pure function. No LLM in the per-line decision path. Replaces the prose
8
- // grammar that lived in references/export-parse.md when the database-operator
9
- // specialist's Sonnet was the line tokeniser. Every grammar branch here is
10
- // exercised by the vitest grid in `__tests__/parse-export.test.ts`; that
11
- // grid IS the contract — extending the grammar means a new test first.
12
- //
13
- // Doctrine alignment:
14
- // - feedback_deterministic_means_remove_llm.md — the LLM is no longer in
15
- // the per-line decision path.
16
- // - feedback_deterministic_is_a_shell_script.md — TypeScript is the right
17
- // deliverable shape here (UTF-8 decode + multi-line body assembly + sha256
18
- // would be cumbersome in shell); the LITERAL-MAPPING rule yields to
19
- // "Node module" because the per-line decision path is the deliverable, not
20
- // a one-shot orchestrator.
21
- // - feedback_loud_failures.md — encoding errors, empty files, and lines
22
- // that match a timestamp prefix but cannot be tokenised throw with named
23
- // reasons rather than degrading silently.
24
- // ---------------------------------------------------------------------------
25
-
26
- export interface ParseExportInput {
27
- /** Absolute path to the `_chat.txt` file. */
28
- filePath: string;
29
- /** Account scope used to compose `conversationId`. */
30
- accountId: string;
31
- /** IANA timezone the operator confirmed (e.g. `Europe/London`). */
32
- timezone: string;
33
- /**
34
- * Date ordering and year shape. Omit for auto-detect (Task 845): the parser
35
- * probes the first matched line as DD/MM and locks that ordering if range-valid;
36
- * otherwise locks MM/DD. Year shape is independent — `\d{2,4}` accepts 2-digit
37
- * (mapped `2000+yy`) and 4-digit (passed through) years per-line, including
38
- * mixed-year files.
39
- */
40
- dateFormat?: "DD/MM/YY" | "MM/DD/YY" | "DD/MM/YYYY" | "MM/DD/YYYY";
41
- }
42
-
43
- export interface ParsedLine {
44
- senderName: string;
45
- /** ISO 8601 with timezone offset for the supplied IANA zone. */
46
- dateSent: string;
47
- body: string;
48
- /** Position within emitted (post-skip) messages, 0-based. */
49
- sequenceIndex: number;
50
- }
51
-
52
- export interface ParseExportCounters {
53
- parsed: number;
54
- systemSkipped: number;
55
- mediaSkipped: number;
56
- parseErrors: number;
57
- }
58
-
59
- export interface ParseExportResult {
60
- conversationId: string;
61
- /** `whatsapp-export:<sha256-hex>` of the raw file bytes. */
62
- archiveSourceFile: string;
63
- parsedLines: ParsedLine[];
64
- counters: ParseExportCounters;
65
- }
66
-
67
- // Year capture is `\d{2}|\d{4}` so a single regex covers both 2-digit (legacy)
68
- // and 4-digit (modern WhatsApp default) prefixes — Task 845. Exactly 2 or 4
69
- // chars; 3-digit years (truncation typos, hand-edited files) are rejected as
70
- // not-a-prefix and surface via parse-grammar-miss, not silently coerced into
71
- // year-202-AD timestamps. Year semantics are resolved per-match in
72
- // `matchTimestampPrefix` from the captured length, not from the regex shape,
73
- // so mixed-year files parse natively.
74
- const TIMESTAMP_PREFIX_DDMMYY =
75
- /^\[(\d{2})\/(\d{2})\/(\d{4}|\d{2}),\s+(\d{1,2}):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
76
-
77
- const TIMESTAMP_PREFIX_MMDDYY = TIMESTAMP_PREFIX_DDMMYY; // shape is identical; ordering differs in interpretation only
78
-
79
- // System-message patterns that appear WITHOUT a `: ` sender/body separator.
80
- // WhatsApp emits group-event and security-code lines as `<Sender> <verb> ...`
81
- // (no colon). Lines that match the timestamp prefix but lack `: ` and do not
82
- // match one of these patterns are LOUD-FAIL parse errors — never silently
83
- // dropped.
84
- const LINE_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
85
- /^Messages and calls are end-to-end encrypted/i,
86
- /'s security code changed\.?$/i,
87
- / created group ["“”]/,
88
- / added /,
89
- / removed /,
90
- / left$/,
91
- / changed the subject from /,
92
- / changed this group's icon/,
93
- / joined using this group's invite link/,
94
- /^You're now an admin$/i,
95
- /^You created group/i,
96
- ];
97
-
98
- // Body-level patterns evaluated after `Sender: body` split. These are real
99
- // messages syntactically but carry no graph value (deletions, media-only).
100
- const BODY_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
101
- /^You deleted this message\.?$/,
102
- /^This message was deleted\.?$/,
103
- ];
104
-
105
- const MEDIA_ONLY_PATTERNS: RegExp[] = [
106
- /^<Media omitted>$/,
107
- /^IMG-\d+-\w+\.(jpg|jpeg|png|heic|gif)\s*\(file attached\)$/i,
108
- /^VID-\d+-\w+\.mp4\s*\(file attached\)$/i,
109
- /^PTT-\d+-\w+\.opus\s*\(file attached\)$/i,
110
- /^AUD-\d+-\w+\.opus\s*\(file attached\)$/i,
111
- /^STK-\d+-\w+\.webp\s*\(file attached\)$/i,
112
- /^.+\.(pdf|docx|doc|xlsx|xls|pptx|ppt|zip|csv|txt)\s*\(file attached\)$/i,
113
- /^‎.+attached:\s*.+$/, // alternative LRM-prefixed format on some platforms
114
- ];
115
-
116
- export function parseExport(input: ParseExportInput): ParseExportResult {
117
- const { filePath, accountId, timezone, dateFormat: explicitDateFormat } = input;
118
-
119
- if (!accountId || !accountId.trim()) {
120
- throw new Error("parse-export: accountId is required.");
121
- }
122
- if (!timezone || !timezone.trim()) {
123
- throw new Error("parse-export: timezone is required (e.g. 'Europe/London').");
124
- }
125
-
126
- const rawBytes = readFileSync(filePath);
127
- const sha256Hex = createHash("sha256").update(rawBytes).digest("hex");
128
- const archiveSourceFile = `whatsapp-export:${sha256Hex}`;
129
- const conversationId = `whatsapp-export:${sha256Hex}:${accountId}`;
130
-
131
- const text = decodeAndNormalise(rawBytes);
132
- if (text.length === 0) {
133
- throw new Error(
134
- `parse-export: file is empty — not a _chat.txt. file=${filePath}`,
135
- );
136
- }
137
-
138
- const lines = text.split("\n");
139
- // Auto-detect when `dateFormat` is omitted (Task 845): probe the first line
140
- // that contains a timestamp prefix as DD/MM; lock DD/MM if range-valid,
141
- // otherwise lock MM/DD. WhatsApp's locale is set per device, so a single
142
- // file never mixes DD/MM and MM/DD — locking once from line 1 is correct.
143
- // Concatenated multi-locale exports require an explicit `dateFormat`.
144
- const ordering = resolveOrdering(explicitDateFormat, lines);
145
- const counters: ParseExportCounters = {
146
- parsed: 0,
147
- systemSkipped: 0,
148
- mediaSkipped: 0,
149
- parseErrors: 0,
150
- };
151
-
152
- // Stage 1 — tokenise into raw messages (timestamp + remainder), accumulating
153
- // continuation lines into the previous remainder. Stage 2 then categorises
154
- // each tokenised message (system / media / real) so the counter increments
155
- // happen exactly once per source line.
156
- interface RawMessage {
157
- rawLineIndex: number; // 1-based file line number for LOUD-FAIL diagnostics
158
- year: number;
159
- month: number;
160
- day: number;
161
- hour: number;
162
- minute: number;
163
- second: number;
164
- remainder: string; // everything after `]` on the prefix line, plus continuation lines
165
- }
166
- const raw: RawMessage[] = [];
167
-
168
- for (let i = 0; i < lines.length; i++) {
169
- const line = lines[i];
170
- if (line.length === 0 && i === lines.length - 1) continue; // trailing newline
171
- const prefixMatch = matchTimestampPrefix(line, ordering);
172
- if (prefixMatch) {
173
- raw.push({
174
- rawLineIndex: i + 1,
175
- ...prefixMatch.dateParts,
176
- remainder: prefixMatch.remainder,
177
- });
178
- } else {
179
- // Continuation of the previous message body. If there is no previous
180
- // message, this line is leading garbage — ignore it (matches the
181
- // export-parse.md edge case where a leading BOM or blank line precedes
182
- // the first timestamp).
183
- const last = raw[raw.length - 1];
184
- if (last) {
185
- last.remainder += "\n" + line;
186
- }
187
- }
188
- }
189
-
190
- // Stage 2 — categorise each raw message. Do NOT trim trailing whitespace
191
- // from the remainder before splitting — `Joel: ` (sender + colon + trailing
192
- // space + newline) collapses to `Joel:` after a `\s+$` trim and the `: `
193
- // separator disappears, turning an empty-body system skip into a LOUD-FAIL.
194
- const parsedLines: ParsedLine[] = [];
195
- for (const r of raw) {
196
- const remainder = r.remainder;
197
- const colonIdx = findFirstColonSeparator(remainder);
198
-
199
- if (colonIdx === -1) {
200
- // No `: ` separator. Must match a known system pattern or LOUD-FAIL.
201
- const trimmed = remainder.replace(/\s+$/, "");
202
- if (matchesAny(trimmed, LINE_LEVEL_SYSTEM_PATTERNS)) {
203
- counters.systemSkipped++;
204
- continue;
205
- }
206
- counters.parseErrors++;
207
- throw new Error(
208
- `parse-export: parse-error file=${filePath} line=${r.rawLineIndex} reason=no-sender-body-separator content="${trimmed.slice(0, 80)}"`,
209
- );
210
- }
211
-
212
- const senderName = remainder.slice(0, colonIdx).trim();
213
- const body = remainder.slice(colonIdx + 2).replace(/\s+$/, "");
214
-
215
- if (body.length === 0) {
216
- counters.systemSkipped++;
217
- continue;
218
- }
219
- if (matchesAny(body, BODY_LEVEL_SYSTEM_PATTERNS)) {
220
- counters.systemSkipped++;
221
- continue;
222
- }
223
- if (matchesAny(body, MEDIA_ONLY_PATTERNS)) {
224
- counters.mediaSkipped++;
225
- continue;
226
- }
227
-
228
- const dateSent = isoWithOffset(
229
- r.year,
230
- r.month,
231
- r.day,
232
- r.hour,
233
- r.minute,
234
- r.second,
235
- timezone,
236
- );
237
-
238
- parsedLines.push({
239
- senderName,
240
- dateSent,
241
- body,
242
- sequenceIndex: parsedLines.length,
243
- });
244
- counters.parsed++;
245
- }
246
-
247
- if (parsedLines.length === 0 && counters.systemSkipped === 0 && counters.mediaSkipped === 0) {
248
- // Task 845: include a sanitised first-line sample so the operator knows
249
- // WHY the file rejected — closes the diagnostic gap that left conversation
250
- // 47c6a590-0c2c-4006-9aca-6ee9ec93c95f guessing. Echoed to stderr too so
251
- // server.log has a grep-able adjunct to the existing parse-failed line.
252
- const sample = sampleFirstNonBlankLine(lines, 100);
253
- process.stderr.write(
254
- `[whatsapp-import] parse-grammar-miss first-line="${sample}"\n`,
255
- );
256
- throw new Error(
257
- `parse-export: zero parsed lines after walking ${filePath} — not a _chat.txt or all lines failed grammar. parse-grammar-miss first-line="${sample}"`,
258
- );
259
- }
260
-
261
- return {
262
- conversationId,
263
- archiveSourceFile,
264
- parsedLines,
265
- counters,
266
- };
267
- }
268
-
269
- // ---------------------------------------------------------------------------
270
- // Internals
271
- // ---------------------------------------------------------------------------
272
-
273
- function decodeAndNormalise(bytes: Buffer): string {
274
- // Strict UTF-8 decode. Node's TextDecoder with `fatal: true` throws on
275
- // invalid bytes — that's the LOUD-FAIL the brief mandates for encoding
276
- // errors. The default `Buffer.toString('utf8')` silently substitutes
277
- // U+FFFD, which would let bad bytes propagate into the graph.
278
- let text: string;
279
- try {
280
- text = new TextDecoder("utf-8", { fatal: true }).decode(bytes);
281
- } catch (err) {
282
- throw new Error(
283
- `parse-export: UTF-8 decode failed — ${err instanceof Error ? err.message : String(err)}. The file is not valid UTF-8; re-export from WhatsApp.`,
284
- );
285
- }
286
-
287
- // Strip leading BOM (U+FEFF).
288
- if (text.charCodeAt(0) === 0xfeff) {
289
- text = text.slice(1);
290
- }
291
-
292
- // Normalise mixed line endings to LF.
293
- text = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
294
-
295
- // Task 887 — strip Unicode bidi marks (U+200E LRM, U+200F RLM) only at
296
- // line-start, where some WhatsApp builds prefix the timestamp header.
297
- // Without stripping, `^\[(\d{2})\/...` fails on the prefixed line, the
298
- // line is appended as a continuation of the previous body, and the next
299
- // clean header parses its senderName off the polluted body — leaking 23
300
- // bogus :Person nodes per import in the Adam Mackay archive. Body-internal
301
- // bidi marks (e.g. the LRM in `: ‎Forwarded`) are preserved — they carry
302
- // semantic information about message origin and are exercised by
303
- // parse-export.test.ts. Counts emitted to stderr for the operator's tail.
304
- const leadingBidiMatches = text.match(/(?:^|\n)[‎‏]+/g) || [];
305
- let lrmStripped = 0;
306
- let rlmStripped = 0;
307
- for (const m of leadingBidiMatches) {
308
- for (const ch of m) {
309
- if (ch === "‎") lrmStripped++;
310
- else if (ch === "‏") rlmStripped++;
311
- }
312
- }
313
- if (leadingBidiMatches.length > 0) {
314
- text = text.replace(/(^|\n)[‎‏]+/g, "$1");
315
- process.stderr.write(
316
- `[whatsapp-ingest] decoded normalised lrm-stripped=${lrmStripped} rlm-stripped=${rlmStripped}\n`,
317
- );
318
- }
319
-
320
- return text;
321
- }
322
-
323
- interface TimestampMatch {
324
- dateParts: {
325
- year: number;
326
- month: number;
327
- day: number;
328
- hour: number;
329
- minute: number;
330
- second: number;
331
- };
332
- remainder: string;
333
- }
334
-
335
- type Ordering = "DDMM" | "MMDD";
336
-
337
- function matchTimestampPrefix(
338
- line: string,
339
- ordering: Ordering,
340
- ): TimestampMatch | null {
341
- const re = ordering === "MMDD" ? TIMESTAMP_PREFIX_MMDDYY : TIMESTAMP_PREFIX_DDMMYY;
342
- const m = line.match(re);
343
- if (!m) return null;
344
- const a = parseInt(m[1], 10); // dd or mm depending on ordering
345
- const b = parseInt(m[2], 10); // mm or dd
346
- const yearRaw = m[3];
347
- const hour = parseInt(m[4], 10);
348
- const minute = parseInt(m[5], 10);
349
- const second = m[6] !== undefined ? parseInt(m[6], 10) : 0;
350
- const remainder = m[7] ?? "";
351
- const day = ordering === "MMDD" ? b : a;
352
- const month = ordering === "MMDD" ? a : b;
353
- // Range-check before passing to Date.UTC — that function silently rolls
354
- // over invalid components (Date.UTC(2026, 13, 1) → 2027-02-01), which
355
- // would corrupt timestamps when the operator passes the wrong ordering
356
- // for a US-locale export. Reject as not-a-prefix; the caller retries the
357
- // file with the correct ordering or LOUD-FAILs when the file isn't a chat.
358
- if (month < 1 || month > 12 || day < 1 || day > 31) return null;
359
- if (hour > 23 || minute > 59 || second > 59) return null;
360
- // Task 845: branch year semantics on captured length. WhatsApp's modern
361
- // exports emit 4-digit years; legacy exports emit 2-digit. Both are
362
- // accepted by the same regex and disambiguated here so a single file may
363
- // hold both shapes (mixed-year imports parse natively).
364
- const year = yearRaw.length === 2 ? 2000 + parseInt(yearRaw, 10) : parseInt(yearRaw, 10);
365
- return {
366
- dateParts: { year, month, day, hour, minute, second },
367
- remainder,
368
- };
369
- }
370
-
371
- function resolveOrdering(
372
- explicit: ParseExportInput["dateFormat"],
373
- lines: readonly string[],
374
- ): Ordering {
375
- if (explicit === "MM/DD/YY" || explicit === "MM/DD/YYYY") return "MMDD";
376
- if (explicit === "DD/MM/YY" || explicit === "DD/MM/YYYY") return "DDMM";
377
- // Auto-detect: probe the first prefix-matching line as DD/MM. If range-valid,
378
- // lock DD/MM (WhatsApp's global default). Otherwise lock MM/DD (US-locale
379
- // exports, which are the only meaningful exception). Locked once.
380
- for (const line of lines) {
381
- if (matchTimestampPrefix(line, "DDMM")) return "DDMM";
382
- if (matchTimestampPrefix(line, "MMDD")) return "MMDD";
383
- }
384
- return "DDMM"; // No matching line — caller will throw zero-parsed-lines anyway.
385
- }
386
-
387
- function sampleFirstNonBlankLine(lines: readonly string[], maxScan: number): string {
388
- const scanLimit = Math.min(maxScan, lines.length);
389
- for (let i = 0; i < scanLimit; i++) {
390
- const trimmed = lines[i].trim();
391
- if (trimmed.length === 0) continue;
392
- // Strip control characters (including tab, BEL, etc.) so the diagnostic
393
- // line stays single-line and grep-friendly. Truncate to 80 chars per
394
- // Task 845 brief — enough to recognise the offending header shape.
395
- const sanitised = trimmed.replace(/[\x00-\x1F\x7F]/g, "");
396
- return sanitised.slice(0, 80);
397
- }
398
- return "";
399
- }
400
-
401
- function findFirstColonSeparator(remainder: string): number {
402
- // Split on the FIRST `: ` (colon-space). A sender display name may itself
403
- // contain a `:` (e.g. "Joel: Work"), so we anchor on the first colon
404
- // followed by a space — that's the WhatsApp export's stable separator.
405
- const idx = remainder.indexOf(": ");
406
- return idx;
407
- }
408
-
409
- function matchesAny(text: string, patterns: RegExp[]): boolean {
410
- for (const p of patterns) {
411
- if (p.test(text)) return true;
412
- }
413
- return false;
414
- }
415
-
416
- function isoWithOffset(
417
- year: number,
418
- month: number,
419
- day: number,
420
- hour: number,
421
- minute: number,
422
- second: number,
423
- timezone: string,
424
- ): string {
425
- // Produce ISO 8601 with the offset that the supplied IANA zone holds for
426
- // this wall-clock instant. Two-step refinement is needed to handle DST:
427
- // the wall-clock components describe a local time, and we need the offset
428
- // for the corresponding UTC instant in `timezone`.
429
- const guessUtcMs = Date.UTC(year, month - 1, day, hour, minute, second);
430
- let offMin = offsetMinutesAt(new Date(guessUtcMs), timezone);
431
- const refinedUtcMs = guessUtcMs - offMin * 60_000;
432
- offMin = offsetMinutesAt(new Date(refinedUtcMs), timezone);
433
-
434
- const sign = offMin >= 0 ? "+" : "-";
435
- const absOff = Math.abs(offMin);
436
- const offHH = String(Math.floor(absOff / 60)).padStart(2, "0");
437
- const offMM = String(absOff % 60).padStart(2, "0");
438
- const Y = String(year).padStart(4, "0");
439
- const M = String(month).padStart(2, "0");
440
- const D = String(day).padStart(2, "0");
441
- const H = String(hour).padStart(2, "0");
442
- const Mi = String(minute).padStart(2, "0");
443
- const S = String(second).padStart(2, "0");
444
- return `${Y}-${M}-${D}T${H}:${Mi}:${S}${sign}${offHH}:${offMM}`;
445
- }
446
-
447
- function offsetMinutesAt(date: Date, timezone: string): number {
448
- // Use Intl.DateTimeFormat with longOffset to read the IANA-zone offset for
449
- // the given UTC instant. Output format: "GMT+01:00", "GMT-05:00", or "GMT".
450
- const formatter = new Intl.DateTimeFormat("en-US", {
451
- timeZone: timezone,
452
- timeZoneName: "longOffset",
453
- });
454
- const parts = formatter.formatToParts(date);
455
- const tzPart = parts.find((p) => p.type === "timeZoneName");
456
- if (!tzPart) {
457
- throw new Error(`parse-export: unable to read offset for timezone "${timezone}".`);
458
- }
459
- const value = tzPart.value;
460
- if (value === "GMT" || value === "UTC") return 0;
461
- const m = value.match(/^(?:GMT|UTC)([+-])(\d{1,2}):?(\d{2})?$/);
462
- if (!m) {
463
- throw new Error(
464
- `parse-export: cannot parse timezone offset "${value}" for IANA zone "${timezone}".`,
465
- );
466
- }
467
- const sign = m[1] === "+" ? 1 : -1;
468
- const hh = parseInt(m[2], 10);
469
- const mm = m[3] ? parseInt(m[3], 10) : 0;
470
- return sign * (hh * 60 + mm);
471
- }