prism-mcp-server 4.6.1 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,7 +37,9 @@ isKnowledgeSetRetentionArgs, // v3.1: TTL retention policy type guard
37
37
  // v4.0: Active Behavioral Memory type guards
38
38
  isSessionSaveExperienceArgs, isKnowledgeVoteArgs,
39
39
  // v4.2: Sync Rules type guard
40
- isKnowledgeSyncRulesArgs, } from "./sessionMemoryDefinitions.js";
40
+ isKnowledgeSyncRulesArgs,
41
+ // v5.1: Deep Storage Mode type guard
42
+ isDeepStoragePurgeArgs, } from "./sessionMemoryDefinitions.js";
41
43
  // v4.2: File system access for knowledge_sync_rules
42
44
  import { readFile, writeFile, mkdir } from "node:fs/promises";
43
45
  import { existsSync } from "node:fs";
@@ -105,9 +107,26 @@ export async function sessionSaveLedgerHandler(args) {
105
107
  if (entryId) {
106
108
  getLLMProvider().generateEmbedding(embeddingText)
107
109
  .then(async (embedding) => {
108
- await storage.patchLedger(entryId, {
110
+ // Build atomic patch — float32 + TurboQuant in ONE DB update
111
+ const patchData = {
109
112
  embedding: JSON.stringify(embedding),
110
- });
113
+ };
114
+ // TurboQuant: compress alongside float32 (non-fatal)
115
+ try {
116
+ const { getDefaultCompressor, serialize } = await import("../utils/turboquant.js");
117
+ const compressor = getDefaultCompressor();
118
+ const compressed = compressor.compress(embedding);
119
+ const buf = serialize(compressed);
120
+ patchData.embedding_compressed = buf.toString("base64");
121
+ patchData.embedding_format = `turbo${compressor.bits}`;
122
+ patchData.embedding_turbo_radius = compressed.radius;
123
+ debugLog(`[session_save_ledger] TurboQuant compressed: ${buf.length} bytes (${(3072 / buf.length).toFixed(1)}× ratio)`);
124
+ }
125
+ catch (turboErr) {
126
+ console.error(`[session_save_ledger] TurboQuant compression failed (non-fatal): ${turboErr.message}`);
127
+ }
128
+ // Single atomic DB update for all embedding data
129
+ await storage.patchLedger(entryId, patchData);
111
130
  debugLog(`[session_save_ledger] Embedding saved for entry ${entryId}`);
112
131
  })
113
132
  .catch((err) => {
@@ -810,9 +829,12 @@ export async function sessionSearchMemoryHandler(args) {
810
829
  const { query, project, limit = 5, similarity_threshold = 0.7,
811
830
  // Phase 1: enable_trace defaults to false for full backward compatibility.
812
831
  // When true, a MemoryTrace JSON block is appended as content[1].
813
- enable_trace = false, } = args;
832
+ enable_trace = false,
833
+ // v5.2: Context-Weighted Retrieval — biases search toward active work context
834
+ context_boost = false, } = args;
814
835
  debugLog(`[session_search_memory] Semantic search: query="${query}", ` +
815
- `project=${project || "all"}, limit=${limit}, threshold=${similarity_threshold}`);
836
+ `project=${project || "all"}, limit=${limit}, threshold=${similarity_threshold}` +
837
+ `${context_boost ? ", context_boost=ON" : ""}`);
816
838
  // Phase 1: Start total latency timer BEFORE any work (embedding + storage)
817
839
  const totalStart = performance.now();
818
840
  // Step 1: Generate embedding for the search query
@@ -831,8 +853,43 @@ export async function sessionSearchMemoryHandler(args) {
831
853
  // Phase 1: Start embedding latency timer — isolates Gemini API call time.
832
854
  // This is the most variable component: 50ms on a good day, 2000ms under load.
833
855
  const embeddingStart = performance.now();
856
+ // ── v5.2: Context-Weighted Retrieval ───────────────────────────
857
+ // When context_boost is enabled, prepend active project context to the
858
+ // search query before embedding generation. This naturally biases the
859
+ // embedding vector toward memories from the same project/branch/context.
860
+ // Elegant: no scoring heuristics needed — semantics do the work.
861
+ let effectiveQuery = query;
862
+ if (context_boost && project) {
863
+ try {
864
+ const storage = await getStorage();
865
+ const ctx = await storage.loadContext(project, "quick", PRISM_USER_ID);
866
+ const contextParts = [];
867
+ if (ctx && typeof ctx === "object") {
868
+ const ctxObj = ctx;
869
+ if (ctxObj.active_branch)
870
+ contextParts.push(`branch: ${ctxObj.active_branch}`);
871
+ if (ctxObj.key_context)
872
+ contextParts.push(`context: ${String(ctxObj.key_context).substring(0, 200)}`);
873
+ const keywords = ctxObj.keywords;
874
+ if (keywords?.length)
875
+ contextParts.push(`keywords: ${keywords.slice(0, 5).join(", ")}`);
876
+ }
877
+ if (contextParts.length > 0) {
878
+ effectiveQuery = `[${contextParts.join("; ")}] ${query}`;
879
+ debugLog(`[session_search_memory] Context boost applied: "${effectiveQuery.substring(0, 100)}..."`);
880
+ }
881
+ }
882
+ catch {
883
+ // Context load failed — proceed with unmodified query (graceful degradation)
884
+ debugLog("[session_search_memory] Context boost failed (non-fatal) — using original query");
885
+ }
886
+ }
887
+ else if (context_boost && !project) {
888
+ // User enabled context_boost but didn't specify a project — can't boost without context
889
+ debugLog("[session_search_memory] context_boost ignored — requires a project parameter to load context");
890
+ }
834
891
  try {
835
- queryEmbedding = await getLLMProvider().generateEmbedding(query);
892
+ queryEmbedding = await getLLMProvider().generateEmbedding(effectiveQuery);
836
893
  }
837
894
  catch (err) {
838
895
  return {
@@ -893,14 +950,40 @@ export async function sessionSearchMemoryHandler(args) {
893
950
  }
894
951
  return { content: contentBlocks, isError: false };
895
952
  }
896
- // Format results with similarity scores
953
+ // ── v5.2: Dynamic Importance Decay (Ebbinghaus Curve) ──────
954
+ // Compute effective_importance at retrieval time:
955
+ // effective = base_importance * 0.95^days_since_accessed
956
+ // This avoids background workers — decay is a pure function of time.
957
+ // Also fire-and-forget update last_accessed_at on all returned results.
958
+ const now = new Date();
959
+ const resultIds = results.map((r) => r.id).filter(Boolean);
960
+ // Fire-and-forget: update last_accessed_at for all returned results
961
+ if (resultIds.length > 0) {
962
+ const nowISO = now.toISOString();
963
+ for (const id of resultIds) {
964
+ storage.patchLedger(id, { last_accessed_at: nowISO }).catch(() => { });
965
+ }
966
+ }
967
+ // Format results with similarity scores + effective importance
897
968
  const formatted = results.map((r, i) => {
898
969
  const score = typeof r.similarity === "number"
899
970
  ? `${(r.similarity * 100).toFixed(1)}%`
900
971
  : "N/A";
972
+ // Dynamic importance decay: effective = base * 0.95^days
973
+ const baseImportance = r.importance ?? 0;
974
+ let effectiveImportance = baseImportance;
975
+ if (baseImportance > 0) {
976
+ const lastAccess = r.last_accessed_at || r.created_at || now.toISOString();
977
+ const daysSince = Math.max(0, (now.getTime() - new Date(lastAccess).getTime()) / 86400000);
978
+ effectiveImportance = Math.round(baseImportance * Math.pow(0.95, daysSince) * 100) / 100;
979
+ }
980
+ const importanceStr = baseImportance > 0
981
+ ? ` Importance: ${effectiveImportance}${effectiveImportance !== baseImportance ? ` (base: ${baseImportance}, decayed)` : ""}\n`
982
+ : "";
901
983
  return `[${i + 1}] ${score} similar — ${r.session_date || "unknown date"}\n` +
902
984
  ` Project: ${r.project}\n` +
903
985
  ` Summary: ${r.summary}\n` +
986
+ importanceStr +
904
987
  (r.decisions?.length ? ` Decisions: ${r.decisions.join("; ")}\n` : "") +
905
988
  (r.files_changed?.length ? ` Files: ${r.files_changed.join(", ")}\n` : "");
906
989
  }).join("\n");
@@ -1022,9 +1105,24 @@ export async function backfillEmbeddingsHandler(args) {
1022
1105
  continue;
1023
1106
  }
1024
1107
  const embedding = await getLLMProvider().generateEmbedding(textToEmbed);
1025
- await storage.patchLedger(e.id, {
1108
+ // Build atomic patch — float32 + TurboQuant in ONE DB update
1109
+ const patchData = {
1026
1110
  embedding: JSON.stringify(embedding),
1027
- });
1111
+ };
1112
+ // TurboQuant: compress alongside repair (non-fatal)
1113
+ try {
1114
+ const { getDefaultCompressor, serialize } = await import("../utils/turboquant.js");
1115
+ const compressor = getDefaultCompressor();
1116
+ const compressed = compressor.compress(embedding);
1117
+ const buf = serialize(compressed);
1118
+ patchData.embedding_compressed = buf.toString("base64");
1119
+ patchData.embedding_format = `turbo${compressor.bits}`;
1120
+ patchData.embedding_turbo_radius = compressed.radius;
1121
+ }
1122
+ catch (turboErr) {
1123
+ debugLog(`[backfill] TurboQuant compression failed for ${e.id} (non-fatal): ${turboErr.message}`);
1124
+ }
1125
+ await storage.patchLedger(e.id, patchData);
1028
1126
  repaired++;
1029
1127
  debugLog(`[backfill] ✅ Repaired ${e.id} (${e.project})`);
1030
1128
  }
@@ -2066,3 +2164,63 @@ export async function sessionExportMemoryHandler(args) {
2066
2164
  };
2067
2165
  }
2068
2166
  }
2167
+ // ─── v5.1: Deep Storage Mode (The Purge) ──────────────────────
2168
+ //
2169
+ // REVIEWER NOTE: This handler is the storage optimization payoff of v5.0.
2170
+ // After TurboQuant backfill, old entries have BOTH float32 (3KB) and
2171
+ // compressed (400B) representations. This tool NULLs out the float32
2172
+ // column for entries old enough that Tier-1 native search value is minimal.
2173
+ //
2174
+ // HANDLER PATTERN:
2175
+ // 1. Validate args via isDeepStoragePurgeArgs (imported from definitions)
2176
+ // 2. Apply defaults (older_than_days=30, dry_run=false)
2177
+ // 3. Delegate to storage.purgeHighPrecisionEmbeddings()
2178
+ // 4. Format response with human-readable byte counts
2179
+ //
2180
+ // NO SERVER REF NEEDED: Unlike sessionSaveHandoffHandler, this tool
2181
+ // doesn't modify any subscribed resource — no notification needed.
2182
+ export async function deepStoragePurgeHandler(args) {
2183
+ if (!isDeepStoragePurgeArgs(args)) {
2184
+ throw new Error("Invalid arguments for deep_storage_purge");
2185
+ }
2186
+ const olderThanDays = args.older_than_days ?? 30;
2187
+ const dryRun = args.dry_run ?? false;
2188
+ debugLog(`[deep_storage_purge] ${dryRun ? "DRY RUN" : "EXECUTING"}: ` +
2189
+ `olderThanDays=${olderThanDays}, project=${args.project || "all"}`);
2190
+ const storage = await getStorage();
2191
+ const result = await storage.purgeHighPrecisionEmbeddings({
2192
+ project: args.project,
2193
+ olderThanDays,
2194
+ dryRun,
2195
+ userId: PRISM_USER_ID,
2196
+ });
2197
+ // Format bytes as human-readable MB with 2 decimal places
2198
+ const mbs = (result.reclaimedBytes / (1024 * 1024)).toFixed(2);
2199
+ if (dryRun) {
2200
+ return {
2201
+ content: [{
2202
+ type: "text",
2203
+ text: `🔍 **Deep Storage Purge — DRY RUN**\n\n` +
2204
+ `Eligible entries: **${result.eligible}**\n` +
2205
+ `Estimated space to reclaim: **${result.reclaimedBytes.toLocaleString()} bytes** (~${mbs} MB)\n\n` +
2206
+ (args.project ? `Project: \`${args.project}\`\n` : `Scope: all projects\n`) +
2207
+ `Age threshold: entries older than ${olderThanDays} days\n\n` +
2208
+ `To execute the purge, call again with \`dry_run: false\`.`,
2209
+ }],
2210
+ isError: false,
2211
+ };
2212
+ }
2213
+ return {
2214
+ content: [{
2215
+ type: "text",
2216
+ text: `✅ **Deep Storage Purge Complete**\n\n` +
2217
+ `Purged entries: **${result.purged}**\n` +
2218
+ `Reclaimed space: **${result.reclaimedBytes.toLocaleString()} bytes** (~${mbs} MB)\n\n` +
2219
+ (args.project ? `Project: \`${args.project}\`\n` : `Scope: all projects\n`) +
2220
+ `Age threshold: entries older than ${olderThanDays} days\n\n` +
2221
+ `💡 Tier-2 (TurboQuant) and Tier-3 (FTS5) search remain fully functional.\n` +
2222
+ `Tier-1 (native sqlite-vec) search will skip these entries — this is expected.`,
2223
+ }],
2224
+ isError: false,
2225
+ };
2226
+ }
@@ -0,0 +1,131 @@
1
+ /**
2
+ * ═══════════════════════════════════════════════════════════════════
3
+ * Claude Code JSONL Adapter
4
+ * ═══════════════════════════════════════════════════════════════════
5
+ *
6
+ * REVIEWER NOTE — Claude Code Streaming Deduplication:
7
+ * Claude Code does NOT write one clean JSON line per turn. It writes
8
+ * to the JSONL file DURING streaming. This means you see multiple
9
+ * JSON lines for the exact same `message.id` as the response streams in.
10
+ *
11
+ * If we processed every `type: assistant` line blindly, we'd ingest
12
+ * highly fragmented or duplicate entries. The solution is to aggregate
13
+ * by `message.id` and only flush the LATEST version of each assistant
14
+ * message when a user message arrives (or at end-of-file).
15
+ *
16
+ * STREAMING STRATEGY:
17
+ * Uses Node's readline interface for true line-by-line processing.
18
+ * Memory usage is O(pending_assistant_chunks), not O(file_size).
19
+ * For a typical session, pending chunks rarely exceed 2-3 entries.
20
+ *
21
+ * SOURCE FORMAT (simplified):
22
+ * { type: "assistant", message: { id: "msg_xxx", content: [...] }, timestamp: "..." }
23
+ * { type: "user", content: "...", timestamp: "..." }
24
+ * ═══════════════════════════════════════════════════════════════════
25
+ */
26
+ import fs from 'node:fs';
27
+ import readline from 'node:readline';
28
+ import { normalizeContent } from './utils.js';
29
+ export const claudeAdapter = {
30
+ id: 'claude',
31
+ /**
32
+ * Claude Code uses .jsonl (JSON Lines) format exclusively.
33
+ * This is a reliable heuristic — no other major LLM uses .jsonl for exports.
34
+ */
35
+ canHandle(filePath) {
36
+ return filePath.endsWith('.jsonl');
37
+ },
38
+ async parse(filePath, onTurn) {
39
+ const fileStream = fs.createReadStream(filePath);
40
+ const rl = readline.createInterface({
41
+ input: fileStream,
42
+ crlfDelay: Infinity, // Handle both \n and \r\n line endings
43
+ });
44
+ // ── Deduplication Buffer ──────────────────────────────────────
45
+ // Accumulates assistant chunks by message.id. When a user message
46
+ // arrives, we flush all pending assistant messages (keeping only
47
+ // the latest content for each ID) and then emit the user message.
48
+ const pendingAssistantChunks = new Map();
49
+ for await (const line of rl) {
50
+ if (!line.trim())
51
+ continue; // Skip blank lines
52
+ try {
53
+ const entry = JSON.parse(line);
54
+ // ── Role Detection ─────────────────────────────────────────
55
+ // Claude Code logs have two role indicators:
56
+ // 1. `entry.type` (top-level) — "assistant" or "user"
57
+ // 2. `entry.message.role` (nested) — "assistant" or "user"
58
+ // We check both for robustness.
59
+ const role = entry.type === 'assistant' || entry.message?.role === 'assistant' ? 'assistant' : 'user';
60
+ // ── Content Extraction ─────────────────────────────────────
61
+ // Content can be at `entry.content` or nested at `entry.message.content`.
62
+ // Both may be strings or arrays of content blocks.
63
+ const content = normalizeContent(entry.content || entry.message?.content || "");
64
+ const timestamp = entry.timestamp || new Date().toISOString();
65
+ // ── Message ID for Deduplication ───────────────────────────
66
+ // Claude logs may use `entry.id`, `entry.message.id`, or `entry.requestId`.
67
+ // Any of these can serve as the deduplication key.
68
+ const messageId = entry.id || entry.message?.id || entry.requestId;
69
+ if (role === 'assistant' && messageId) {
70
+ // ── Streaming Chunk Aggregation ────────────────────────────
71
+ // For assistant messages with an ID, we DON'T emit immediately.
72
+ // Instead, we overwrite the buffer entry — the last chunk for a
73
+ // given ID contains the complete content (Claude rewrites the
74
+ // full message in the final streaming chunk).
75
+ pendingAssistantChunks.set(messageId, { content, tools: [], timestamp });
76
+ continue;
77
+ }
78
+ // ── User Message: Flush Pending Assistants ──────────────────
79
+ // A user message signals the end of the previous assistant turn.
80
+ // Flush all pending assistant chunks before emitting the user turn.
81
+ if (role === 'user') {
82
+ for (const [id, msg] of pendingAssistantChunks) {
83
+ await onTurn({
84
+ role: 'assistant',
85
+ content: msg.content,
86
+ timestamp: msg.timestamp,
87
+ sessionId: 'claude-migration',
88
+ project: 'default',
89
+ todos: [],
90
+ files_changed: [],
91
+ messageId: id,
92
+ tools: msg.tools,
93
+ });
94
+ }
95
+ pendingAssistantChunks.clear();
96
+ await onTurn({
97
+ role: 'user',
98
+ content,
99
+ timestamp,
100
+ sessionId: 'claude-migration',
101
+ project: 'default',
102
+ todos: [],
103
+ files_changed: [],
104
+ messageId,
105
+ });
106
+ }
107
+ }
108
+ catch (e) {
109
+ // ── Malformed Line Handling ──────────────────────────────────
110
+ // Skip lines that fail JSON parsing. This is expected for
111
+ // corrupted exports or partial writes during Claude crashes.
112
+ }
113
+ }
114
+ // ── Final Flush ──────────────────────────────────────────────────
115
+ // If the file ends with assistant messages (no trailing user message),
116
+ // we must flush any remaining pending chunks.
117
+ for (const [id, msg] of pendingAssistantChunks) {
118
+ await onTurn({
119
+ role: 'assistant',
120
+ content: msg.content,
121
+ timestamp: msg.timestamp,
122
+ sessionId: 'claude-migration',
123
+ project: 'default',
124
+ todos: [],
125
+ files_changed: [],
126
+ messageId: id,
127
+ tools: msg.tools,
128
+ });
129
+ }
130
+ },
131
+ };
@@ -0,0 +1,87 @@
1
+ /**
2
+ * ═══════════════════════════════════════════════════════════════════
3
+ * Gemini History JSON Adapter
4
+ * ═══════════════════════════════════════════════════════════════════
5
+ *
6
+ * REVIEWER NOTE — Streaming Large JSON Arrays:
7
+ * Gemini exports history as a single JSON array (not JSONL).
8
+ * A naive `JSON.parse(fs.readFileSync(...))` would load the entire
9
+ * file into memory — OOM for 100MB+ exports.
10
+ *
11
+ * We use `stream-json/StreamArray` to parse array elements one at a
12
+ * time in streaming fashion. Memory usage is O(1) per entry.
13
+ *
14
+ * ROLE MAPPING:
15
+ * Gemini uses "model" for assistant responses (not "assistant").
16
+ * We normalize this to "assistant" for Prism's unified schema.
17
+ *
18
+ * TIMESTAMP FALLBACK:
19
+ * Gemini SDK history arrays often lack per-turn timestamps.
20
+ * We fall back to `createTime` (if present) or current time.
21
+ * The orchestrator may override timestamps via ensureChronology.
22
+ *
23
+ * SOURCE FORMAT (simplified):
24
+ * [
25
+ * { role: "user", parts: [{ text: "..." }] },
26
+ * { role: "model", parts: [{ text: "..." }] }
27
+ * ]
28
+ * ═══════════════════════════════════════════════════════════════════
29
+ */
30
+ import fs from 'node:fs';
31
+ import { chain } from 'stream-chain';
32
+ import StreamArray from 'stream-json/streamers/stream-array.js';
33
+ import { normalizeContent } from './utils.js';
34
+ export const geminiAdapter = {
35
+ id: 'gemini',
36
+ /**
37
+ * Auto-detection heuristic for Gemini files.
38
+ *
39
+ * REVIEWER NOTE — canHandle Overlap (Finding 1):
40
+ * Both Gemini and OpenAI use .json. To disambiguate without content sniffing,
41
+ * we use a filename convention: if the path contains "openai" or "chatgpt",
42
+ * we defer to the OpenAI adapter. Otherwise, .json files default to Gemini.
43
+ *
44
+ * For production use, users should ALWAYS use --format= to avoid ambiguity.
45
+ * This heuristic is a convenience fallback only.
46
+ */
47
+ canHandle(filePath) {
48
+ const lower = filePath.toLowerCase();
49
+ return lower.endsWith('.json') && !lower.includes('openai') && !lower.includes('chatgpt');
50
+ },
51
+ async parse(filePath, onTurn) {
52
+ // ── Streaming Pipeline ────────────────────────────────────────
53
+ // `StreamArray.withParser()` combines the JSON parser + array streamer
54
+ // into a single transform. Each emitted object has { key, value }
55
+ // where `key` is the array index and `value` is the parsed element.
56
+ const pipeline = chain([
57
+ fs.createReadStream(filePath),
58
+ StreamArray.withParser(),
59
+ ]);
60
+ for await (const { value: entry } of pipeline) {
61
+ // ── Role Normalization ──────────────────────────────────────
62
+ // Gemini uses 'model' for AI responses; some exports may use 'assistant'.
63
+ // Both are mapped to 'assistant' in the normalized schema.
64
+ const role = entry.role === 'model' || entry.role === 'assistant' ? 'assistant' : 'user';
65
+ // ── Content Extraction ──────────────────────────────────────
66
+ // Gemini stores content in `parts` (array of { text: '...' } objects).
67
+ // Falls back to `entry.content` for non-standard exports.
68
+ const content = normalizeContent(entry.parts || entry.content || "");
69
+ // ── Timestamp Fallback Chain ────────────────────────────────
70
+ // Priority: entry.timestamp > entry.createTime > now()
71
+ // REVIEWER NOTE: Using Date.now() as final fallback means all turns
72
+ // without timestamps get the SAME timestamp — the orchestrator's
73
+ // session_date splitting may group them incorrectly. This is a known
74
+ // acceptable tradeoff for the initial implementation.
75
+ const timestamp = entry.timestamp || entry.createTime || new Date().toISOString();
76
+ await onTurn({
77
+ role,
78
+ content,
79
+ timestamp,
80
+ sessionId: 'gemini-migration',
81
+ project: 'default',
82
+ todos: [],
83
+ files_changed: [],
84
+ });
85
+ }
86
+ },
87
+ };
@@ -0,0 +1,88 @@
1
+ /**
2
+ * ═══════════════════════════════════════════════════════════════════
3
+ * OpenAI / ChatGPT History JSON Adapter
4
+ * ═══════════════════════════════════════════════════════════════════
5
+ *
6
+ * REVIEWER NOTE — Tool Call Normalization:
7
+ * OpenAI's chat completion format includes structured `tool_calls`
8
+ * arrays on assistant messages. These contain function names, arguments,
9
+ * and call IDs. Since Prism's ledger stores content as plain text,
10
+ * we inline tool calls as readable markers: `[Tool Use: function_name]`.
11
+ *
12
+ * The original tool names are also preserved in `NormalizedTurn.tools[]`
13
+ * for keyword indexing in the Mind Palace.
14
+ *
15
+ * TIMESTAMP HANDLING:
16
+ * OpenAI uses Unix epoch seconds in `created_at` (not milliseconds).
17
+ * We convert: `new Date(created_at * 1000).toISOString()`.
18
+ * Standard ISO timestamps in `entry.timestamp` take priority.
19
+ *
20
+ * SOURCE FORMAT (simplified):
21
+ * [
22
+ * { role: "user", content: "..." },
23
+ * { role: "assistant", content: "...", tool_calls: [{ function: { name: "..." } }] }
24
+ * ]
25
+ * ═══════════════════════════════════════════════════════════════════
26
+ */
27
+ import fs from 'node:fs';
28
+ import { chain } from 'stream-chain';
29
+ import StreamArray from 'stream-json/streamers/stream-array.js';
30
+ import { normalizeContent } from './utils.js';
31
+ export const openaiAdapter = {
32
+ id: 'openai',
33
+ /**
34
+ * Auto-detection heuristic for OpenAI/ChatGPT files.
35
+ *
36
+ * REVIEWER NOTE — canHandle Strategy:
37
+ * Matches files with "openai" or "chatgpt" anywhere in the path.
38
+ * This is intentionally broad — ChatGPT export filenames vary widely.
39
+ * For ambiguous files (e.g., `history.json`), users MUST use --format=openai.
40
+ */
41
+ canHandle(filePath) {
42
+ const lower = filePath.toLowerCase();
43
+ return lower.includes('openai') || lower.includes('chatgpt');
44
+ },
45
+ async parse(filePath, onTurn) {
46
+ // ── Streaming Pipeline ────────────────────────────────────────
47
+ // Same OOM-safe pattern as geminiAdapter. See that file for details.
48
+ const pipeline = chain([
49
+ fs.createReadStream(filePath),
50
+ StreamArray.withParser(),
51
+ ]);
52
+ for await (const { value: entry } of pipeline) {
53
+ // ── Role Normalization ──────────────────────────────────────
54
+ // OpenAI also has 'system' and 'tool' roles — we skip those.
55
+ // Only 'user' and 'assistant' turns are meaningful for migration.
56
+ const role = entry.role === 'assistant' ? 'assistant' : 'user';
57
+ let content = normalizeContent(entry.content || "");
58
+ // ── Tool Call Inlining ──────────────────────────────────────
59
+ // Convert structured tool_calls into human-readable content markers.
60
+ // This preserves the semantic intent while keeping storage as plain text.
61
+ if (entry.tool_calls) {
62
+ const tools = entry.tool_calls
63
+ .map((tc) => `[Tool Use: ${tc.function?.name || tc.id}]`)
64
+ .join("\n");
65
+ content = `${content}\n${tools}`.trim();
66
+ }
67
+ // ── Timestamp Fallback Chain ────────────────────────────────
68
+ // Priority: entry.timestamp (ISO) > entry.created_at (Unix epoch) > now()
69
+ // REVIEWER NOTE: OpenAI's `created_at` is in SECONDS, not milliseconds.
70
+ // Multiplying by 1000 is critical — without it, dates land in 1970.
71
+ const timestamp = entry.timestamp
72
+ || (entry.created_at ? new Date(entry.created_at * 1000).toISOString() : new Date().toISOString());
73
+ await onTurn({
74
+ role,
75
+ content,
76
+ timestamp,
77
+ sessionId: 'openai-migration',
78
+ project: 'default',
79
+ todos: [],
80
+ files_changed: [],
81
+ // ── Keyword Indexing ────────────────────────────────────────
82
+ // Extract tool function names for Prism's keyword search index.
83
+ // `undefined` tools are filtered out by the optional chaining.
84
+ tools: entry.tool_calls?.map((tc) => tc.function?.name),
85
+ });
86
+ }
87
+ },
88
+ };
@@ -0,0 +1,18 @@
1
+ /**
2
+ * ═══════════════════════════════════════════════════════════════════
3
+ * Migration Types — Strategy Pattern Interfaces
4
+ * ═══════════════════════════════════════════════════════════════════
5
+ *
6
+ * REVIEWER NOTE:
7
+ * This file defines the core contracts for the Universal Migration
8
+ * Utility. Each LLM format (Claude, Gemini, OpenAI) implements the
9
+ * MigrationAdapter interface. All turns are normalized into the
10
+ * NormalizedTurn schema before being mapped to Prism's LedgerEntry.
11
+ *
12
+ * DESIGN DECISION:
13
+ * NormalizedTurn is intentionally NOT a subset of LedgerEntry.
14
+ * The orchestrator (universalImporter.ts) performs the final mapping.
15
+ * This keeps adapters decoupled from storage internals.
16
+ * ═══════════════════════════════════════════════════════════════════
17
+ */
18
+ export {};
@@ -0,0 +1,99 @@
1
+ /**
2
+ * ═══════════════════════════════════════════════════════════════════
3
+ * Migration Utilities — Shared Normalization Helpers
4
+ * ═══════════════════════════════════════════════════════════════════
5
+ *
6
+ * REVIEWER NOTE:
7
+ * These utilities handle the messiest part of cross-format migration:
8
+ * normalizing wildly different content representations into plain strings.
9
+ *
10
+ * Claude uses: `content: [{ type: 'text', text: '...' }]` (array of blocks)
11
+ * Gemini uses: `parts: [{ text: '...' }]` (array of parts)
12
+ * OpenAI uses: `content: '...'` (plain string, usually)
13
+ *
14
+ * The `normalizeContent` function handles all three shapes.
15
+ * ═══════════════════════════════════════════════════════════════════
16
+ */
17
+ /**
18
+ * Normalizes content from various LLM formats into a plain string.
19
+ *
20
+ * Handles three shapes:
21
+ * 1. Plain string → returned as-is
22
+ * 2. Array of objects with `.text` → concatenated
23
+ * 3. Array of strings → joined
24
+ * 4. Anything else → empty string (safe fallback)
25
+ *
26
+ * REVIEWER NOTE:
27
+ * Gemini's `functionCall` parts (which have `.functionCall` but no `.text`)
28
+ * are intentionally dropped here. They are handled separately by the
29
+ * Gemini adapter via tool-call extraction. Returning "" for unknown part
30
+ * types is the correct behavior — it avoids injecting [object Object] strings.
31
+ */
32
+ /**
33
+ * Content-sniffs the first ~4KB of a file to detect its LLM format.
34
+ *
35
+ * REVIEWER NOTE:
36
+ * This is a best-effort heuristic that supplements filename-based detection.
37
+ * It reads only the first 4KB to stay fast and memory-safe on large files.
38
+ * Returns the adapter ID ('claude', 'gemini', 'openai') or null if ambiguous.
39
+ *
40
+ * Detection markers:
41
+ * Claude → JSONL format (newline-delimited), or `"message":{"id":` / `"type":"assistant"`
42
+ * Gemini → `"parts":` array or `"role":"model"`
43
+ * OpenAI → `"tool_calls":` or `"created_at":` (Unix epoch) or `"role":"system"`
44
+ */
45
+ export function sniffFormat(filePath) {
46
+ const fs = require('node:fs');
47
+ const fd = fs.openSync(filePath, 'r');
48
+ const buf = Buffer.alloc(4096);
49
+ const bytesRead = fs.readSync(fd, buf, 0, 4096, 0);
50
+ fs.closeSync(fd);
51
+ if (bytesRead === 0)
52
+ return null;
53
+ const head = buf.toString('utf8', 0, bytesRead);
54
+ // ── JSONL detection (Claude) ────────────────────────────────────
55
+ // If the file starts with `{` and contains newlines followed by `{`,
56
+ // it's JSONL (not a JSON array). Claude Code is the only major LLM
57
+ // that uses JSONL for exports.
58
+ const trimmed = head.trimStart();
59
+ if (trimmed.startsWith('{') && !trimmed.startsWith('[')) {
60
+ return 'claude';
61
+ }
62
+ // ── JSON array content inspection ──────────────────────────────
63
+ // For JSON arrays, inspect the content for format-specific markers.
64
+ // Gemini markers: "parts" array or "role":"model"
65
+ if (head.includes('"parts"') || head.includes('"role":"model"') || head.includes('"role": "model"')) {
66
+ return 'gemini';
67
+ }
68
+ // OpenAI markers: "tool_calls", "created_at" (Unix epoch), or "role":"system"
69
+ if (head.includes('"tool_calls"') || head.includes('"created_at"') ||
70
+ head.includes('"role":"system"') || head.includes('"role": "system"')) {
71
+ return 'openai';
72
+ }
73
+ // Claude markers in JSON form (less common but possible)
74
+ if (head.includes('"message":{') || head.includes('"message": {') ||
75
+ head.includes('"type":"assistant"') || head.includes('"type": "assistant"')) {
76
+ return 'claude';
77
+ }
78
+ return null;
79
+ }
80
+ export function normalizeContent(content) {
81
+ if (typeof content === 'string')
82
+ return content;
83
+ if (Array.isArray(content)) {
84
+ return content
85
+ .map((part) => {
86
+ if (typeof part === 'string')
87
+ return part;
88
+ // Handle Claude's `{ type: 'text', text: '...' }` and Gemini's `{ text: '...' }`
89
+ if (part.text)
90
+ return part.text;
91
+ // Explicit type-check for safety (redundant with above, but clear for reviewers)
92
+ if (part.type === 'text')
93
+ return part.text;
94
+ return "";
95
+ })
96
+ .join("");
97
+ }
98
+ return "";
99
+ }