@getlore/cli 0.5.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +12 -4
- package/README.md +66 -5
- package/dist/cli/commands/sync.js +4 -1
- package/dist/core/git.js +36 -4
- package/dist/core/vector-store.d.ts +13 -0
- package/dist/core/vector-store.js +28 -3
- package/dist/mcp/handlers/research-agent.d.ts +2 -1
- package/dist/mcp/handlers/research-agent.js +37 -7
- package/dist/mcp/handlers/research.d.ts +19 -0
- package/dist/mcp/handlers/research.js +144 -3
- package/dist/mcp/handlers/sync.d.ts +2 -0
- package/dist/mcp/handlers/sync.js +70 -3
- package/dist/mcp/server.js +28 -5
- package/dist/mcp/tools.js +16 -2
- package/dist/sync/process.d.ts +8 -0
- package/dist/sync/process.js +77 -17
- package/dist/sync/processors.d.ts +7 -0
- package/dist/sync/processors.js +95 -1
- package/dist/tui/browse-handlers.js +71 -32
- package/dist/tui/browse-render.js +28 -12
- package/dist/tui/browse-types.d.ts +1 -0
- package/package.json +3 -2
|
@@ -11,10 +11,10 @@
|
|
|
11
11
|
* - Generate embeddings
|
|
12
12
|
* - Store in Supabase + local data dir
|
|
13
13
|
*/
|
|
14
|
-
import { readdir, readFile } from 'fs/promises';
|
|
14
|
+
import { readdir, readFile, mkdir, writeFile } from 'fs/promises';
|
|
15
15
|
import { existsSync } from 'fs';
|
|
16
16
|
import path from 'path';
|
|
17
|
-
import { getAllSources, addSource, resetDatabaseConnection, } from '../../core/vector-store.js';
|
|
17
|
+
import { getAllSources, addSource, getSourcesWithPaths, resetDatabaseConnection, } from '../../core/vector-store.js';
|
|
18
18
|
import { generateEmbedding, createSearchableText } from '../../core/embedder.js';
|
|
19
19
|
import { gitPull, gitCommitAndPush } from '../../core/git.js';
|
|
20
20
|
import { loadSyncConfig, getEnabledSources } from '../../sync/config.js';
|
|
@@ -106,6 +106,65 @@ async function legacyDiskSync(dbPath, dataDir) {
|
|
|
106
106
|
return result;
|
|
107
107
|
}
|
|
108
108
|
// ============================================================================
|
|
109
|
+
// Local Content Reconciliation
|
|
110
|
+
// ============================================================================
|
|
111
|
+
/**
|
|
112
|
+
* Ensures every source in Supabase with a source_path has a local
|
|
113
|
+
* ~/.lore/sources/{id}/content.md file. This handles:
|
|
114
|
+
* - Sources indexed before storeSourceToDisk was implemented
|
|
115
|
+
* - Sources from other machines (in shared Supabase but no local content)
|
|
116
|
+
* - Any edge case where Supabase write succeeded but disk write failed
|
|
117
|
+
*
|
|
118
|
+
* Cost: One Supabase query + local filesystem checks. No LLM calls.
|
|
119
|
+
*/
|
|
120
|
+
async function reconcileLocalContent(dataDir) {
|
|
121
|
+
const sourcesDir = path.join(dataDir, 'sources');
|
|
122
|
+
const textExts = ['.md', '.txt', '.json', '.jsonl', '.csv', '.xml', '.yaml', '.yml', '.html', '.log'];
|
|
123
|
+
// Get all sources that have a source_path in Supabase
|
|
124
|
+
const sourcesWithPaths = await getSourcesWithPaths('');
|
|
125
|
+
if (sourcesWithPaths.length === 0)
|
|
126
|
+
return 0;
|
|
127
|
+
let reconciled = 0;
|
|
128
|
+
for (const source of sourcesWithPaths) {
|
|
129
|
+
const sourceDir = path.join(sourcesDir, source.id);
|
|
130
|
+
const contentPath = path.join(sourceDir, 'content.md');
|
|
131
|
+
// Skip if content.md already exists
|
|
132
|
+
if (existsSync(contentPath))
|
|
133
|
+
continue;
|
|
134
|
+
// Try to create content.md from the original source_path
|
|
135
|
+
let content = null;
|
|
136
|
+
if (existsSync(source.source_path)) {
|
|
137
|
+
const ext = path.extname(source.source_path).toLowerCase();
|
|
138
|
+
if (textExts.includes(ext)) {
|
|
139
|
+
try {
|
|
140
|
+
content = await readFile(source.source_path, 'utf-8');
|
|
141
|
+
}
|
|
142
|
+
catch {
|
|
143
|
+
// File can't be read — fall through to summary
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// If we couldn't read the original file, use the summary from Supabase
|
|
148
|
+
if (!content) {
|
|
149
|
+
content = [
|
|
150
|
+
`# ${source.title}`,
|
|
151
|
+
'',
|
|
152
|
+
source.summary,
|
|
153
|
+
].join('\n');
|
|
154
|
+
}
|
|
155
|
+
// Create the source directory and content.md
|
|
156
|
+
try {
|
|
157
|
+
await mkdir(sourceDir, { recursive: true });
|
|
158
|
+
await writeFile(contentPath, content);
|
|
159
|
+
reconciled++;
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
// Skip on write failure — will retry on next sync
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return reconciled;
|
|
166
|
+
}
|
|
167
|
+
// ============================================================================
|
|
109
168
|
// Universal Sync (new system)
|
|
110
169
|
// ============================================================================
|
|
111
170
|
async function universalSync(dataDir, dryRun, hookContext) {
|
|
@@ -172,9 +231,12 @@ export async function handleSync(dbPath, dataDir, args, options = {}) {
|
|
|
172
231
|
sources_found: 0,
|
|
173
232
|
sources_indexed: 0,
|
|
174
233
|
already_indexed: 0,
|
|
234
|
+
reconciled: 0,
|
|
175
235
|
};
|
|
236
|
+
const { onProgress } = options;
|
|
176
237
|
// 1. Git pull
|
|
177
238
|
if (doPull) {
|
|
239
|
+
await onProgress?.(5, undefined, 'Pulling from git...');
|
|
178
240
|
const pullResult = await gitPull(dataDir);
|
|
179
241
|
result.git_pulled = pullResult.success && (pullResult.message?.includes('Pulled') || false);
|
|
180
242
|
if (pullResult.error) {
|
|
@@ -188,20 +250,25 @@ export async function handleSync(dbPath, dataDir, args, options = {}) {
|
|
|
188
250
|
const hasUniversalSources = getEnabledSources(config).length > 0;
|
|
189
251
|
if (hasUniversalSources && !useLegacy) {
|
|
190
252
|
// Use new universal sync
|
|
253
|
+
await onProgress?.(20, undefined, 'Discovering new files...');
|
|
191
254
|
const { discovery, processing } = await universalSync(dataDir, dryRun, options.hookContext);
|
|
192
255
|
result.discovery = discovery;
|
|
193
256
|
result.processing = processing;
|
|
194
257
|
}
|
|
195
258
|
// Always run legacy disk sync for backward compatibility
|
|
196
259
|
// (picks up sources added via old `lore ingest` command)
|
|
260
|
+
await onProgress?.(60, undefined, 'Running legacy sync...');
|
|
197
261
|
const legacyResult = await legacyDiskSync(dbPath, dataDir);
|
|
198
262
|
result.sources_found = legacyResult.sources_found;
|
|
199
263
|
result.sources_indexed = legacyResult.sources_indexed;
|
|
200
264
|
result.already_indexed = legacyResult.already_indexed;
|
|
265
|
+
// Reconcile: ensure every Supabase source has local content.md
|
|
266
|
+
await onProgress?.(80, undefined, 'Reconciling local content...');
|
|
267
|
+
result.reconciled = await reconcileLocalContent(dataDir);
|
|
201
268
|
}
|
|
202
269
|
// 3. Git push
|
|
203
270
|
if (doPush && !dryRun) {
|
|
204
|
-
const totalNew = (result.processing?.processed || 0) + result.sources_indexed;
|
|
271
|
+
const totalNew = (result.processing?.processed || 0) + result.sources_indexed + result.reconciled;
|
|
205
272
|
if (totalNew > 0) {
|
|
206
273
|
const pushResult = await gitCommitAndPush(dataDir, `Sync: Added ${totalNew} source(s)`);
|
|
207
274
|
result.git_pushed = pushResult.success && (pushResult.message?.includes('pushed') || false);
|
package/dist/mcp/server.js
CHANGED
|
@@ -21,7 +21,7 @@ import { handleGetSource } from './handlers/get-source.js';
|
|
|
21
21
|
import { handleListSources } from './handlers/list-sources.js';
|
|
22
22
|
import { handleRetain } from './handlers/retain.js';
|
|
23
23
|
import { handleIngest } from './handlers/ingest.js';
|
|
24
|
-
import {
|
|
24
|
+
import { startResearchJob, getResearchJobStatus } from './handlers/research.js';
|
|
25
25
|
import { handleListProjects } from './handlers/list-projects.js';
|
|
26
26
|
import { handleSync } from './handlers/sync.js';
|
|
27
27
|
import { handleArchiveProject } from './handlers/archive-project.js';
|
|
@@ -136,7 +136,7 @@ async function main() {
|
|
|
136
136
|
}
|
|
137
137
|
const server = new Server({
|
|
138
138
|
name: 'lore',
|
|
139
|
-
version: '0.
|
|
139
|
+
version: '0.7.0',
|
|
140
140
|
}, {
|
|
141
141
|
capabilities: {
|
|
142
142
|
tools: {},
|
|
@@ -184,8 +184,25 @@ async function main() {
|
|
|
184
184
|
return { tools: toolDefinitions };
|
|
185
185
|
});
|
|
186
186
|
// Handle tool calls (core tools only)
|
|
187
|
-
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
187
|
+
server.setRequestHandler(CallToolRequestSchema, async (request, extra) => {
|
|
188
188
|
const { name, arguments: args } = request.params;
|
|
189
|
+
// Build a progress callback for long-running tools.
|
|
190
|
+
// If the client sent a progressToken, we send notifications/progress back;
|
|
191
|
+
// otherwise, onProgress is a no-op.
|
|
192
|
+
const progressToken = request.params._meta?.progressToken;
|
|
193
|
+
const onProgress = progressToken
|
|
194
|
+
? async (progress, total, message) => {
|
|
195
|
+
try {
|
|
196
|
+
await extra.sendNotification({
|
|
197
|
+
method: 'notifications/progress',
|
|
198
|
+
params: { progressToken, progress, ...(total != null ? { total } : {}), ...(message ? { message } : {}) },
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
catch {
|
|
202
|
+
// Progress notifications are best-effort
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
: undefined;
|
|
189
206
|
try {
|
|
190
207
|
let result;
|
|
191
208
|
switch (name) {
|
|
@@ -215,16 +232,22 @@ async function main() {
|
|
|
215
232
|
hookContext: { mode: 'mcp' },
|
|
216
233
|
});
|
|
217
234
|
break;
|
|
218
|
-
// Agentic research tool
|
|
235
|
+
// Agentic research tool — runs async, returns job_id immediately
|
|
219
236
|
case 'research':
|
|
220
|
-
result =
|
|
237
|
+
result = startResearchJob(DB_PATH, LORE_DATA_DIR, args, {
|
|
221
238
|
hookContext: { mode: 'mcp' },
|
|
239
|
+
onProgress,
|
|
222
240
|
});
|
|
223
241
|
break;
|
|
242
|
+
// Poll for research results (long-polls up to 20s)
|
|
243
|
+
case 'research_status':
|
|
244
|
+
result = await getResearchJobStatus(args?.job_id);
|
|
245
|
+
break;
|
|
224
246
|
// Sync tool
|
|
225
247
|
case 'sync':
|
|
226
248
|
result = await handleSync(DB_PATH, LORE_DATA_DIR, args, {
|
|
227
249
|
hookContext: { mode: 'mcp' },
|
|
250
|
+
onProgress,
|
|
228
251
|
});
|
|
229
252
|
break;
|
|
230
253
|
// Project management
|
package/dist/mcp/tools.js
CHANGED
|
@@ -241,7 +241,7 @@ USE 'ingest' INSTEAD for full documents, meeting notes, transcripts, or any cont
|
|
|
241
241
|
name: 'research',
|
|
242
242
|
description: `Run a comprehensive research query across the knowledge base. An internal agent iteratively searches, reads sources, cross-references findings, and synthesizes a research package with full citations.
|
|
243
243
|
|
|
244
|
-
|
|
244
|
+
ASYNC: This tool returns immediately with a job_id. You MUST then poll 'research_status' with that job_id to get results. Research typically takes 2-8 minutes depending on the amount of data. Poll every 15-20 seconds. Do NOT assume it is stuck — check the 'activity' array in the status response to see what the agent is doing.
|
|
245
245
|
|
|
246
246
|
WHEN TO USE:
|
|
247
247
|
- Questions that span multiple sources ("What do we know about authentication?")
|
|
@@ -249,9 +249,23 @@ WHEN TO USE:
|
|
|
249
249
|
- Building a cited research package for decision-making
|
|
250
250
|
- Open-ended exploration of a topic
|
|
251
251
|
|
|
252
|
-
COST: This tool makes multiple LLM calls internally (typically
|
|
252
|
+
COST: This tool makes multiple LLM calls internally (typically 10-30 search + read cycles). For simple lookups, use 'search' instead — it's 10x cheaper and faster.`,
|
|
253
253
|
inputSchema: zodToJsonSchema(ResearchSchema),
|
|
254
254
|
},
|
|
255
|
+
// Research status (polling for async results)
|
|
256
|
+
{
|
|
257
|
+
name: 'research_status',
|
|
258
|
+
description: `Check the status of a running research job. Returns the full research package when complete.
|
|
259
|
+
|
|
260
|
+
Call this after 'research' returns a job_id. Research typically takes 2-8 minutes. Poll every 15-20 seconds. The response includes an 'activity' array showing exactly what the research agent is doing (searches, sources being read, reasoning). As long as 'total_steps' is increasing or 'elapsed_seconds' is under 8 minutes, the research is progressing normally — do NOT abandon it.`,
|
|
261
|
+
inputSchema: {
|
|
262
|
+
type: 'object',
|
|
263
|
+
properties: {
|
|
264
|
+
job_id: { type: 'string', description: 'The job_id returned by the research tool' },
|
|
265
|
+
},
|
|
266
|
+
required: ['job_id'],
|
|
267
|
+
},
|
|
268
|
+
},
|
|
255
269
|
// Ingest tool
|
|
256
270
|
{
|
|
257
271
|
name: 'ingest',
|
package/dist/sync/process.d.ts
CHANGED
|
@@ -16,6 +16,7 @@ import { type ImageMediaType } from './processors.js';
|
|
|
16
16
|
export interface ExtractedMetadata {
|
|
17
17
|
title: string;
|
|
18
18
|
summary: string;
|
|
19
|
+
description?: string;
|
|
19
20
|
date: string | null;
|
|
20
21
|
participants: string[];
|
|
21
22
|
content_type: ContentType;
|
|
@@ -38,6 +39,13 @@ export declare function extractMetadata(content: string, filePath: string, optio
|
|
|
38
39
|
base64: string;
|
|
39
40
|
mediaType: ImageMediaType;
|
|
40
41
|
};
|
|
42
|
+
fileMetadata?: {
|
|
43
|
+
filename: string;
|
|
44
|
+
sizeBytes: number;
|
|
45
|
+
createdAt: string;
|
|
46
|
+
modifiedAt: string;
|
|
47
|
+
exif?: Record<string, unknown>;
|
|
48
|
+
};
|
|
41
49
|
}): Promise<ExtractedMetadata>;
|
|
42
50
|
export declare function processFiles(files: DiscoveredFile[], dataDir: string, options?: {
|
|
43
51
|
onProgress?: (completed: number, total: number, title: string) => void;
|
package/dist/sync/process.js
CHANGED
|
@@ -55,12 +55,24 @@ Content type guidelines:
|
|
|
55
55
|
|
|
56
56
|
Be specific in the summary. Include concrete details, names, numbers when present.`;
|
|
57
57
|
export async function extractMetadata(content, filePath, options = {}) {
|
|
58
|
-
const { model = 'claude-sonnet-4-20250514', image } = options;
|
|
58
|
+
const { model = 'claude-sonnet-4-20250514', image, fileMetadata } = options;
|
|
59
59
|
const client = getAnthropic();
|
|
60
60
|
// Build message content based on whether we have an image or text
|
|
61
61
|
let messageContent;
|
|
62
62
|
if (image) {
|
|
63
|
-
// Image analysis with Claude Vision
|
|
63
|
+
// Image analysis with Claude Vision — extract metadata AND a detailed text description
|
|
64
|
+
const imagePrompt = `Analyze this image and return ONLY valid JSON with these fields:
|
|
65
|
+
|
|
66
|
+
{
|
|
67
|
+
"title": "A descriptive title for this image",
|
|
68
|
+
"summary": "2-4 sentences capturing the key takeaway or purpose of this image",
|
|
69
|
+
"description": "A comprehensive text description of everything in this image. Include all text, data, labels, numbers, charts, diagrams, and visual elements. Transcribe any visible text verbatim. For charts/graphs, describe the data points and trends. For screenshots, describe the UI elements and content. Be thorough — this description replaces the image in a text-only knowledge base.",
|
|
70
|
+
"date": "ISO date string (YYYY-MM-DD) if mentioned, otherwise null",
|
|
71
|
+
"participants": ["list", "of", "names"] if people are mentioned, otherwise [],
|
|
72
|
+
"content_type": "one of: interview|meeting|conversation|document|note|analysis"
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
Be specific and thorough in the description. Include ALL visible text, numbers, and data.`;
|
|
64
76
|
messageContent = [
|
|
65
77
|
{
|
|
66
78
|
type: 'image',
|
|
@@ -72,7 +84,7 @@ export async function extractMetadata(content, filePath, options = {}) {
|
|
|
72
84
|
},
|
|
73
85
|
{
|
|
74
86
|
type: 'text',
|
|
75
|
-
text: `${
|
|
87
|
+
text: `${imagePrompt}\n\nFile: ${path.basename(filePath)}${fileMetadata ? `\nFile size: ${(fileMetadata.sizeBytes / 1024).toFixed(0)} KB\nFile created: ${fileMetadata.createdAt}\nFile modified: ${fileMetadata.modifiedAt}${fileMetadata.exif ? `\nEXIF data: ${JSON.stringify(fileMetadata.exif)}` : ''}` : ''}`,
|
|
76
88
|
},
|
|
77
89
|
];
|
|
78
90
|
}
|
|
@@ -86,7 +98,7 @@ export async function extractMetadata(content, filePath, options = {}) {
|
|
|
86
98
|
}
|
|
87
99
|
const response = await client.messages.create({
|
|
88
100
|
model,
|
|
89
|
-
max_tokens: 1000,
|
|
101
|
+
max_tokens: image ? 4000 : 1000,
|
|
90
102
|
messages: [
|
|
91
103
|
{
|
|
92
104
|
role: 'user',
|
|
@@ -111,6 +123,7 @@ export async function extractMetadata(content, filePath, options = {}) {
|
|
|
111
123
|
return {
|
|
112
124
|
title: parsed.title || path.basename(filePath),
|
|
113
125
|
summary: parsed.summary || 'No summary available',
|
|
126
|
+
description: parsed.description || undefined,
|
|
114
127
|
date: parsed.date || null,
|
|
115
128
|
participants: Array.isArray(parsed.participants) ? parsed.participants : [],
|
|
116
129
|
content_type: validateContentType(parsed.content_type),
|
|
@@ -150,9 +163,12 @@ async function storeSourceToDisk(sourceId, file, metadata, processedContent, dat
|
|
|
150
163
|
const sourceDir = path.join(sourcesDir, sourceId);
|
|
151
164
|
// Create source directory
|
|
152
165
|
await mkdir(sourceDir, { recursive: true });
|
|
153
|
-
// Copy original file
|
|
154
|
-
const originalExt = path.extname(file.absolutePath);
|
|
155
|
-
|
|
166
|
+
// Copy original file (skip binary formats — knowledge store is text-based)
|
|
167
|
+
const originalExt = path.extname(file.absolutePath).toLowerCase();
|
|
168
|
+
const binaryExts = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico', '.svg'];
|
|
169
|
+
if (!binaryExts.includes(originalExt)) {
|
|
170
|
+
await copyFile(file.absolutePath, path.join(sourceDir, `original${originalExt}`));
|
|
171
|
+
}
|
|
156
172
|
// Save processed content
|
|
157
173
|
await writeFile(path.join(sourceDir, 'content.md'), processedContent);
|
|
158
174
|
// Save metadata
|
|
@@ -223,17 +239,59 @@ export async function processFiles(files, dataDir, options = {}) {
|
|
|
223
239
|
// 1. Read and preprocess file
|
|
224
240
|
const processed = await processFile(file.absolutePath);
|
|
225
241
|
// 2. Extract metadata with Claude (handles both text and images)
|
|
226
|
-
const metadata = await extractMetadata(processed.text, file.absolutePath, { model, image: processed.image });
|
|
227
|
-
// For images, use the
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
242
|
+
const metadata = await extractMetadata(processed.text, file.absolutePath, { model, image: processed.image, fileMetadata: processed.fileMetadata });
|
|
243
|
+
// For images, use the detailed description as the text content
|
|
244
|
+
let contentText;
|
|
245
|
+
if (processed.image) {
|
|
246
|
+
const lines = [
|
|
247
|
+
`# ${metadata.title}`,
|
|
248
|
+
'',
|
|
249
|
+
metadata.description || metadata.summary,
|
|
250
|
+
'',
|
|
251
|
+
'---',
|
|
252
|
+
'',
|
|
253
|
+
`*Original file: ${path.basename(file.absolutePath)}*`,
|
|
254
|
+
`*Synced from: ${file.sourceName}*`,
|
|
255
|
+
metadata.date ? `*Date: ${metadata.date}*` : '',
|
|
256
|
+
];
|
|
257
|
+
// Append EXIF metadata if available
|
|
258
|
+
const exif = processed.fileMetadata?.exif;
|
|
259
|
+
if (exif && Object.keys(exif).length > 0) {
|
|
260
|
+
lines.push('');
|
|
261
|
+
lines.push('## Image Metadata');
|
|
262
|
+
for (const [key, value] of Object.entries(exif)) {
|
|
263
|
+
if (value != null && value !== '') {
|
|
264
|
+
const label = key.replace(/([A-Z])/g, ' $1').replace(/^./, s => s.toUpperCase()).trim();
|
|
265
|
+
lines.push(`- **${label}:** ${Array.isArray(value) ? value.join(', ') : String(value)}`);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
contentText = lines.filter(Boolean).join('\n');
|
|
270
|
+
}
|
|
271
|
+
else {
|
|
272
|
+
contentText = processed.text;
|
|
273
|
+
}
|
|
231
274
|
// 3. Use existing ID for edits, generate new ID for new files
|
|
232
275
|
const sourceId = file.existingId || generateSourceId();
|
|
233
|
-
// 4.
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
276
|
+
// 4. Store to disk FIRST — ensures content.md always exists
|
|
277
|
+
// If this fails, we skip Supabase so the file stays "new" for retry.
|
|
278
|
+
try {
|
|
279
|
+
await storeSourceToDisk(sourceId, file, metadata, contentText, dataDir);
|
|
280
|
+
}
|
|
281
|
+
catch (diskError) {
|
|
282
|
+
console.error(`[process] Disk write failed for ${file.relativePath}: ${diskError}`);
|
|
283
|
+
throw new Error(`Disk write failed for ${file.relativePath}: ${diskError}`);
|
|
284
|
+
}
|
|
285
|
+
// 5. Index in Supabase — if this fails, disk content still exists
|
|
286
|
+
// and legacy sync will pick it up on the next run.
|
|
287
|
+
try {
|
|
288
|
+
await indexSource(sourceId, file, metadata, dbPath);
|
|
289
|
+
}
|
|
290
|
+
catch (supabaseError) {
|
|
291
|
+
console.error(`[process] Supabase index failed for ${file.relativePath}: ${supabaseError}`);
|
|
292
|
+
console.error(`[process] Content saved to disk — will be indexed on next sync via legacy path`);
|
|
293
|
+
// Don't re-throw: disk write succeeded, source is safe
|
|
294
|
+
}
|
|
237
295
|
if (extensionRegistry && hookContext) {
|
|
238
296
|
await extensionRegistry.runHook('onSourceCreated', {
|
|
239
297
|
id: sourceId,
|
|
@@ -265,9 +323,11 @@ export async function processFiles(files, dataDir, options = {}) {
|
|
|
265
323
|
onProgress?.(result.processed.length + result.errors.length, files.length, batchResult.value.metadata.title);
|
|
266
324
|
}
|
|
267
325
|
else {
|
|
326
|
+
const errorMsg = batchResult.reason?.message || String(batchResult.reason);
|
|
327
|
+
console.error(`[process] Failed to process ${file.relativePath}: ${errorMsg}`);
|
|
268
328
|
result.errors.push({
|
|
269
329
|
file,
|
|
270
|
-
error:
|
|
330
|
+
error: errorMsg,
|
|
271
331
|
});
|
|
272
332
|
onProgress?.(result.processed.length + result.errors.length, files.length, `Error: ${file.relativePath}`);
|
|
273
333
|
}
|
|
@@ -17,6 +17,13 @@ export interface ProcessedContent {
|
|
|
17
17
|
base64: string;
|
|
18
18
|
mediaType: ImageMediaType;
|
|
19
19
|
};
|
|
20
|
+
fileMetadata?: {
|
|
21
|
+
filename: string;
|
|
22
|
+
sizeBytes: number;
|
|
23
|
+
createdAt: string;
|
|
24
|
+
modifiedAt: string;
|
|
25
|
+
exif?: Record<string, unknown>;
|
|
26
|
+
};
|
|
20
27
|
}
|
|
21
28
|
export declare function processFile(filePath: string): Promise<ProcessedContent>;
|
|
22
29
|
export declare function preprocessFiles(filePaths: string[], options?: {
|
package/dist/sync/processors.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Converts various file formats to plain text for Claude analysis.
|
|
5
5
|
* All processing is IN MEMORY ONLY - original files are never modified.
|
|
6
6
|
*/
|
|
7
|
-
import { readFile } from 'fs/promises';
|
|
7
|
+
import { readFile, stat } from 'fs/promises';
|
|
8
8
|
import path from 'path';
|
|
9
9
|
let pdfParser = null;
|
|
10
10
|
async function getPdfParser() {
|
|
@@ -196,13 +196,107 @@ async function processImage(filePath) {
|
|
|
196
196
|
}
|
|
197
197
|
const buffer = await readFile(filePath);
|
|
198
198
|
const base64 = buffer.toString('base64');
|
|
199
|
+
// Extract file-level metadata
|
|
200
|
+
const fileStat = await stat(filePath);
|
|
201
|
+
const filename = path.basename(filePath);
|
|
202
|
+
// Try to parse date from common filename patterns (e.g. WhatsApp, screenshots)
|
|
203
|
+
let dateFromFilename;
|
|
204
|
+
const whatsappMatch = filename.match(/(\d{4}-\d{2}-\d{2})/);
|
|
205
|
+
if (whatsappMatch) {
|
|
206
|
+
dateFromFilename = whatsappMatch[1];
|
|
207
|
+
}
|
|
208
|
+
// Extract EXIF metadata (GPS, camera, date, etc.)
|
|
209
|
+
let exifData;
|
|
210
|
+
try {
|
|
211
|
+
const exifr = await import('exifr');
|
|
212
|
+
const raw = await exifr.default.parse(buffer, {
|
|
213
|
+
// Request all available tags
|
|
214
|
+
tiff: true,
|
|
215
|
+
exif: true,
|
|
216
|
+
gps: true,
|
|
217
|
+
icc: false, // Skip color profile (not useful for knowledge)
|
|
218
|
+
iptc: true, // Keywords, captions, copyright
|
|
219
|
+
xmp: true, // Extended metadata
|
|
220
|
+
});
|
|
221
|
+
if (raw) {
|
|
222
|
+
// Extract the most useful fields
|
|
223
|
+
exifData = {};
|
|
224
|
+
// Camera info
|
|
225
|
+
if (raw.Make)
|
|
226
|
+
exifData.cameraMake = raw.Make;
|
|
227
|
+
if (raw.Model)
|
|
228
|
+
exifData.cameraModel = raw.Model;
|
|
229
|
+
if (raw.LensModel)
|
|
230
|
+
exifData.lens = raw.LensModel;
|
|
231
|
+
// Date
|
|
232
|
+
if (raw.DateTimeOriginal)
|
|
233
|
+
exifData.dateTaken = raw.DateTimeOriginal instanceof Date ? raw.DateTimeOriginal.toISOString() : String(raw.DateTimeOriginal);
|
|
234
|
+
if (raw.CreateDate)
|
|
235
|
+
exifData.dateCreated = raw.CreateDate instanceof Date ? raw.CreateDate.toISOString() : String(raw.CreateDate);
|
|
236
|
+
// GPS
|
|
237
|
+
if (raw.latitude != null && raw.longitude != null) {
|
|
238
|
+
exifData.gpsLatitude = raw.latitude;
|
|
239
|
+
exifData.gpsLongitude = raw.longitude;
|
|
240
|
+
}
|
|
241
|
+
if (raw.GPSAltitude != null)
|
|
242
|
+
exifData.gpsAltitude = raw.GPSAltitude;
|
|
243
|
+
// Image dimensions
|
|
244
|
+
if (raw.ImageWidth)
|
|
245
|
+
exifData.width = raw.ImageWidth;
|
|
246
|
+
if (raw.ImageHeight)
|
|
247
|
+
exifData.height = raw.ImageHeight;
|
|
248
|
+
if (raw.ExifImageWidth)
|
|
249
|
+
exifData.width = raw.ExifImageWidth;
|
|
250
|
+
if (raw.ExifImageHeight)
|
|
251
|
+
exifData.height = raw.ExifImageHeight;
|
|
252
|
+
// Software / source
|
|
253
|
+
if (raw.Software)
|
|
254
|
+
exifData.software = raw.Software;
|
|
255
|
+
if (raw.Artist)
|
|
256
|
+
exifData.artist = raw.Artist;
|
|
257
|
+
if (raw.Copyright)
|
|
258
|
+
exifData.copyright = raw.Copyright;
|
|
259
|
+
// IPTC/XMP tags
|
|
260
|
+
if (raw.Keywords)
|
|
261
|
+
exifData.keywords = raw.Keywords;
|
|
262
|
+
if (raw.Description)
|
|
263
|
+
exifData.description = raw.Description;
|
|
264
|
+
if (raw.Caption)
|
|
265
|
+
exifData.caption = raw.Caption;
|
|
266
|
+
if (raw.Subject)
|
|
267
|
+
exifData.subject = raw.Subject;
|
|
268
|
+
if (raw.Title)
|
|
269
|
+
exifData.title = raw.Title;
|
|
270
|
+
// Use EXIF date if no filename date
|
|
271
|
+
if (!dateFromFilename && exifData.dateTaken) {
|
|
272
|
+
const d = new Date(exifData.dateTaken);
|
|
273
|
+
if (!isNaN(d.getTime())) {
|
|
274
|
+
dateFromFilename = d.toISOString().split('T')[0];
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
// Drop empty objects
|
|
278
|
+
if (Object.keys(exifData).length === 0)
|
|
279
|
+
exifData = undefined;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
catch (exifError) {
|
|
283
|
+
console.error(`[processors] EXIF extraction failed for ${path.basename(filePath)}: ${exifError}`);
|
|
284
|
+
}
|
|
199
285
|
return {
|
|
200
286
|
text: '', // Will be filled by Claude vision
|
|
201
287
|
format: 'image',
|
|
288
|
+
metadata: dateFromFilename ? { date: dateFromFilename } : undefined,
|
|
202
289
|
image: {
|
|
203
290
|
base64,
|
|
204
291
|
mediaType,
|
|
205
292
|
},
|
|
293
|
+
fileMetadata: {
|
|
294
|
+
filename,
|
|
295
|
+
sizeBytes: fileStat.size,
|
|
296
|
+
createdAt: fileStat.birthtime.toISOString(),
|
|
297
|
+
modifiedAt: fileStat.mtime.toISOString(),
|
|
298
|
+
...(exifData ? { exif: exifData } : {}),
|
|
299
|
+
},
|
|
206
300
|
};
|
|
207
301
|
}
|
|
208
302
|
// ============================================================================
|
|
@@ -90,47 +90,85 @@ export async function loadFullContent(state, ui, dbPath, sourcesDir) {
|
|
|
90
90
|
const source = getSelectedSource(state);
|
|
91
91
|
if (!source)
|
|
92
92
|
return;
|
|
93
|
-
// Try to load from disk first
|
|
94
|
-
const
|
|
93
|
+
// Try to load from disk first (content.md, then original file)
|
|
94
|
+
const sourceDir = path.join(sourcesDir, source.id);
|
|
95
|
+
const contentPath = path.join(sourceDir, 'content.md');
|
|
95
96
|
try {
|
|
96
97
|
const { readFile } = await import('fs/promises');
|
|
97
98
|
state.fullContent = await readFile(contentPath, 'utf-8');
|
|
98
99
|
}
|
|
99
100
|
catch {
|
|
100
|
-
//
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
'',
|
|
113
|
-
].join('\n');
|
|
114
|
-
if (details.themes && details.themes.length > 0) {
|
|
115
|
-
state.fullContent += '## Themes\n';
|
|
116
|
-
for (const theme of details.themes) {
|
|
117
|
-
state.fullContent += `- **${theme.name}**`;
|
|
118
|
-
if (theme.summary)
|
|
119
|
-
state.fullContent += `: ${theme.summary}`;
|
|
120
|
-
state.fullContent += '\n';
|
|
101
|
+
// content.md not found — try to find and read an original text file
|
|
102
|
+
let foundOriginal = false;
|
|
103
|
+
try {
|
|
104
|
+
const { readFile, readdir } = await import('fs/promises');
|
|
105
|
+
const files = await readdir(sourceDir);
|
|
106
|
+
const originalFile = files.find(f => f.startsWith('original.'));
|
|
107
|
+
if (originalFile) {
|
|
108
|
+
const textExts = ['.md', '.txt', '.json', '.jsonl', '.csv', '.xml', '.yaml', '.yml', '.html', '.log'];
|
|
109
|
+
const ext = path.extname(originalFile).toLowerCase();
|
|
110
|
+
if (textExts.includes(ext)) {
|
|
111
|
+
state.fullContent = await readFile(path.join(sourceDir, originalFile), 'utf-8');
|
|
112
|
+
foundOriginal = true;
|
|
121
113
|
}
|
|
122
|
-
state.fullContent += '\n';
|
|
123
114
|
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
// Source directory doesn't exist locally — fall through to DB
|
|
118
|
+
}
|
|
119
|
+
if (!foundOriginal) {
|
|
120
|
+
// Try reading from source_path (original file in sync directory)
|
|
121
|
+
const details = await getSourceById(dbPath, source.id);
|
|
122
|
+
if (details?.source_path) {
|
|
123
|
+
try {
|
|
124
|
+
const { readFile } = await import('fs/promises');
|
|
125
|
+
const ext = path.extname(details.source_path).toLowerCase();
|
|
126
|
+
const textExts = ['.md', '.txt', '.json', '.jsonl', '.csv', '.xml', '.yaml', '.yml', '.html', '.log'];
|
|
127
|
+
if (textExts.includes(ext)) {
|
|
128
|
+
state.fullContent = await readFile(details.source_path, 'utf-8');
|
|
129
|
+
foundOriginal = true;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
catch {
|
|
133
|
+
// source_path file doesn't exist or can't be read
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
if (!foundOriginal) {
|
|
137
|
+
// Final fallback: database summary view
|
|
138
|
+
if (details) {
|
|
139
|
+
state.fullContent = [
|
|
140
|
+
`# ${details.title}`,
|
|
141
|
+
'',
|
|
142
|
+
`**Type:** ${details.source_type} · ${details.content_type}`,
|
|
143
|
+
`**Date:** ${formatDate(details.created_at)}`,
|
|
144
|
+
`**Projects:** ${details.projects.join(', ') || '(none)'}`,
|
|
145
|
+
'',
|
|
146
|
+
'## Summary',
|
|
147
|
+
details.summary,
|
|
148
|
+
'',
|
|
149
|
+
].join('\n');
|
|
150
|
+
if (details.themes && details.themes.length > 0) {
|
|
151
|
+
state.fullContent += '## Themes\n';
|
|
152
|
+
for (const theme of details.themes) {
|
|
153
|
+
state.fullContent += `- **${theme.name}**`;
|
|
154
|
+
if (theme.summary)
|
|
155
|
+
state.fullContent += `: ${theme.summary}`;
|
|
156
|
+
state.fullContent += '\n';
|
|
157
|
+
}
|
|
158
|
+
state.fullContent += '\n';
|
|
159
|
+
}
|
|
160
|
+
if (details.quotes && details.quotes.length > 0) {
|
|
161
|
+
state.fullContent += '## Key Quotes\n';
|
|
162
|
+
for (const quote of details.quotes.slice(0, 10)) {
|
|
163
|
+
const speaker = quote.speaker === 'user' ? '[You]' : `[${quote.speaker_name || 'Participant'}]`;
|
|
164
|
+
state.fullContent += `> ${speaker} "${quote.text}"\n\n`;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
state.fullContent = `Could not load content for ${source.title}`;
|
|
129
170
|
}
|
|
130
171
|
}
|
|
131
|
-
}
|
|
132
|
-
else {
|
|
133
|
-
state.fullContent = `Could not load content for ${source.title}`;
|
|
134
172
|
}
|
|
135
173
|
}
|
|
136
174
|
// Store raw lines for searching
|
|
@@ -370,6 +408,7 @@ export async function applyFilter(state, ui, query, filterMode, dbPath, dataDir,
|
|
|
370
408
|
content_type: r.content_type,
|
|
371
409
|
projects: r.projects,
|
|
372
410
|
created_at: r.created_at,
|
|
411
|
+
indexed_at: r.created_at,
|
|
373
412
|
summary: r.summary,
|
|
374
413
|
score: r.score,
|
|
375
414
|
}));
|