npm - escribano - Versions diffs - 0.1.4 → 0.2.0 - Mend

escribano 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/actions/generate-artifact-v3.js +4 -2
package/dist/actions/generate-summary-v3.js +4 -2
package/dist/adapters/audio.silero.adapter.js +50 -3
package/dist/adapters/intelligence.ollama.adapter.js +9 -7
package/dist/adapters/video.ffmpeg.adapter.js +9 -5
package/dist/index.js +3 -0
package/dist/services/subject-grouping.js +4 -2
package/dist/tests/utils/env-logger.test.js +262 -0
package/dist/utils/env-logger.js +166 -0
package/package.json +5 -3
package/src/scripts/audio_preprocessor.py +109 -0
package/src/scripts/visual_observer_base.py +417 -0

package/dist/actions/generate-artifact-v3.js CHANGED Viewed

@@ -7,10 +7,12 @@
 import { execSync } from 'node:child_process';
 import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import { homedir } from 'node:os';
-import path from 'node:path';
+import path, { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
 import { log, step } from '../pipeline/context.js';
 import { normalizeAppNames } from '../services/app-normalization.js';
 import { groupTopicBlocksIntoSubjects, saveSubjectsToDatabase, } from '../services/subject-grouping.js';
+const __dirname = dirname(fileURLToPath(import.meta.url));
 export async function generateArtifactV3(recordingId, repos, intelligence, options) {
     const format = options.format || 'card';
     log('info', `[Artifact V3.1] Generating ${format} artifact for recording ${recordingId}...`);
@@ -204,7 +206,7 @@ async function generateLlmArtifact(subjects, groupingResult, format, recording,
         : format === 'standup'
             ? 'standup.md'
             : 'summary-v3.md';
-    const promptPath = path.join(process.cwd(), 'prompts', promptFileName);
+    const promptPath = resolve(__dirname, '..', '..', 'prompts', promptFileName);
     let promptTemplate;
     try {
         promptTemplate = await readFile(promptPath, 'utf-8');

package/dist/actions/generate-summary-v3.js CHANGED Viewed

@@ -5,8 +5,10 @@
  */
 import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import { homedir } from 'node:os';
-import path from 'node:path';
+import path, { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
 import { log } from '../pipeline/context.js';
+const __dirname = dirname(fileURLToPath(import.meta.url));
 /**
  * Generate a work session summary artifact from processed TopicBlocks.
  *
@@ -81,7 +83,7 @@ export async function generateSummaryV3(recordingId, repos, intelligence, option
  */
 async function generateLlmSummary(sections, recording, intelligence) {
     // Read prompt template
-    const promptPath = path.join(process.cwd(), 'prompts', 'summary-v3.md');
+    const promptPath = resolve(__dirname, '..', '..', 'prompts', 'summary-v3.md');
     let promptTemplate;
     try {
         promptTemplate = await readFile(promptPath, 'utf-8');

package/dist/adapters/audio.silero.adapter.js CHANGED Viewed

@@ -1,9 +1,12 @@
 import { exec, spawn } from 'node:child_process';
+import { existsSync } from 'node:fs';
 import { mkdir, readFile, rm } from 'node:fs/promises';
 import os from 'node:os';
-import path from 'node:path';
+import path, { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
 import { promisify } from 'node:util';
 const execAsync = promisify(exec);
+const __dirname = dirname(fileURLToPath(import.meta.url));
 export function createSileroPreprocessor() {
     let currentProcess = null;
     return {
@@ -19,19 +22,63 @@ export function createSileroPreprocessor() {
             catch (error) {
                 throw new Error(`Failed to pre-convert audio for VAD: ${error.message}`);
             }
-            const scriptPath = path.join(process.cwd(), 'src', 'scripts', 'audio_preprocessor.py');
+            const scriptPath = resolve(__dirname, '..', '..', 'src', 'scripts', 'audio_preprocessor.py');
+            if (!existsSync(scriptPath)) {
+                throw new Error(`Audio preprocessor script not found at: ${scriptPath}`);
+            }
             const command = `uv run "${scriptPath}" --audio "${inputWavPath}" --output-dir "${tempDir}" --output-json "${manifestPath}"`;
             try {
                 console.log(`Running Silero VAD on ${inputWavPath}...`);
+                if (process.env.ESCRIBANO_VERBOSE === 'true') {
+                    console.log(`  Script path: ${scriptPath}`);
+                    console.log(`  Script exists: ${existsSync(scriptPath)}`);
+                    console.log(`  Command: ${command}`);
+                    console.log(`  Working directory (user): ${process.cwd()}`);
+                    try {
+                        const { stdout: uvVersion } = await execAsync('uv --version');
+                        console.log(`  uv version: ${uvVersion.trim()}`);
+                    }
+                    catch {
+                        console.log(`  uv version: NOT FOUND`);
+                    }
+                }
                 currentProcess = spawn('sh', ['-c', command]);
+                let stderr = '';
+                let stdout = '';
+                if (currentProcess.stderr) {
+                    currentProcess.stderr.on('data', (data) => {
+                        stderr += data.toString();
+                    });
+                }
+                if (currentProcess.stdout) {
+                    currentProcess.stdout.on('data', (data) => {
+                        stdout += data.toString();
+                    });
+                }
                 await new Promise((resolve, reject) => {
                     currentProcess?.on('close', (code) => {
                         currentProcess = null;
                         if (code === 0) {
+                            if (process.env.ESCRIBANO_VERBOSE === 'true' && stdout) {
+                                console.log(`  Silero VAD stdout:\n${stdout
+                                    .split('\n')
+                                    .map((l) => '    ' + l)
+                                    .join('\n')}`);
+                            }
                             resolve();
                         }
                         else {
-                            reject(new Error(`Silero VAD failed with code ${code}`));
+                            console.error(`  Silero VAD stderr:\n${stderr
+                                .split('\n')
+                                .map((l) => '    ' + l)
+                                .join('\n')}`);
+                            if (stdout) {
+                                console.error(`  Silero VAD stdout:\n${stdout
+                                    .split('\n')
+                                    .map((l) => '    ' + l)
+                                    .join('\n')}`);
+                            }
+                            reject(new Error(`Silero VAD failed with code ${code}: ${stderr || stdout || 'No output captured'}`));
                         }
                     });
                     currentProcess?.on('error', (err) => {

package/dist/adapters/intelligence.ollama.adapter.js CHANGED Viewed

@@ -4,12 +4,14 @@
  * Implements IntelligenceService using Ollama REST API
  */
 import { readFileSync } from 'node:fs';
-import { join } from 'node:path';
+import { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
 import { Agent, fetch as undiciFetch } from 'undici';
 import { z } from 'zod';
 import { classificationSchema, intelligenceConfigSchema, transcriptMetadataSchema, } from '../0_types.js';
 // Debug logging controlled by environment variable
 const DEBUG_OLLAMA = process.env.ESCRIBANO_DEBUG_OLLAMA === 'true';
+const __dirname = dirname(fileURLToPath(import.meta.url));
 // TODO: put in an util
 export function debugLog(...args) {
     if (DEBUG_OLLAMA) {
@@ -193,7 +195,7 @@ async function classifySegmentWithOllama(segment, config, transcript) {
     return raw;
 }
 function loadClassifySegmentPrompt(segment, transcript) {
-    const promptPath = join(process.cwd(), 'prompts', 'classify-segment.md');
+    const promptPath = resolve(__dirname, '..', '..', 'prompts', 'classify-segment.md');
     let prompt = readFileSync(promptPath, 'utf-8');
     const timeRangeStr = `[${segment.timeRange[0]}s - ${segment.timeRange[1]}s]`;
     const ocrContext = segment.contexts.map((c) => `${c.type}: ${c.value}`).join(', ') || 'None';
@@ -208,7 +210,7 @@ function loadClassifySegmentPrompt(segment, transcript) {
     return prompt;
 }
 function loadClassifyPrompt(transcript, visualLogs) {
-    const promptPath = join(process.cwd(), 'prompts', 'classify.md');
+    const promptPath = resolve(__dirname, '..', '..', 'prompts', 'classify.md');
     let prompt = readFileSync(promptPath, 'utf-8');
     const segmentsText = transcript.segments
         .map((seg) => `[seg-${seg.id}] [${seg.start}s - ${seg.end}s] ${seg.text}`)
@@ -241,7 +243,7 @@ function loadClassifyPrompt(transcript, visualLogs) {
  */
 function buildVLMSingleImagePrompt() {
     try {
-        const promptPath = join(process.cwd(), 'prompts', 'vlm-single.md');
+        const promptPath = resolve(__dirname, '..', '..', 'prompts', 'vlm-single.md');
         return readFileSync(promptPath, 'utf-8');
     }
     catch {
@@ -426,7 +428,7 @@ async function extractTopicsWithOllama(observations, config) {
         return [];
     let prompt;
     try {
-        const promptPath = join(process.cwd(), 'prompts', 'topic-extract.md');
+        const promptPath = resolve(__dirname, '..', '..', 'prompts', 'topic-extract.md');
         const template = readFileSync(promptPath, 'utf-8');
         prompt = template.replace('{{OBSERVATIONS}}', textSamples.join('\n---\n'));
     }
@@ -657,7 +659,7 @@ async function extractMetadata(transcript, classification, config, visualLogs) {
     return raw;
 }
 function loadMetadataPrompt(transcript, classification, visualLogs) {
-    const promptPath = join(process.cwd(), 'prompts', 'extract-metadata.md');
+    const promptPath = resolve(__dirname, '..', '..', 'prompts', 'extract-metadata.md');
     let prompt = readFileSync(promptPath, 'utf-8');
     const classificationSummary = Object.entries(classification)
         .filter(([_, score]) => score >= 25)
@@ -698,7 +700,7 @@ async function generateArtifact(artifactType, context, config) {
     return response;
 }
 function loadArtifactPrompt(artifactType, context) {
-    const promptPath = join(process.cwd(), 'prompts', `${artifactType}.md`);
+    const promptPath = resolve(__dirname, '..', '..', 'prompts', `${artifactType}.md`);
     let prompt = readFileSync(promptPath, 'utf-8');
     // TODO: Implement robust transcript cleaning (Milestone 4)
     prompt = prompt.replace('{{TRANSCRIPT_ALL}}', context.transcript.fullText);

package/dist/adapters/video.ffmpeg.adapter.js CHANGED Viewed

@@ -5,12 +5,15 @@
  * Used for extracting screenshots and detecting scene changes.
  */
 import { exec, spawn } from 'node:child_process';
+import { existsSync } from 'node:fs';
 import { mkdir, readdir, readFile, rm } from 'node:fs/promises';
 import os from 'node:os';
-import path from 'node:path';
+import path, { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
 import { promisify } from 'node:util';
 import { debugLog } from './intelligence.ollama.adapter.js';
 const execAsync = promisify(exec);
+const __dirname = dirname(fileURLToPath(import.meta.url));
 // Scene detection configuration (with env var overrides)
 // Lower threshold = more sensitive = more scene changes detected
 // Examples: 0.3 (sensitive), 0.4 (default), 0.5 (conservative)
@@ -268,15 +271,16 @@ export function createFfmpegVideoService() {
          * OCR is parallelized across all available CPU cores.
          */
         runVisualIndexing: async (framesDir, outputPath) => {
-            const scriptPath = path.join(process.cwd(), 'src', 'scripts', 'visual_observer_base.py');
+            const scriptPath = resolve(__dirname, '..', '..', 'src', 'scripts', 'visual_observer_base.py');
+            if (!existsSync(scriptPath)) {
+                throw new Error(`Visual observer script not found at: ${scriptPath}`);
+            }
             const frameInterval = Number(process.env.ESCRIBANO_FRAME_INTERVAL) || 2;
             const workers = os.cpus().length;
-            // Use uv run to execute the script with its environment
-            // --workers enables parallel OCR processing
             const command = `uv run "${scriptPath}" --frames-dir "${framesDir}" --output "${outputPath}" --frame-interval ${frameInterval} --workers ${workers}`;
             try {
                 await execAsync(command, {
-                    cwd: path.join(process.cwd(), 'src', 'scripts'),
+                    cwd: dirname(scriptPath),
                 });
                 const content = await readFile(outputPath, 'utf-8');
                 return JSON.parse(content);

package/dist/index.js CHANGED Viewed

@@ -12,6 +12,7 @@ import { createFilesystemCaptureSource } from './adapters/capture.filesystem.ada
 import { cleanupMlxBridge, initializeSystem, processVideo, } from './batch-context.js';
 import { getDbPath } from './db/index.js';
 import { checkPrerequisites, hasMissingPrerequisites, printDoctorResults, } from './prerequisites.js';
+import { logEnvironmentVariables } from './utils/env-logger.js';
 const MODELS_DIR = path.join(homedir(), '.escribano', 'models');
 const MODEL_FILE = 'ggml-large-v3.bin';
 const MODEL_PATH = path.join(MODELS_DIR, MODEL_FILE);
@@ -96,6 +97,8 @@ Output: Markdown summary saved to ~/.escribano/artifacts/
 }
 async function run(args) {
     const { force, file: filePath, skipSummary, micAudio, systemAudio, format, includePersonal, copyToClipboard, printToStdout, } = args;
+    // Log environment variables if verbose mode is enabled
+    logEnvironmentVariables();
     // Initialize system (reuses batch-context for consistency)
     console.log('Initializing database...');
     const ctx = await initializeSystem();

package/dist/services/subject-grouping.js CHANGED Viewed

@@ -5,7 +5,9 @@
  * This is the foundation for the new artifact architecture.
  */
 import { readFileSync } from 'node:fs';
-import { join } from 'node:path';
+import { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+const __dirname = dirname(fileURLToPath(import.meta.url));
 const PERSONAL_APPS = new Set([
     'WhatsApp',
     'Instagram',
@@ -144,7 +146,7 @@ ID: ${b.id}`;
         : `"${blockIdList[0]}"`;
     let template;
     try {
-        const promptPath = join(process.cwd(), 'prompts', 'subject-grouping.md');
+        const promptPath = resolve(__dirname, '..', '..', 'prompts', 'subject-grouping.md');
         template = readFileSync(promptPath, 'utf-8');
     }
     catch {

package/dist/tests/utils/env-logger.test.js ADDED Viewed

@@ -0,0 +1,262 @@
+import { readFileSync } from 'node:fs';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+// Mock fs module
+vi.mock('node:fs', () => ({
+    readFileSync: vi.fn(),
+}));
+// Mock process.env
+const originalEnv = { ...process.env };
+beforeEach(() => {
+    vi.clearAllMocks();
+    // Reset process.env
+    process.env = { ...originalEnv };
+});
+// Import after mocking
+const { logEnvironmentVariables } = await import('../../utils/env-logger.js');
+describe('Environment Variable Logger', () => {
+    describe('parseEnvExample', () => {
+        it('parses simple variable with description', async () => {
+            const mockContent = `# Enable verbose logging
+ESCRIBANO_VERBOSE=false`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            // Call the function indirectly via logEnvironmentVariables
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            expect(consoleSpy).toHaveBeenCalled();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('ESCRIBANO_VERBOSE');
+            expect(output).toContain('Enable verbose logging');
+            consoleSpy.mockRestore();
+        });
+        it('skips section headers', async () => {
+            const mockContent = `# === Frame Extraction ===
+# Output frame width
+ESCRIBANO_FRAME_WIDTH=1024`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('ESCRIBANO_FRAME_WIDTH');
+            expect(output).toContain('Output frame width');
+            expect(output).not.toContain('Frame Extraction');
+            consoleSpy.mockRestore();
+        });
+        it('skips commented/deprecated variables', async () => {
+            const mockContent = `# Active variable
+ESCRIBANO_VERBOSE=false
+# ESCRIBANO_DEPRECATED=value`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('ESCRIBANO_VERBOSE');
+            expect(output).not.toContain('ESCRIBANO_DEPRECATED');
+            consoleSpy.mockRestore();
+        });
+        it('skips non-ESCRIBANO variables', async () => {
+            const mockContent = `ESCRIBANO_VERBOSE=false
+OTHER_VAR=value`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('ESCRIBANO_VERBOSE');
+            expect(output).not.toContain('OTHER_VAR');
+            consoleSpy.mockRestore();
+        });
+        it('handles empty file gracefully', async () => {
+            vi.mocked(readFileSync).mockReturnValue('');
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('Environment Variables');
+            consoleSpy.mockRestore();
+        });
+        it('handles file not found gracefully', async () => {
+            vi.mocked(readFileSync).mockImplementation(() => {
+                throw new Error('ENOENT');
+            });
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('Could not parse .env.example');
+            consoleSpy.mockRestore();
+        });
+    });
+    describe('logEnvironmentVariables', () => {
+        it('does not log when ESCRIBANO_VERBOSE is false', () => {
+            process.env.ESCRIBANO_VERBOSE = 'false';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            expect(consoleSpy).not.toHaveBeenCalled();
+            consoleSpy.mockRestore();
+        });
+        it('does not log when ESCRIBANO_VERBOSE is not set', () => {
+            delete process.env.ESCRIBANO_VERBOSE;
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            expect(consoleSpy).not.toHaveBeenCalled();
+            consoleSpy.mockRestore();
+        });
+        it('logs when ESCRIBANO_VERBOSE is true', () => {
+            const mockContent = 'ESCRIBANO_VERBOSE=false';
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            expect(consoleSpy).toHaveBeenCalled();
+            consoleSpy.mockRestore();
+        });
+        it('marks custom values with [CUSTOM]', () => {
+            const mockContent = `# Default batch size
+ESCRIBANO_VLM_BATCH_SIZE=4`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            process.env.ESCRIBANO_VLM_BATCH_SIZE = '8';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('[CUSTOM]');
+            expect(output).toContain('Current: 8');
+            expect(output).toContain('Default: 4');
+            consoleSpy.mockRestore();
+        });
+        it('does not mark default values', () => {
+            const mockContent = `# Default batch size
+ESCRIBANO_VLM_BATCH_SIZE=4`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            process.env.ESCRIBANO_VLM_BATCH_SIZE = '4';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).not.toContain('[CUSTOM]');
+            consoleSpy.mockRestore();
+        });
+        it('masks secret tokens', () => {
+            const mockContent = `ESCRIBANO_OUTLINE_TOKEN=`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            process.env.ESCRIBANO_OUTLINE_TOKEN = 'secret-api-key-123';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('***');
+            expect(output).not.toContain('secret-api-key-123');
+            consoleSpy.mockRestore();
+        });
+        it('does not mask non-secret values', () => {
+            const mockContent = `ESCRIBANO_VLM_BATCH_SIZE=4`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            process.env.ESCRIBANO_VLM_BATCH_SIZE = '8';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('8');
+            expect(output).not.toContain('***');
+            consoleSpy.mockRestore();
+        });
+        it('shows "not set" for undefined variables', () => {
+            const mockContent = `ESCRIBANO_VLM_BATCH_SIZE=4`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            delete process.env.ESCRIBANO_VLM_BATCH_SIZE;
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('not set');
+            consoleSpy.mockRestore();
+        });
+        it('shows "(empty)" for empty default values', () => {
+            const mockContent = `ESCRIBANO_OUTLINE_TOKEN=`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('(empty)');
+            consoleSpy.mockRestore();
+        });
+        it('sorts variables alphabetically', () => {
+            const mockContent = `ESCRIBANO_ZEBRA=1
+ESCRIBANO_ALPHA=2
+ESCRIBANO_MIDDLE=3`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            const alphaIndex = output.indexOf('ESCRIBANO_ALPHA');
+            const middleIndex = output.indexOf('ESCRIBANO_MIDDLE');
+            const zebraIndex = output.indexOf('ESCRIBANO_ZEBRA');
+            expect(alphaIndex).toBeLessThan(middleIndex);
+            expect(middleIndex).toBeLessThan(zebraIndex);
+            consoleSpy.mockRestore();
+        });
+        it('includes multi-line descriptions', () => {
+            const mockContent = `# First line of description
+# Second line of description
+ESCRIBANO_VERBOSE=false`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('First line of description');
+            expect(output).toContain('Second line of description');
+            consoleSpy.mockRestore();
+        });
+    });
+    describe('text wrapping', () => {
+        it('wraps long descriptions to fit width', () => {
+            const longDescription = 'This is a very long description that should be wrapped across multiple lines to fit within the specified width limit for better readability in the console output';
+            const mockContent = `# ${longDescription}
+ESCRIBANO_VERBOSE=false`;
+            vi.mocked(readFileSync).mockReturnValue(mockContent);
+            process.env.ESCRIBANO_VERBOSE = 'true';
+            const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
+            logEnvironmentVariables();
+            // The description should appear in the output
+            const output = consoleSpy.mock.calls
+                .map((call) => call.join(' '))
+                .join('\n');
+            expect(output).toContain('very long description');
+            consoleSpy.mockRestore();
+        });
+    });
+});

package/dist/utils/env-logger.js ADDED Viewed

@@ -0,0 +1,166 @@
+/**
+ * Environment Variable Logger
+ *
+ * Parses .env.example to extract default values and descriptions,
+ * then logs all ESCRIBANO_* environment variables with comparisons
+ * to their defaults. Only logs when ESCRIBANO_VERBOSE=true.
+ */
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+const SECRET_VARS = ['ESCRIBANO_OUTLINE_TOKEN'];
+/**
+ * Parse .env.example file to extract variable names, defaults, and descriptions.
+ * Returns empty array if file not found.
+ */
+function parseEnvExample() {
+    const envExamplePath = resolve(process.cwd(), '.env.example');
+    let content;
+    try {
+        content = readFileSync(envExamplePath, 'utf-8');
+    }
+    catch {
+        // File not found or unreadable
+        return [];
+    }
+    const vars = [];
+    const lines = content.split('\n');
+    let currentDescription = [];
+    for (const line of lines) {
+        const trimmedLine = line.trim();
+        // Skip empty lines - reset description
+        if (trimmedLine === '') {
+            currentDescription = [];
+            continue;
+        }
+        // Comment line
+        if (trimmedLine.startsWith('#')) {
+            const commentContent = trimmedLine.slice(1).trim();
+            // Skip section headers (pattern: # ===)
+            if (commentContent.startsWith('===')) {
+                currentDescription = [];
+                continue;
+            }
+            // Skip deprecated section marker
+            if (commentContent.toLowerCase().includes('deprecated')) {
+                currentDescription = [];
+                continue;
+            }
+            // Accumulate description
+            currentDescription.push(commentContent);
+            continue;
+        }
+        // Variable line (contains =)
+        if (trimmedLine.includes('=')) {
+            const eqIndex = trimmedLine.indexOf('=');
+            const name = trimmedLine.slice(0, eqIndex).trim();
+            const value = trimmedLine.slice(eqIndex + 1).trim();
+            // Skip if name starts with # (deprecated/commented)
+            if (name.startsWith('#')) {
+                currentDescription = [];
+                continue;
+            }
+            // Only track ESCRIBANO_* variables
+            if (name.startsWith('ESCRIBANO_')) {
+                vars.push({
+                    name,
+                    defaultValue: value,
+                    description: currentDescription.join(' '),
+                });
+            }
+            currentDescription = [];
+        }
+    }
+    return vars;
+}
+/**
+ * Check if a variable should be masked (secret).
+ */
+function isSecretVar(name) {
+    return SECRET_VARS.includes(name);
+}
+/**
+ * Format value for display, masking secrets if needed.
+ */
+function formatValue(value, isSecret) {
+    if (value === 'not set') {
+        return 'not set';
+    }
+    if (isSecret && value !== 'not set' && value !== '') {
+        return '***';
+    }
+    if (value === '') {
+        return '(empty)';
+    }
+    return value;
+}
+/**
+ * Main logging function. Only runs when ESCRIBANO_VERBOSE=true.
+ */
+export function logEnvironmentVariables() {
+    if (process.env.ESCRIBANO_VERBOSE !== 'true') {
+        return;
+    }
+    const envVars = parseEnvExample();
+    if (envVars.length === 0) {
+        console.log('\n=== Environment Variables ===');
+        console.log('  (Could not parse .env.example)\n');
+        return;
+    }
+    // Build list of vars with their current values
+    const varsWithValues = envVars.map((varDef) => {
+        const currentValue = process.env[varDef.name] ?? 'not set';
+        const isCustom = currentValue !== varDef.defaultValue && currentValue !== 'not set';
+        return {
+            ...varDef,
+            currentValue,
+            isCustom,
+            isSecret: isSecretVar(varDef.name),
+        };
+    });
+    // Sort alphabetically by name
+    varsWithValues.sort((a, b) => a.name.localeCompare(b.name));
+    // Log output
+    console.log('\n=== Environment Variables ===\n');
+    for (const varDef of varsWithValues) {
+        const marker = varDef.isCustom ? ' [CUSTOM]' : '';
+        const displayCurrent = formatValue(varDef.currentValue, varDef.isSecret);
+        const displayDefault = formatValue(varDef.defaultValue, false);
+        console.log(`  ${varDef.name}${marker}`);
+        console.log(`    Current: ${displayCurrent}`);
+        console.log(`    Default: ${displayDefault}`);
+        if (varDef.description) {
+            // Wrap description to fit nicely (max ~60 chars per line)
+            const wrappedDesc = wrapText(varDef.description, 58);
+            for (const line of wrappedDesc) {
+                console.log(`    ${line}`);
+            }
+        }
+        console.log('');
+    }
+}
+/**
+ * Wrap text to specified width.
+ */
+function wrapText(text, width) {
+    if (text.length <= width) {
+        return [text];
+    }
+    const words = text.split(' ');
+    const lines = [];
+    let currentLine = '';
+    for (const word of words) {
+        if (`${currentLine} ${word}`.trim().length <= width) {
+            currentLine = currentLine ? `${currentLine} ${word}` : word;
+        }
+        else {
+            if (currentLine) {
+                lines.push(currentLine);
+            }
+            currentLine = word;
+        }
+    }
+    if (currentLine) {
+        lines.push(currentLine);
+    }
+    return lines;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "escribano",
-  "version": "0.1.4",
+  "version": "0.2.0",
   "description": "AI-powered session intelligence tool — turn screen recordings into structured work summaries",
   "main": "dist/index.js",
   "type": "module",
@@ -11,7 +11,9 @@
     "dist",
     "migrations",
     "prompts",
-    "scripts"
+    "scripts",
+    "src/scripts/audio_preprocessor.py",
+    "src/scripts/visual_observer_base.py"
   ],
   "scripts": {
     "test": "vitest run",
@@ -30,7 +32,7 @@
     "dashboard": "node tools/dashboard/server.js",
     "db:reset": "rm -f ~/.escribano/escribano.db*",
     "ollama": "OLLAMA_NUM_PARALLEL=4 OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_FLASH_ATTENTION=1 OLLAMA_KEEP_ALIVE=-1 OLLAMA_CONTEXT_LENGTH=262144 ollama serve",
-    "ollama-2": "OLLAMA_NUM_PARALLEL=1 OLLAMA_HOST=127.0.0.1:11435 OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_FLASH_ATTENTION=1 OLLAMA_KEEP_ALIVE=-1 OLLAMA_CONTEXT_LENGTH=262144 ollama serve",
+    "ollama-2": "OLLAMA_NUM_PARALLEL=1 OLLAMA_HOST=127.0.0.1.11435 OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_FLASH_ATTENTION=1 OLLAMA_KEEP_ALIVE=-1 OLLAMA_CONTEXT_LENGTH=262144 ollama serve",
     "index:rebuild": "tsx --env-file=.env src/scripts/rebuild-index.ts"
   },
   "keywords": [

package/src/scripts/audio_preprocessor.py ADDED Viewed

@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "torch>=2.0",
+#   "soundfile",
+#   "numpy",
+#   "silero-vad",
+# ]
+# ///
+"""
+Audio Preprocessor - Silero VAD for speech segment extraction.
+Uses soundfile for I/O to avoid torchaudio/torchcodec native dependency issues.
+Usage:
+    uv run audio_preprocessor.py --audio /path/to/audio.wav --output-dir /tmp/segments --output-json /path/to/segments.json
+"""
+import argparse
+import json
+import os
+from pathlib import Path
+import torch
+import soundfile as sf
+import numpy as np
+def parse_args():
+    parser = argparse.ArgumentParser(description="Audio Preprocessor with Silero VAD")
+    parser.add_argument("--audio", type=Path, required=True, help="Path to input audio file")
+    parser.add_argument("--output-dir", type=Path, required=True, help="Directory to save segment WAV files")
+    parser.add_argument("--output-json", type=Path, required=True, help="Path to save segments manifest JSON")
+    parser.add_argument("--threshold", type=float, default=0.5, help="VAD threshold (default: 0.5)")
+    parser.add_argument("--min-speech-duration-ms", type=int, default=250, help="Min speech duration in ms")
+    parser.add_argument("--min-silence-duration-ms", type=int, default=1000, help="Min silence duration in ms")
+    return parser.parse_args()
+def read_audio_sf(path: str, sampling_rate: int = 16000):
+    wav, sr = sf.read(path)
+    if len(wav.shape) > 1:
+        wav = np.mean(wav, axis=1)
+    if sr != sampling_rate:
+        # Note: We expect the input to be pre-converted by ffmpeg to 16000
+        # But if not, we would need a resampler. For now, we assume sr is correct.
+        pass
+    return torch.from_numpy(wav.astype(np.float32))
+def main():
+    args = parse_args()
+    if not args.audio.exists():
+        print(f"Error: Audio file not found: {args.audio}")
+        return 1
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    # Load Silero VAD model
+    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                  model='silero_vad',
+                                  force_reload=False,
+                                  onnx=False)
+    (get_speech_timestamps, _, _, _, _) = utils
+    # Load audio
+    sampling_rate = 16000
+    wav = read_audio_sf(str(args.audio), sampling_rate=sampling_rate)
+    # Get speech timestamps
+    speech_timestamps = get_speech_timestamps(
+        wav,
+        model,
+        sampling_rate=sampling_rate,
+        threshold=args.threshold,
+        min_speech_duration_ms=args.min_speech_duration_ms,
+        min_silence_duration_ms=args.min_silence_duration_ms
+    )
+    segments = []
+    for i, ts in enumerate(speech_timestamps):
+        start_sec = ts['start'] / sampling_rate
+        end_sec = ts['end'] / sampling_rate
+        # Extract segment
+        segment_wav = wav[ts['start']:ts['end']].numpy()
+        # Save segment to WAV using soundfile
+        segment_filename = f"segment_{i:04d}.wav"
+        segment_path = args.output_dir / segment_filename
+        sf.write(str(segment_path), segment_wav, sampling_rate)
+        segments.append({
+            "start": float(start_sec),
+            "end": float(end_sec),
+            "audioPath": str(segment_path)
+        })
+    # Write manifest
+    with open(args.output_json, "w") as f:
+        json.dump(segments, f, indent=2)
+    print(f"Extracted {len(segments)} speech segments to {args.output_dir}")
+    print(f"Manifest written to {args.output_json}")
+    return 0
+if __name__ == "__main__":
+    exit(main())

package/src/scripts/visual_observer_base.py ADDED Viewed

@@ -0,0 +1,417 @@
+#!/usr/bin/env python3
+"""
+Visual Observer Base - OCR + CLIP indexing for screen recordings.
+Usage:
+    uv run visual_observer_base.py --frames-dir /path/to/frames --output /path/to/visual-index.json
+"""
+import argparse
+import json
+import os
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import TypedDict
+import open_clip
+import pytesseract
+import torch
+from PIL import Image
+from sklearn.cluster import AgglomerativeClustering
+# Type definitions
+class FrameData(TypedDict):
+    index: int
+    timestamp: float
+    imagePath: str
+    ocrText: str
+    clusterId: int
+    changeScore: float
+class ClusterData(TypedDict):
+    id: int
+    heuristicLabel: str
+    timeRange: tuple[float, float]
+    frameCount: int
+    representativeIdx: int
+    avgOcrCharacters: float
+    mediaIndicators: list[str]
+class VisualIndex(TypedDict):
+    frames: list[FrameData]
+    clusters: list[ClusterData]
+    processingTime: dict[str, int]
+# Constants
+# Prefer MPS for Apple Silicon, fallback to CPU
+DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
+CLIP_MODEL = "ViT-B-32"
+CLIP_PRETRAINED = "laion2b_s34b_b79k"
+CLUSTER_DISTANCE_THRESHOLD = 0.15  # 1 - 0.85 similarity
+UI_CATEGORIES = [
+    "A screenshot of a code editor showing programming code",
+    "A screenshot of a terminal with command line interface",
+    "A screenshot of a web browser showing a website",
+    "A screenshot of a video player with playback controls",
+    "A screenshot of a document or PDF viewer",
+    "A screenshot of an image viewer or photo application",
+    "A screenshot of a chat or messaging application",
+    "A screenshot of a file manager or finder window",
+]
+CATEGORY_LABELS = [
+    "code-editor",
+    "terminal",
+    "browser",
+    "video-player",
+    "document",
+    "image-viewer",
+    "chat",
+    "file-manager",
+]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Visual Observer Base")
+    parser.add_argument("--frames-dir", type=Path, required=True)
+    parser.add_argument("--output", type=Path, required=True)
+    parser.add_argument("--frame-interval", type=float, default=2.0,
+                        help="Seconds between frames (default: 2)")
+    parser.add_argument("--workers", type=int, default=os.cpu_count(),
+                        help="Number of parallel OCR workers (default: CPU count)")
+    return parser.parse_args()
+def load_frames(frames_dir: Path, frame_interval: float) -> list[tuple[int, float, Path]]:
+    """Load frame paths and compute timestamps.
+    Args:
+        frames_dir: Directory containing frame images
+        frame_interval: Seconds between frames (e.g., 2.0 means frame 0 at 0s, frame 1 at 2s)
+    """
+    frames = []
+    # Assumes filenames like scene_0001.jpg
+    # Using sorted glob to ensure chronological order
+    all_files = sorted(list(frames_dir.glob("*.jpg")))
+    for i, path in enumerate(all_files):
+        timestamp = i * frame_interval
+        frames.append((i, timestamp, path))
+    return frames
+def extract_ocr(image_path: Path) -> str:
+    """Extract text from image using Tesseract.
+    Uses PSM 11 (sparse text) which works better for UI screenshots
+    where text is scattered across the screen (menus, buttons, tabs, URLs).
+    """
+    try:
+        image = Image.open(image_path)
+        # PSM 11: Sparse text - finds text scattered anywhere (UI elements)
+        # OEM 3: Default OCR engine mode (LSTM if available)
+        custom_config = r'--psm 11 --oem 3'
+        text = pytesseract.image_to_string(image, config=custom_config)
+        return text.strip()
+    except Exception as e:
+        print(f"  Warning: OCR failed for {image_path.name}: {e}")
+        return ""
+def extract_ocr_parallel(
+    frames: list[tuple[int, float, Path]],
+    max_workers: int
+) -> dict[int, str]:
+    """Extract OCR in parallel using multiprocessing.
+    Args:
+        frames: List of (index, timestamp, path) tuples
+        max_workers: Number of parallel workers
+    Returns:
+        Dictionary mapping frame index to OCR text
+    """
+    results = {}
+    total = len(frames)
+    completed = 0
+    print(f"  Using {max_workers} parallel workers...")
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all tasks
+        future_to_idx = {
+            executor.submit(extract_ocr, path): idx
+            for idx, _, path in frames
+        }
+        # Collect results as they complete
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                results[idx] = future.result()
+            except Exception as e:
+                print(f"  Warning: OCR failed for frame {idx}: {e}")
+                results[idx] = ""
+            completed += 1
+            # Progress indicator every 10%
+            if completed % max(1, total // 10) == 0:
+                pct = (completed / total) * 100
+                print(f"  OCR progress: {completed}/{total} ({pct:.0f}%)")
+    return results
+def compute_clip_embeddings(
+    frames: list[tuple[int, float, Path]],
+    model,
+    preprocess,
+) -> torch.Tensor:
+    """Compute CLIP embeddings for all frames."""
+    embeddings = []
+    for _, _, path in frames:
+        try:
+            image = preprocess(Image.open(path)).unsqueeze(0).to(DEVICE)
+            with torch.no_grad():
+                embedding = model.encode_image(image)
+                embedding = embedding / embedding.norm(dim=-1, keepdim=True)
+            embeddings.append(embedding.cpu())
+        except Exception as e:
+            print(f"  Warning: CLIP embedding failed for {path.name}: {e}")
+            # Use zero vector as fallback to maintain alignment
+            embeddings.append(torch.zeros((1, 512)))
+    if not embeddings:
+        return torch.zeros((0, 512))
+    return torch.cat(embeddings, dim=0)
+def cluster_frames(embeddings: torch.Tensor) -> list[int]:
+    """Cluster frames by CLIP embedding similarity."""
+    if len(embeddings) < 2:
+        return [0] * len(embeddings)
+    clustering = AgglomerativeClustering(
+        n_clusters=None, # type: ignore
+        distance_threshold=CLUSTER_DISTANCE_THRESHOLD,
+        metric="cosine",
+        linkage="average",
+    )
+    labels = clustering.fit_predict(embeddings.numpy())
+    return labels.tolist()
+def infer_label_with_clip(
+    image_path: Path,
+    model,
+    preprocess,
+    tokenizer,
+) -> str:
+    """Use CLIP zero-shot to classify frame into UI category."""
+    try:
+        image = preprocess(Image.open(image_path)).unsqueeze(0).to(DEVICE)
+        text_tokens = tokenizer(UI_CATEGORIES).to(DEVICE)
+        with torch.no_grad():
+            image_features = model.encode_image(image)
+            text_features = model.encode_text(text_tokens)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+            best_idx = similarity.argmax().item()
+        return CATEGORY_LABELS[best_idx]
+    except Exception as e:
+        print(f"  Warning: Zero-shot classification failed for {image_path.name}: {e}")
+        return "unknown"
+def detect_media_indicators(ocr_text: str) -> list[str]:
+    """
+    Detect indicators that frame shows media content.
+    TODO: Expand patterns based on real-world testing:
+    - Video platforms: Vimeo, Twitch, Netflix, Disney+
+    - Image formats: .gif, .webp, .svg, .bmp
+    - Media players: VLC, QuickTime, IINA, mpv
+    - Streaming: Spotify, Apple Music, SoundCloud
+    - Social media: Twitter/X, Instagram, TikTok
+    """
+    indicators = []
+    text_lower = ocr_text.lower()
+    # Video platforms
+    if "youtube" in text_lower:
+        indicators.append("youtube")
+    if "vimeo" in text_lower:
+        indicators.append("vimeo")
+    if "netflix" in text_lower:
+        indicators.append("netflix")
+    # Image files
+    image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
+    if any(ext in text_lower for ext in image_extensions):
+        indicators.append("image-file")
+    # TODO: Add more patterns after dry-run testing
+    return indicators
+def build_cluster_metadata(
+    frames_data: list[FrameData],
+    cluster_labels: list[int],
+    model,
+    preprocess,
+    tokenizer,
+) -> list[ClusterData]:
+    """Build metadata for each cluster."""
+    clusters: dict[int, list[FrameData]] = {}
+    for frame, label in zip(frames_data, cluster_labels):
+        if label not in clusters:
+            clusters[label] = []
+        clusters[label].append(frame)
+    result = []
+    for cluster_id, cluster_frames in clusters.items():
+        # Find representative (middle frame)
+        representative = cluster_frames[len(cluster_frames) // 2]
+        # Compute average OCR characters
+        avg_chars = sum(len(f["ocrText"]) for f in cluster_frames) / len(cluster_frames)
+        # Get time range
+        timestamps = [f["timestamp"] for f in cluster_frames]
+        time_range = (float(min(timestamps)), float(max(timestamps)))
+        # Aggregate media indicators
+        all_indicators = set()
+        for f in cluster_frames:
+            all_indicators.update(detect_media_indicators(f["ocrText"]))
+        # Infer label using CLIP on representative
+        rep_path = Path(representative["imagePath"])
+        label = infer_label_with_clip(rep_path, model, preprocess, tokenizer)
+        result.append({
+            "id": cluster_id,
+            "heuristicLabel": label,
+            "timeRange": time_range,
+            "frameCount": len(cluster_frames),
+            "representativeIdx": representative["index"],
+            "avgOcrCharacters": avg_chars,
+            "mediaIndicators": list(all_indicators),
+        })
+    return result
+def main():
+    args = parse_args()
+    print(f"Loading frames from {args.frames_dir}...")
+    frames = load_frames(args.frames_dir, args.frame_interval)
+    if not frames:
+        print("Error: No frames found")
+        return 1
+    print(f"Found {len(frames)} frames")
+    # Initialize timing
+    timing = {"ocrMs": 0, "clipMs": 0, "clusterMs": 0, "totalMs": 0}
+    total_start = time.time()
+    # Phase 1: OCR (Parallel)
+    print(f"Phase 1: Extracting text with OCR ({args.workers} workers)...")
+    ocr_start = time.time()
+    ocr_results = extract_ocr_parallel(frames, args.workers)
+    frames_data: list[FrameData] = []
+    for idx, timestamp, path in frames:
+        frames_data.append({
+            "index": idx,
+            "timestamp": timestamp,
+            "imagePath": str(path),
+            "ocrText": ocr_results.get(idx, ""),
+            "clusterId": -1,  # Set later
+            "changeScore": 0.0,  # TODO: Implement pixel delta if needed
+        })
+    timing["ocrMs"] = int((time.time() - ocr_start) * 1000)
+    print(f"  OCR complete: {timing['ocrMs']}ms")
+    # Phase 2: CLIP embeddings
+    print(f"Phase 2: Computing CLIP embeddings on {DEVICE}...")
+    clip_start = time.time()
+    model, _, preprocess = open_clip.create_model_and_transforms(
+        CLIP_MODEL, pretrained=CLIP_PRETRAINED
+    )
+    model.eval()
+    model.to(DEVICE)
+    tokenizer = open_clip.get_tokenizer(CLIP_MODEL)
+    embeddings = compute_clip_embeddings(frames, model, preprocess)
+    timing["clipMs"] = int((time.time() - clip_start) * 1000)
+    print(f"  CLIP complete: {timing['clipMs']}ms")
+    # Phase 3: Clustering
+    print("Phase 3: Clustering frames...")
+    cluster_start = time.time()
+    cluster_labels = cluster_frames(embeddings)
+    # Update frames with cluster IDs
+    for frame, label in zip(frames_data, cluster_labels):
+        frame["clusterId"] = label
+    timing["clusterMs"] = int((time.time() - cluster_start) * 1000)
+    print(f"  Clustering complete: {timing['clusterMs']}ms")
+    # Phase 4: Build cluster metadata
+    print("Phase 4: Building cluster metadata...")
+    clusters = build_cluster_metadata(
+        frames_data, cluster_labels, model, preprocess, tokenizer
+    )
+    print(f"  Found {len(clusters)} clusters")
+    timing["totalMs"] = int((time.time() - total_start) * 1000)
+    # Output
+    result: VisualIndex = {
+        "frames": frames_data,
+        "clusters": clusters,
+        "processingTime": timing,
+    }
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(result, f, indent=2)
+    print(f"\nOutput written to {args.output}")
+    print(f"Total processing time: {timing['totalMs']}ms")
+    return 0
+if __name__ == "__main__":
+    exit(main())