escribano 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,10 +7,12 @@
7
7
  import { execSync } from 'node:child_process';
8
8
  import { mkdir, readFile, writeFile } from 'node:fs/promises';
9
9
  import { homedir } from 'node:os';
10
- import path from 'node:path';
10
+ import path, { dirname, resolve } from 'node:path';
11
+ import { fileURLToPath } from 'node:url';
11
12
  import { log, step } from '../pipeline/context.js';
12
13
  import { normalizeAppNames } from '../services/app-normalization.js';
13
14
  import { groupTopicBlocksIntoSubjects, saveSubjectsToDatabase, } from '../services/subject-grouping.js';
15
+ const __dirname = dirname(fileURLToPath(import.meta.url));
14
16
  export async function generateArtifactV3(recordingId, repos, intelligence, options) {
15
17
  const format = options.format || 'card';
16
18
  log('info', `[Artifact V3.1] Generating ${format} artifact for recording ${recordingId}...`);
@@ -204,7 +206,7 @@ async function generateLlmArtifact(subjects, groupingResult, format, recording,
204
206
  : format === 'standup'
205
207
  ? 'standup.md'
206
208
  : 'summary-v3.md';
207
- const promptPath = path.join(process.cwd(), 'prompts', promptFileName);
209
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', promptFileName);
208
210
  let promptTemplate;
209
211
  try {
210
212
  promptTemplate = await readFile(promptPath, 'utf-8');
@@ -5,8 +5,10 @@
5
5
  */
6
6
  import { mkdir, readFile, writeFile } from 'node:fs/promises';
7
7
  import { homedir } from 'node:os';
8
- import path from 'node:path';
8
+ import path, { dirname, resolve } from 'node:path';
9
+ import { fileURLToPath } from 'node:url';
9
10
  import { log } from '../pipeline/context.js';
11
+ const __dirname = dirname(fileURLToPath(import.meta.url));
10
12
  /**
11
13
  * Generate a work session summary artifact from processed TopicBlocks.
12
14
  *
@@ -81,7 +83,7 @@ export async function generateSummaryV3(recordingId, repos, intelligence, option
81
83
  */
82
84
  async function generateLlmSummary(sections, recording, intelligence) {
83
85
  // Read prompt template
84
- const promptPath = path.join(process.cwd(), 'prompts', 'summary-v3.md');
86
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'summary-v3.md');
85
87
  let promptTemplate;
86
88
  try {
87
89
  promptTemplate = await readFile(promptPath, 'utf-8');
@@ -1,9 +1,12 @@
1
1
  import { exec, spawn } from 'node:child_process';
2
+ import { existsSync } from 'node:fs';
2
3
  import { mkdir, readFile, rm } from 'node:fs/promises';
3
4
  import os from 'node:os';
4
- import path from 'node:path';
5
+ import path, { dirname, resolve } from 'node:path';
6
+ import { fileURLToPath } from 'node:url';
5
7
  import { promisify } from 'node:util';
6
8
  const execAsync = promisify(exec);
9
+ const __dirname = dirname(fileURLToPath(import.meta.url));
7
10
  export function createSileroPreprocessor() {
8
11
  let currentProcess = null;
9
12
  return {
@@ -19,19 +22,63 @@ export function createSileroPreprocessor() {
19
22
  catch (error) {
20
23
  throw new Error(`Failed to pre-convert audio for VAD: ${error.message}`);
21
24
  }
22
- const scriptPath = path.join(process.cwd(), 'src', 'scripts', 'audio_preprocessor.py');
25
+ const scriptPath = resolve(__dirname, '..', '..', 'src', 'scripts', 'audio_preprocessor.py');
26
+ if (!existsSync(scriptPath)) {
27
+ throw new Error(`Audio preprocessor script not found at: ${scriptPath}`);
28
+ }
23
29
  const command = `uv run "${scriptPath}" --audio "${inputWavPath}" --output-dir "${tempDir}" --output-json "${manifestPath}"`;
24
30
  try {
25
31
  console.log(`Running Silero VAD on ${inputWavPath}...`);
32
+ if (process.env.ESCRIBANO_VERBOSE === 'true') {
33
+ console.log(` Script path: ${scriptPath}`);
34
+ console.log(` Script exists: ${existsSync(scriptPath)}`);
35
+ console.log(` Command: ${command}`);
36
+ console.log(` Working directory (user): ${process.cwd()}`);
37
+ try {
38
+ const { stdout: uvVersion } = await execAsync('uv --version');
39
+ console.log(` uv version: ${uvVersion.trim()}`);
40
+ }
41
+ catch {
42
+ console.log(` uv version: NOT FOUND`);
43
+ }
44
+ }
26
45
  currentProcess = spawn('sh', ['-c', command]);
46
+ let stderr = '';
47
+ let stdout = '';
48
+ if (currentProcess.stderr) {
49
+ currentProcess.stderr.on('data', (data) => {
50
+ stderr += data.toString();
51
+ });
52
+ }
53
+ if (currentProcess.stdout) {
54
+ currentProcess.stdout.on('data', (data) => {
55
+ stdout += data.toString();
56
+ });
57
+ }
27
58
  await new Promise((resolve, reject) => {
28
59
  currentProcess?.on('close', (code) => {
29
60
  currentProcess = null;
30
61
  if (code === 0) {
62
+ if (process.env.ESCRIBANO_VERBOSE === 'true' && stdout) {
63
+ console.log(` Silero VAD stdout:\n${stdout
64
+ .split('\n')
65
+ .map((l) => ' ' + l)
66
+ .join('\n')}`);
67
+ }
31
68
  resolve();
32
69
  }
33
70
  else {
34
- reject(new Error(`Silero VAD failed with code ${code}`));
71
+ console.error(` Silero VAD stderr:\n${stderr
72
+ .split('\n')
73
+ .map((l) => ' ' + l)
74
+ .join('\n')}`);
75
+ if (stdout) {
76
+ console.error(` Silero VAD stdout:\n${stdout
77
+ .split('\n')
78
+ .map((l) => ' ' + l)
79
+ .join('\n')}`);
80
+ }
81
+ reject(new Error(`Silero VAD failed with code ${code}: ${stderr || stdout || 'No output captured'}`));
35
82
  }
36
83
  });
37
84
  currentProcess?.on('error', (err) => {
@@ -4,12 +4,14 @@
4
4
  * Implements IntelligenceService using Ollama REST API
5
5
  */
6
6
  import { readFileSync } from 'node:fs';
7
- import { join } from 'node:path';
7
+ import { dirname, resolve } from 'node:path';
8
+ import { fileURLToPath } from 'node:url';
8
9
  import { Agent, fetch as undiciFetch } from 'undici';
9
10
  import { z } from 'zod';
10
11
  import { classificationSchema, intelligenceConfigSchema, transcriptMetadataSchema, } from '../0_types.js';
11
12
  // Debug logging controlled by environment variable
12
13
  const DEBUG_OLLAMA = process.env.ESCRIBANO_DEBUG_OLLAMA === 'true';
14
+ const __dirname = dirname(fileURLToPath(import.meta.url));
13
15
  // TODO: put in an util
14
16
  export function debugLog(...args) {
15
17
  if (DEBUG_OLLAMA) {
@@ -193,7 +195,7 @@ async function classifySegmentWithOllama(segment, config, transcript) {
193
195
  return raw;
194
196
  }
195
197
  function loadClassifySegmentPrompt(segment, transcript) {
196
- const promptPath = join(process.cwd(), 'prompts', 'classify-segment.md');
198
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'classify-segment.md');
197
199
  let prompt = readFileSync(promptPath, 'utf-8');
198
200
  const timeRangeStr = `[${segment.timeRange[0]}s - ${segment.timeRange[1]}s]`;
199
201
  const ocrContext = segment.contexts.map((c) => `${c.type}: ${c.value}`).join(', ') || 'None';
@@ -208,7 +210,7 @@ function loadClassifySegmentPrompt(segment, transcript) {
208
210
  return prompt;
209
211
  }
210
212
  function loadClassifyPrompt(transcript, visualLogs) {
211
- const promptPath = join(process.cwd(), 'prompts', 'classify.md');
213
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'classify.md');
212
214
  let prompt = readFileSync(promptPath, 'utf-8');
213
215
  const segmentsText = transcript.segments
214
216
  .map((seg) => `[seg-${seg.id}] [${seg.start}s - ${seg.end}s] ${seg.text}`)
@@ -241,7 +243,7 @@ function loadClassifyPrompt(transcript, visualLogs) {
241
243
  */
242
244
  function buildVLMSingleImagePrompt() {
243
245
  try {
244
- const promptPath = join(process.cwd(), 'prompts', 'vlm-single.md');
246
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'vlm-single.md');
245
247
  return readFileSync(promptPath, 'utf-8');
246
248
  }
247
249
  catch {
@@ -426,7 +428,7 @@ async function extractTopicsWithOllama(observations, config) {
426
428
  return [];
427
429
  let prompt;
428
430
  try {
429
- const promptPath = join(process.cwd(), 'prompts', 'topic-extract.md');
431
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'topic-extract.md');
430
432
  const template = readFileSync(promptPath, 'utf-8');
431
433
  prompt = template.replace('{{OBSERVATIONS}}', textSamples.join('\n---\n'));
432
434
  }
@@ -657,7 +659,7 @@ async function extractMetadata(transcript, classification, config, visualLogs) {
657
659
  return raw;
658
660
  }
659
661
  function loadMetadataPrompt(transcript, classification, visualLogs) {
660
- const promptPath = join(process.cwd(), 'prompts', 'extract-metadata.md');
662
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'extract-metadata.md');
661
663
  let prompt = readFileSync(promptPath, 'utf-8');
662
664
  const classificationSummary = Object.entries(classification)
663
665
  .filter(([_, score]) => score >= 25)
@@ -698,7 +700,7 @@ async function generateArtifact(artifactType, context, config) {
698
700
  return response;
699
701
  }
700
702
  function loadArtifactPrompt(artifactType, context) {
701
- const promptPath = join(process.cwd(), 'prompts', `${artifactType}.md`);
703
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', `${artifactType}.md`);
702
704
  let prompt = readFileSync(promptPath, 'utf-8');
703
705
  // TODO: Implement robust transcript cleaning (Milestone 4)
704
706
  prompt = prompt.replace('{{TRANSCRIPT_ALL}}', context.transcript.fullText);
@@ -5,12 +5,15 @@
5
5
  * Used for extracting screenshots and detecting scene changes.
6
6
  */
7
7
  import { exec, spawn } from 'node:child_process';
8
+ import { existsSync } from 'node:fs';
8
9
  import { mkdir, readdir, readFile, rm } from 'node:fs/promises';
9
10
  import os from 'node:os';
10
- import path from 'node:path';
11
+ import path, { dirname, resolve } from 'node:path';
12
+ import { fileURLToPath } from 'node:url';
11
13
  import { promisify } from 'node:util';
12
14
  import { debugLog } from './intelligence.ollama.adapter.js';
13
15
  const execAsync = promisify(exec);
16
+ const __dirname = dirname(fileURLToPath(import.meta.url));
14
17
  // Scene detection configuration (with env var overrides)
15
18
  // Lower threshold = more sensitive = more scene changes detected
16
19
  // Examples: 0.3 (sensitive), 0.4 (default), 0.5 (conservative)
@@ -268,15 +271,16 @@ export function createFfmpegVideoService() {
268
271
  * OCR is parallelized across all available CPU cores.
269
272
  */
270
273
  runVisualIndexing: async (framesDir, outputPath) => {
271
- const scriptPath = path.join(process.cwd(), 'src', 'scripts', 'visual_observer_base.py');
274
+ const scriptPath = resolve(__dirname, '..', '..', 'src', 'scripts', 'visual_observer_base.py');
275
+ if (!existsSync(scriptPath)) {
276
+ throw new Error(`Visual observer script not found at: ${scriptPath}`);
277
+ }
272
278
  const frameInterval = Number(process.env.ESCRIBANO_FRAME_INTERVAL) || 2;
273
279
  const workers = os.cpus().length;
274
- // Use uv run to execute the script with its environment
275
- // --workers enables parallel OCR processing
276
280
  const command = `uv run "${scriptPath}" --frames-dir "${framesDir}" --output "${outputPath}" --frame-interval ${frameInterval} --workers ${workers}`;
277
281
  try {
278
282
  await execAsync(command, {
279
- cwd: path.join(process.cwd(), 'src', 'scripts'),
283
+ cwd: dirname(scriptPath),
280
284
  });
281
285
  const content = await readFile(outputPath, 'utf-8');
282
286
  return JSON.parse(content);
package/dist/index.js CHANGED
@@ -12,6 +12,7 @@ import { createFilesystemCaptureSource } from './adapters/capture.filesystem.ada
12
12
  import { cleanupMlxBridge, initializeSystem, processVideo, } from './batch-context.js';
13
13
  import { getDbPath } from './db/index.js';
14
14
  import { checkPrerequisites, hasMissingPrerequisites, printDoctorResults, } from './prerequisites.js';
15
+ import { logEnvironmentVariables } from './utils/env-logger.js';
15
16
  const MODELS_DIR = path.join(homedir(), '.escribano', 'models');
16
17
  const MODEL_FILE = 'ggml-large-v3.bin';
17
18
  const MODEL_PATH = path.join(MODELS_DIR, MODEL_FILE);
@@ -96,6 +97,8 @@ Output: Markdown summary saved to ~/.escribano/artifacts/
96
97
  }
97
98
  async function run(args) {
98
99
  const { force, file: filePath, skipSummary, micAudio, systemAudio, format, includePersonal, copyToClipboard, printToStdout, } = args;
100
+ // Log environment variables if verbose mode is enabled
101
+ logEnvironmentVariables();
99
102
  // Initialize system (reuses batch-context for consistency)
100
103
  console.log('Initializing database...');
101
104
  const ctx = await initializeSystem();
@@ -5,7 +5,9 @@
5
5
  * This is the foundation for the new artifact architecture.
6
6
  */
7
7
  import { readFileSync } from 'node:fs';
8
- import { join } from 'node:path';
8
+ import { dirname, resolve } from 'node:path';
9
+ import { fileURLToPath } from 'node:url';
10
+ const __dirname = dirname(fileURLToPath(import.meta.url));
9
11
  const PERSONAL_APPS = new Set([
10
12
  'WhatsApp',
11
13
  'Instagram',
@@ -144,7 +146,7 @@ ID: ${b.id}`;
144
146
  : `"${blockIdList[0]}"`;
145
147
  let template;
146
148
  try {
147
- const promptPath = join(process.cwd(), 'prompts', 'subject-grouping.md');
149
+ const promptPath = resolve(__dirname, '..', '..', 'prompts', 'subject-grouping.md');
148
150
  template = readFileSync(promptPath, 'utf-8');
149
151
  }
150
152
  catch {
@@ -0,0 +1,262 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { beforeEach, describe, expect, it, vi } from 'vitest';
3
+ // Mock fs module
4
+ vi.mock('node:fs', () => ({
5
+ readFileSync: vi.fn(),
6
+ }));
7
+ // Mock process.env
8
+ const originalEnv = { ...process.env };
9
+ beforeEach(() => {
10
+ vi.clearAllMocks();
11
+ // Reset process.env
12
+ process.env = { ...originalEnv };
13
+ });
14
+ // Import after mocking
15
+ const { logEnvironmentVariables } = await import('../../utils/env-logger.js');
16
+ describe('Environment Variable Logger', () => {
17
+ describe('parseEnvExample', () => {
18
+ it('parses simple variable with description', async () => {
19
+ const mockContent = `# Enable verbose logging
20
+ ESCRIBANO_VERBOSE=false`;
21
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
22
+ // Call the function indirectly via logEnvironmentVariables
23
+ process.env.ESCRIBANO_VERBOSE = 'true';
24
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
25
+ logEnvironmentVariables();
26
+ expect(consoleSpy).toHaveBeenCalled();
27
+ const output = consoleSpy.mock.calls
28
+ .map((call) => call.join(' '))
29
+ .join('\n');
30
+ expect(output).toContain('ESCRIBANO_VERBOSE');
31
+ expect(output).toContain('Enable verbose logging');
32
+ consoleSpy.mockRestore();
33
+ });
34
+ it('skips section headers', async () => {
35
+ const mockContent = `# === Frame Extraction ===
36
+ # Output frame width
37
+ ESCRIBANO_FRAME_WIDTH=1024`;
38
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
39
+ process.env.ESCRIBANO_VERBOSE = 'true';
40
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
41
+ logEnvironmentVariables();
42
+ const output = consoleSpy.mock.calls
43
+ .map((call) => call.join(' '))
44
+ .join('\n');
45
+ expect(output).toContain('ESCRIBANO_FRAME_WIDTH');
46
+ expect(output).toContain('Output frame width');
47
+ expect(output).not.toContain('Frame Extraction');
48
+ consoleSpy.mockRestore();
49
+ });
50
+ it('skips commented/deprecated variables', async () => {
51
+ const mockContent = `# Active variable
52
+ ESCRIBANO_VERBOSE=false
53
+ # ESCRIBANO_DEPRECATED=value`;
54
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
55
+ process.env.ESCRIBANO_VERBOSE = 'true';
56
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
57
+ logEnvironmentVariables();
58
+ const output = consoleSpy.mock.calls
59
+ .map((call) => call.join(' '))
60
+ .join('\n');
61
+ expect(output).toContain('ESCRIBANO_VERBOSE');
62
+ expect(output).not.toContain('ESCRIBANO_DEPRECATED');
63
+ consoleSpy.mockRestore();
64
+ });
65
+ it('skips non-ESCRIBANO variables', async () => {
66
+ const mockContent = `ESCRIBANO_VERBOSE=false
67
+ OTHER_VAR=value`;
68
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
69
+ process.env.ESCRIBANO_VERBOSE = 'true';
70
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
71
+ logEnvironmentVariables();
72
+ const output = consoleSpy.mock.calls
73
+ .map((call) => call.join(' '))
74
+ .join('\n');
75
+ expect(output).toContain('ESCRIBANO_VERBOSE');
76
+ expect(output).not.toContain('OTHER_VAR');
77
+ consoleSpy.mockRestore();
78
+ });
79
+ it('handles empty file gracefully', async () => {
80
+ vi.mocked(readFileSync).mockReturnValue('');
81
+ process.env.ESCRIBANO_VERBOSE = 'true';
82
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
83
+ logEnvironmentVariables();
84
+ const output = consoleSpy.mock.calls
85
+ .map((call) => call.join(' '))
86
+ .join('\n');
87
+ expect(output).toContain('Environment Variables');
88
+ consoleSpy.mockRestore();
89
+ });
90
+ it('handles file not found gracefully', async () => {
91
+ vi.mocked(readFileSync).mockImplementation(() => {
92
+ throw new Error('ENOENT');
93
+ });
94
+ process.env.ESCRIBANO_VERBOSE = 'true';
95
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
96
+ logEnvironmentVariables();
97
+ const output = consoleSpy.mock.calls
98
+ .map((call) => call.join(' '))
99
+ .join('\n');
100
+ expect(output).toContain('Could not parse .env.example');
101
+ consoleSpy.mockRestore();
102
+ });
103
+ });
104
+ describe('logEnvironmentVariables', () => {
105
+ it('does not log when ESCRIBANO_VERBOSE is false', () => {
106
+ process.env.ESCRIBANO_VERBOSE = 'false';
107
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
108
+ logEnvironmentVariables();
109
+ expect(consoleSpy).not.toHaveBeenCalled();
110
+ consoleSpy.mockRestore();
111
+ });
112
+ it('does not log when ESCRIBANO_VERBOSE is not set', () => {
113
+ delete process.env.ESCRIBANO_VERBOSE;
114
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
115
+ logEnvironmentVariables();
116
+ expect(consoleSpy).not.toHaveBeenCalled();
117
+ consoleSpy.mockRestore();
118
+ });
119
+ it('logs when ESCRIBANO_VERBOSE is true', () => {
120
+ const mockContent = 'ESCRIBANO_VERBOSE=false';
121
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
122
+ process.env.ESCRIBANO_VERBOSE = 'true';
123
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
124
+ logEnvironmentVariables();
125
+ expect(consoleSpy).toHaveBeenCalled();
126
+ consoleSpy.mockRestore();
127
+ });
128
+ it('marks custom values with [CUSTOM]', () => {
129
+ const mockContent = `# Default batch size
130
+ ESCRIBANO_VLM_BATCH_SIZE=4`;
131
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
132
+ process.env.ESCRIBANO_VERBOSE = 'true';
133
+ process.env.ESCRIBANO_VLM_BATCH_SIZE = '8';
134
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
135
+ logEnvironmentVariables();
136
+ const output = consoleSpy.mock.calls
137
+ .map((call) => call.join(' '))
138
+ .join('\n');
139
+ expect(output).toContain('[CUSTOM]');
140
+ expect(output).toContain('Current: 8');
141
+ expect(output).toContain('Default: 4');
142
+ consoleSpy.mockRestore();
143
+ });
144
+ it('does not mark default values', () => {
145
+ const mockContent = `# Default batch size
146
+ ESCRIBANO_VLM_BATCH_SIZE=4`;
147
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
148
+ process.env.ESCRIBANO_VERBOSE = 'true';
149
+ process.env.ESCRIBANO_VLM_BATCH_SIZE = '4';
150
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
151
+ logEnvironmentVariables();
152
+ const output = consoleSpy.mock.calls
153
+ .map((call) => call.join(' '))
154
+ .join('\n');
155
+ expect(output).not.toContain('[CUSTOM]');
156
+ consoleSpy.mockRestore();
157
+ });
158
+ it('masks secret tokens', () => {
159
+ const mockContent = `ESCRIBANO_OUTLINE_TOKEN=`;
160
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
161
+ process.env.ESCRIBANO_VERBOSE = 'true';
162
+ process.env.ESCRIBANO_OUTLINE_TOKEN = 'secret-api-key-123';
163
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
164
+ logEnvironmentVariables();
165
+ const output = consoleSpy.mock.calls
166
+ .map((call) => call.join(' '))
167
+ .join('\n');
168
+ expect(output).toContain('***');
169
+ expect(output).not.toContain('secret-api-key-123');
170
+ consoleSpy.mockRestore();
171
+ });
172
+ it('does not mask non-secret values', () => {
173
+ const mockContent = `ESCRIBANO_VLM_BATCH_SIZE=4`;
174
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
175
+ process.env.ESCRIBANO_VERBOSE = 'true';
176
+ process.env.ESCRIBANO_VLM_BATCH_SIZE = '8';
177
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
178
+ logEnvironmentVariables();
179
+ const output = consoleSpy.mock.calls
180
+ .map((call) => call.join(' '))
181
+ .join('\n');
182
+ expect(output).toContain('8');
183
+ expect(output).not.toContain('***');
184
+ consoleSpy.mockRestore();
185
+ });
186
+ it('shows "not set" for undefined variables', () => {
187
+ const mockContent = `ESCRIBANO_VLM_BATCH_SIZE=4`;
188
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
189
+ process.env.ESCRIBANO_VERBOSE = 'true';
190
+ delete process.env.ESCRIBANO_VLM_BATCH_SIZE;
191
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
192
+ logEnvironmentVariables();
193
+ const output = consoleSpy.mock.calls
194
+ .map((call) => call.join(' '))
195
+ .join('\n');
196
+ expect(output).toContain('not set');
197
+ consoleSpy.mockRestore();
198
+ });
199
+ it('shows "(empty)" for empty default values', () => {
200
+ const mockContent = `ESCRIBANO_OUTLINE_TOKEN=`;
201
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
202
+ process.env.ESCRIBANO_VERBOSE = 'true';
203
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
204
+ logEnvironmentVariables();
205
+ const output = consoleSpy.mock.calls
206
+ .map((call) => call.join(' '))
207
+ .join('\n');
208
+ expect(output).toContain('(empty)');
209
+ consoleSpy.mockRestore();
210
+ });
211
+ it('sorts variables alphabetically', () => {
212
+ const mockContent = `ESCRIBANO_ZEBRA=1
213
+ ESCRIBANO_ALPHA=2
214
+ ESCRIBANO_MIDDLE=3`;
215
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
216
+ process.env.ESCRIBANO_VERBOSE = 'true';
217
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
218
+ logEnvironmentVariables();
219
+ const output = consoleSpy.mock.calls
220
+ .map((call) => call.join(' '))
221
+ .join('\n');
222
+ const alphaIndex = output.indexOf('ESCRIBANO_ALPHA');
223
+ const middleIndex = output.indexOf('ESCRIBANO_MIDDLE');
224
+ const zebraIndex = output.indexOf('ESCRIBANO_ZEBRA');
225
+ expect(alphaIndex).toBeLessThan(middleIndex);
226
+ expect(middleIndex).toBeLessThan(zebraIndex);
227
+ consoleSpy.mockRestore();
228
+ });
229
+ it('includes multi-line descriptions', () => {
230
+ const mockContent = `# First line of description
231
+ # Second line of description
232
+ ESCRIBANO_VERBOSE=false`;
233
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
234
+ process.env.ESCRIBANO_VERBOSE = 'true';
235
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
236
+ logEnvironmentVariables();
237
+ const output = consoleSpy.mock.calls
238
+ .map((call) => call.join(' '))
239
+ .join('\n');
240
+ expect(output).toContain('First line of description');
241
+ expect(output).toContain('Second line of description');
242
+ consoleSpy.mockRestore();
243
+ });
244
+ });
245
+ describe('text wrapping', () => {
246
+ it('wraps long descriptions to fit width', () => {
247
+ const longDescription = 'This is a very long description that should be wrapped across multiple lines to fit within the specified width limit for better readability in the console output';
248
+ const mockContent = `# ${longDescription}
249
+ ESCRIBANO_VERBOSE=false`;
250
+ vi.mocked(readFileSync).mockReturnValue(mockContent);
251
+ process.env.ESCRIBANO_VERBOSE = 'true';
252
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
253
+ logEnvironmentVariables();
254
+ // The description should appear in the output
255
+ const output = consoleSpy.mock.calls
256
+ .map((call) => call.join(' '))
257
+ .join('\n');
258
+ expect(output).toContain('very long description');
259
+ consoleSpy.mockRestore();
260
+ });
261
+ });
262
+ });
@@ -0,0 +1,166 @@
1
+ /**
2
+ * Environment Variable Logger
3
+ *
4
+ * Parses .env.example to extract default values and descriptions,
5
+ * then logs all ESCRIBANO_* environment variables with comparisons
6
+ * to their defaults. Only logs when ESCRIBANO_VERBOSE=true.
7
+ */
8
+ import { readFileSync } from 'node:fs';
9
+ import { resolve } from 'node:path';
10
+ const SECRET_VARS = ['ESCRIBANO_OUTLINE_TOKEN'];
11
+ /**
12
+ * Parse .env.example file to extract variable names, defaults, and descriptions.
13
+ * Returns empty array if file not found.
14
+ */
15
+ function parseEnvExample() {
16
+ const envExamplePath = resolve(process.cwd(), '.env.example');
17
+ let content;
18
+ try {
19
+ content = readFileSync(envExamplePath, 'utf-8');
20
+ }
21
+ catch {
22
+ // File not found or unreadable
23
+ return [];
24
+ }
25
+ const vars = [];
26
+ const lines = content.split('\n');
27
+ let currentDescription = [];
28
+ for (const line of lines) {
29
+ const trimmedLine = line.trim();
30
+ // Skip empty lines - reset description
31
+ if (trimmedLine === '') {
32
+ currentDescription = [];
33
+ continue;
34
+ }
35
+ // Comment line
36
+ if (trimmedLine.startsWith('#')) {
37
+ const commentContent = trimmedLine.slice(1).trim();
38
+ // Skip section headers (pattern: # ===)
39
+ if (commentContent.startsWith('===')) {
40
+ currentDescription = [];
41
+ continue;
42
+ }
43
+ // Skip deprecated section marker
44
+ if (commentContent.toLowerCase().includes('deprecated')) {
45
+ currentDescription = [];
46
+ continue;
47
+ }
48
+ // Accumulate description
49
+ currentDescription.push(commentContent);
50
+ continue;
51
+ }
52
+ // Variable line (contains =)
53
+ if (trimmedLine.includes('=')) {
54
+ const eqIndex = trimmedLine.indexOf('=');
55
+ const name = trimmedLine.slice(0, eqIndex).trim();
56
+ const value = trimmedLine.slice(eqIndex + 1).trim();
57
+ // Skip if name starts with # (deprecated/commented)
58
+ if (name.startsWith('#')) {
59
+ currentDescription = [];
60
+ continue;
61
+ }
62
+ // Only track ESCRIBANO_* variables
63
+ if (name.startsWith('ESCRIBANO_')) {
64
+ vars.push({
65
+ name,
66
+ defaultValue: value,
67
+ description: currentDescription.join(' '),
68
+ });
69
+ }
70
+ currentDescription = [];
71
+ }
72
+ }
73
+ return vars;
74
+ }
75
+ /**
76
+ * Check if a variable should be masked (secret).
77
+ */
78
+ function isSecretVar(name) {
79
+ return SECRET_VARS.includes(name);
80
+ }
81
+ /**
82
+ * Format value for display, masking secrets if needed.
83
+ */
84
+ function formatValue(value, isSecret) {
85
+ if (value === 'not set') {
86
+ return 'not set';
87
+ }
88
+ if (isSecret && value !== 'not set' && value !== '') {
89
+ return '***';
90
+ }
91
+ if (value === '') {
92
+ return '(empty)';
93
+ }
94
+ return value;
95
+ }
96
+ /**
97
+ * Main logging function. Only runs when ESCRIBANO_VERBOSE=true.
98
+ */
99
+ export function logEnvironmentVariables() {
100
+ if (process.env.ESCRIBANO_VERBOSE !== 'true') {
101
+ return;
102
+ }
103
+ const envVars = parseEnvExample();
104
+ if (envVars.length === 0) {
105
+ console.log('\n=== Environment Variables ===');
106
+ console.log(' (Could not parse .env.example)\n');
107
+ return;
108
+ }
109
+ // Build list of vars with their current values
110
+ const varsWithValues = envVars.map((varDef) => {
111
+ const currentValue = process.env[varDef.name] ?? 'not set';
112
+ const isCustom = currentValue !== varDef.defaultValue && currentValue !== 'not set';
113
+ return {
114
+ ...varDef,
115
+ currentValue,
116
+ isCustom,
117
+ isSecret: isSecretVar(varDef.name),
118
+ };
119
+ });
120
+ // Sort alphabetically by name
121
+ varsWithValues.sort((a, b) => a.name.localeCompare(b.name));
122
+ // Log output
123
+ console.log('\n=== Environment Variables ===\n');
124
+ for (const varDef of varsWithValues) {
125
+ const marker = varDef.isCustom ? ' [CUSTOM]' : '';
126
+ const displayCurrent = formatValue(varDef.currentValue, varDef.isSecret);
127
+ const displayDefault = formatValue(varDef.defaultValue, false);
128
+ console.log(` ${varDef.name}${marker}`);
129
+ console.log(` Current: ${displayCurrent}`);
130
+ console.log(` Default: ${displayDefault}`);
131
+ if (varDef.description) {
132
+ // Wrap description to fit nicely (max ~60 chars per line)
133
+ const wrappedDesc = wrapText(varDef.description, 58);
134
+ for (const line of wrappedDesc) {
135
+ console.log(` ${line}`);
136
+ }
137
+ }
138
+ console.log('');
139
+ }
140
+ }
141
+ /**
142
+ * Wrap text to specified width.
143
+ */
144
+ function wrapText(text, width) {
145
+ if (text.length <= width) {
146
+ return [text];
147
+ }
148
+ const words = text.split(' ');
149
+ const lines = [];
150
+ let currentLine = '';
151
+ for (const word of words) {
152
+ if (`${currentLine} ${word}`.trim().length <= width) {
153
+ currentLine = currentLine ? `${currentLine} ${word}` : word;
154
+ }
155
+ else {
156
+ if (currentLine) {
157
+ lines.push(currentLine);
158
+ }
159
+ currentLine = word;
160
+ }
161
+ }
162
+ if (currentLine) {
163
+ lines.push(currentLine);
164
+ }
165
+ return lines;
166
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "escribano",
3
- "version": "0.1.4",
3
+ "version": "0.2.0",
4
4
  "description": "AI-powered session intelligence tool — turn screen recordings into structured work summaries",
5
5
  "main": "dist/index.js",
6
6
  "type": "module",
@@ -11,7 +11,9 @@
11
11
  "dist",
12
12
  "migrations",
13
13
  "prompts",
14
- "scripts"
14
+ "scripts",
15
+ "src/scripts/audio_preprocessor.py",
16
+ "src/scripts/visual_observer_base.py"
15
17
  ],
16
18
  "scripts": {
17
19
  "test": "vitest run",
@@ -30,7 +32,7 @@
30
32
  "dashboard": "node tools/dashboard/server.js",
31
33
  "db:reset": "rm -f ~/.escribano/escribano.db*",
32
34
  "ollama": "OLLAMA_NUM_PARALLEL=4 OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_FLASH_ATTENTION=1 OLLAMA_KEEP_ALIVE=-1 OLLAMA_CONTEXT_LENGTH=262144 ollama serve",
33
- "ollama-2": "OLLAMA_NUM_PARALLEL=1 OLLAMA_HOST=127.0.0.1:11435 OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_FLASH_ATTENTION=1 OLLAMA_KEEP_ALIVE=-1 OLLAMA_CONTEXT_LENGTH=262144 ollama serve",
35
+ "ollama-2": "OLLAMA_NUM_PARALLEL=1 OLLAMA_HOST=127.0.0.1.11435 OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_FLASH_ATTENTION=1 OLLAMA_KEEP_ALIVE=-1 OLLAMA_CONTEXT_LENGTH=262144 ollama serve",
34
36
  "index:rebuild": "tsx --env-file=.env src/scripts/rebuild-index.ts"
35
37
  },
36
38
  "keywords": [
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "torch>=2.0",
6
+ # "soundfile",
7
+ # "numpy",
8
+ # "silero-vad",
9
+ # ]
10
+ # ///
11
+ """
12
+ Audio Preprocessor - Silero VAD for speech segment extraction.
13
+ Uses soundfile for I/O to avoid torchaudio/torchcodec native dependency issues.
14
+
15
+ Usage:
16
+ uv run audio_preprocessor.py --audio /path/to/audio.wav --output-dir /tmp/segments --output-json /path/to/segments.json
17
+ """
18
+
19
+ import argparse
20
+ import json
21
+ import os
22
+ from pathlib import Path
23
+ import torch
24
+ import soundfile as sf
25
+ import numpy as np
26
+
27
+ def parse_args():
28
+ parser = argparse.ArgumentParser(description="Audio Preprocessor with Silero VAD")
29
+ parser.add_argument("--audio", type=Path, required=True, help="Path to input audio file")
30
+ parser.add_argument("--output-dir", type=Path, required=True, help="Directory to save segment WAV files")
31
+ parser.add_argument("--output-json", type=Path, required=True, help="Path to save segments manifest JSON")
32
+ parser.add_argument("--threshold", type=float, default=0.5, help="VAD threshold (default: 0.5)")
33
+ parser.add_argument("--min-speech-duration-ms", type=int, default=250, help="Min speech duration in ms")
34
+ parser.add_argument("--min-silence-duration-ms", type=int, default=1000, help="Min silence duration in ms")
35
+ return parser.parse_args()
36
+
37
+ def read_audio_sf(path: str, sampling_rate: int = 16000):
38
+ wav, sr = sf.read(path)
39
+ if len(wav.shape) > 1:
40
+ wav = np.mean(wav, axis=1)
41
+ if sr != sampling_rate:
42
+ # Note: We expect the input to be pre-converted by ffmpeg to 16000
43
+ # But if not, we would need a resampler. For now, we assume sr is correct.
44
+ pass
45
+ return torch.from_numpy(wav.astype(np.float32))
46
+
47
+ def main():
48
+ args = parse_args()
49
+
50
+ if not args.audio.exists():
51
+ print(f"Error: Audio file not found: {args.audio}")
52
+ return 1
53
+
54
+ args.output_dir.mkdir(parents=True, exist_ok=True)
55
+
56
+ # Load Silero VAD model
57
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
58
+ model='silero_vad',
59
+ force_reload=False,
60
+ onnx=False)
61
+
62
+ (get_speech_timestamps, _, _, _, _) = utils
63
+
64
+ # Load audio
65
+ sampling_rate = 16000
66
+ wav = read_audio_sf(str(args.audio), sampling_rate=sampling_rate)
67
+
68
+ # Get speech timestamps
69
+ speech_timestamps = get_speech_timestamps(
70
+ wav,
71
+ model,
72
+ sampling_rate=sampling_rate,
73
+ threshold=args.threshold,
74
+ min_speech_duration_ms=args.min_speech_duration_ms,
75
+ min_silence_duration_ms=args.min_silence_duration_ms
76
+ )
77
+
78
+ segments = []
79
+
80
+ for i, ts in enumerate(speech_timestamps):
81
+ start_sec = ts['start'] / sampling_rate
82
+ end_sec = ts['end'] / sampling_rate
83
+
84
+ # Extract segment
85
+ segment_wav = wav[ts['start']:ts['end']].numpy()
86
+
87
+ # Save segment to WAV using soundfile
88
+ segment_filename = f"segment_{i:04d}.wav"
89
+ segment_path = args.output_dir / segment_filename
90
+
91
+ sf.write(str(segment_path), segment_wav, sampling_rate)
92
+
93
+ segments.append({
94
+ "start": float(start_sec),
95
+ "end": float(end_sec),
96
+ "audioPath": str(segment_path)
97
+ })
98
+
99
+ # Write manifest
100
+ with open(args.output_json, "w") as f:
101
+ json.dump(segments, f, indent=2)
102
+
103
+ print(f"Extracted {len(segments)} speech segments to {args.output_dir}")
104
+ print(f"Manifest written to {args.output_json}")
105
+
106
+ return 0
107
+
108
+ if __name__ == "__main__":
109
+ exit(main())
@@ -0,0 +1,417 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Visual Observer Base - OCR + CLIP indexing for screen recordings.
4
+
5
+ Usage:
6
+ uv run visual_observer_base.py --frames-dir /path/to/frames --output /path/to/visual-index.json
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import os
12
+ import time
13
+ from concurrent.futures import ProcessPoolExecutor, as_completed
14
+ from pathlib import Path
15
+ from typing import TypedDict
16
+
17
+ import open_clip
18
+ import pytesseract
19
+ import torch
20
+ from PIL import Image
21
+ from sklearn.cluster import AgglomerativeClustering
22
+
23
+
24
+ # Type definitions
25
+ class FrameData(TypedDict):
26
+ index: int
27
+ timestamp: float
28
+ imagePath: str
29
+ ocrText: str
30
+ clusterId: int
31
+ changeScore: float
32
+
33
+
34
+ class ClusterData(TypedDict):
35
+ id: int
36
+ heuristicLabel: str
37
+ timeRange: tuple[float, float]
38
+ frameCount: int
39
+ representativeIdx: int
40
+ avgOcrCharacters: float
41
+ mediaIndicators: list[str]
42
+
43
+
44
+ class VisualIndex(TypedDict):
45
+ frames: list[FrameData]
46
+ clusters: list[ClusterData]
47
+ processingTime: dict[str, int]
48
+
49
+
50
+ # Constants
51
+ # Prefer MPS for Apple Silicon, fallback to CPU
52
+ DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
53
+ CLIP_MODEL = "ViT-B-32"
54
+ CLIP_PRETRAINED = "laion2b_s34b_b79k"
55
+ CLUSTER_DISTANCE_THRESHOLD = 0.15 # 1 - 0.85 similarity
56
+
57
+ UI_CATEGORIES = [
58
+ "A screenshot of a code editor showing programming code",
59
+ "A screenshot of a terminal with command line interface",
60
+ "A screenshot of a web browser showing a website",
61
+ "A screenshot of a video player with playback controls",
62
+ "A screenshot of a document or PDF viewer",
63
+ "A screenshot of an image viewer or photo application",
64
+ "A screenshot of a chat or messaging application",
65
+ "A screenshot of a file manager or finder window",
66
+ ]
67
+
68
+ CATEGORY_LABELS = [
69
+ "code-editor",
70
+ "terminal",
71
+ "browser",
72
+ "video-player",
73
+ "document",
74
+ "image-viewer",
75
+ "chat",
76
+ "file-manager",
77
+ ]
78
+
79
+
80
+ def parse_args() -> argparse.Namespace:
81
+ parser = argparse.ArgumentParser(description="Visual Observer Base")
82
+ parser.add_argument("--frames-dir", type=Path, required=True)
83
+ parser.add_argument("--output", type=Path, required=True)
84
+ parser.add_argument("--frame-interval", type=float, default=2.0,
85
+ help="Seconds between frames (default: 2)")
86
+ parser.add_argument("--workers", type=int, default=os.cpu_count(),
87
+ help="Number of parallel OCR workers (default: CPU count)")
88
+ return parser.parse_args()
89
+
90
+
91
+ def load_frames(frames_dir: Path, frame_interval: float) -> list[tuple[int, float, Path]]:
92
+ """Load frame paths and compute timestamps.
93
+
94
+ Args:
95
+ frames_dir: Directory containing frame images
96
+ frame_interval: Seconds between frames (e.g., 2.0 means frame 0 at 0s, frame 1 at 2s)
97
+ """
98
+ frames = []
99
+ # Assumes filenames like scene_0001.jpg
100
+ # Using sorted glob to ensure chronological order
101
+ all_files = sorted(list(frames_dir.glob("*.jpg")))
102
+
103
+ for i, path in enumerate(all_files):
104
+ timestamp = i * frame_interval
105
+ frames.append((i, timestamp, path))
106
+
107
+ return frames
108
+
109
+
110
+ def extract_ocr(image_path: Path) -> str:
111
+ """Extract text from image using Tesseract.
112
+
113
+ Uses PSM 11 (sparse text) which works better for UI screenshots
114
+ where text is scattered across the screen (menus, buttons, tabs, URLs).
115
+ """
116
+ try:
117
+ image = Image.open(image_path)
118
+ # PSM 11: Sparse text - finds text scattered anywhere (UI elements)
119
+ # OEM 3: Default OCR engine mode (LSTM if available)
120
+ custom_config = r'--psm 11 --oem 3'
121
+ text = pytesseract.image_to_string(image, config=custom_config)
122
+ return text.strip()
123
+ except Exception as e:
124
+ print(f" Warning: OCR failed for {image_path.name}: {e}")
125
+ return ""
126
+
127
+
128
+ def extract_ocr_parallel(
129
+ frames: list[tuple[int, float, Path]],
130
+ max_workers: int
131
+ ) -> dict[int, str]:
132
+ """Extract OCR in parallel using multiprocessing.
133
+
134
+ Args:
135
+ frames: List of (index, timestamp, path) tuples
136
+ max_workers: Number of parallel workers
137
+
138
+ Returns:
139
+ Dictionary mapping frame index to OCR text
140
+ """
141
+ results = {}
142
+ total = len(frames)
143
+ completed = 0
144
+
145
+ print(f" Using {max_workers} parallel workers...")
146
+
147
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
148
+ # Submit all tasks
149
+ future_to_idx = {
150
+ executor.submit(extract_ocr, path): idx
151
+ for idx, _, path in frames
152
+ }
153
+
154
+ # Collect results as they complete
155
+ for future in as_completed(future_to_idx):
156
+ idx = future_to_idx[future]
157
+ try:
158
+ results[idx] = future.result()
159
+ except Exception as e:
160
+ print(f" Warning: OCR failed for frame {idx}: {e}")
161
+ results[idx] = ""
162
+
163
+ completed += 1
164
+ # Progress indicator every 10%
165
+ if completed % max(1, total // 10) == 0:
166
+ pct = (completed / total) * 100
167
+ print(f" OCR progress: {completed}/{total} ({pct:.0f}%)")
168
+
169
+ return results
170
+
171
+
172
+ def compute_clip_embeddings(
173
+ frames: list[tuple[int, float, Path]],
174
+ model,
175
+ preprocess,
176
+ ) -> torch.Tensor:
177
+ """Compute CLIP embeddings for all frames."""
178
+ embeddings = []
179
+
180
+ for _, _, path in frames:
181
+ try:
182
+ image = preprocess(Image.open(path)).unsqueeze(0).to(DEVICE)
183
+
184
+ with torch.no_grad():
185
+ embedding = model.encode_image(image)
186
+ embedding = embedding / embedding.norm(dim=-1, keepdim=True)
187
+
188
+ embeddings.append(embedding.cpu())
189
+ except Exception as e:
190
+ print(f" Warning: CLIP embedding failed for {path.name}: {e}")
191
+ # Use zero vector as fallback to maintain alignment
192
+ embeddings.append(torch.zeros((1, 512)))
193
+
194
+ if not embeddings:
195
+ return torch.zeros((0, 512))
196
+
197
+ return torch.cat(embeddings, dim=0)
198
+
199
+
200
+ def cluster_frames(embeddings: torch.Tensor) -> list[int]:
201
+ """Cluster frames by CLIP embedding similarity."""
202
+ if len(embeddings) < 2:
203
+ return [0] * len(embeddings)
204
+
205
+ clustering = AgglomerativeClustering(
206
+ n_clusters=None, # type: ignore
207
+ distance_threshold=CLUSTER_DISTANCE_THRESHOLD,
208
+ metric="cosine",
209
+ linkage="average",
210
+ )
211
+
212
+ labels = clustering.fit_predict(embeddings.numpy())
213
+ return labels.tolist()
214
+
215
+
216
+ def infer_label_with_clip(
217
+ image_path: Path,
218
+ model,
219
+ preprocess,
220
+ tokenizer,
221
+ ) -> str:
222
+ """Use CLIP zero-shot to classify frame into UI category."""
223
+ try:
224
+ image = preprocess(Image.open(image_path)).unsqueeze(0).to(DEVICE)
225
+ text_tokens = tokenizer(UI_CATEGORIES).to(DEVICE)
226
+
227
+ with torch.no_grad():
228
+ image_features = model.encode_image(image)
229
+ text_features = model.encode_text(text_tokens)
230
+
231
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
232
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
233
+
234
+ similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
235
+ best_idx = similarity.argmax().item()
236
+
237
+ return CATEGORY_LABELS[best_idx]
238
+ except Exception as e:
239
+ print(f" Warning: Zero-shot classification failed for {image_path.name}: {e}")
240
+ return "unknown"
241
+
242
+
243
+ def detect_media_indicators(ocr_text: str) -> list[str]:
244
+ """
245
+ Detect indicators that frame shows media content.
246
+
247
+ TODO: Expand patterns based on real-world testing:
248
+ - Video platforms: Vimeo, Twitch, Netflix, Disney+
249
+ - Image formats: .gif, .webp, .svg, .bmp
250
+ - Media players: VLC, QuickTime, IINA, mpv
251
+ - Streaming: Spotify, Apple Music, SoundCloud
252
+ - Social media: Twitter/X, Instagram, TikTok
253
+ """
254
+ indicators = []
255
+ text_lower = ocr_text.lower()
256
+
257
+ # Video platforms
258
+ if "youtube" in text_lower:
259
+ indicators.append("youtube")
260
+
261
+ if "vimeo" in text_lower:
262
+ indicators.append("vimeo")
263
+
264
+ if "netflix" in text_lower:
265
+ indicators.append("netflix")
266
+
267
+ # Image files
268
+ image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
269
+ if any(ext in text_lower for ext in image_extensions):
270
+ indicators.append("image-file")
271
+
272
+ # TODO: Add more patterns after dry-run testing
273
+
274
+ return indicators
275
+
276
+
277
+ def build_cluster_metadata(
278
+ frames_data: list[FrameData],
279
+ cluster_labels: list[int],
280
+ model,
281
+ preprocess,
282
+ tokenizer,
283
+ ) -> list[ClusterData]:
284
+ """Build metadata for each cluster."""
285
+ clusters: dict[int, list[FrameData]] = {}
286
+
287
+ for frame, label in zip(frames_data, cluster_labels):
288
+ if label not in clusters:
289
+ clusters[label] = []
290
+ clusters[label].append(frame)
291
+
292
+ result = []
293
+ for cluster_id, cluster_frames in clusters.items():
294
+ # Find representative (middle frame)
295
+ representative = cluster_frames[len(cluster_frames) // 2]
296
+
297
+ # Compute average OCR characters
298
+ avg_chars = sum(len(f["ocrText"]) for f in cluster_frames) / len(cluster_frames)
299
+
300
+ # Get time range
301
+ timestamps = [f["timestamp"] for f in cluster_frames]
302
+ time_range = (float(min(timestamps)), float(max(timestamps)))
303
+
304
+ # Aggregate media indicators
305
+ all_indicators = set()
306
+ for f in cluster_frames:
307
+ all_indicators.update(detect_media_indicators(f["ocrText"]))
308
+
309
+ # Infer label using CLIP on representative
310
+ rep_path = Path(representative["imagePath"])
311
+ label = infer_label_with_clip(rep_path, model, preprocess, tokenizer)
312
+
313
+ result.append({
314
+ "id": cluster_id,
315
+ "heuristicLabel": label,
316
+ "timeRange": time_range,
317
+ "frameCount": len(cluster_frames),
318
+ "representativeIdx": representative["index"],
319
+ "avgOcrCharacters": avg_chars,
320
+ "mediaIndicators": list(all_indicators),
321
+ })
322
+
323
+ return result
324
+
325
+
326
+ def main():
327
+ args = parse_args()
328
+
329
+ print(f"Loading frames from {args.frames_dir}...")
330
+ frames = load_frames(args.frames_dir, args.frame_interval)
331
+
332
+ if not frames:
333
+ print("Error: No frames found")
334
+ return 1
335
+
336
+ print(f"Found {len(frames)} frames")
337
+
338
+ # Initialize timing
339
+ timing = {"ocrMs": 0, "clipMs": 0, "clusterMs": 0, "totalMs": 0}
340
+ total_start = time.time()
341
+
342
+ # Phase 1: OCR (Parallel)
343
+ print(f"Phase 1: Extracting text with OCR ({args.workers} workers)...")
344
+ ocr_start = time.time()
345
+
346
+ ocr_results = extract_ocr_parallel(frames, args.workers)
347
+
348
+ frames_data: list[FrameData] = []
349
+ for idx, timestamp, path in frames:
350
+ frames_data.append({
351
+ "index": idx,
352
+ "timestamp": timestamp,
353
+ "imagePath": str(path),
354
+ "ocrText": ocr_results.get(idx, ""),
355
+ "clusterId": -1, # Set later
356
+ "changeScore": 0.0, # TODO: Implement pixel delta if needed
357
+ })
358
+
359
+ timing["ocrMs"] = int((time.time() - ocr_start) * 1000)
360
+ print(f" OCR complete: {timing['ocrMs']}ms")
361
+
362
+ # Phase 2: CLIP embeddings
363
+ print(f"Phase 2: Computing CLIP embeddings on {DEVICE}...")
364
+ clip_start = time.time()
365
+
366
+ model, _, preprocess = open_clip.create_model_and_transforms(
367
+ CLIP_MODEL, pretrained=CLIP_PRETRAINED
368
+ )
369
+ model.eval()
370
+ model.to(DEVICE)
371
+ tokenizer = open_clip.get_tokenizer(CLIP_MODEL)
372
+
373
+ embeddings = compute_clip_embeddings(frames, model, preprocess)
374
+ timing["clipMs"] = int((time.time() - clip_start) * 1000)
375
+ print(f" CLIP complete: {timing['clipMs']}ms")
376
+
377
+ # Phase 3: Clustering
378
+ print("Phase 3: Clustering frames...")
379
+ cluster_start = time.time()
380
+
381
+ cluster_labels = cluster_frames(embeddings)
382
+
383
+ # Update frames with cluster IDs
384
+ for frame, label in zip(frames_data, cluster_labels):
385
+ frame["clusterId"] = label
386
+
387
+ timing["clusterMs"] = int((time.time() - cluster_start) * 1000)
388
+ print(f" Clustering complete: {timing['clusterMs']}ms")
389
+
390
+ # Phase 4: Build cluster metadata
391
+ print("Phase 4: Building cluster metadata...")
392
+ clusters = build_cluster_metadata(
393
+ frames_data, cluster_labels, model, preprocess, tokenizer
394
+ )
395
+ print(f" Found {len(clusters)} clusters")
396
+
397
+ timing["totalMs"] = int((time.time() - total_start) * 1000)
398
+
399
+ # Output
400
+ result: VisualIndex = {
401
+ "frames": frames_data,
402
+ "clusters": clusters,
403
+ "processingTime": timing,
404
+ }
405
+
406
+ args.output.parent.mkdir(parents=True, exist_ok=True)
407
+ with open(args.output, "w") as f:
408
+ json.dump(result, f, indent=2)
409
+
410
+ print(f"\nOutput written to {args.output}")
411
+ print(f"Total processing time: {timing['totalMs']}ms")
412
+
413
+ return 0
414
+
415
+
416
+ if __name__ == "__main__":
417
+ exit(main())