escribano 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +297 -0
  3. package/dist/0_types.js +279 -0
  4. package/dist/actions/classify-session.js +77 -0
  5. package/dist/actions/create-contexts.js +44 -0
  6. package/dist/actions/create-topic-blocks.js +68 -0
  7. package/dist/actions/extract-metadata.js +24 -0
  8. package/dist/actions/generate-artifact-v3.js +296 -0
  9. package/dist/actions/generate-artifact.js +61 -0
  10. package/dist/actions/generate-summary-v3.js +260 -0
  11. package/dist/actions/outline-index.js +204 -0
  12. package/dist/actions/process-recording-v2.js +494 -0
  13. package/dist/actions/process-recording-v3.js +412 -0
  14. package/dist/actions/process-session.js +183 -0
  15. package/dist/actions/publish-summary-v3.js +303 -0
  16. package/dist/actions/sync-to-outline.js +196 -0
  17. package/dist/adapters/audio.silero.adapter.js +69 -0
  18. package/dist/adapters/cap.adapter.js +94 -0
  19. package/dist/adapters/capture.cap.adapter.js +107 -0
  20. package/dist/adapters/capture.filesystem.adapter.js +124 -0
  21. package/dist/adapters/embedding.ollama.adapter.js +141 -0
  22. package/dist/adapters/intelligence.adapter.js +202 -0
  23. package/dist/adapters/intelligence.mlx.adapter.js +395 -0
  24. package/dist/adapters/intelligence.ollama.adapter.js +741 -0
  25. package/dist/adapters/publishing.outline.adapter.js +75 -0
  26. package/dist/adapters/storage.adapter.js +81 -0
  27. package/dist/adapters/storage.fs.adapter.js +83 -0
  28. package/dist/adapters/transcription.whisper.adapter.js +206 -0
  29. package/dist/adapters/video.ffmpeg.adapter.js +405 -0
  30. package/dist/adapters/whisper.adapter.js +168 -0
  31. package/dist/batch-context.js +329 -0
  32. package/dist/db/helpers.js +50 -0
  33. package/dist/db/index.js +95 -0
  34. package/dist/db/migrate.js +80 -0
  35. package/dist/db/repositories/artifact.sqlite.js +77 -0
  36. package/dist/db/repositories/cluster.sqlite.js +92 -0
  37. package/dist/db/repositories/context.sqlite.js +75 -0
  38. package/dist/db/repositories/index.js +10 -0
  39. package/dist/db/repositories/observation.sqlite.js +70 -0
  40. package/dist/db/repositories/recording.sqlite.js +56 -0
  41. package/dist/db/repositories/subject.sqlite.js +64 -0
  42. package/dist/db/repositories/topic-block.sqlite.js +45 -0
  43. package/dist/db/types.js +4 -0
  44. package/dist/domain/classification.js +60 -0
  45. package/dist/domain/context.js +97 -0
  46. package/dist/domain/index.js +2 -0
  47. package/dist/domain/observation.js +17 -0
  48. package/dist/domain/recording.js +41 -0
  49. package/dist/domain/segment.js +93 -0
  50. package/dist/domain/session.js +93 -0
  51. package/dist/domain/time-range.js +38 -0
  52. package/dist/domain/transcript.js +79 -0
  53. package/dist/index.js +173 -0
  54. package/dist/pipeline/context.js +162 -0
  55. package/dist/pipeline/events.js +2 -0
  56. package/dist/prerequisites.js +226 -0
  57. package/dist/scripts/rebuild-index.js +53 -0
  58. package/dist/scripts/seed-fixtures.js +290 -0
  59. package/dist/services/activity-segmentation.js +333 -0
  60. package/dist/services/activity-segmentation.test.js +191 -0
  61. package/dist/services/app-normalization.js +212 -0
  62. package/dist/services/cluster-merge.js +69 -0
  63. package/dist/services/clustering.js +237 -0
  64. package/dist/services/debug.js +58 -0
  65. package/dist/services/frame-sampling.js +318 -0
  66. package/dist/services/signal-extraction.js +106 -0
  67. package/dist/services/subject-grouping.js +342 -0
  68. package/dist/services/temporal-alignment.js +99 -0
  69. package/dist/services/vlm-enrichment.js +84 -0
  70. package/dist/services/vlm-service.js +130 -0
  71. package/dist/stats/index.js +3 -0
  72. package/dist/stats/observer.js +65 -0
  73. package/dist/stats/repository.js +36 -0
  74. package/dist/stats/resource-tracker.js +86 -0
  75. package/dist/stats/types.js +1 -0
  76. package/dist/test-classification-prompts.js +181 -0
  77. package/dist/tests/cap.adapter.test.js +75 -0
  78. package/dist/tests/capture.cap.adapter.test.js +69 -0
  79. package/dist/tests/classify-session.test.js +140 -0
  80. package/dist/tests/db/repositories.test.js +243 -0
  81. package/dist/tests/domain/time-range.test.js +31 -0
  82. package/dist/tests/integration.test.js +84 -0
  83. package/dist/tests/intelligence.adapter.test.js +102 -0
  84. package/dist/tests/intelligence.ollama.adapter.test.js +178 -0
  85. package/dist/tests/process-v2.test.js +90 -0
  86. package/dist/tests/services/clustering.test.js +112 -0
  87. package/dist/tests/services/frame-sampling.test.js +152 -0
  88. package/dist/tests/utils/ocr.test.js +76 -0
  89. package/dist/tests/utils/parallel.test.js +57 -0
  90. package/dist/tests/visual-observer.test.js +175 -0
  91. package/dist/utils/id-normalization.js +15 -0
  92. package/dist/utils/index.js +9 -0
  93. package/dist/utils/model-detector.js +154 -0
  94. package/dist/utils/ocr.js +80 -0
  95. package/dist/utils/parallel.js +32 -0
  96. package/migrations/001_initial.sql +109 -0
  97. package/migrations/002_clusters.sql +41 -0
  98. package/migrations/003_observations_vlm_fields.sql +14 -0
  99. package/migrations/004_observations_unique.sql +18 -0
  100. package/migrations/005_processing_stats.sql +29 -0
  101. package/migrations/006_vlm_raw_response.sql +6 -0
  102. package/migrations/007_subjects.sql +23 -0
  103. package/migrations/008_artifacts_recording.sql +6 -0
  104. package/migrations/009_artifact_subjects.sql +10 -0
  105. package/package.json +82 -0
  106. package/prompts/action-items.md +55 -0
  107. package/prompts/blog-draft.md +54 -0
  108. package/prompts/blog-research.md +87 -0
  109. package/prompts/card.md +54 -0
  110. package/prompts/classify-segment.md +38 -0
  111. package/prompts/classify.md +37 -0
  112. package/prompts/code-snippets.md +163 -0
  113. package/prompts/extract-metadata.md +149 -0
  114. package/prompts/notes.md +83 -0
  115. package/prompts/runbook.md +123 -0
  116. package/prompts/standup.md +50 -0
  117. package/prompts/step-by-step.md +125 -0
  118. package/prompts/subject-grouping.md +31 -0
  119. package/prompts/summary-v3.md +89 -0
  120. package/prompts/summary.md +77 -0
  121. package/prompts/topic-classifier.md +24 -0
  122. package/prompts/topic-extract.md +13 -0
  123. package/prompts/vlm-batch.md +21 -0
  124. package/prompts/vlm-single.md +19 -0
@@ -0,0 +1,412 @@
1
+ /**
2
+ * Escribano - V3 Recording Processor (VLM-First Pipeline)
3
+ *
4
+ * This pipeline replaces the V2 OCR→Embedding→Clustering approach with
5
+ * a VLM-first approach where visual understanding drives segmentation.
6
+ *
7
+ * See ADR-005 for architectural rationale.
8
+ */
9
+ import { readdir } from 'node:fs/promises';
10
+ import os from 'node:os';
11
+ import path from 'node:path';
12
+ import { generateId } from '../db/helpers.js';
13
+ import { advanceStep, completeProcessing, failProcessing, startProcessing, } from '../domain/recording.js';
14
+ import { log, step } from '../pipeline/context.js';
15
+ import { calculateRequiredTimestamps, } from '../services/frame-sampling.js';
16
+ import { describeFrames } from '../services/vlm-service.js';
17
+ // V3 step order - simplified from V2
18
+ const STEP_ORDER_V3 = [
19
+ 'vad',
20
+ 'transcription',
21
+ 'frame_extraction',
22
+ 'vlm_enrichment', // Repurposed: now does batch VLM on all sampled frames
23
+ 'context_creation',
24
+ 'block_formation',
25
+ 'complete',
26
+ ];
27
+ function shouldSkipStep(currentStep, targetStep) {
28
+ if (!currentStep)
29
+ return false;
30
+ if (currentStep === 'complete')
31
+ return true;
32
+ const currentIndex = STEP_ORDER_V3.indexOf(currentStep);
33
+ const targetIndex = STEP_ORDER_V3.indexOf(targetStep);
34
+ return targetIndex < currentIndex;
35
+ }
36
+ /**
37
+ * Process a recording using the V3 VLM-First pipeline.
38
+ *
39
+ * Key differences from V2:
40
+ * - No OCR processing step
41
+ * - No embedding generation step
42
+ * - No semantic clustering step
43
+ * - VLM processes all sampled frames (not just sparse selection)
44
+ * - Activity-based segmentation (not embedding similarity)
45
+ */
46
+ export async function processRecordingV3(recordingId, repos, adapters, options = {}) {
47
+ const dbRecording = repos.recordings.findById(recordingId);
48
+ if (!dbRecording) {
49
+ throw new Error(`Recording ${recordingId} not found`);
50
+ }
51
+ // Handle --force: delete existing observations and reset
52
+ if (options.force) {
53
+ log('info', `[V3] Force flag set, deleting existing data for ${recordingId}...`);
54
+ repos.observations.deleteByRecording(recordingId);
55
+ repos.topicBlocks.deleteByRecording(recordingId);
56
+ }
57
+ // Map DB to Domain
58
+ let recording = {
59
+ id: dbRecording.id,
60
+ status: dbRecording.status,
61
+ processingStep: dbRecording.processing_step,
62
+ errorMessage: dbRecording.error_message,
63
+ videoPath: dbRecording.video_path,
64
+ audioMicPath: dbRecording.audio_mic_path,
65
+ audioSystemPath: dbRecording.audio_system_path,
66
+ capturedAt: dbRecording.captured_at,
67
+ duration: dbRecording.duration,
68
+ };
69
+ // If forced, reset to raw state
70
+ if (options.force) {
71
+ recording = {
72
+ ...recording,
73
+ status: 'raw',
74
+ processingStep: null,
75
+ errorMessage: null,
76
+ };
77
+ updateRecordingInDb(repos, recording);
78
+ }
79
+ if (recording.processingStep) {
80
+ log('info', `[V3] Resuming ${recording.id} from step: ${recording.processingStep}`);
81
+ }
82
+ try {
83
+ // Start processing
84
+ if (!shouldSkipStep(recording.processingStep, 'vad')) {
85
+ recording = startProcessing(recording);
86
+ updateRecordingInDb(repos, recording);
87
+ }
88
+ // ============================================
89
+ // AUDIO PIPELINE (reused from V2)
90
+ // ============================================
91
+ if (!shouldSkipStep(recording.processingStep, 'transcription')) {
92
+ log('info', '[V3] Running audio pipeline...');
93
+ const audioObservations = await processAudioPipeline(recording, adapters);
94
+ if (audioObservations.length > 0) {
95
+ await step('save-audio-observations', async () => {
96
+ repos.observations.saveBatch(audioObservations);
97
+ log('info', `[V3] Saved ${audioObservations.length} audio observations`);
98
+ return { itemsProcessed: audioObservations.length };
99
+ }, { itemsTotal: audioObservations.length });
100
+ }
101
+ recording = advanceStep(recording, 'transcription');
102
+ updateRecordingInDb(repos, recording);
103
+ }
104
+ else {
105
+ log('info', '[V3] Skipping audio pipeline (already completed)');
106
+ }
107
+ // ============================================
108
+ // VISUAL PIPELINE (V3: Smart Extraction)
109
+ // ============================================
110
+ if (recording.videoPath) {
111
+ // Step 1: Get video metadata
112
+ const metadata = await adapters.video.getMetadata(recording.videoPath);
113
+ log('info', `[V3] Video: ${Math.round(metadata.duration)}s, ${metadata.width}x${metadata.height}`);
114
+ // Step 2: Scene Detection FIRST (no frame extraction needed)
115
+ let sceneChanges = [];
116
+ const dbRecording = repos.recordings.findById(recording.id);
117
+ const sourceMetadata = dbRecording?.source_metadata
118
+ ? JSON.parse(dbRecording.source_metadata)
119
+ : {};
120
+ if (sourceMetadata.scene_changes) {
121
+ sceneChanges = sourceMetadata.scene_changes;
122
+ log('info', `[V3] Loaded ${sceneChanges.length} scene changes from DB`);
123
+ }
124
+ else {
125
+ sceneChanges = await step('scene-detection', async () => {
126
+ const changes = await adapters.video.detectSceneChanges(recording.videoPath);
127
+ log('info', `[V3] Detected ${changes.length} scene changes`);
128
+ // Save to DB for resume safety
129
+ if (dbRecording) {
130
+ const updatedMetadata = {
131
+ ...sourceMetadata,
132
+ scene_changes: changes,
133
+ };
134
+ repos.recordings.updateMetadata(recording.id, JSON.stringify(updatedMetadata));
135
+ }
136
+ return changes;
137
+ });
138
+ }
139
+ // Step 3: Calculate required timestamps (pure math, no I/O)
140
+ log('info', '[V3] Calculating required frame timestamps...');
141
+ const requiredTimestamps = calculateRequiredTimestamps(metadata.duration, sceneChanges);
142
+ log('info', `[V3] Need ${requiredTimestamps.length} frames (from ${Math.round(metadata.duration)}s video with ${sceneChanges.length} scenes)`);
143
+ // Step 4: Extract ONLY the needed frames
144
+ let extractedFrames = [];
145
+ if (!shouldSkipStep(recording.processingStep, 'frame_extraction')) {
146
+ extractedFrames = await step('frame-extraction-batch', async () => {
147
+ const framesDir = path.join(os.tmpdir(), 'escribano', recording.id, 'frames');
148
+ const frames = await adapters.video.extractFramesAtTimestampsBatch(recording.videoPath, requiredTimestamps, framesDir);
149
+ log('info', `[V3] Extracted ${frames.length} frames`);
150
+ recording = advanceStep(recording, 'frame_extraction');
151
+ updateRecordingInDb(repos, recording);
152
+ return frames;
153
+ });
154
+ }
155
+ else {
156
+ log('info', '[V3] Skipping frame extraction (already completed)');
157
+ // Reload frames from disk if resuming
158
+ const framesDir = path.join(os.tmpdir(), 'escribano', recording.id, 'frames');
159
+ try {
160
+ const files = await readdir(framesDir);
161
+ extractedFrames = files
162
+ .filter((f) => f.endsWith('.jpg'))
163
+ .map((f, i) => ({
164
+ imagePath: path.join(framesDir, f),
165
+ timestamp: requiredTimestamps[i] || i * 10,
166
+ }))
167
+ .sort((a, b) => a.timestamp - b.timestamp);
168
+ log('info', `[V3] Reloaded ${extractedFrames.length} frames from disk`);
169
+ }
170
+ catch {
171
+ log('warn', '[V3] Could not reload frames from disk');
172
+ }
173
+ }
174
+ // Step 5: VLM Batch Inference
175
+ if (!shouldSkipStep(recording.processingStep, 'vlm_enrichment')) {
176
+ // Check for already-processed frames (resume safety)
177
+ const existingObs = repos.observations
178
+ .findByRecording(recording.id)
179
+ .filter((o) => o.type === 'visual' && o.vlm_description)
180
+ .filter((o) => !o.vlm_description?.startsWith('No description') &&
181
+ !o.vlm_description?.startsWith('Parse error'));
182
+ const processedTimestamps = new Set(existingObs.map((o) => o.timestamp));
183
+ const framesToProcess = extractedFrames.filter((f) => !processedTimestamps.has(f.timestamp));
184
+ if (framesToProcess.length < extractedFrames.length) {
185
+ log('info', `[V3] Found ${existingObs.length} already-processed frames, ${framesToProcess.length} remaining`);
186
+ }
187
+ const vlmItemsTotal = framesToProcess.length;
188
+ await step('vlm-batch-inference', async () => {
189
+ let framesProcessed = 0;
190
+ if (framesToProcess.length === 0) {
191
+ log('info', '[V3] All frames already processed, skipping VLM inference');
192
+ }
193
+ else {
194
+ log('info', `[V3] Frames to process (${framesToProcess.length}):`);
195
+ framesToProcess.slice(0, 10).forEach((f, i) => {
196
+ log('info', ` [${i}] ${f.imagePath.split('/').pop()} @ ${f.timestamp}s`);
197
+ });
198
+ if (framesToProcess.length > 10) {
199
+ log('info', ` ... and ${framesToProcess.length - 10} more`);
200
+ }
201
+ log('info', '[V3] Starting VLM inference...');
202
+ await describeFrames(framesToProcess, adapters.intelligence, {
203
+ recordingId: recording.id,
204
+ onImageProcessed: (result, progress) => {
205
+ const observation = {
206
+ id: generateId(),
207
+ recording_id: recording.id,
208
+ type: 'visual',
209
+ timestamp: result.timestamp,
210
+ end_timestamp: result.timestamp,
211
+ image_path: result.imagePath,
212
+ ocr_text: null,
213
+ vlm_description: result.description,
214
+ vlm_raw_response: result.raw_response ?? null,
215
+ activity_type: result.activity,
216
+ apps: JSON.stringify(result.apps),
217
+ topics: JSON.stringify(result.topics),
218
+ embedding: null,
219
+ text: null,
220
+ audio_source: null,
221
+ audio_type: null,
222
+ };
223
+ repos.observations.save(observation);
224
+ framesProcessed = progress.current;
225
+ if (progress.current % 10 === 0) {
226
+ log('info', `[V3] Processed ${progress.current}/${progress.total} frames`);
227
+ }
228
+ },
229
+ });
230
+ const allVisualObs = repos.observations
231
+ .findByRecording(recording.id)
232
+ .filter((o) => o.type === 'visual' && o.vlm_description);
233
+ log('info', `[V3] VLM complete: ${allVisualObs.length} total visual observations`);
234
+ }
235
+ recording = advanceStep(recording, 'vlm_enrichment');
236
+ updateRecordingInDb(repos, recording);
237
+ return { itemsProcessed: framesProcessed || existingObs.length };
238
+ }, { itemsTotal: vlmItemsTotal || extractedFrames.length });
239
+ }
240
+ else {
241
+ log('info', '[V3] Skipping VLM inference (already completed)');
242
+ }
243
+ // Phase 2 - Activity Segmentation & TopicBlock Formation
244
+ if (!shouldSkipStep(recording.processingStep, 'block_formation')) {
245
+ await step('activity-segmentation-and-alignment', async () => {
246
+ // Get all observations for this recording
247
+ const allObservations = repos.observations.findByRecording(recording.id);
248
+ const visualObservations = allObservations.filter((o) => o.type === 'visual');
249
+ const audioObservations = allObservations.filter((o) => o.type === 'audio');
250
+ log('info', `[V3] Running activity segmentation on ${visualObservations.length} visual observations...`);
251
+ // Import and run segmentation
252
+ const { segmentByActivity, getSegmentStats } = await import('../services/activity-segmentation.js');
253
+ const segments = segmentByActivity(visualObservations);
254
+ const stats = getSegmentStats(segments);
255
+ log('info', `[V3] Created ${stats.totalSegments} segments: ${Object.entries(stats.activityTypeCounts)
256
+ .map(([k, v]) => `${k}=${v}`)
257
+ .join(', ')}`);
258
+ // Import and run temporal alignment
259
+ const { alignAudioToSegments, getAlignmentStats } = await import('../services/temporal-alignment.js');
260
+ log('info', `[V3] Aligning ${audioObservations.length} audio transcripts to segments...`);
261
+ const enrichedSegments = alignAudioToSegments(segments, audioObservations);
262
+ const alignStats = getAlignmentStats(enrichedSegments);
263
+ log('info', `[V3] Aligned audio: ${alignStats.segmentsWithAudio}/${alignStats.totalSegments} segments have transcripts (${alignStats.totalTranscriptSegments} total transcript segments)`);
264
+ log('info', `[V3] Creating ${enrichedSegments.length} TopicBlocks...`);
265
+ let blockCount = 0;
266
+ for (const segment of enrichedSegments) {
267
+ // Create context from segment apps/topics
268
+ const contextIds = [];
269
+ // Simplified context creation using INSERT OR IGNORE
270
+ for (const app of segment.apps) {
271
+ const ctxId = generateId();
272
+ repos.contexts.saveOrIgnore({
273
+ id: ctxId,
274
+ type: 'app',
275
+ name: app,
276
+ metadata: JSON.stringify({ source: 'vlm-v3' }),
277
+ });
278
+ // Fetch the context to get its ID (existing or newly created)
279
+ const existingCtx = repos.contexts.findByTypeAndName('app', app);
280
+ if (existingCtx) {
281
+ contextIds.push(existingCtx.id);
282
+ }
283
+ }
284
+ for (const topic of segment.topics) {
285
+ const ctxId = generateId();
286
+ repos.contexts.saveOrIgnore({
287
+ id: ctxId,
288
+ type: 'topic',
289
+ name: topic,
290
+ metadata: JSON.stringify({ source: 'vlm-v3' }),
291
+ });
292
+ // Fetch the context to get its ID (existing or newly created)
293
+ const existingCtx = repos.contexts.findByTypeAndName('topic', topic);
294
+ if (existingCtx) {
295
+ contextIds.push(existingCtx.id);
296
+ }
297
+ }
298
+ // Create the TopicBlock with enriched classification
299
+ repos.topicBlocks.save({
300
+ id: generateId(),
301
+ recording_id: recording.id,
302
+ context_ids: JSON.stringify(contextIds),
303
+ classification: JSON.stringify({
304
+ activity_type: segment.activityType,
305
+ key_description: segment.keyDescription,
306
+ start_time: segment.startTime,
307
+ end_time: segment.endTime,
308
+ duration: segment.duration,
309
+ apps: segment.apps,
310
+ topics: segment.topics,
311
+ transcript_count: segment.transcripts.length,
312
+ has_transcript: segment.combinedTranscript.length > 0,
313
+ combined_transcript: segment.combinedTranscript,
314
+ }),
315
+ duration: segment.duration,
316
+ });
317
+ blockCount++;
318
+ }
319
+ log('info', `[V3] Created ${blockCount} TopicBlocks`);
320
+ recording = advanceStep(recording, 'context_creation');
321
+ updateRecordingInDb(repos, recording);
322
+ recording = advanceStep(recording, 'block_formation');
323
+ updateRecordingInDb(repos, recording);
324
+ return { itemsProcessed: blockCount };
325
+ });
326
+ }
327
+ else {
328
+ log('info', '[V3] Skipping segmentation and block formation (already completed)');
329
+ }
330
+ }
331
+ // Complete
332
+ recording = completeProcessing(recording);
333
+ updateRecordingInDb(repos, recording);
334
+ log('info', `[V3] Successfully processed recording ${recording.id}`);
335
+ }
336
+ catch (error) {
337
+ const message = error.message;
338
+ log('error', `[V3] Processing failed for ${recordingId}: ${message}`);
339
+ recording = failProcessing(recording, message);
340
+ updateRecordingInDb(repos, recording);
341
+ throw error;
342
+ }
343
+ }
344
+ /**
345
+ * Process audio sources (reused from V2 with minor modifications)
346
+ */
347
+ async function processAudioPipeline(recording, adapters) {
348
+ const observations = [];
349
+ const processSource = async (audioPath, source) => {
350
+ if (!audioPath) {
351
+ log('info', `[V3] No ${source} audio path, skipping...`);
352
+ return;
353
+ }
354
+ log('info', `[V3] Processing ${source} audio: ${audioPath}`);
355
+ // VAD
356
+ const { segments, tempDir } = await step(`vad-${source}`, async () => {
357
+ return await adapters.preprocessor.extractSpeechSegments(audioPath, recording.id);
358
+ });
359
+ if (segments.length === 0) {
360
+ log('info', `[V3] No speech segments found in ${source} audio`);
361
+ await adapters.preprocessor.cleanup(tempDir);
362
+ return;
363
+ }
364
+ log('info', `[V3] Found ${segments.length} segments in ${source} audio`);
365
+ // Transcription
366
+ await step(`transcription-${source}`, async () => {
367
+ let successCount = 0;
368
+ for (const segment of segments) {
369
+ try {
370
+ const text = await adapters.transcription.transcribeSegment(segment.audioPath);
371
+ if (text.length > 0) {
372
+ successCount++;
373
+ observations.push({
374
+ id: generateId(),
375
+ recording_id: recording.id,
376
+ type: 'audio',
377
+ timestamp: segment.start,
378
+ end_timestamp: segment.end,
379
+ text,
380
+ audio_source: source,
381
+ audio_type: 'speech',
382
+ image_path: null,
383
+ ocr_text: null,
384
+ vlm_description: null,
385
+ vlm_raw_response: null,
386
+ activity_type: null,
387
+ apps: null,
388
+ topics: null,
389
+ embedding: null,
390
+ });
391
+ }
392
+ }
393
+ catch (error) {
394
+ log('warn', `[V3] Failed to transcribe segment at ${segment.start}s: ${error.message}`);
395
+ }
396
+ }
397
+ log('info', `[V3] Transcribed ${successCount}/${segments.length} segments for ${source}`);
398
+ return { itemsProcessed: successCount };
399
+ }, { itemsTotal: segments.length });
400
+ // Cleanup
401
+ await step(`cleanup-${source}`, async () => {
402
+ await adapters.preprocessor.cleanup(tempDir);
403
+ });
404
+ };
405
+ // Process sequentially
406
+ await processSource(recording.audioMicPath, 'mic');
407
+ await processSource(recording.audioSystemPath, 'system');
408
+ return observations;
409
+ }
410
+ function updateRecordingInDb(repos, recording) {
411
+ repos.recordings.updateStatus(recording.id, recording.status, recording.processingStep, recording.errorMessage);
412
+ }
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Process Session Action
3
+ * @deprecated V2 pipeline - use process-recording-v3.ts instead.
4
+ *
5
+ * Takes a recording and transcribes all available audio sources, creating a Session.
6
+ * Supports multiple audio sources (mic, system) with parallel transcription option.
7
+ */
8
+ import os from 'node:os';
9
+ import path from 'node:path';
10
+ import { Session } from '../domain/session.js';
11
+ import { Transcript } from '../domain/transcript.js';
12
+ /**
13
+ * Transcribe multiple audio sources, optionally in parallel.
14
+ */
15
+ async function transcribeAudioSources(sources, transcriber, parallel = false) {
16
+ const results = [];
17
+ if (parallel) {
18
+ console.log('Transcribing audio sources in parallel...');
19
+ const promises = sources.map(async ({ source, path }) => {
20
+ try {
21
+ console.log(`Transcribing ${source} audio from: ${path}`);
22
+ const transcript = await transcriber.transcribe(path);
23
+ if (Transcript.isEmpty(transcript)) {
24
+ console.log(`Warning: ${source} audio produced empty transcript`);
25
+ return null;
26
+ }
27
+ return { source, transcript };
28
+ }
29
+ catch (error) {
30
+ console.error(`Failed to transcribe ${source} audio:`, error);
31
+ return null;
32
+ }
33
+ });
34
+ const transcribed = await Promise.all(promises);
35
+ return transcribed.filter((t) => t !== null);
36
+ }
37
+ console.log('Transcribing audio sources sequentially...');
38
+ for (const { source, path } of sources) {
39
+ try {
40
+ console.log(`Transcribing ${source} audio from: ${path}`);
41
+ const transcript = await transcriber.transcribe(path);
42
+ if (Transcript.isEmpty(transcript)) {
43
+ console.log(`Warning: ${source} audio produced empty transcript`);
44
+ continue;
45
+ }
46
+ results.push({ source, transcript });
47
+ }
48
+ catch (error) {
49
+ console.error(`Failed to transcribe ${source} audio:`, error);
50
+ }
51
+ }
52
+ return results;
53
+ }
54
+ /**
55
+ * Process a recording by transcribing all available audio sources and extracting visual logs
56
+ */
57
+ export async function processSession(recording, transcriber, videoService, storageService, intelligenceService) {
58
+ console.log(`Processing recording: ${recording.id}`);
59
+ let session = Session.create(recording);
60
+ const parallelTranscription = process.env.ESCRIBANO_PARALLEL_TRANSCRIPTION === 'true';
61
+ // 1. Audio Transcription
62
+ const audioSources = [];
63
+ if (recording.audioMicPath) {
64
+ audioSources.push({ source: 'mic', path: recording.audioMicPath });
65
+ }
66
+ if (recording.audioSystemPath) {
67
+ audioSources.push({ source: 'system', path: recording.audioSystemPath });
68
+ }
69
+ if (audioSources.length > 0) {
70
+ const transcripts = await transcribeAudioSources(audioSources, transcriber, parallelTranscription);
71
+ session = Session.withTranscripts(session, transcripts);
72
+ }
73
+ // Intermediate save to ensure we don't lose transcription work
74
+ await storageService.saveSession(session);
75
+ // 2. Visual Log Extraction
76
+ if (!recording.videoPath) {
77
+ return finalizeSession(session, [], storageService);
78
+ }
79
+ const { visualLogs, updatedSession } = await extractVisualLogs(session, videoService, intelligenceService);
80
+ return finalizeSession(updatedSession, visualLogs, storageService);
81
+ }
82
+ /**
83
+ * Extract visual logs from a video recording
84
+ */
85
+ async function extractVisualLogs(session, videoService, intelligenceService) {
86
+ const { recording } = session;
87
+ if (!recording.videoPath)
88
+ return { visualLogs: [], updatedSession: session };
89
+ console.log(`Extracting visual log from: ${recording.videoPath}`);
90
+ const visualLogDir = path.join(os.homedir(), '.escribano', 'sessions', recording.id, 'visual-log');
91
+ try {
92
+ const sceneResults = await videoService.extractFramesAtInterval(recording.videoPath, 0.3, visualLogDir);
93
+ if (sceneResults.length === 0)
94
+ return { visualLogs: [], updatedSession: session };
95
+ console.log('Running visual analysis (OCR + CLIP)...');
96
+ const indexPath = path.join(visualLogDir, 'visual-index.json');
97
+ const visualIndex = await videoService.runVisualIndexing(visualLogDir, indexPath);
98
+ console.log(`✓ Indexed ${visualIndex.frames.length} frames into ${visualIndex.clusters.length} clusters`);
99
+ // Update session with visual index (generates segments)
100
+ const updatedSession = Session.withVisualIndex(session, visualIndex);
101
+ const descriptions = await getVisualDescriptions(updatedSession, visualIndex, intelligenceService);
102
+ const entries = visualIndex.clusters.map((cluster) => {
103
+ const repFrame = visualIndex.frames.find((f) => f.index === cluster.representativeIdx);
104
+ const vlmDesc = descriptions.find((d) => d.clusterId === cluster.id);
105
+ return {
106
+ timestamp: repFrame?.timestamp || cluster.timeRange[0],
107
+ imagePath: repFrame?.imagePath || '',
108
+ description: vlmDesc?.description,
109
+ ocrSummary: repFrame?.ocrText.substring(0, 200).replace(/\n/g, ' '),
110
+ heuristicLabel: cluster.heuristicLabel,
111
+ };
112
+ });
113
+ return {
114
+ visualLogs: [{ entries, source: 'screen' }],
115
+ updatedSession,
116
+ };
117
+ }
118
+ catch (error) {
119
+ console.error('Failed to extract visual log:', error);
120
+ return { visualLogs: [], updatedSession: session };
121
+ }
122
+ }
123
+ /**
124
+ * Get VLM descriptions for relevant segments
125
+ */
126
+ async function getVisualDescriptions(session, visualIndex, intelligenceService) {
127
+ const segmentsNeedingVLM = Session.getSegmentsNeedingVLM(session);
128
+ if (segmentsNeedingVLM.length === 0 || !intelligenceService) {
129
+ if (segmentsNeedingVLM.length > 0) {
130
+ console.log(' Skipping VLM descriptions (no intelligence service provided)');
131
+ }
132
+ return [];
133
+ }
134
+ console.log(`Describing ${segmentsNeedingVLM.length} visual-heavy segments...`);
135
+ const imagesToDescribe = segmentsNeedingVLM
136
+ .map((seg) => {
137
+ // Find representative frame for the first cluster in segment
138
+ const clusterId = seg.visualClusterIds[0];
139
+ const cluster = visualIndex.clusters.find((c) => c.id === clusterId);
140
+ const repFrame = visualIndex.frames.find((f) => f.index === cluster?.representativeIdx);
141
+ return {
142
+ imagePath: repFrame?.imagePath || '',
143
+ clusterId,
144
+ timestamp: repFrame?.timestamp || 0,
145
+ };
146
+ })
147
+ .filter((img) => img.imagePath);
148
+ try {
149
+ const descResult = await intelligenceService.describeImages(imagesToDescribe);
150
+ // New interface returns array directly
151
+ return descResult.map((d) => ({
152
+ clusterId: 0,
153
+ timestamp: d.timestamp,
154
+ description: d.description,
155
+ }));
156
+ }
157
+ catch (descError) {
158
+ console.warn(` Warning: Visual description failed: ${descError}`);
159
+ return [];
160
+ }
161
+ }
162
+ /**
163
+ * Finalize session processing, perform validation and save
164
+ */
165
+ async function finalizeSession(session, visualLogs, storageService) {
166
+ const finalSession = {
167
+ ...session,
168
+ visualLogs,
169
+ updatedAt: new Date(),
170
+ };
171
+ const hasAudioContent = finalSession.transcripts.length > 0;
172
+ const hasVisualContent = visualLogs.length > 0 && visualLogs[0].entries.length > 0;
173
+ if (!hasAudioContent && !hasVisualContent) {
174
+ finalSession.status = 'error';
175
+ const message = `Session processing failed: No audio content AND no visual changes detected for recording: ${finalSession.id}`;
176
+ finalSession.errorMessage = message;
177
+ await storageService.saveSession(finalSession);
178
+ throw new Error(message);
179
+ }
180
+ console.log(`Processing complete. Sources: ${finalSession.transcripts.length} audio, ${visualLogs.length} visual. Segments: ${finalSession.segments.length}`);
181
+ await storageService.saveSession(finalSession);
182
+ return finalSession;
183
+ }