escribano 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +297 -0
- package/dist/0_types.js +279 -0
- package/dist/actions/classify-session.js +77 -0
- package/dist/actions/create-contexts.js +44 -0
- package/dist/actions/create-topic-blocks.js +68 -0
- package/dist/actions/extract-metadata.js +24 -0
- package/dist/actions/generate-artifact-v3.js +296 -0
- package/dist/actions/generate-artifact.js +61 -0
- package/dist/actions/generate-summary-v3.js +260 -0
- package/dist/actions/outline-index.js +204 -0
- package/dist/actions/process-recording-v2.js +494 -0
- package/dist/actions/process-recording-v3.js +412 -0
- package/dist/actions/process-session.js +183 -0
- package/dist/actions/publish-summary-v3.js +303 -0
- package/dist/actions/sync-to-outline.js +196 -0
- package/dist/adapters/audio.silero.adapter.js +69 -0
- package/dist/adapters/cap.adapter.js +94 -0
- package/dist/adapters/capture.cap.adapter.js +107 -0
- package/dist/adapters/capture.filesystem.adapter.js +124 -0
- package/dist/adapters/embedding.ollama.adapter.js +141 -0
- package/dist/adapters/intelligence.adapter.js +202 -0
- package/dist/adapters/intelligence.mlx.adapter.js +395 -0
- package/dist/adapters/intelligence.ollama.adapter.js +741 -0
- package/dist/adapters/publishing.outline.adapter.js +75 -0
- package/dist/adapters/storage.adapter.js +81 -0
- package/dist/adapters/storage.fs.adapter.js +83 -0
- package/dist/adapters/transcription.whisper.adapter.js +206 -0
- package/dist/adapters/video.ffmpeg.adapter.js +405 -0
- package/dist/adapters/whisper.adapter.js +168 -0
- package/dist/batch-context.js +329 -0
- package/dist/db/helpers.js +50 -0
- package/dist/db/index.js +95 -0
- package/dist/db/migrate.js +80 -0
- package/dist/db/repositories/artifact.sqlite.js +77 -0
- package/dist/db/repositories/cluster.sqlite.js +92 -0
- package/dist/db/repositories/context.sqlite.js +75 -0
- package/dist/db/repositories/index.js +10 -0
- package/dist/db/repositories/observation.sqlite.js +70 -0
- package/dist/db/repositories/recording.sqlite.js +56 -0
- package/dist/db/repositories/subject.sqlite.js +64 -0
- package/dist/db/repositories/topic-block.sqlite.js +45 -0
- package/dist/db/types.js +4 -0
- package/dist/domain/classification.js +60 -0
- package/dist/domain/context.js +97 -0
- package/dist/domain/index.js +2 -0
- package/dist/domain/observation.js +17 -0
- package/dist/domain/recording.js +41 -0
- package/dist/domain/segment.js +93 -0
- package/dist/domain/session.js +93 -0
- package/dist/domain/time-range.js +38 -0
- package/dist/domain/transcript.js +79 -0
- package/dist/index.js +173 -0
- package/dist/pipeline/context.js +162 -0
- package/dist/pipeline/events.js +2 -0
- package/dist/prerequisites.js +226 -0
- package/dist/scripts/rebuild-index.js +53 -0
- package/dist/scripts/seed-fixtures.js +290 -0
- package/dist/services/activity-segmentation.js +333 -0
- package/dist/services/activity-segmentation.test.js +191 -0
- package/dist/services/app-normalization.js +212 -0
- package/dist/services/cluster-merge.js +69 -0
- package/dist/services/clustering.js +237 -0
- package/dist/services/debug.js +58 -0
- package/dist/services/frame-sampling.js +318 -0
- package/dist/services/signal-extraction.js +106 -0
- package/dist/services/subject-grouping.js +342 -0
- package/dist/services/temporal-alignment.js +99 -0
- package/dist/services/vlm-enrichment.js +84 -0
- package/dist/services/vlm-service.js +130 -0
- package/dist/stats/index.js +3 -0
- package/dist/stats/observer.js +65 -0
- package/dist/stats/repository.js +36 -0
- package/dist/stats/resource-tracker.js +86 -0
- package/dist/stats/types.js +1 -0
- package/dist/test-classification-prompts.js +181 -0
- package/dist/tests/cap.adapter.test.js +75 -0
- package/dist/tests/capture.cap.adapter.test.js +69 -0
- package/dist/tests/classify-session.test.js +140 -0
- package/dist/tests/db/repositories.test.js +243 -0
- package/dist/tests/domain/time-range.test.js +31 -0
- package/dist/tests/integration.test.js +84 -0
- package/dist/tests/intelligence.adapter.test.js +102 -0
- package/dist/tests/intelligence.ollama.adapter.test.js +178 -0
- package/dist/tests/process-v2.test.js +90 -0
- package/dist/tests/services/clustering.test.js +112 -0
- package/dist/tests/services/frame-sampling.test.js +152 -0
- package/dist/tests/utils/ocr.test.js +76 -0
- package/dist/tests/utils/parallel.test.js +57 -0
- package/dist/tests/visual-observer.test.js +175 -0
- package/dist/utils/id-normalization.js +15 -0
- package/dist/utils/index.js +9 -0
- package/dist/utils/model-detector.js +154 -0
- package/dist/utils/ocr.js +80 -0
- package/dist/utils/parallel.js +32 -0
- package/migrations/001_initial.sql +109 -0
- package/migrations/002_clusters.sql +41 -0
- package/migrations/003_observations_vlm_fields.sql +14 -0
- package/migrations/004_observations_unique.sql +18 -0
- package/migrations/005_processing_stats.sql +29 -0
- package/migrations/006_vlm_raw_response.sql +6 -0
- package/migrations/007_subjects.sql +23 -0
- package/migrations/008_artifacts_recording.sql +6 -0
- package/migrations/009_artifact_subjects.sql +10 -0
- package/package.json +82 -0
- package/prompts/action-items.md +55 -0
- package/prompts/blog-draft.md +54 -0
- package/prompts/blog-research.md +87 -0
- package/prompts/card.md +54 -0
- package/prompts/classify-segment.md +38 -0
- package/prompts/classify.md +37 -0
- package/prompts/code-snippets.md +163 -0
- package/prompts/extract-metadata.md +149 -0
- package/prompts/notes.md +83 -0
- package/prompts/runbook.md +123 -0
- package/prompts/standup.md +50 -0
- package/prompts/step-by-step.md +125 -0
- package/prompts/subject-grouping.md +31 -0
- package/prompts/summary-v3.md +89 -0
- package/prompts/summary.md +77 -0
- package/prompts/topic-classifier.md +24 -0
- package/prompts/topic-extract.md +13 -0
- package/prompts/vlm-batch.md +21 -0
- package/prompts/vlm-single.md +19 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - V3 Recording Processor (VLM-First Pipeline)
|
|
3
|
+
*
|
|
4
|
+
* This pipeline replaces the V2 OCR→Embedding→Clustering approach with
|
|
5
|
+
* a VLM-first approach where visual understanding drives segmentation.
|
|
6
|
+
*
|
|
7
|
+
* See ADR-005 for architectural rationale.
|
|
8
|
+
*/
|
|
9
|
+
import { readdir } from 'node:fs/promises';
|
|
10
|
+
import os from 'node:os';
|
|
11
|
+
import path from 'node:path';
|
|
12
|
+
import { generateId } from '../db/helpers.js';
|
|
13
|
+
import { advanceStep, completeProcessing, failProcessing, startProcessing, } from '../domain/recording.js';
|
|
14
|
+
import { log, step } from '../pipeline/context.js';
|
|
15
|
+
import { calculateRequiredTimestamps, } from '../services/frame-sampling.js';
|
|
16
|
+
import { describeFrames } from '../services/vlm-service.js';
|
|
17
|
+
// V3 step order - simplified from V2
|
|
18
|
+
const STEP_ORDER_V3 = [
|
|
19
|
+
'vad',
|
|
20
|
+
'transcription',
|
|
21
|
+
'frame_extraction',
|
|
22
|
+
'vlm_enrichment', // Repurposed: now does batch VLM on all sampled frames
|
|
23
|
+
'context_creation',
|
|
24
|
+
'block_formation',
|
|
25
|
+
'complete',
|
|
26
|
+
];
|
|
27
|
+
function shouldSkipStep(currentStep, targetStep) {
|
|
28
|
+
if (!currentStep)
|
|
29
|
+
return false;
|
|
30
|
+
if (currentStep === 'complete')
|
|
31
|
+
return true;
|
|
32
|
+
const currentIndex = STEP_ORDER_V3.indexOf(currentStep);
|
|
33
|
+
const targetIndex = STEP_ORDER_V3.indexOf(targetStep);
|
|
34
|
+
return targetIndex < currentIndex;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Process a recording using the V3 VLM-First pipeline.
|
|
38
|
+
*
|
|
39
|
+
* Key differences from V2:
|
|
40
|
+
* - No OCR processing step
|
|
41
|
+
* - No embedding generation step
|
|
42
|
+
* - No semantic clustering step
|
|
43
|
+
* - VLM processes all sampled frames (not just sparse selection)
|
|
44
|
+
* - Activity-based segmentation (not embedding similarity)
|
|
45
|
+
*/
|
|
46
|
+
export async function processRecordingV3(recordingId, repos, adapters, options = {}) {
|
|
47
|
+
const dbRecording = repos.recordings.findById(recordingId);
|
|
48
|
+
if (!dbRecording) {
|
|
49
|
+
throw new Error(`Recording ${recordingId} not found`);
|
|
50
|
+
}
|
|
51
|
+
// Handle --force: delete existing observations and reset
|
|
52
|
+
if (options.force) {
|
|
53
|
+
log('info', `[V3] Force flag set, deleting existing data for ${recordingId}...`);
|
|
54
|
+
repos.observations.deleteByRecording(recordingId);
|
|
55
|
+
repos.topicBlocks.deleteByRecording(recordingId);
|
|
56
|
+
}
|
|
57
|
+
// Map DB to Domain
|
|
58
|
+
let recording = {
|
|
59
|
+
id: dbRecording.id,
|
|
60
|
+
status: dbRecording.status,
|
|
61
|
+
processingStep: dbRecording.processing_step,
|
|
62
|
+
errorMessage: dbRecording.error_message,
|
|
63
|
+
videoPath: dbRecording.video_path,
|
|
64
|
+
audioMicPath: dbRecording.audio_mic_path,
|
|
65
|
+
audioSystemPath: dbRecording.audio_system_path,
|
|
66
|
+
capturedAt: dbRecording.captured_at,
|
|
67
|
+
duration: dbRecording.duration,
|
|
68
|
+
};
|
|
69
|
+
// If forced, reset to raw state
|
|
70
|
+
if (options.force) {
|
|
71
|
+
recording = {
|
|
72
|
+
...recording,
|
|
73
|
+
status: 'raw',
|
|
74
|
+
processingStep: null,
|
|
75
|
+
errorMessage: null,
|
|
76
|
+
};
|
|
77
|
+
updateRecordingInDb(repos, recording);
|
|
78
|
+
}
|
|
79
|
+
if (recording.processingStep) {
|
|
80
|
+
log('info', `[V3] Resuming ${recording.id} from step: ${recording.processingStep}`);
|
|
81
|
+
}
|
|
82
|
+
try {
|
|
83
|
+
// Start processing
|
|
84
|
+
if (!shouldSkipStep(recording.processingStep, 'vad')) {
|
|
85
|
+
recording = startProcessing(recording);
|
|
86
|
+
updateRecordingInDb(repos, recording);
|
|
87
|
+
}
|
|
88
|
+
// ============================================
|
|
89
|
+
// AUDIO PIPELINE (reused from V2)
|
|
90
|
+
// ============================================
|
|
91
|
+
if (!shouldSkipStep(recording.processingStep, 'transcription')) {
|
|
92
|
+
log('info', '[V3] Running audio pipeline...');
|
|
93
|
+
const audioObservations = await processAudioPipeline(recording, adapters);
|
|
94
|
+
if (audioObservations.length > 0) {
|
|
95
|
+
await step('save-audio-observations', async () => {
|
|
96
|
+
repos.observations.saveBatch(audioObservations);
|
|
97
|
+
log('info', `[V3] Saved ${audioObservations.length} audio observations`);
|
|
98
|
+
return { itemsProcessed: audioObservations.length };
|
|
99
|
+
}, { itemsTotal: audioObservations.length });
|
|
100
|
+
}
|
|
101
|
+
recording = advanceStep(recording, 'transcription');
|
|
102
|
+
updateRecordingInDb(repos, recording);
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
log('info', '[V3] Skipping audio pipeline (already completed)');
|
|
106
|
+
}
|
|
107
|
+
// ============================================
|
|
108
|
+
// VISUAL PIPELINE (V3: Smart Extraction)
|
|
109
|
+
// ============================================
|
|
110
|
+
if (recording.videoPath) {
|
|
111
|
+
// Step 1: Get video metadata
|
|
112
|
+
const metadata = await adapters.video.getMetadata(recording.videoPath);
|
|
113
|
+
log('info', `[V3] Video: ${Math.round(metadata.duration)}s, ${metadata.width}x${metadata.height}`);
|
|
114
|
+
// Step 2: Scene Detection FIRST (no frame extraction needed)
|
|
115
|
+
let sceneChanges = [];
|
|
116
|
+
const dbRecording = repos.recordings.findById(recording.id);
|
|
117
|
+
const sourceMetadata = dbRecording?.source_metadata
|
|
118
|
+
? JSON.parse(dbRecording.source_metadata)
|
|
119
|
+
: {};
|
|
120
|
+
if (sourceMetadata.scene_changes) {
|
|
121
|
+
sceneChanges = sourceMetadata.scene_changes;
|
|
122
|
+
log('info', `[V3] Loaded ${sceneChanges.length} scene changes from DB`);
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
sceneChanges = await step('scene-detection', async () => {
|
|
126
|
+
const changes = await adapters.video.detectSceneChanges(recording.videoPath);
|
|
127
|
+
log('info', `[V3] Detected ${changes.length} scene changes`);
|
|
128
|
+
// Save to DB for resume safety
|
|
129
|
+
if (dbRecording) {
|
|
130
|
+
const updatedMetadata = {
|
|
131
|
+
...sourceMetadata,
|
|
132
|
+
scene_changes: changes,
|
|
133
|
+
};
|
|
134
|
+
repos.recordings.updateMetadata(recording.id, JSON.stringify(updatedMetadata));
|
|
135
|
+
}
|
|
136
|
+
return changes;
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
// Step 3: Calculate required timestamps (pure math, no I/O)
|
|
140
|
+
log('info', '[V3] Calculating required frame timestamps...');
|
|
141
|
+
const requiredTimestamps = calculateRequiredTimestamps(metadata.duration, sceneChanges);
|
|
142
|
+
log('info', `[V3] Need ${requiredTimestamps.length} frames (from ${Math.round(metadata.duration)}s video with ${sceneChanges.length} scenes)`);
|
|
143
|
+
// Step 4: Extract ONLY the needed frames
|
|
144
|
+
let extractedFrames = [];
|
|
145
|
+
if (!shouldSkipStep(recording.processingStep, 'frame_extraction')) {
|
|
146
|
+
extractedFrames = await step('frame-extraction-batch', async () => {
|
|
147
|
+
const framesDir = path.join(os.tmpdir(), 'escribano', recording.id, 'frames');
|
|
148
|
+
const frames = await adapters.video.extractFramesAtTimestampsBatch(recording.videoPath, requiredTimestamps, framesDir);
|
|
149
|
+
log('info', `[V3] Extracted ${frames.length} frames`);
|
|
150
|
+
recording = advanceStep(recording, 'frame_extraction');
|
|
151
|
+
updateRecordingInDb(repos, recording);
|
|
152
|
+
return frames;
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
log('info', '[V3] Skipping frame extraction (already completed)');
|
|
157
|
+
// Reload frames from disk if resuming
|
|
158
|
+
const framesDir = path.join(os.tmpdir(), 'escribano', recording.id, 'frames');
|
|
159
|
+
try {
|
|
160
|
+
const files = await readdir(framesDir);
|
|
161
|
+
extractedFrames = files
|
|
162
|
+
.filter((f) => f.endsWith('.jpg'))
|
|
163
|
+
.map((f, i) => ({
|
|
164
|
+
imagePath: path.join(framesDir, f),
|
|
165
|
+
timestamp: requiredTimestamps[i] || i * 10,
|
|
166
|
+
}))
|
|
167
|
+
.sort((a, b) => a.timestamp - b.timestamp);
|
|
168
|
+
log('info', `[V3] Reloaded ${extractedFrames.length} frames from disk`);
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
log('warn', '[V3] Could not reload frames from disk');
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
// Step 5: VLM Batch Inference
|
|
175
|
+
if (!shouldSkipStep(recording.processingStep, 'vlm_enrichment')) {
|
|
176
|
+
// Check for already-processed frames (resume safety)
|
|
177
|
+
const existingObs = repos.observations
|
|
178
|
+
.findByRecording(recording.id)
|
|
179
|
+
.filter((o) => o.type === 'visual' && o.vlm_description)
|
|
180
|
+
.filter((o) => !o.vlm_description?.startsWith('No description') &&
|
|
181
|
+
!o.vlm_description?.startsWith('Parse error'));
|
|
182
|
+
const processedTimestamps = new Set(existingObs.map((o) => o.timestamp));
|
|
183
|
+
const framesToProcess = extractedFrames.filter((f) => !processedTimestamps.has(f.timestamp));
|
|
184
|
+
if (framesToProcess.length < extractedFrames.length) {
|
|
185
|
+
log('info', `[V3] Found ${existingObs.length} already-processed frames, ${framesToProcess.length} remaining`);
|
|
186
|
+
}
|
|
187
|
+
const vlmItemsTotal = framesToProcess.length;
|
|
188
|
+
await step('vlm-batch-inference', async () => {
|
|
189
|
+
let framesProcessed = 0;
|
|
190
|
+
if (framesToProcess.length === 0) {
|
|
191
|
+
log('info', '[V3] All frames already processed, skipping VLM inference');
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
log('info', `[V3] Frames to process (${framesToProcess.length}):`);
|
|
195
|
+
framesToProcess.slice(0, 10).forEach((f, i) => {
|
|
196
|
+
log('info', ` [${i}] ${f.imagePath.split('/').pop()} @ ${f.timestamp}s`);
|
|
197
|
+
});
|
|
198
|
+
if (framesToProcess.length > 10) {
|
|
199
|
+
log('info', ` ... and ${framesToProcess.length - 10} more`);
|
|
200
|
+
}
|
|
201
|
+
log('info', '[V3] Starting VLM inference...');
|
|
202
|
+
await describeFrames(framesToProcess, adapters.intelligence, {
|
|
203
|
+
recordingId: recording.id,
|
|
204
|
+
onImageProcessed: (result, progress) => {
|
|
205
|
+
const observation = {
|
|
206
|
+
id: generateId(),
|
|
207
|
+
recording_id: recording.id,
|
|
208
|
+
type: 'visual',
|
|
209
|
+
timestamp: result.timestamp,
|
|
210
|
+
end_timestamp: result.timestamp,
|
|
211
|
+
image_path: result.imagePath,
|
|
212
|
+
ocr_text: null,
|
|
213
|
+
vlm_description: result.description,
|
|
214
|
+
vlm_raw_response: result.raw_response ?? null,
|
|
215
|
+
activity_type: result.activity,
|
|
216
|
+
apps: JSON.stringify(result.apps),
|
|
217
|
+
topics: JSON.stringify(result.topics),
|
|
218
|
+
embedding: null,
|
|
219
|
+
text: null,
|
|
220
|
+
audio_source: null,
|
|
221
|
+
audio_type: null,
|
|
222
|
+
};
|
|
223
|
+
repos.observations.save(observation);
|
|
224
|
+
framesProcessed = progress.current;
|
|
225
|
+
if (progress.current % 10 === 0) {
|
|
226
|
+
log('info', `[V3] Processed ${progress.current}/${progress.total} frames`);
|
|
227
|
+
}
|
|
228
|
+
},
|
|
229
|
+
});
|
|
230
|
+
const allVisualObs = repos.observations
|
|
231
|
+
.findByRecording(recording.id)
|
|
232
|
+
.filter((o) => o.type === 'visual' && o.vlm_description);
|
|
233
|
+
log('info', `[V3] VLM complete: ${allVisualObs.length} total visual observations`);
|
|
234
|
+
}
|
|
235
|
+
recording = advanceStep(recording, 'vlm_enrichment');
|
|
236
|
+
updateRecordingInDb(repos, recording);
|
|
237
|
+
return { itemsProcessed: framesProcessed || existingObs.length };
|
|
238
|
+
}, { itemsTotal: vlmItemsTotal || extractedFrames.length });
|
|
239
|
+
}
|
|
240
|
+
else {
|
|
241
|
+
log('info', '[V3] Skipping VLM inference (already completed)');
|
|
242
|
+
}
|
|
243
|
+
// Phase 2 - Activity Segmentation & TopicBlock Formation
|
|
244
|
+
if (!shouldSkipStep(recording.processingStep, 'block_formation')) {
|
|
245
|
+
await step('activity-segmentation-and-alignment', async () => {
|
|
246
|
+
// Get all observations for this recording
|
|
247
|
+
const allObservations = repos.observations.findByRecording(recording.id);
|
|
248
|
+
const visualObservations = allObservations.filter((o) => o.type === 'visual');
|
|
249
|
+
const audioObservations = allObservations.filter((o) => o.type === 'audio');
|
|
250
|
+
log('info', `[V3] Running activity segmentation on ${visualObservations.length} visual observations...`);
|
|
251
|
+
// Import and run segmentation
|
|
252
|
+
const { segmentByActivity, getSegmentStats } = await import('../services/activity-segmentation.js');
|
|
253
|
+
const segments = segmentByActivity(visualObservations);
|
|
254
|
+
const stats = getSegmentStats(segments);
|
|
255
|
+
log('info', `[V3] Created ${stats.totalSegments} segments: ${Object.entries(stats.activityTypeCounts)
|
|
256
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
257
|
+
.join(', ')}`);
|
|
258
|
+
// Import and run temporal alignment
|
|
259
|
+
const { alignAudioToSegments, getAlignmentStats } = await import('../services/temporal-alignment.js');
|
|
260
|
+
log('info', `[V3] Aligning ${audioObservations.length} audio transcripts to segments...`);
|
|
261
|
+
const enrichedSegments = alignAudioToSegments(segments, audioObservations);
|
|
262
|
+
const alignStats = getAlignmentStats(enrichedSegments);
|
|
263
|
+
log('info', `[V3] Aligned audio: ${alignStats.segmentsWithAudio}/${alignStats.totalSegments} segments have transcripts (${alignStats.totalTranscriptSegments} total transcript segments)`);
|
|
264
|
+
log('info', `[V3] Creating ${enrichedSegments.length} TopicBlocks...`);
|
|
265
|
+
let blockCount = 0;
|
|
266
|
+
for (const segment of enrichedSegments) {
|
|
267
|
+
// Create context from segment apps/topics
|
|
268
|
+
const contextIds = [];
|
|
269
|
+
// Simplified context creation using INSERT OR IGNORE
|
|
270
|
+
for (const app of segment.apps) {
|
|
271
|
+
const ctxId = generateId();
|
|
272
|
+
repos.contexts.saveOrIgnore({
|
|
273
|
+
id: ctxId,
|
|
274
|
+
type: 'app',
|
|
275
|
+
name: app,
|
|
276
|
+
metadata: JSON.stringify({ source: 'vlm-v3' }),
|
|
277
|
+
});
|
|
278
|
+
// Fetch the context to get its ID (existing or newly created)
|
|
279
|
+
const existingCtx = repos.contexts.findByTypeAndName('app', app);
|
|
280
|
+
if (existingCtx) {
|
|
281
|
+
contextIds.push(existingCtx.id);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
for (const topic of segment.topics) {
|
|
285
|
+
const ctxId = generateId();
|
|
286
|
+
repos.contexts.saveOrIgnore({
|
|
287
|
+
id: ctxId,
|
|
288
|
+
type: 'topic',
|
|
289
|
+
name: topic,
|
|
290
|
+
metadata: JSON.stringify({ source: 'vlm-v3' }),
|
|
291
|
+
});
|
|
292
|
+
// Fetch the context to get its ID (existing or newly created)
|
|
293
|
+
const existingCtx = repos.contexts.findByTypeAndName('topic', topic);
|
|
294
|
+
if (existingCtx) {
|
|
295
|
+
contextIds.push(existingCtx.id);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
// Create the TopicBlock with enriched classification
|
|
299
|
+
repos.topicBlocks.save({
|
|
300
|
+
id: generateId(),
|
|
301
|
+
recording_id: recording.id,
|
|
302
|
+
context_ids: JSON.stringify(contextIds),
|
|
303
|
+
classification: JSON.stringify({
|
|
304
|
+
activity_type: segment.activityType,
|
|
305
|
+
key_description: segment.keyDescription,
|
|
306
|
+
start_time: segment.startTime,
|
|
307
|
+
end_time: segment.endTime,
|
|
308
|
+
duration: segment.duration,
|
|
309
|
+
apps: segment.apps,
|
|
310
|
+
topics: segment.topics,
|
|
311
|
+
transcript_count: segment.transcripts.length,
|
|
312
|
+
has_transcript: segment.combinedTranscript.length > 0,
|
|
313
|
+
combined_transcript: segment.combinedTranscript,
|
|
314
|
+
}),
|
|
315
|
+
duration: segment.duration,
|
|
316
|
+
});
|
|
317
|
+
blockCount++;
|
|
318
|
+
}
|
|
319
|
+
log('info', `[V3] Created ${blockCount} TopicBlocks`);
|
|
320
|
+
recording = advanceStep(recording, 'context_creation');
|
|
321
|
+
updateRecordingInDb(repos, recording);
|
|
322
|
+
recording = advanceStep(recording, 'block_formation');
|
|
323
|
+
updateRecordingInDb(repos, recording);
|
|
324
|
+
return { itemsProcessed: blockCount };
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
else {
|
|
328
|
+
log('info', '[V3] Skipping segmentation and block formation (already completed)');
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
// Complete
|
|
332
|
+
recording = completeProcessing(recording);
|
|
333
|
+
updateRecordingInDb(repos, recording);
|
|
334
|
+
log('info', `[V3] Successfully processed recording ${recording.id}`);
|
|
335
|
+
}
|
|
336
|
+
catch (error) {
|
|
337
|
+
const message = error.message;
|
|
338
|
+
log('error', `[V3] Processing failed for ${recordingId}: ${message}`);
|
|
339
|
+
recording = failProcessing(recording, message);
|
|
340
|
+
updateRecordingInDb(repos, recording);
|
|
341
|
+
throw error;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Process audio sources (reused from V2 with minor modifications)
|
|
346
|
+
*/
|
|
347
|
+
async function processAudioPipeline(recording, adapters) {
|
|
348
|
+
const observations = [];
|
|
349
|
+
const processSource = async (audioPath, source) => {
|
|
350
|
+
if (!audioPath) {
|
|
351
|
+
log('info', `[V3] No ${source} audio path, skipping...`);
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
log('info', `[V3] Processing ${source} audio: ${audioPath}`);
|
|
355
|
+
// VAD
|
|
356
|
+
const { segments, tempDir } = await step(`vad-${source}`, async () => {
|
|
357
|
+
return await adapters.preprocessor.extractSpeechSegments(audioPath, recording.id);
|
|
358
|
+
});
|
|
359
|
+
if (segments.length === 0) {
|
|
360
|
+
log('info', `[V3] No speech segments found in ${source} audio`);
|
|
361
|
+
await adapters.preprocessor.cleanup(tempDir);
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
log('info', `[V3] Found ${segments.length} segments in ${source} audio`);
|
|
365
|
+
// Transcription
|
|
366
|
+
await step(`transcription-${source}`, async () => {
|
|
367
|
+
let successCount = 0;
|
|
368
|
+
for (const segment of segments) {
|
|
369
|
+
try {
|
|
370
|
+
const text = await adapters.transcription.transcribeSegment(segment.audioPath);
|
|
371
|
+
if (text.length > 0) {
|
|
372
|
+
successCount++;
|
|
373
|
+
observations.push({
|
|
374
|
+
id: generateId(),
|
|
375
|
+
recording_id: recording.id,
|
|
376
|
+
type: 'audio',
|
|
377
|
+
timestamp: segment.start,
|
|
378
|
+
end_timestamp: segment.end,
|
|
379
|
+
text,
|
|
380
|
+
audio_source: source,
|
|
381
|
+
audio_type: 'speech',
|
|
382
|
+
image_path: null,
|
|
383
|
+
ocr_text: null,
|
|
384
|
+
vlm_description: null,
|
|
385
|
+
vlm_raw_response: null,
|
|
386
|
+
activity_type: null,
|
|
387
|
+
apps: null,
|
|
388
|
+
topics: null,
|
|
389
|
+
embedding: null,
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
catch (error) {
|
|
394
|
+
log('warn', `[V3] Failed to transcribe segment at ${segment.start}s: ${error.message}`);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
log('info', `[V3] Transcribed ${successCount}/${segments.length} segments for ${source}`);
|
|
398
|
+
return { itemsProcessed: successCount };
|
|
399
|
+
}, { itemsTotal: segments.length });
|
|
400
|
+
// Cleanup
|
|
401
|
+
await step(`cleanup-${source}`, async () => {
|
|
402
|
+
await adapters.preprocessor.cleanup(tempDir);
|
|
403
|
+
});
|
|
404
|
+
};
|
|
405
|
+
// Process sequentially
|
|
406
|
+
await processSource(recording.audioMicPath, 'mic');
|
|
407
|
+
await processSource(recording.audioSystemPath, 'system');
|
|
408
|
+
return observations;
|
|
409
|
+
}
|
|
410
|
+
function updateRecordingInDb(repos, recording) {
|
|
411
|
+
repos.recordings.updateStatus(recording.id, recording.status, recording.processingStep, recording.errorMessage);
|
|
412
|
+
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Process Session Action
|
|
3
|
+
* @deprecated V2 pipeline - use process-recording-v3.ts instead.
|
|
4
|
+
*
|
|
5
|
+
* Takes a recording and transcribes all available audio sources, creating a Session.
|
|
6
|
+
* Supports multiple audio sources (mic, system) with parallel transcription option.
|
|
7
|
+
*/
|
|
8
|
+
import os from 'node:os';
|
|
9
|
+
import path from 'node:path';
|
|
10
|
+
import { Session } from '../domain/session.js';
|
|
11
|
+
import { Transcript } from '../domain/transcript.js';
|
|
12
|
+
/**
|
|
13
|
+
* Transcribe multiple audio sources, optionally in parallel.
|
|
14
|
+
*/
|
|
15
|
+
async function transcribeAudioSources(sources, transcriber, parallel = false) {
|
|
16
|
+
const results = [];
|
|
17
|
+
if (parallel) {
|
|
18
|
+
console.log('Transcribing audio sources in parallel...');
|
|
19
|
+
const promises = sources.map(async ({ source, path }) => {
|
|
20
|
+
try {
|
|
21
|
+
console.log(`Transcribing ${source} audio from: ${path}`);
|
|
22
|
+
const transcript = await transcriber.transcribe(path);
|
|
23
|
+
if (Transcript.isEmpty(transcript)) {
|
|
24
|
+
console.log(`Warning: ${source} audio produced empty transcript`);
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
return { source, transcript };
|
|
28
|
+
}
|
|
29
|
+
catch (error) {
|
|
30
|
+
console.error(`Failed to transcribe ${source} audio:`, error);
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
});
|
|
34
|
+
const transcribed = await Promise.all(promises);
|
|
35
|
+
return transcribed.filter((t) => t !== null);
|
|
36
|
+
}
|
|
37
|
+
console.log('Transcribing audio sources sequentially...');
|
|
38
|
+
for (const { source, path } of sources) {
|
|
39
|
+
try {
|
|
40
|
+
console.log(`Transcribing ${source} audio from: ${path}`);
|
|
41
|
+
const transcript = await transcriber.transcribe(path);
|
|
42
|
+
if (Transcript.isEmpty(transcript)) {
|
|
43
|
+
console.log(`Warning: ${source} audio produced empty transcript`);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
results.push({ source, transcript });
|
|
47
|
+
}
|
|
48
|
+
catch (error) {
|
|
49
|
+
console.error(`Failed to transcribe ${source} audio:`, error);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return results;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Process a recording by transcribing all available audio sources and extracting visual logs
|
|
56
|
+
*/
|
|
57
|
+
export async function processSession(recording, transcriber, videoService, storageService, intelligenceService) {
|
|
58
|
+
console.log(`Processing recording: ${recording.id}`);
|
|
59
|
+
let session = Session.create(recording);
|
|
60
|
+
const parallelTranscription = process.env.ESCRIBANO_PARALLEL_TRANSCRIPTION === 'true';
|
|
61
|
+
// 1. Audio Transcription
|
|
62
|
+
const audioSources = [];
|
|
63
|
+
if (recording.audioMicPath) {
|
|
64
|
+
audioSources.push({ source: 'mic', path: recording.audioMicPath });
|
|
65
|
+
}
|
|
66
|
+
if (recording.audioSystemPath) {
|
|
67
|
+
audioSources.push({ source: 'system', path: recording.audioSystemPath });
|
|
68
|
+
}
|
|
69
|
+
if (audioSources.length > 0) {
|
|
70
|
+
const transcripts = await transcribeAudioSources(audioSources, transcriber, parallelTranscription);
|
|
71
|
+
session = Session.withTranscripts(session, transcripts);
|
|
72
|
+
}
|
|
73
|
+
// Intermediate save to ensure we don't lose transcription work
|
|
74
|
+
await storageService.saveSession(session);
|
|
75
|
+
// 2. Visual Log Extraction
|
|
76
|
+
if (!recording.videoPath) {
|
|
77
|
+
return finalizeSession(session, [], storageService);
|
|
78
|
+
}
|
|
79
|
+
const { visualLogs, updatedSession } = await extractVisualLogs(session, videoService, intelligenceService);
|
|
80
|
+
return finalizeSession(updatedSession, visualLogs, storageService);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Extract visual logs from a video recording
|
|
84
|
+
*/
|
|
85
|
+
async function extractVisualLogs(session, videoService, intelligenceService) {
|
|
86
|
+
const { recording } = session;
|
|
87
|
+
if (!recording.videoPath)
|
|
88
|
+
return { visualLogs: [], updatedSession: session };
|
|
89
|
+
console.log(`Extracting visual log from: ${recording.videoPath}`);
|
|
90
|
+
const visualLogDir = path.join(os.homedir(), '.escribano', 'sessions', recording.id, 'visual-log');
|
|
91
|
+
try {
|
|
92
|
+
const sceneResults = await videoService.extractFramesAtInterval(recording.videoPath, 0.3, visualLogDir);
|
|
93
|
+
if (sceneResults.length === 0)
|
|
94
|
+
return { visualLogs: [], updatedSession: session };
|
|
95
|
+
console.log('Running visual analysis (OCR + CLIP)...');
|
|
96
|
+
const indexPath = path.join(visualLogDir, 'visual-index.json');
|
|
97
|
+
const visualIndex = await videoService.runVisualIndexing(visualLogDir, indexPath);
|
|
98
|
+
console.log(`✓ Indexed ${visualIndex.frames.length} frames into ${visualIndex.clusters.length} clusters`);
|
|
99
|
+
// Update session with visual index (generates segments)
|
|
100
|
+
const updatedSession = Session.withVisualIndex(session, visualIndex);
|
|
101
|
+
const descriptions = await getVisualDescriptions(updatedSession, visualIndex, intelligenceService);
|
|
102
|
+
const entries = visualIndex.clusters.map((cluster) => {
|
|
103
|
+
const repFrame = visualIndex.frames.find((f) => f.index === cluster.representativeIdx);
|
|
104
|
+
const vlmDesc = descriptions.find((d) => d.clusterId === cluster.id);
|
|
105
|
+
return {
|
|
106
|
+
timestamp: repFrame?.timestamp || cluster.timeRange[0],
|
|
107
|
+
imagePath: repFrame?.imagePath || '',
|
|
108
|
+
description: vlmDesc?.description,
|
|
109
|
+
ocrSummary: repFrame?.ocrText.substring(0, 200).replace(/\n/g, ' '),
|
|
110
|
+
heuristicLabel: cluster.heuristicLabel,
|
|
111
|
+
};
|
|
112
|
+
});
|
|
113
|
+
return {
|
|
114
|
+
visualLogs: [{ entries, source: 'screen' }],
|
|
115
|
+
updatedSession,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
catch (error) {
|
|
119
|
+
console.error('Failed to extract visual log:', error);
|
|
120
|
+
return { visualLogs: [], updatedSession: session };
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Get VLM descriptions for relevant segments
|
|
125
|
+
*/
|
|
126
|
+
async function getVisualDescriptions(session, visualIndex, intelligenceService) {
|
|
127
|
+
const segmentsNeedingVLM = Session.getSegmentsNeedingVLM(session);
|
|
128
|
+
if (segmentsNeedingVLM.length === 0 || !intelligenceService) {
|
|
129
|
+
if (segmentsNeedingVLM.length > 0) {
|
|
130
|
+
console.log(' Skipping VLM descriptions (no intelligence service provided)');
|
|
131
|
+
}
|
|
132
|
+
return [];
|
|
133
|
+
}
|
|
134
|
+
console.log(`Describing ${segmentsNeedingVLM.length} visual-heavy segments...`);
|
|
135
|
+
const imagesToDescribe = segmentsNeedingVLM
|
|
136
|
+
.map((seg) => {
|
|
137
|
+
// Find representative frame for the first cluster in segment
|
|
138
|
+
const clusterId = seg.visualClusterIds[0];
|
|
139
|
+
const cluster = visualIndex.clusters.find((c) => c.id === clusterId);
|
|
140
|
+
const repFrame = visualIndex.frames.find((f) => f.index === cluster?.representativeIdx);
|
|
141
|
+
return {
|
|
142
|
+
imagePath: repFrame?.imagePath || '',
|
|
143
|
+
clusterId,
|
|
144
|
+
timestamp: repFrame?.timestamp || 0,
|
|
145
|
+
};
|
|
146
|
+
})
|
|
147
|
+
.filter((img) => img.imagePath);
|
|
148
|
+
try {
|
|
149
|
+
const descResult = await intelligenceService.describeImages(imagesToDescribe);
|
|
150
|
+
// New interface returns array directly
|
|
151
|
+
return descResult.map((d) => ({
|
|
152
|
+
clusterId: 0,
|
|
153
|
+
timestamp: d.timestamp,
|
|
154
|
+
description: d.description,
|
|
155
|
+
}));
|
|
156
|
+
}
|
|
157
|
+
catch (descError) {
|
|
158
|
+
console.warn(` Warning: Visual description failed: ${descError}`);
|
|
159
|
+
return [];
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Finalize session processing, perform validation and save
|
|
164
|
+
*/
|
|
165
|
+
async function finalizeSession(session, visualLogs, storageService) {
|
|
166
|
+
const finalSession = {
|
|
167
|
+
...session,
|
|
168
|
+
visualLogs,
|
|
169
|
+
updatedAt: new Date(),
|
|
170
|
+
};
|
|
171
|
+
const hasAudioContent = finalSession.transcripts.length > 0;
|
|
172
|
+
const hasVisualContent = visualLogs.length > 0 && visualLogs[0].entries.length > 0;
|
|
173
|
+
if (!hasAudioContent && !hasVisualContent) {
|
|
174
|
+
finalSession.status = 'error';
|
|
175
|
+
const message = `Session processing failed: No audio content AND no visual changes detected for recording: ${finalSession.id}`;
|
|
176
|
+
finalSession.errorMessage = message;
|
|
177
|
+
await storageService.saveSession(finalSession);
|
|
178
|
+
throw new Error(message);
|
|
179
|
+
}
|
|
180
|
+
console.log(`Processing complete. Sources: ${finalSession.transcripts.length} audio, ${visualLogs.length} visual. Segments: ${finalSession.segments.length}`);
|
|
181
|
+
await storageService.saveSession(finalSession);
|
|
182
|
+
return finalSession;
|
|
183
|
+
}
|