escribano 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +297 -0
  3. package/dist/0_types.js +279 -0
  4. package/dist/actions/classify-session.js +77 -0
  5. package/dist/actions/create-contexts.js +44 -0
  6. package/dist/actions/create-topic-blocks.js +68 -0
  7. package/dist/actions/extract-metadata.js +24 -0
  8. package/dist/actions/generate-artifact-v3.js +296 -0
  9. package/dist/actions/generate-artifact.js +61 -0
  10. package/dist/actions/generate-summary-v3.js +260 -0
  11. package/dist/actions/outline-index.js +204 -0
  12. package/dist/actions/process-recording-v2.js +494 -0
  13. package/dist/actions/process-recording-v3.js +412 -0
  14. package/dist/actions/process-session.js +183 -0
  15. package/dist/actions/publish-summary-v3.js +303 -0
  16. package/dist/actions/sync-to-outline.js +196 -0
  17. package/dist/adapters/audio.silero.adapter.js +69 -0
  18. package/dist/adapters/cap.adapter.js +94 -0
  19. package/dist/adapters/capture.cap.adapter.js +107 -0
  20. package/dist/adapters/capture.filesystem.adapter.js +124 -0
  21. package/dist/adapters/embedding.ollama.adapter.js +141 -0
  22. package/dist/adapters/intelligence.adapter.js +202 -0
  23. package/dist/adapters/intelligence.mlx.adapter.js +395 -0
  24. package/dist/adapters/intelligence.ollama.adapter.js +741 -0
  25. package/dist/adapters/publishing.outline.adapter.js +75 -0
  26. package/dist/adapters/storage.adapter.js +81 -0
  27. package/dist/adapters/storage.fs.adapter.js +83 -0
  28. package/dist/adapters/transcription.whisper.adapter.js +206 -0
  29. package/dist/adapters/video.ffmpeg.adapter.js +405 -0
  30. package/dist/adapters/whisper.adapter.js +168 -0
  31. package/dist/batch-context.js +329 -0
  32. package/dist/db/helpers.js +50 -0
  33. package/dist/db/index.js +95 -0
  34. package/dist/db/migrate.js +80 -0
  35. package/dist/db/repositories/artifact.sqlite.js +77 -0
  36. package/dist/db/repositories/cluster.sqlite.js +92 -0
  37. package/dist/db/repositories/context.sqlite.js +75 -0
  38. package/dist/db/repositories/index.js +10 -0
  39. package/dist/db/repositories/observation.sqlite.js +70 -0
  40. package/dist/db/repositories/recording.sqlite.js +56 -0
  41. package/dist/db/repositories/subject.sqlite.js +64 -0
  42. package/dist/db/repositories/topic-block.sqlite.js +45 -0
  43. package/dist/db/types.js +4 -0
  44. package/dist/domain/classification.js +60 -0
  45. package/dist/domain/context.js +97 -0
  46. package/dist/domain/index.js +2 -0
  47. package/dist/domain/observation.js +17 -0
  48. package/dist/domain/recording.js +41 -0
  49. package/dist/domain/segment.js +93 -0
  50. package/dist/domain/session.js +93 -0
  51. package/dist/domain/time-range.js +38 -0
  52. package/dist/domain/transcript.js +79 -0
  53. package/dist/index.js +173 -0
  54. package/dist/pipeline/context.js +162 -0
  55. package/dist/pipeline/events.js +2 -0
  56. package/dist/prerequisites.js +226 -0
  57. package/dist/scripts/rebuild-index.js +53 -0
  58. package/dist/scripts/seed-fixtures.js +290 -0
  59. package/dist/services/activity-segmentation.js +333 -0
  60. package/dist/services/activity-segmentation.test.js +191 -0
  61. package/dist/services/app-normalization.js +212 -0
  62. package/dist/services/cluster-merge.js +69 -0
  63. package/dist/services/clustering.js +237 -0
  64. package/dist/services/debug.js +58 -0
  65. package/dist/services/frame-sampling.js +318 -0
  66. package/dist/services/signal-extraction.js +106 -0
  67. package/dist/services/subject-grouping.js +342 -0
  68. package/dist/services/temporal-alignment.js +99 -0
  69. package/dist/services/vlm-enrichment.js +84 -0
  70. package/dist/services/vlm-service.js +130 -0
  71. package/dist/stats/index.js +3 -0
  72. package/dist/stats/observer.js +65 -0
  73. package/dist/stats/repository.js +36 -0
  74. package/dist/stats/resource-tracker.js +86 -0
  75. package/dist/stats/types.js +1 -0
  76. package/dist/test-classification-prompts.js +181 -0
  77. package/dist/tests/cap.adapter.test.js +75 -0
  78. package/dist/tests/capture.cap.adapter.test.js +69 -0
  79. package/dist/tests/classify-session.test.js +140 -0
  80. package/dist/tests/db/repositories.test.js +243 -0
  81. package/dist/tests/domain/time-range.test.js +31 -0
  82. package/dist/tests/integration.test.js +84 -0
  83. package/dist/tests/intelligence.adapter.test.js +102 -0
  84. package/dist/tests/intelligence.ollama.adapter.test.js +178 -0
  85. package/dist/tests/process-v2.test.js +90 -0
  86. package/dist/tests/services/clustering.test.js +112 -0
  87. package/dist/tests/services/frame-sampling.test.js +152 -0
  88. package/dist/tests/utils/ocr.test.js +76 -0
  89. package/dist/tests/utils/parallel.test.js +57 -0
  90. package/dist/tests/visual-observer.test.js +175 -0
  91. package/dist/utils/id-normalization.js +15 -0
  92. package/dist/utils/index.js +9 -0
  93. package/dist/utils/model-detector.js +154 -0
  94. package/dist/utils/ocr.js +80 -0
  95. package/dist/utils/parallel.js +32 -0
  96. package/migrations/001_initial.sql +109 -0
  97. package/migrations/002_clusters.sql +41 -0
  98. package/migrations/003_observations_vlm_fields.sql +14 -0
  99. package/migrations/004_observations_unique.sql +18 -0
  100. package/migrations/005_processing_stats.sql +29 -0
  101. package/migrations/006_vlm_raw_response.sql +6 -0
  102. package/migrations/007_subjects.sql +23 -0
  103. package/migrations/008_artifacts_recording.sql +6 -0
  104. package/migrations/009_artifact_subjects.sql +10 -0
  105. package/package.json +82 -0
  106. package/prompts/action-items.md +55 -0
  107. package/prompts/blog-draft.md +54 -0
  108. package/prompts/blog-research.md +87 -0
  109. package/prompts/card.md +54 -0
  110. package/prompts/classify-segment.md +38 -0
  111. package/prompts/classify.md +37 -0
  112. package/prompts/code-snippets.md +163 -0
  113. package/prompts/extract-metadata.md +149 -0
  114. package/prompts/notes.md +83 -0
  115. package/prompts/runbook.md +123 -0
  116. package/prompts/standup.md +50 -0
  117. package/prompts/step-by-step.md +125 -0
  118. package/prompts/subject-grouping.md +31 -0
  119. package/prompts/summary-v3.md +89 -0
  120. package/prompts/summary.md +77 -0
  121. package/prompts/topic-classifier.md +24 -0
  122. package/prompts/topic-extract.md +13 -0
  123. package/prompts/vlm-batch.md +21 -0
  124. package/prompts/vlm-single.md +19 -0
@@ -0,0 +1,212 @@
1
+ /**
2
+ * Escribano - App Name Normalization Service
3
+ *
4
+ * Normalizes and deduplicates app names extracted from VLM descriptions.
5
+ * Uses fuzzy matching and known alias maps to produce consistent app names.
6
+ */
7
+ const KNOWN_ALIASES = {
8
+ ghosty: 'Ghostty',
9
+ ghosttie: 'Ghostty',
10
+ 'ghostty terminal': 'Ghostty',
11
+ iterm: 'iTerm',
12
+ iterm2: 'iTerm',
13
+ 'iterm 2': 'iTerm',
14
+ 'vs code': 'VSCode',
15
+ 'visual studio code': 'VSCode',
16
+ vscode: 'VSCode',
17
+ 'visual studio': 'VSCode',
18
+ chrome: 'Google Chrome',
19
+ 'google chrome': 'Google Chrome',
20
+ safari: 'Safari',
21
+ firefox: 'Firefox',
22
+ slack: 'Slack',
23
+ zoom: 'Zoom',
24
+ 'google meet': 'Google Meet',
25
+ teams: 'Microsoft Teams',
26
+ 'microsoft teams': 'Microsoft Teams',
27
+ discord: 'Discord',
28
+ whatsapp: 'WhatsApp',
29
+ telegram: 'Telegram',
30
+ notion: 'Notion',
31
+ figma: 'Figma',
32
+ github: 'GitHub',
33
+ gitlab: 'GitLab',
34
+ bitbucket: 'Bitbucket',
35
+ terminal: 'Terminal',
36
+ finder: 'Finder',
37
+ mail: 'Mail',
38
+ gmail: 'Gmail',
39
+ calendar: 'Calendar',
40
+ notes: 'Notes',
41
+ spotify: 'Spotify',
42
+ music: 'Music',
43
+ photos: 'Photos',
44
+ preview: 'Preview',
45
+ quicktime: 'QuickTime Player',
46
+ 'quicktime player': 'QuickTime Player',
47
+ 'activity monitor': 'Activity Monitor',
48
+ 'system preferences': 'System Preferences',
49
+ settings: 'System Settings',
50
+ 'system settings': 'System Settings',
51
+ tableplus: 'TablePlus',
52
+ postgres: 'PostgreSQL',
53
+ postgresql: 'PostgreSQL',
54
+ sqlite: 'SQLite',
55
+ sqlitebrowser: 'SQLite',
56
+ };
57
+ const NOISY_APP_NAMES = new Set([
58
+ 'a',
59
+ 'the',
60
+ 'and',
61
+ 'or',
62
+ 'in',
63
+ 'on',
64
+ 'at',
65
+ 'to',
66
+ 'for',
67
+ 'of',
68
+ 'with',
69
+ 'unknown',
70
+ 'unidentified',
71
+ 'application',
72
+ 'app',
73
+ 'program',
74
+ 'software',
75
+ 'window',
76
+ 'screen',
77
+ 'desktop',
78
+ 'mac os',
79
+ 'macos',
80
+ 'os x',
81
+ 'operating system',
82
+ ]);
83
+ function levenshteinDistance(a, b) {
84
+ const matrix = [];
85
+ for (let i = 0; i <= b.length; i++) {
86
+ matrix[i] = [i];
87
+ }
88
+ for (let j = 0; j <= a.length; j++) {
89
+ matrix[0][j] = j;
90
+ }
91
+ for (let i = 1; i <= b.length; i++) {
92
+ for (let j = 1; j <= a.length; j++) {
93
+ if (b.charAt(i - 1) === a.charAt(j - 1)) {
94
+ matrix[i][j] = matrix[i - 1][j - 1];
95
+ }
96
+ else {
97
+ matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1);
98
+ }
99
+ }
100
+ }
101
+ return matrix[b.length][a.length];
102
+ }
103
+ function normalizeAppName(name) {
104
+ const normalized = name
105
+ .trim()
106
+ .toLowerCase()
107
+ .replace(/[^a-z0-9\s]/g, '')
108
+ .replace(/\s+/g, ' ')
109
+ .trim();
110
+ if (KNOWN_ALIASES[normalized]) {
111
+ return KNOWN_ALIASES[normalized];
112
+ }
113
+ for (const [alias, canonical] of Object.entries(KNOWN_ALIASES)) {
114
+ if (normalized.includes(alias)) {
115
+ return canonical;
116
+ }
117
+ }
118
+ return name.trim();
119
+ }
120
+ function isNoisyAppName(name) {
121
+ const normalized = name.toLowerCase().trim();
122
+ if (NOISY_APP_NAMES.has(normalized))
123
+ return true;
124
+ if (normalized.length < 2)
125
+ return true;
126
+ if (normalized.length > 50)
127
+ return true;
128
+ return false;
129
+ }
130
+ function fuzzyMatchAppName(name, knownApps) {
131
+ const normalized = normalizeAppName(name);
132
+ if (isNoisyAppName(normalized))
133
+ return null;
134
+ for (const known of knownApps) {
135
+ const distance = levenshteinDistance(normalized.toLowerCase(), known.toLowerCase());
136
+ const maxLen = Math.max(normalized.length, known.length);
137
+ const similarity = 1 - distance / maxLen;
138
+ if (similarity >= 0.85) {
139
+ return known;
140
+ }
141
+ }
142
+ return null;
143
+ }
144
+ export function normalizeAppNames(apps) {
145
+ if (apps.length === 0)
146
+ return [];
147
+ const normalizedApps = [];
148
+ const knownApps = new Set();
149
+ const sortedApps = [...apps].sort((a, b) => a.length - b.length);
150
+ for (const app of sortedApps) {
151
+ const trimmed = app.trim();
152
+ if (!trimmed)
153
+ continue;
154
+ let normalized = normalizeAppName(trimmed);
155
+ if (isNoisyAppName(normalized))
156
+ continue;
157
+ const fuzzyMatch = fuzzyMatchAppName(normalized, knownApps);
158
+ if (fuzzyMatch) {
159
+ normalized = fuzzyMatch;
160
+ }
161
+ else {
162
+ knownApps.add(normalized);
163
+ }
164
+ if (!normalizedApps.includes(normalized)) {
165
+ normalizedApps.push(normalized);
166
+ }
167
+ }
168
+ return normalizedApps.sort();
169
+ }
170
+ export function normalizeAppNamesInRecord(record) {
171
+ return {
172
+ ...record,
173
+ apps: normalizeAppNames(record.apps),
174
+ };
175
+ }
176
+ export function normalizeAppNamesInRecords(records) {
177
+ const allApps = records.flatMap((r) => r.apps);
178
+ const globalNormalized = normalizeAppNames(allApps);
179
+ const appMapping = new Map();
180
+ for (const app of allApps) {
181
+ const normalized = normalizeAppName(app);
182
+ const fuzzyMatch = fuzzyMatchAppName(normalized, new Set(globalNormalized));
183
+ appMapping.set(app, fuzzyMatch || normalized);
184
+ }
185
+ return records.map((record) => ({
186
+ ...record,
187
+ apps: record.apps
188
+ .map((app) => appMapping.get(app) || normalizeAppName(app))
189
+ .filter((app) => !isNoisyAppName(app))
190
+ .filter((app, index, arr) => arr.indexOf(app) === index)
191
+ .sort(),
192
+ }));
193
+ }
194
+ export function isPersonalApp(app) {
195
+ const normalized = app.toLowerCase().trim();
196
+ const personalApps = [
197
+ 'whatsapp',
198
+ 'instagram',
199
+ 'tiktok',
200
+ 'telegram',
201
+ 'facebook',
202
+ 'twitter',
203
+ 'snapchat',
204
+ 'discord',
205
+ 'messenger',
206
+ 'signal',
207
+ 'facetime',
208
+ 'imessage',
209
+ 'messages',
210
+ ];
211
+ return personalApps.some((personal) => normalized.includes(personal));
212
+ }
@@ -0,0 +1,69 @@
1
+ /**
2
+ * Escribano - Cluster Merge Service
3
+ *
4
+ * Merges audio clusters with visual clusters based on classification similarity.
5
+ * Many-to-many: one audio cluster can merge with multiple visual clusters.
6
+ */
7
+ const MERGE_THRESHOLD = 0.6; // Minimum similarity for merge
8
+ /**
9
+ * Find all valid merges between visual and audio clusters.
10
+ * Returns many-to-many: each audio cluster can merge with multiple visual clusters.
11
+ */
12
+ export function findClusterMerges(visualClusters, audioClusters, embeddingService) {
13
+ const merges = [];
14
+ for (const audio of audioClusters) {
15
+ for (const visual of visualClusters) {
16
+ const result = computeMerge(visual, audio, embeddingService);
17
+ if (result) {
18
+ merges.push(result);
19
+ }
20
+ }
21
+ }
22
+ return merges;
23
+ }
24
+ function computeMerge(visual, audio, embeddingService) {
25
+ // Check shared topics
26
+ const sharedTopics = visual.signals.topics.filter((t) => audio.signals.topics.some((at) => t.toLowerCase().includes(at.toLowerCase()) ||
27
+ at.toLowerCase().includes(t.toLowerCase())));
28
+ if (sharedTopics.length > 0) {
29
+ return {
30
+ visualClusterId: visual.cluster.id,
31
+ audioClusterId: audio.cluster.id,
32
+ similarityScore: 1.0,
33
+ mergeReason: 'shared_topic',
34
+ };
35
+ }
36
+ // Check shared apps
37
+ const sharedApps = visual.signals.apps.filter((a) => audio.signals.apps.includes(a));
38
+ if (sharedApps.length > 0) {
39
+ return {
40
+ visualClusterId: visual.cluster.id,
41
+ audioClusterId: audio.cluster.id,
42
+ similarityScore: 0.9,
43
+ mergeReason: 'shared_app',
44
+ };
45
+ }
46
+ // Check shared projects
47
+ const sharedProjects = visual.signals.projects.filter((p) => audio.signals.projects.includes(p));
48
+ if (sharedProjects.length > 0) {
49
+ return {
50
+ visualClusterId: visual.cluster.id,
51
+ audioClusterId: audio.cluster.id,
52
+ similarityScore: 0.85,
53
+ mergeReason: 'shared_project',
54
+ };
55
+ }
56
+ // Fallback: centroid similarity
57
+ if (visual.centroid.length > 0 && audio.centroid.length > 0) {
58
+ const similarity = embeddingService.similarity(visual.centroid, audio.centroid);
59
+ if (similarity >= MERGE_THRESHOLD) {
60
+ return {
61
+ visualClusterId: visual.cluster.id,
62
+ audioClusterId: audio.cluster.id,
63
+ similarityScore: similarity,
64
+ mergeReason: 'centroid_similarity',
65
+ };
66
+ }
67
+ }
68
+ return null;
69
+ }
@@ -0,0 +1,237 @@
1
+ /**
2
+ * Escribano - Semantic Clustering Service
3
+ *
4
+ * ═══════════════════════════════════════════════════════════════════════════════
5
+ * ALGORITHM: Agglomerative Hierarchical Clustering with Time Constraints
6
+ * ═══════════════════════════════════════════════════════════════════════════════
7
+ *
8
+ * WHAT IT DOES:
9
+ * Groups observations into semantic clusters based on embedding similarity,
10
+ * while respecting temporal constraints (observations far apart in time
11
+ * shouldn't cluster together even if semantically similar).
12
+ *
13
+ * WHY AGGLOMERATIVE:
14
+ * - No need to specify number of clusters upfront (unlike K-means)
15
+ * - Natural hierarchical structure matches how work sessions evolve
16
+ * - Can stop at any similarity threshold
17
+ *
18
+ * HOW IT WORKS:
19
+ *
20
+ * 1. INITIALIZATION
21
+ * - Start with N clusters, each containing one observation
22
+ * - Pre-compute all pairwise distances (1 - cosine_similarity)
23
+ * - Apply time constraint: if |timestamp_i - timestamp_j| > timeWindow,
24
+ * set distance to Infinity (can never merge)
25
+ *
26
+ * 2. ITERATIVE MERGING
27
+ * - Find the two closest clusters (single-linkage: min distance between any pair)
28
+ * - If closest distance > threshold → STOP (clusters are distinct enough)
29
+ * - Otherwise, merge them into one cluster
30
+ * - Repeat until no more merges possible
31
+ *
32
+ * 3. POST-PROCESSING
33
+ * - Small clusters (< minSize) are merged with their nearest neighbor
34
+ * - Prevents fragmentation from noise or outliers
35
+ *
36
+ * EXAMPLE:
37
+ *
38
+ * Input: 10 observations with embeddings
39
+ *
40
+ * Step 1: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] (10 clusters)
41
+ * Step 2: [1,2] [3] [4] [5] [6] [7] [8] [9] [10] (obs 1 & 2 merged)
42
+ * Step 3: [1,2] [3,4] [5] [6] [7] [8] [9] [10] (obs 3 & 4 merged)
43
+ * ...
44
+ * Final: [1,2,3,4] [5,6,7] [8,9,10] (3 clusters)
45
+ *
46
+ * TIME CONSTRAINT VISUALIZATION:
47
+ *
48
+ * Time: 0min ──────────────────────────── 60min
49
+ * Obs: ●●●●● ●●●● ●●●●●●
50
+ * └─────┘ └────┘ └──────┘
51
+ * Cluster A Cluster B Cluster C
52
+ *
53
+ * Even if B is semantically similar to A, they won't merge if
54
+ * the time gap exceeds timeWindowSeconds.
55
+ *
56
+ * ═══════════════════════════════════════════════════════════════════════════════
57
+ */
58
+ const DEFAULT_CONFIG = {
59
+ timeWindowSeconds: 600, // 10 minutes
60
+ distanceThreshold: 0.4, // 0.6 similarity threshold
61
+ minClusterSize: 3,
62
+ };
63
+ /**
64
+ * Main clustering function.
65
+ *
66
+ * @param observations - Observations to cluster (must have embeddings)
67
+ * @param embeddingService - Service for computing similarity
68
+ * @param config - Clustering parameters
69
+ * @returns Array of clusters, sorted by start timestamp
70
+ */
71
+ export function clusterObservations(observations, embeddingService, config = {}) {
72
+ const cfg = { ...DEFAULT_CONFIG, ...config };
73
+ // Filter to observations with valid embeddings
74
+ const validObs = observations.filter((obs) => obs.embedding?.length);
75
+ if (validObs.length === 0)
76
+ return [];
77
+ // Parse embeddings from Buffer format
78
+ const embeddings = validObs.map((obs) => {
79
+ if (!obs.embedding) {
80
+ throw new Error(`Observation ${obs.id} has no embedding`);
81
+ }
82
+ return bufferToEmbedding(obs.embedding);
83
+ });
84
+ // STEP 1: Initialize - each observation is its own cluster
85
+ // Clusters are represented as arrays of indices into validObs
86
+ let clusters = validObs.map((_, index) => [index]);
87
+ // STEP 2: Pre-compute distance matrix with time constraints
88
+ const distances = computeDistanceMatrix(validObs, embeddings, embeddingService, cfg.timeWindowSeconds);
89
+ // STEP 3: Agglomerative merging
90
+ // Keep merging until no clusters are close enough
91
+ let mergeCount = 0;
92
+ const maxMerges = validObs.length; // Safety limit
93
+ while (mergeCount < maxMerges) {
94
+ const closest = findClosestClusterPair(clusters, distances);
95
+ // Exit condition: no clusters are close enough to merge
96
+ if (closest.distance > cfg.distanceThreshold) {
97
+ break;
98
+ }
99
+ // Merge the two closest clusters
100
+ clusters = mergeClusters(clusters, closest.indexA, closest.indexB);
101
+ mergeCount++;
102
+ }
103
+ // STEP 4: Post-process - absorb small clusters
104
+ clusters = absorbSmallClusters(clusters, distances, cfg.minClusterSize);
105
+ // STEP 5: Build result objects
106
+ return clusters
107
+ .map((indices) => buildClusterResult(indices, validObs, embeddings, embeddingService))
108
+ .sort((a, b) => a.startTimestamp - b.startTimestamp);
109
+ }
110
+ // ═══════════════════════════════════════════════════════════════════════════════
111
+ // HELPER FUNCTIONS
112
+ // ═══════════════════════════════════════════════════════════════════════════════
113
+ /**
114
+ * Convert SQLite BLOB buffer to number array.
115
+ */
116
+ function bufferToEmbedding(buffer) {
117
+ const float32 = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
118
+ return Array.from(float32);
119
+ }
120
+ /**
121
+ * Compute NxN distance matrix.
122
+ * Distance = 1 - cosine_similarity (so 0 = identical, 1 = orthogonal)
123
+ * Time-violating pairs get Infinity distance.
124
+ */
125
+ function computeDistanceMatrix(observations, embeddings, embeddingService, timeWindowSeconds) {
126
+ const n = observations.length;
127
+ // Initialize with Infinity (no connection)
128
+ const matrix = Array.from({ length: n }, () => Array.from({ length: n }, () => Infinity));
129
+ // Fill in distances for valid pairs
130
+ for (const [i, obsA] of observations.entries()) {
131
+ matrix[i][i] = 0; // Self-distance is 0
132
+ for (const [j, obsB] of observations.entries()) {
133
+ if (j <= i)
134
+ continue; // Only compute upper triangle
135
+ // Time constraint check
136
+ const timeDiff = Math.abs(obsA.timestamp - obsB.timestamp);
137
+ if (timeDiff > timeWindowSeconds) {
138
+ continue; // Leave as Infinity
139
+ }
140
+ // Compute semantic distance
141
+ const similarity = embeddingService.similarity(embeddings[i], embeddings[j]);
142
+ const distance = 1 - similarity;
143
+ // Symmetric matrix
144
+ matrix[i][j] = distance;
145
+ matrix[j][i] = distance;
146
+ }
147
+ }
148
+ return matrix;
149
+ }
150
+ /**
151
+ * Find the two clusters with minimum distance (single-linkage).
152
+ * Single-linkage = minimum distance between ANY pair of points from each cluster.
153
+ */
154
+ function findClosestClusterPair(clusters, distances) {
155
+ let minDistance = Infinity;
156
+ let bestA = -1;
157
+ let bestB = -1;
158
+ for (const [i, clusterA] of clusters.entries()) {
159
+ for (const [j, clusterB] of clusters.entries()) {
160
+ if (j <= i)
161
+ continue; // Only check each pair once
162
+ const pairDistance = computeClusterDistance(clusterA, clusterB, distances);
163
+ if (pairDistance < minDistance) {
164
+ minDistance = pairDistance;
165
+ bestA = i;
166
+ bestB = j;
167
+ }
168
+ }
169
+ }
170
+ return { indexA: bestA, indexB: bestB, distance: minDistance };
171
+ }
172
+ /**
173
+ * Single-linkage distance between two clusters.
174
+ * Returns the minimum distance between any observation in A and any in B.
175
+ */
176
+ function computeClusterDistance(clusterA, clusterB, distances) {
177
+ let minDist = Infinity;
178
+ for (const i of clusterA) {
179
+ for (const j of clusterB) {
180
+ if (distances[i][j] < minDist) {
181
+ minDist = distances[i][j];
182
+ }
183
+ }
184
+ }
185
+ return minDist;
186
+ }
187
+ /**
188
+ * Merge two clusters by combining their observation indices.
189
+ * Returns new cluster array with merged result.
190
+ */
191
+ function mergeClusters(clusters, indexA, indexB) {
192
+ // Ensure indexA < indexB for consistent splicing
193
+ const [smaller, larger] = indexA < indexB ? [indexA, indexB] : [indexB, indexA];
194
+ const merged = [...clusters[smaller], ...clusters[larger]];
195
+ return clusters
196
+ .filter((_, index) => index !== smaller && index !== larger)
197
+ .concat([merged]);
198
+ }
199
+ /**
200
+ * Absorb clusters smaller than minSize into their nearest neighbor.
201
+ */
202
+ function absorbSmallClusters(clusters, distances, minSize) {
203
+ const large = clusters.filter((c) => c.length >= minSize);
204
+ const small = clusters.filter((c) => c.length < minSize);
205
+ if (large.length === 0) {
206
+ // All clusters are small - just return them
207
+ return clusters;
208
+ }
209
+ // Merge each small cluster into nearest large cluster
210
+ for (const smallCluster of small) {
211
+ let nearestIndex = 0;
212
+ let nearestDistance = Infinity;
213
+ for (const [index, largeCluster] of large.entries()) {
214
+ const dist = computeClusterDistance(smallCluster, largeCluster, distances);
215
+ if (dist < nearestDistance) {
216
+ nearestDistance = dist;
217
+ nearestIndex = index;
218
+ }
219
+ }
220
+ large[nearestIndex] = [...large[nearestIndex], ...smallCluster];
221
+ }
222
+ return large;
223
+ }
224
+ /**
225
+ * Build a ClusterResult from observation indices.
226
+ */
227
+ function buildClusterResult(indices, observations, embeddings, embeddingService) {
228
+ const clusterObs = indices.map((i) => observations[i]);
229
+ const clusterEmbeddings = indices.map((i) => embeddings[i]);
230
+ return {
231
+ clusterId: `cluster-${Date.now()}`, // Placeholder, replaced with UUIDv7 later
232
+ observations: clusterObs,
233
+ centroid: embeddingService.centroid(clusterEmbeddings),
234
+ startTimestamp: Math.min(...clusterObs.map((o) => o.timestamp)),
235
+ endTimestamp: Math.max(...clusterObs.map((o) => o.end_timestamp ?? o.timestamp)),
236
+ };
237
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Escribano - Debug Utilities
3
+ * @deprecated No longer needed - all data is stored in the database.
4
+ * Kept for backward compatibility only.
5
+ *
6
+ * Utilities for saving debug artifacts (VLM responses, frame copies) during processing.
7
+ */
8
+ import { copyFile, mkdir, writeFile } from 'node:fs/promises';
9
+ import { homedir } from 'node:os';
10
+ import path, { dirname } from 'node:path';
11
+ const DEBUG_ENABLED = process.env.ESCRIBANO_DEBUG_VLM === 'true';
12
+ const DEBUG_DIR = path.join(homedir(), '.escribano', 'debug');
13
+ /**
14
+ * Initialize debug directory for a recording.
15
+ * @deprecated
16
+ */
17
+ export async function initDebugDir(recordingId) {
18
+ if (!DEBUG_ENABLED)
19
+ return '';
20
+ const debugPath = path.join(DEBUG_DIR, recordingId);
21
+ const responsesPath = path.join(debugPath, 'vlm-responses');
22
+ const framesPath = path.join(debugPath, 'frames');
23
+ await mkdir(responsesPath, { recursive: true });
24
+ await mkdir(framesPath, { recursive: true });
25
+ return debugPath;
26
+ }
27
+ /**
28
+ * Save a VLM response to disk.
29
+ * @deprecated
30
+ */
31
+ export async function saveVlmResponse(recordingId, batchIndex, response) {
32
+ if (!DEBUG_ENABLED)
33
+ return;
34
+ const filePath = path.join(DEBUG_DIR, recordingId, 'vlm-responses', `batch-${String(batchIndex).padStart(3, '0')}-response.json`);
35
+ // Create parent directories if they don't exist
36
+ await mkdir(dirname(filePath), { recursive: true });
37
+ await writeFile(filePath, JSON.stringify(response, null, 2), 'utf-8');
38
+ }
39
+ /**
40
+ * Copy sampled frames to debug directory with batch naming.
41
+ * @deprecated
42
+ */
43
+ export async function copyFramesForDebug(recordingId, batchIndex, frames) {
44
+ if (!DEBUG_ENABLED)
45
+ return;
46
+ const batchFramesDir = path.join(DEBUG_DIR, recordingId, 'frames', `batch-${String(batchIndex).padStart(3, '0')}`);
47
+ await mkdir(batchFramesDir, { recursive: true });
48
+ for (const frame of frames) {
49
+ const destFileName = `frame-${String(frame.index).padStart(3, '0')}-t${frame.timestamp.toFixed(1)}.jpg`;
50
+ const destPath = path.join(batchFramesDir, destFileName);
51
+ try {
52
+ await copyFile(frame.imagePath, destPath);
53
+ }
54
+ catch (error) {
55
+ console.warn(`[Debug] Failed to copy frame ${frame.imagePath}:`, error.message);
56
+ }
57
+ }
58
+ }