escribano 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +297 -0
- package/dist/0_types.js +279 -0
- package/dist/actions/classify-session.js +77 -0
- package/dist/actions/create-contexts.js +44 -0
- package/dist/actions/create-topic-blocks.js +68 -0
- package/dist/actions/extract-metadata.js +24 -0
- package/dist/actions/generate-artifact-v3.js +296 -0
- package/dist/actions/generate-artifact.js +61 -0
- package/dist/actions/generate-summary-v3.js +260 -0
- package/dist/actions/outline-index.js +204 -0
- package/dist/actions/process-recording-v2.js +494 -0
- package/dist/actions/process-recording-v3.js +412 -0
- package/dist/actions/process-session.js +183 -0
- package/dist/actions/publish-summary-v3.js +303 -0
- package/dist/actions/sync-to-outline.js +196 -0
- package/dist/adapters/audio.silero.adapter.js +69 -0
- package/dist/adapters/cap.adapter.js +94 -0
- package/dist/adapters/capture.cap.adapter.js +107 -0
- package/dist/adapters/capture.filesystem.adapter.js +124 -0
- package/dist/adapters/embedding.ollama.adapter.js +141 -0
- package/dist/adapters/intelligence.adapter.js +202 -0
- package/dist/adapters/intelligence.mlx.adapter.js +395 -0
- package/dist/adapters/intelligence.ollama.adapter.js +741 -0
- package/dist/adapters/publishing.outline.adapter.js +75 -0
- package/dist/adapters/storage.adapter.js +81 -0
- package/dist/adapters/storage.fs.adapter.js +83 -0
- package/dist/adapters/transcription.whisper.adapter.js +206 -0
- package/dist/adapters/video.ffmpeg.adapter.js +405 -0
- package/dist/adapters/whisper.adapter.js +168 -0
- package/dist/batch-context.js +329 -0
- package/dist/db/helpers.js +50 -0
- package/dist/db/index.js +95 -0
- package/dist/db/migrate.js +80 -0
- package/dist/db/repositories/artifact.sqlite.js +77 -0
- package/dist/db/repositories/cluster.sqlite.js +92 -0
- package/dist/db/repositories/context.sqlite.js +75 -0
- package/dist/db/repositories/index.js +10 -0
- package/dist/db/repositories/observation.sqlite.js +70 -0
- package/dist/db/repositories/recording.sqlite.js +56 -0
- package/dist/db/repositories/subject.sqlite.js +64 -0
- package/dist/db/repositories/topic-block.sqlite.js +45 -0
- package/dist/db/types.js +4 -0
- package/dist/domain/classification.js +60 -0
- package/dist/domain/context.js +97 -0
- package/dist/domain/index.js +2 -0
- package/dist/domain/observation.js +17 -0
- package/dist/domain/recording.js +41 -0
- package/dist/domain/segment.js +93 -0
- package/dist/domain/session.js +93 -0
- package/dist/domain/time-range.js +38 -0
- package/dist/domain/transcript.js +79 -0
- package/dist/index.js +173 -0
- package/dist/pipeline/context.js +162 -0
- package/dist/pipeline/events.js +2 -0
- package/dist/prerequisites.js +226 -0
- package/dist/scripts/rebuild-index.js +53 -0
- package/dist/scripts/seed-fixtures.js +290 -0
- package/dist/services/activity-segmentation.js +333 -0
- package/dist/services/activity-segmentation.test.js +191 -0
- package/dist/services/app-normalization.js +212 -0
- package/dist/services/cluster-merge.js +69 -0
- package/dist/services/clustering.js +237 -0
- package/dist/services/debug.js +58 -0
- package/dist/services/frame-sampling.js +318 -0
- package/dist/services/signal-extraction.js +106 -0
- package/dist/services/subject-grouping.js +342 -0
- package/dist/services/temporal-alignment.js +99 -0
- package/dist/services/vlm-enrichment.js +84 -0
- package/dist/services/vlm-service.js +130 -0
- package/dist/stats/index.js +3 -0
- package/dist/stats/observer.js +65 -0
- package/dist/stats/repository.js +36 -0
- package/dist/stats/resource-tracker.js +86 -0
- package/dist/stats/types.js +1 -0
- package/dist/test-classification-prompts.js +181 -0
- package/dist/tests/cap.adapter.test.js +75 -0
- package/dist/tests/capture.cap.adapter.test.js +69 -0
- package/dist/tests/classify-session.test.js +140 -0
- package/dist/tests/db/repositories.test.js +243 -0
- package/dist/tests/domain/time-range.test.js +31 -0
- package/dist/tests/integration.test.js +84 -0
- package/dist/tests/intelligence.adapter.test.js +102 -0
- package/dist/tests/intelligence.ollama.adapter.test.js +178 -0
- package/dist/tests/process-v2.test.js +90 -0
- package/dist/tests/services/clustering.test.js +112 -0
- package/dist/tests/services/frame-sampling.test.js +152 -0
- package/dist/tests/utils/ocr.test.js +76 -0
- package/dist/tests/utils/parallel.test.js +57 -0
- package/dist/tests/visual-observer.test.js +175 -0
- package/dist/utils/id-normalization.js +15 -0
- package/dist/utils/index.js +9 -0
- package/dist/utils/model-detector.js +154 -0
- package/dist/utils/ocr.js +80 -0
- package/dist/utils/parallel.js +32 -0
- package/migrations/001_initial.sql +109 -0
- package/migrations/002_clusters.sql +41 -0
- package/migrations/003_observations_vlm_fields.sql +14 -0
- package/migrations/004_observations_unique.sql +18 -0
- package/migrations/005_processing_stats.sql +29 -0
- package/migrations/006_vlm_raw_response.sql +6 -0
- package/migrations/007_subjects.sql +23 -0
- package/migrations/008_artifacts_recording.sql +6 -0
- package/migrations/009_artifact_subjects.sql +10 -0
- package/package.json +82 -0
- package/prompts/action-items.md +55 -0
- package/prompts/blog-draft.md +54 -0
- package/prompts/blog-research.md +87 -0
- package/prompts/card.md +54 -0
- package/prompts/classify-segment.md +38 -0
- package/prompts/classify.md +37 -0
- package/prompts/code-snippets.md +163 -0
- package/prompts/extract-metadata.md +149 -0
- package/prompts/notes.md +83 -0
- package/prompts/runbook.md +123 -0
- package/prompts/standup.md +50 -0
- package/prompts/step-by-step.md +125 -0
- package/prompts/subject-grouping.md +31 -0
- package/prompts/summary-v3.md +89 -0
- package/prompts/summary.md +77 -0
- package/prompts/topic-classifier.md +24 -0
- package/prompts/topic-extract.md +13 -0
- package/prompts/vlm-batch.md +21 -0
- package/prompts/vlm-single.md +19 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - App Name Normalization Service
|
|
3
|
+
*
|
|
4
|
+
* Normalizes and deduplicates app names extracted from VLM descriptions.
|
|
5
|
+
* Uses fuzzy matching and known alias maps to produce consistent app names.
|
|
6
|
+
*/
|
|
7
|
+
const KNOWN_ALIASES = {
|
|
8
|
+
ghosty: 'Ghostty',
|
|
9
|
+
ghosttie: 'Ghostty',
|
|
10
|
+
'ghostty terminal': 'Ghostty',
|
|
11
|
+
iterm: 'iTerm',
|
|
12
|
+
iterm2: 'iTerm',
|
|
13
|
+
'iterm 2': 'iTerm',
|
|
14
|
+
'vs code': 'VSCode',
|
|
15
|
+
'visual studio code': 'VSCode',
|
|
16
|
+
vscode: 'VSCode',
|
|
17
|
+
'visual studio': 'VSCode',
|
|
18
|
+
chrome: 'Google Chrome',
|
|
19
|
+
'google chrome': 'Google Chrome',
|
|
20
|
+
safari: 'Safari',
|
|
21
|
+
firefox: 'Firefox',
|
|
22
|
+
slack: 'Slack',
|
|
23
|
+
zoom: 'Zoom',
|
|
24
|
+
'google meet': 'Google Meet',
|
|
25
|
+
teams: 'Microsoft Teams',
|
|
26
|
+
'microsoft teams': 'Microsoft Teams',
|
|
27
|
+
discord: 'Discord',
|
|
28
|
+
whatsapp: 'WhatsApp',
|
|
29
|
+
telegram: 'Telegram',
|
|
30
|
+
notion: 'Notion',
|
|
31
|
+
figma: 'Figma',
|
|
32
|
+
github: 'GitHub',
|
|
33
|
+
gitlab: 'GitLab',
|
|
34
|
+
bitbucket: 'Bitbucket',
|
|
35
|
+
terminal: 'Terminal',
|
|
36
|
+
finder: 'Finder',
|
|
37
|
+
mail: 'Mail',
|
|
38
|
+
gmail: 'Gmail',
|
|
39
|
+
calendar: 'Calendar',
|
|
40
|
+
notes: 'Notes',
|
|
41
|
+
spotify: 'Spotify',
|
|
42
|
+
music: 'Music',
|
|
43
|
+
photos: 'Photos',
|
|
44
|
+
preview: 'Preview',
|
|
45
|
+
quicktime: 'QuickTime Player',
|
|
46
|
+
'quicktime player': 'QuickTime Player',
|
|
47
|
+
'activity monitor': 'Activity Monitor',
|
|
48
|
+
'system preferences': 'System Preferences',
|
|
49
|
+
settings: 'System Settings',
|
|
50
|
+
'system settings': 'System Settings',
|
|
51
|
+
tableplus: 'TablePlus',
|
|
52
|
+
postgres: 'PostgreSQL',
|
|
53
|
+
postgresql: 'PostgreSQL',
|
|
54
|
+
sqlite: 'SQLite',
|
|
55
|
+
sqlitebrowser: 'SQLite',
|
|
56
|
+
};
|
|
57
|
+
const NOISY_APP_NAMES = new Set([
|
|
58
|
+
'a',
|
|
59
|
+
'the',
|
|
60
|
+
'and',
|
|
61
|
+
'or',
|
|
62
|
+
'in',
|
|
63
|
+
'on',
|
|
64
|
+
'at',
|
|
65
|
+
'to',
|
|
66
|
+
'for',
|
|
67
|
+
'of',
|
|
68
|
+
'with',
|
|
69
|
+
'unknown',
|
|
70
|
+
'unidentified',
|
|
71
|
+
'application',
|
|
72
|
+
'app',
|
|
73
|
+
'program',
|
|
74
|
+
'software',
|
|
75
|
+
'window',
|
|
76
|
+
'screen',
|
|
77
|
+
'desktop',
|
|
78
|
+
'mac os',
|
|
79
|
+
'macos',
|
|
80
|
+
'os x',
|
|
81
|
+
'operating system',
|
|
82
|
+
]);
|
|
83
|
+
function levenshteinDistance(a, b) {
|
|
84
|
+
const matrix = [];
|
|
85
|
+
for (let i = 0; i <= b.length; i++) {
|
|
86
|
+
matrix[i] = [i];
|
|
87
|
+
}
|
|
88
|
+
for (let j = 0; j <= a.length; j++) {
|
|
89
|
+
matrix[0][j] = j;
|
|
90
|
+
}
|
|
91
|
+
for (let i = 1; i <= b.length; i++) {
|
|
92
|
+
for (let j = 1; j <= a.length; j++) {
|
|
93
|
+
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
|
94
|
+
matrix[i][j] = matrix[i - 1][j - 1];
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return matrix[b.length][a.length];
|
|
102
|
+
}
|
|
103
|
+
function normalizeAppName(name) {
|
|
104
|
+
const normalized = name
|
|
105
|
+
.trim()
|
|
106
|
+
.toLowerCase()
|
|
107
|
+
.replace(/[^a-z0-9\s]/g, '')
|
|
108
|
+
.replace(/\s+/g, ' ')
|
|
109
|
+
.trim();
|
|
110
|
+
if (KNOWN_ALIASES[normalized]) {
|
|
111
|
+
return KNOWN_ALIASES[normalized];
|
|
112
|
+
}
|
|
113
|
+
for (const [alias, canonical] of Object.entries(KNOWN_ALIASES)) {
|
|
114
|
+
if (normalized.includes(alias)) {
|
|
115
|
+
return canonical;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return name.trim();
|
|
119
|
+
}
|
|
120
|
+
function isNoisyAppName(name) {
|
|
121
|
+
const normalized = name.toLowerCase().trim();
|
|
122
|
+
if (NOISY_APP_NAMES.has(normalized))
|
|
123
|
+
return true;
|
|
124
|
+
if (normalized.length < 2)
|
|
125
|
+
return true;
|
|
126
|
+
if (normalized.length > 50)
|
|
127
|
+
return true;
|
|
128
|
+
return false;
|
|
129
|
+
}
|
|
130
|
+
function fuzzyMatchAppName(name, knownApps) {
|
|
131
|
+
const normalized = normalizeAppName(name);
|
|
132
|
+
if (isNoisyAppName(normalized))
|
|
133
|
+
return null;
|
|
134
|
+
for (const known of knownApps) {
|
|
135
|
+
const distance = levenshteinDistance(normalized.toLowerCase(), known.toLowerCase());
|
|
136
|
+
const maxLen = Math.max(normalized.length, known.length);
|
|
137
|
+
const similarity = 1 - distance / maxLen;
|
|
138
|
+
if (similarity >= 0.85) {
|
|
139
|
+
return known;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
export function normalizeAppNames(apps) {
|
|
145
|
+
if (apps.length === 0)
|
|
146
|
+
return [];
|
|
147
|
+
const normalizedApps = [];
|
|
148
|
+
const knownApps = new Set();
|
|
149
|
+
const sortedApps = [...apps].sort((a, b) => a.length - b.length);
|
|
150
|
+
for (const app of sortedApps) {
|
|
151
|
+
const trimmed = app.trim();
|
|
152
|
+
if (!trimmed)
|
|
153
|
+
continue;
|
|
154
|
+
let normalized = normalizeAppName(trimmed);
|
|
155
|
+
if (isNoisyAppName(normalized))
|
|
156
|
+
continue;
|
|
157
|
+
const fuzzyMatch = fuzzyMatchAppName(normalized, knownApps);
|
|
158
|
+
if (fuzzyMatch) {
|
|
159
|
+
normalized = fuzzyMatch;
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
knownApps.add(normalized);
|
|
163
|
+
}
|
|
164
|
+
if (!normalizedApps.includes(normalized)) {
|
|
165
|
+
normalizedApps.push(normalized);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return normalizedApps.sort();
|
|
169
|
+
}
|
|
170
|
+
export function normalizeAppNamesInRecord(record) {
|
|
171
|
+
return {
|
|
172
|
+
...record,
|
|
173
|
+
apps: normalizeAppNames(record.apps),
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
export function normalizeAppNamesInRecords(records) {
|
|
177
|
+
const allApps = records.flatMap((r) => r.apps);
|
|
178
|
+
const globalNormalized = normalizeAppNames(allApps);
|
|
179
|
+
const appMapping = new Map();
|
|
180
|
+
for (const app of allApps) {
|
|
181
|
+
const normalized = normalizeAppName(app);
|
|
182
|
+
const fuzzyMatch = fuzzyMatchAppName(normalized, new Set(globalNormalized));
|
|
183
|
+
appMapping.set(app, fuzzyMatch || normalized);
|
|
184
|
+
}
|
|
185
|
+
return records.map((record) => ({
|
|
186
|
+
...record,
|
|
187
|
+
apps: record.apps
|
|
188
|
+
.map((app) => appMapping.get(app) || normalizeAppName(app))
|
|
189
|
+
.filter((app) => !isNoisyAppName(app))
|
|
190
|
+
.filter((app, index, arr) => arr.indexOf(app) === index)
|
|
191
|
+
.sort(),
|
|
192
|
+
}));
|
|
193
|
+
}
|
|
194
|
+
export function isPersonalApp(app) {
|
|
195
|
+
const normalized = app.toLowerCase().trim();
|
|
196
|
+
const personalApps = [
|
|
197
|
+
'whatsapp',
|
|
198
|
+
'instagram',
|
|
199
|
+
'tiktok',
|
|
200
|
+
'telegram',
|
|
201
|
+
'facebook',
|
|
202
|
+
'twitter',
|
|
203
|
+
'snapchat',
|
|
204
|
+
'discord',
|
|
205
|
+
'messenger',
|
|
206
|
+
'signal',
|
|
207
|
+
'facetime',
|
|
208
|
+
'imessage',
|
|
209
|
+
'messages',
|
|
210
|
+
];
|
|
211
|
+
return personalApps.some((personal) => normalized.includes(personal));
|
|
212
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - Cluster Merge Service
|
|
3
|
+
*
|
|
4
|
+
* Merges audio clusters with visual clusters based on classification similarity.
|
|
5
|
+
* Many-to-many: one audio cluster can merge with multiple visual clusters.
|
|
6
|
+
*/
|
|
7
|
+
const MERGE_THRESHOLD = 0.6; // Minimum similarity for merge
|
|
8
|
+
/**
|
|
9
|
+
* Find all valid merges between visual and audio clusters.
|
|
10
|
+
* Returns many-to-many: each audio cluster can merge with multiple visual clusters.
|
|
11
|
+
*/
|
|
12
|
+
export function findClusterMerges(visualClusters, audioClusters, embeddingService) {
|
|
13
|
+
const merges = [];
|
|
14
|
+
for (const audio of audioClusters) {
|
|
15
|
+
for (const visual of visualClusters) {
|
|
16
|
+
const result = computeMerge(visual, audio, embeddingService);
|
|
17
|
+
if (result) {
|
|
18
|
+
merges.push(result);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return merges;
|
|
23
|
+
}
|
|
24
|
+
function computeMerge(visual, audio, embeddingService) {
|
|
25
|
+
// Check shared topics
|
|
26
|
+
const sharedTopics = visual.signals.topics.filter((t) => audio.signals.topics.some((at) => t.toLowerCase().includes(at.toLowerCase()) ||
|
|
27
|
+
at.toLowerCase().includes(t.toLowerCase())));
|
|
28
|
+
if (sharedTopics.length > 0) {
|
|
29
|
+
return {
|
|
30
|
+
visualClusterId: visual.cluster.id,
|
|
31
|
+
audioClusterId: audio.cluster.id,
|
|
32
|
+
similarityScore: 1.0,
|
|
33
|
+
mergeReason: 'shared_topic',
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
// Check shared apps
|
|
37
|
+
const sharedApps = visual.signals.apps.filter((a) => audio.signals.apps.includes(a));
|
|
38
|
+
if (sharedApps.length > 0) {
|
|
39
|
+
return {
|
|
40
|
+
visualClusterId: visual.cluster.id,
|
|
41
|
+
audioClusterId: audio.cluster.id,
|
|
42
|
+
similarityScore: 0.9,
|
|
43
|
+
mergeReason: 'shared_app',
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
// Check shared projects
|
|
47
|
+
const sharedProjects = visual.signals.projects.filter((p) => audio.signals.projects.includes(p));
|
|
48
|
+
if (sharedProjects.length > 0) {
|
|
49
|
+
return {
|
|
50
|
+
visualClusterId: visual.cluster.id,
|
|
51
|
+
audioClusterId: audio.cluster.id,
|
|
52
|
+
similarityScore: 0.85,
|
|
53
|
+
mergeReason: 'shared_project',
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
// Fallback: centroid similarity
|
|
57
|
+
if (visual.centroid.length > 0 && audio.centroid.length > 0) {
|
|
58
|
+
const similarity = embeddingService.similarity(visual.centroid, audio.centroid);
|
|
59
|
+
if (similarity >= MERGE_THRESHOLD) {
|
|
60
|
+
return {
|
|
61
|
+
visualClusterId: visual.cluster.id,
|
|
62
|
+
audioClusterId: audio.cluster.id,
|
|
63
|
+
similarityScore: similarity,
|
|
64
|
+
mergeReason: 'centroid_similarity',
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - Semantic Clustering Service
|
|
3
|
+
*
|
|
4
|
+
* ═══════════════════════════════════════════════════════════════════════════════
|
|
5
|
+
* ALGORITHM: Agglomerative Hierarchical Clustering with Time Constraints
|
|
6
|
+
* ═══════════════════════════════════════════════════════════════════════════════
|
|
7
|
+
*
|
|
8
|
+
* WHAT IT DOES:
|
|
9
|
+
* Groups observations into semantic clusters based on embedding similarity,
|
|
10
|
+
* while respecting temporal constraints (observations far apart in time
|
|
11
|
+
* shouldn't cluster together even if semantically similar).
|
|
12
|
+
*
|
|
13
|
+
* WHY AGGLOMERATIVE:
|
|
14
|
+
* - No need to specify number of clusters upfront (unlike K-means)
|
|
15
|
+
* - Natural hierarchical structure matches how work sessions evolve
|
|
16
|
+
* - Can stop at any similarity threshold
|
|
17
|
+
*
|
|
18
|
+
* HOW IT WORKS:
|
|
19
|
+
*
|
|
20
|
+
* 1. INITIALIZATION
|
|
21
|
+
* - Start with N clusters, each containing one observation
|
|
22
|
+
* - Pre-compute all pairwise distances (1 - cosine_similarity)
|
|
23
|
+
* - Apply time constraint: if |timestamp_i - timestamp_j| > timeWindow,
|
|
24
|
+
* set distance to Infinity (can never merge)
|
|
25
|
+
*
|
|
26
|
+
* 2. ITERATIVE MERGING
|
|
27
|
+
* - Find the two closest clusters (single-linkage: min distance between any pair)
|
|
28
|
+
* - If closest distance > threshold → STOP (clusters are distinct enough)
|
|
29
|
+
* - Otherwise, merge them into one cluster
|
|
30
|
+
* - Repeat until no more merges possible
|
|
31
|
+
*
|
|
32
|
+
* 3. POST-PROCESSING
|
|
33
|
+
* - Small clusters (< minSize) are merged with their nearest neighbor
|
|
34
|
+
* - Prevents fragmentation from noise or outliers
|
|
35
|
+
*
|
|
36
|
+
* EXAMPLE:
|
|
37
|
+
*
|
|
38
|
+
* Input: 10 observations with embeddings
|
|
39
|
+
*
|
|
40
|
+
* Step 1: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] (10 clusters)
|
|
41
|
+
* Step 2: [1,2] [3] [4] [5] [6] [7] [8] [9] [10] (obs 1 & 2 merged)
|
|
42
|
+
* Step 3: [1,2] [3,4] [5] [6] [7] [8] [9] [10] (obs 3 & 4 merged)
|
|
43
|
+
* ...
|
|
44
|
+
* Final: [1,2,3,4] [5,6,7] [8,9,10] (3 clusters)
|
|
45
|
+
*
|
|
46
|
+
* TIME CONSTRAINT VISUALIZATION:
|
|
47
|
+
*
|
|
48
|
+
* Time: 0min ──────────────────────────── 60min
|
|
49
|
+
* Obs: ●●●●● ●●●● ●●●●●●
|
|
50
|
+
* └─────┘ └────┘ └──────┘
|
|
51
|
+
* Cluster A Cluster B Cluster C
|
|
52
|
+
*
|
|
53
|
+
* Even if B is semantically similar to A, they won't merge if
|
|
54
|
+
* the time gap exceeds timeWindowSeconds.
|
|
55
|
+
*
|
|
56
|
+
* ═══════════════════════════════════════════════════════════════════════════════
|
|
57
|
+
*/
|
|
58
|
+
const DEFAULT_CONFIG = {
|
|
59
|
+
timeWindowSeconds: 600, // 10 minutes
|
|
60
|
+
distanceThreshold: 0.4, // 0.6 similarity threshold
|
|
61
|
+
minClusterSize: 3,
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Main clustering function.
|
|
65
|
+
*
|
|
66
|
+
* @param observations - Observations to cluster (must have embeddings)
|
|
67
|
+
* @param embeddingService - Service for computing similarity
|
|
68
|
+
* @param config - Clustering parameters
|
|
69
|
+
* @returns Array of clusters, sorted by start timestamp
|
|
70
|
+
*/
|
|
71
|
+
export function clusterObservations(observations, embeddingService, config = {}) {
|
|
72
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
73
|
+
// Filter to observations with valid embeddings
|
|
74
|
+
const validObs = observations.filter((obs) => obs.embedding?.length);
|
|
75
|
+
if (validObs.length === 0)
|
|
76
|
+
return [];
|
|
77
|
+
// Parse embeddings from Buffer format
|
|
78
|
+
const embeddings = validObs.map((obs) => {
|
|
79
|
+
if (!obs.embedding) {
|
|
80
|
+
throw new Error(`Observation ${obs.id} has no embedding`);
|
|
81
|
+
}
|
|
82
|
+
return bufferToEmbedding(obs.embedding);
|
|
83
|
+
});
|
|
84
|
+
// STEP 1: Initialize - each observation is its own cluster
|
|
85
|
+
// Clusters are represented as arrays of indices into validObs
|
|
86
|
+
let clusters = validObs.map((_, index) => [index]);
|
|
87
|
+
// STEP 2: Pre-compute distance matrix with time constraints
|
|
88
|
+
const distances = computeDistanceMatrix(validObs, embeddings, embeddingService, cfg.timeWindowSeconds);
|
|
89
|
+
// STEP 3: Agglomerative merging
|
|
90
|
+
// Keep merging until no clusters are close enough
|
|
91
|
+
let mergeCount = 0;
|
|
92
|
+
const maxMerges = validObs.length; // Safety limit
|
|
93
|
+
while (mergeCount < maxMerges) {
|
|
94
|
+
const closest = findClosestClusterPair(clusters, distances);
|
|
95
|
+
// Exit condition: no clusters are close enough to merge
|
|
96
|
+
if (closest.distance > cfg.distanceThreshold) {
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
// Merge the two closest clusters
|
|
100
|
+
clusters = mergeClusters(clusters, closest.indexA, closest.indexB);
|
|
101
|
+
mergeCount++;
|
|
102
|
+
}
|
|
103
|
+
// STEP 4: Post-process - absorb small clusters
|
|
104
|
+
clusters = absorbSmallClusters(clusters, distances, cfg.minClusterSize);
|
|
105
|
+
// STEP 5: Build result objects
|
|
106
|
+
return clusters
|
|
107
|
+
.map((indices) => buildClusterResult(indices, validObs, embeddings, embeddingService))
|
|
108
|
+
.sort((a, b) => a.startTimestamp - b.startTimestamp);
|
|
109
|
+
}
|
|
110
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
111
|
+
// HELPER FUNCTIONS
|
|
112
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
113
|
+
/**
|
|
114
|
+
* Convert SQLite BLOB buffer to number array.
|
|
115
|
+
*/
|
|
116
|
+
function bufferToEmbedding(buffer) {
|
|
117
|
+
const float32 = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
|
|
118
|
+
return Array.from(float32);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Compute NxN distance matrix.
|
|
122
|
+
* Distance = 1 - cosine_similarity (so 0 = identical, 1 = orthogonal)
|
|
123
|
+
* Time-violating pairs get Infinity distance.
|
|
124
|
+
*/
|
|
125
|
+
function computeDistanceMatrix(observations, embeddings, embeddingService, timeWindowSeconds) {
|
|
126
|
+
const n = observations.length;
|
|
127
|
+
// Initialize with Infinity (no connection)
|
|
128
|
+
const matrix = Array.from({ length: n }, () => Array.from({ length: n }, () => Infinity));
|
|
129
|
+
// Fill in distances for valid pairs
|
|
130
|
+
for (const [i, obsA] of observations.entries()) {
|
|
131
|
+
matrix[i][i] = 0; // Self-distance is 0
|
|
132
|
+
for (const [j, obsB] of observations.entries()) {
|
|
133
|
+
if (j <= i)
|
|
134
|
+
continue; // Only compute upper triangle
|
|
135
|
+
// Time constraint check
|
|
136
|
+
const timeDiff = Math.abs(obsA.timestamp - obsB.timestamp);
|
|
137
|
+
if (timeDiff > timeWindowSeconds) {
|
|
138
|
+
continue; // Leave as Infinity
|
|
139
|
+
}
|
|
140
|
+
// Compute semantic distance
|
|
141
|
+
const similarity = embeddingService.similarity(embeddings[i], embeddings[j]);
|
|
142
|
+
const distance = 1 - similarity;
|
|
143
|
+
// Symmetric matrix
|
|
144
|
+
matrix[i][j] = distance;
|
|
145
|
+
matrix[j][i] = distance;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return matrix;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Find the two clusters with minimum distance (single-linkage).
|
|
152
|
+
* Single-linkage = minimum distance between ANY pair of points from each cluster.
|
|
153
|
+
*/
|
|
154
|
+
function findClosestClusterPair(clusters, distances) {
|
|
155
|
+
let minDistance = Infinity;
|
|
156
|
+
let bestA = -1;
|
|
157
|
+
let bestB = -1;
|
|
158
|
+
for (const [i, clusterA] of clusters.entries()) {
|
|
159
|
+
for (const [j, clusterB] of clusters.entries()) {
|
|
160
|
+
if (j <= i)
|
|
161
|
+
continue; // Only check each pair once
|
|
162
|
+
const pairDistance = computeClusterDistance(clusterA, clusterB, distances);
|
|
163
|
+
if (pairDistance < minDistance) {
|
|
164
|
+
minDistance = pairDistance;
|
|
165
|
+
bestA = i;
|
|
166
|
+
bestB = j;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return { indexA: bestA, indexB: bestB, distance: minDistance };
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Single-linkage distance between two clusters.
|
|
174
|
+
* Returns the minimum distance between any observation in A and any in B.
|
|
175
|
+
*/
|
|
176
|
+
function computeClusterDistance(clusterA, clusterB, distances) {
|
|
177
|
+
let minDist = Infinity;
|
|
178
|
+
for (const i of clusterA) {
|
|
179
|
+
for (const j of clusterB) {
|
|
180
|
+
if (distances[i][j] < minDist) {
|
|
181
|
+
minDist = distances[i][j];
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return minDist;
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Merge two clusters by combining their observation indices.
|
|
189
|
+
* Returns new cluster array with merged result.
|
|
190
|
+
*/
|
|
191
|
+
function mergeClusters(clusters, indexA, indexB) {
|
|
192
|
+
// Ensure indexA < indexB for consistent splicing
|
|
193
|
+
const [smaller, larger] = indexA < indexB ? [indexA, indexB] : [indexB, indexA];
|
|
194
|
+
const merged = [...clusters[smaller], ...clusters[larger]];
|
|
195
|
+
return clusters
|
|
196
|
+
.filter((_, index) => index !== smaller && index !== larger)
|
|
197
|
+
.concat([merged]);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Absorb clusters smaller than minSize into their nearest neighbor.
|
|
201
|
+
*/
|
|
202
|
+
function absorbSmallClusters(clusters, distances, minSize) {
|
|
203
|
+
const large = clusters.filter((c) => c.length >= minSize);
|
|
204
|
+
const small = clusters.filter((c) => c.length < minSize);
|
|
205
|
+
if (large.length === 0) {
|
|
206
|
+
// All clusters are small - just return them
|
|
207
|
+
return clusters;
|
|
208
|
+
}
|
|
209
|
+
// Merge each small cluster into nearest large cluster
|
|
210
|
+
for (const smallCluster of small) {
|
|
211
|
+
let nearestIndex = 0;
|
|
212
|
+
let nearestDistance = Infinity;
|
|
213
|
+
for (const [index, largeCluster] of large.entries()) {
|
|
214
|
+
const dist = computeClusterDistance(smallCluster, largeCluster, distances);
|
|
215
|
+
if (dist < nearestDistance) {
|
|
216
|
+
nearestDistance = dist;
|
|
217
|
+
nearestIndex = index;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
large[nearestIndex] = [...large[nearestIndex], ...smallCluster];
|
|
221
|
+
}
|
|
222
|
+
return large;
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Build a ClusterResult from observation indices.
|
|
226
|
+
*/
|
|
227
|
+
function buildClusterResult(indices, observations, embeddings, embeddingService) {
|
|
228
|
+
const clusterObs = indices.map((i) => observations[i]);
|
|
229
|
+
const clusterEmbeddings = indices.map((i) => embeddings[i]);
|
|
230
|
+
return {
|
|
231
|
+
clusterId: `cluster-${Date.now()}`, // Placeholder, replaced with UUIDv7 later
|
|
232
|
+
observations: clusterObs,
|
|
233
|
+
centroid: embeddingService.centroid(clusterEmbeddings),
|
|
234
|
+
startTimestamp: Math.min(...clusterObs.map((o) => o.timestamp)),
|
|
235
|
+
endTimestamp: Math.max(...clusterObs.map((o) => o.end_timestamp ?? o.timestamp)),
|
|
236
|
+
};
|
|
237
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - Debug Utilities
|
|
3
|
+
* @deprecated No longer needed - all data is stored in the database.
|
|
4
|
+
* Kept for backward compatibility only.
|
|
5
|
+
*
|
|
6
|
+
* Utilities for saving debug artifacts (VLM responses, frame copies) during processing.
|
|
7
|
+
*/
|
|
8
|
+
import { copyFile, mkdir, writeFile } from 'node:fs/promises';
|
|
9
|
+
import { homedir } from 'node:os';
|
|
10
|
+
import path, { dirname } from 'node:path';
|
|
11
|
+
const DEBUG_ENABLED = process.env.ESCRIBANO_DEBUG_VLM === 'true';
|
|
12
|
+
const DEBUG_DIR = path.join(homedir(), '.escribano', 'debug');
|
|
13
|
+
/**
|
|
14
|
+
* Initialize debug directory for a recording.
|
|
15
|
+
* @deprecated
|
|
16
|
+
*/
|
|
17
|
+
export async function initDebugDir(recordingId) {
|
|
18
|
+
if (!DEBUG_ENABLED)
|
|
19
|
+
return '';
|
|
20
|
+
const debugPath = path.join(DEBUG_DIR, recordingId);
|
|
21
|
+
const responsesPath = path.join(debugPath, 'vlm-responses');
|
|
22
|
+
const framesPath = path.join(debugPath, 'frames');
|
|
23
|
+
await mkdir(responsesPath, { recursive: true });
|
|
24
|
+
await mkdir(framesPath, { recursive: true });
|
|
25
|
+
return debugPath;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Save a VLM response to disk.
|
|
29
|
+
* @deprecated
|
|
30
|
+
*/
|
|
31
|
+
export async function saveVlmResponse(recordingId, batchIndex, response) {
|
|
32
|
+
if (!DEBUG_ENABLED)
|
|
33
|
+
return;
|
|
34
|
+
const filePath = path.join(DEBUG_DIR, recordingId, 'vlm-responses', `batch-${String(batchIndex).padStart(3, '0')}-response.json`);
|
|
35
|
+
// Create parent directories if they don't exist
|
|
36
|
+
await mkdir(dirname(filePath), { recursive: true });
|
|
37
|
+
await writeFile(filePath, JSON.stringify(response, null, 2), 'utf-8');
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Copy sampled frames to debug directory with batch naming.
|
|
41
|
+
* @deprecated
|
|
42
|
+
*/
|
|
43
|
+
export async function copyFramesForDebug(recordingId, batchIndex, frames) {
|
|
44
|
+
if (!DEBUG_ENABLED)
|
|
45
|
+
return;
|
|
46
|
+
const batchFramesDir = path.join(DEBUG_DIR, recordingId, 'frames', `batch-${String(batchIndex).padStart(3, '0')}`);
|
|
47
|
+
await mkdir(batchFramesDir, { recursive: true });
|
|
48
|
+
for (const frame of frames) {
|
|
49
|
+
const destFileName = `frame-${String(frame.index).padStart(3, '0')}-t${frame.timestamp.toFixed(1)}.jpg`;
|
|
50
|
+
const destPath = path.join(batchFramesDir, destFileName);
|
|
51
|
+
try {
|
|
52
|
+
await copyFile(frame.imagePath, destPath);
|
|
53
|
+
}
|
|
54
|
+
catch (error) {
|
|
55
|
+
console.warn(`[Debug] Failed to copy frame ${frame.imagePath}:`, error.message);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|