escribano 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +297 -0
- package/dist/0_types.js +279 -0
- package/dist/actions/classify-session.js +77 -0
- package/dist/actions/create-contexts.js +44 -0
- package/dist/actions/create-topic-blocks.js +68 -0
- package/dist/actions/extract-metadata.js +24 -0
- package/dist/actions/generate-artifact-v3.js +296 -0
- package/dist/actions/generate-artifact.js +61 -0
- package/dist/actions/generate-summary-v3.js +260 -0
- package/dist/actions/outline-index.js +204 -0
- package/dist/actions/process-recording-v2.js +494 -0
- package/dist/actions/process-recording-v3.js +412 -0
- package/dist/actions/process-session.js +183 -0
- package/dist/actions/publish-summary-v3.js +303 -0
- package/dist/actions/sync-to-outline.js +196 -0
- package/dist/adapters/audio.silero.adapter.js +69 -0
- package/dist/adapters/cap.adapter.js +94 -0
- package/dist/adapters/capture.cap.adapter.js +107 -0
- package/dist/adapters/capture.filesystem.adapter.js +124 -0
- package/dist/adapters/embedding.ollama.adapter.js +141 -0
- package/dist/adapters/intelligence.adapter.js +202 -0
- package/dist/adapters/intelligence.mlx.adapter.js +395 -0
- package/dist/adapters/intelligence.ollama.adapter.js +741 -0
- package/dist/adapters/publishing.outline.adapter.js +75 -0
- package/dist/adapters/storage.adapter.js +81 -0
- package/dist/adapters/storage.fs.adapter.js +83 -0
- package/dist/adapters/transcription.whisper.adapter.js +206 -0
- package/dist/adapters/video.ffmpeg.adapter.js +405 -0
- package/dist/adapters/whisper.adapter.js +168 -0
- package/dist/batch-context.js +329 -0
- package/dist/db/helpers.js +50 -0
- package/dist/db/index.js +95 -0
- package/dist/db/migrate.js +80 -0
- package/dist/db/repositories/artifact.sqlite.js +77 -0
- package/dist/db/repositories/cluster.sqlite.js +92 -0
- package/dist/db/repositories/context.sqlite.js +75 -0
- package/dist/db/repositories/index.js +10 -0
- package/dist/db/repositories/observation.sqlite.js +70 -0
- package/dist/db/repositories/recording.sqlite.js +56 -0
- package/dist/db/repositories/subject.sqlite.js +64 -0
- package/dist/db/repositories/topic-block.sqlite.js +45 -0
- package/dist/db/types.js +4 -0
- package/dist/domain/classification.js +60 -0
- package/dist/domain/context.js +97 -0
- package/dist/domain/index.js +2 -0
- package/dist/domain/observation.js +17 -0
- package/dist/domain/recording.js +41 -0
- package/dist/domain/segment.js +93 -0
- package/dist/domain/session.js +93 -0
- package/dist/domain/time-range.js +38 -0
- package/dist/domain/transcript.js +79 -0
- package/dist/index.js +173 -0
- package/dist/pipeline/context.js +162 -0
- package/dist/pipeline/events.js +2 -0
- package/dist/prerequisites.js +226 -0
- package/dist/scripts/rebuild-index.js +53 -0
- package/dist/scripts/seed-fixtures.js +290 -0
- package/dist/services/activity-segmentation.js +333 -0
- package/dist/services/activity-segmentation.test.js +191 -0
- package/dist/services/app-normalization.js +212 -0
- package/dist/services/cluster-merge.js +69 -0
- package/dist/services/clustering.js +237 -0
- package/dist/services/debug.js +58 -0
- package/dist/services/frame-sampling.js +318 -0
- package/dist/services/signal-extraction.js +106 -0
- package/dist/services/subject-grouping.js +342 -0
- package/dist/services/temporal-alignment.js +99 -0
- package/dist/services/vlm-enrichment.js +84 -0
- package/dist/services/vlm-service.js +130 -0
- package/dist/stats/index.js +3 -0
- package/dist/stats/observer.js +65 -0
- package/dist/stats/repository.js +36 -0
- package/dist/stats/resource-tracker.js +86 -0
- package/dist/stats/types.js +1 -0
- package/dist/test-classification-prompts.js +181 -0
- package/dist/tests/cap.adapter.test.js +75 -0
- package/dist/tests/capture.cap.adapter.test.js +69 -0
- package/dist/tests/classify-session.test.js +140 -0
- package/dist/tests/db/repositories.test.js +243 -0
- package/dist/tests/domain/time-range.test.js +31 -0
- package/dist/tests/integration.test.js +84 -0
- package/dist/tests/intelligence.adapter.test.js +102 -0
- package/dist/tests/intelligence.ollama.adapter.test.js +178 -0
- package/dist/tests/process-v2.test.js +90 -0
- package/dist/tests/services/clustering.test.js +112 -0
- package/dist/tests/services/frame-sampling.test.js +152 -0
- package/dist/tests/utils/ocr.test.js +76 -0
- package/dist/tests/utils/parallel.test.js +57 -0
- package/dist/tests/visual-observer.test.js +175 -0
- package/dist/utils/id-normalization.js +15 -0
- package/dist/utils/index.js +9 -0
- package/dist/utils/model-detector.js +154 -0
- package/dist/utils/ocr.js +80 -0
- package/dist/utils/parallel.js +32 -0
- package/migrations/001_initial.sql +109 -0
- package/migrations/002_clusters.sql +41 -0
- package/migrations/003_observations_vlm_fields.sql +14 -0
- package/migrations/004_observations_unique.sql +18 -0
- package/migrations/005_processing_stats.sql +29 -0
- package/migrations/006_vlm_raw_response.sql +6 -0
- package/migrations/007_subjects.sql +23 -0
- package/migrations/008_artifacts_recording.sql +6 -0
- package/migrations/009_artifact_subjects.sql +10 -0
- package/package.json +82 -0
- package/prompts/action-items.md +55 -0
- package/prompts/blog-draft.md +54 -0
- package/prompts/blog-research.md +87 -0
- package/prompts/card.md +54 -0
- package/prompts/classify-segment.md +38 -0
- package/prompts/classify.md +37 -0
- package/prompts/code-snippets.md +163 -0
- package/prompts/extract-metadata.md +149 -0
- package/prompts/notes.md +83 -0
- package/prompts/runbook.md +123 -0
- package/prompts/standup.md +50 -0
- package/prompts/step-by-step.md +125 -0
- package/prompts/subject-grouping.md +31 -0
- package/prompts/summary-v3.md +89 -0
- package/prompts/summary.md +77 -0
- package/prompts/topic-classifier.md +24 -0
- package/prompts/topic-extract.md +13 -0
- package/prompts/vlm-batch.md +21 -0
- package/prompts/vlm-single.md +19 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - Adaptive Frame Sampling Service
|
|
3
|
+
*
|
|
4
|
+
* Reduces frame count while preserving important moments.
|
|
5
|
+
* Strategy: Base sampling (10s) + gap filling for large time jumps.
|
|
6
|
+
*/
|
|
7
|
+
const DEFAULT_CONFIG = {
|
|
8
|
+
baseIntervalSeconds: Number(process.env.ESCRIBANO_SAMPLE_INTERVAL) || 10,
|
|
9
|
+
gapThresholdSeconds: Number(process.env.ESCRIBANO_SAMPLE_GAP_THRESHOLD) || 15,
|
|
10
|
+
gapFillIntervalSeconds: Number(process.env.ESCRIBANO_SAMPLE_GAP_FILL) || 3,
|
|
11
|
+
};
|
|
12
|
+
/**
|
|
13
|
+
* Find the frame closest to a target timestamp.
|
|
14
|
+
*/
|
|
15
|
+
function findNearestFrame(frames, targetTimestamp) {
|
|
16
|
+
if (frames.length === 0)
|
|
17
|
+
return null;
|
|
18
|
+
let nearest = frames[0];
|
|
19
|
+
let minDiff = Math.abs(frames[0].timestamp - targetTimestamp);
|
|
20
|
+
for (const frame of frames) {
|
|
21
|
+
const diff = Math.abs(frame.timestamp - targetTimestamp);
|
|
22
|
+
if (diff < minDiff) {
|
|
23
|
+
minDiff = diff;
|
|
24
|
+
nearest = frame;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return nearest;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Adaptively sample frames from a recording.
|
|
31
|
+
*
|
|
32
|
+
* Strategy:
|
|
33
|
+
* 1. Take frames at base interval (default: every 10 seconds)
|
|
34
|
+
* 2. Detect gaps larger than threshold (default: 15 seconds)
|
|
35
|
+
* 3. Fill gaps with denser sampling (default: every 3 seconds)
|
|
36
|
+
*
|
|
37
|
+
* @param allFrames - All extracted frames (typically at 2s intervals)
|
|
38
|
+
* @param config - Sampling configuration
|
|
39
|
+
* @returns Sampled frames with reason annotations
|
|
40
|
+
*/
|
|
41
|
+
export function adaptiveSample(allFrames, config = {}) {
|
|
42
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
43
|
+
if (allFrames.length === 0)
|
|
44
|
+
return [];
|
|
45
|
+
// Sort frames by timestamp
|
|
46
|
+
const sortedFrames = [...allFrames].sort((a, b) => a.timestamp - b.timestamp);
|
|
47
|
+
// Step 1: Base sampling - take frames at regular intervals
|
|
48
|
+
const baseSampled = [];
|
|
49
|
+
const sampledTimestamps = new Set();
|
|
50
|
+
let lastSampledTime = -Infinity;
|
|
51
|
+
for (const frame of sortedFrames) {
|
|
52
|
+
if (frame.timestamp - lastSampledTime >= cfg.baseIntervalSeconds) {
|
|
53
|
+
baseSampled.push({
|
|
54
|
+
imagePath: frame.imagePath,
|
|
55
|
+
timestamp: frame.timestamp,
|
|
56
|
+
reason: 'base',
|
|
57
|
+
});
|
|
58
|
+
sampledTimestamps.add(frame.timestamp);
|
|
59
|
+
lastSampledTime = frame.timestamp;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
// Step 2: Detect and fill gaps
|
|
63
|
+
const result = [];
|
|
64
|
+
for (let i = 0; i < baseSampled.length; i++) {
|
|
65
|
+
result.push(baseSampled[i]);
|
|
66
|
+
// Check for gap to next sample
|
|
67
|
+
if (i < baseSampled.length - 1) {
|
|
68
|
+
const currentTime = baseSampled[i].timestamp;
|
|
69
|
+
const nextTime = baseSampled[i + 1].timestamp;
|
|
70
|
+
const gap = nextTime - currentTime;
|
|
71
|
+
if (gap > cfg.gapThresholdSeconds) {
|
|
72
|
+
// Fill the gap with denser samples
|
|
73
|
+
const gapStart = currentTime + cfg.gapFillIntervalSeconds;
|
|
74
|
+
const gapEnd = nextTime - cfg.gapFillIntervalSeconds;
|
|
75
|
+
for (let t = gapStart; t <= gapEnd; t += cfg.gapFillIntervalSeconds) {
|
|
76
|
+
const nearestFrame = findNearestFrame(sortedFrames, t);
|
|
77
|
+
if (nearestFrame && !sampledTimestamps.has(nearestFrame.timestamp)) {
|
|
78
|
+
result.push({
|
|
79
|
+
imagePath: nearestFrame.imagePath,
|
|
80
|
+
timestamp: nearestFrame.timestamp,
|
|
81
|
+
reason: 'gap_fill',
|
|
82
|
+
});
|
|
83
|
+
sampledTimestamps.add(nearestFrame.timestamp);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// Sort final result by timestamp
|
|
90
|
+
return result.sort((a, b) => a.timestamp - b.timestamp);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Calculate adaptive base interval based on scene change density.
|
|
94
|
+
*
|
|
95
|
+
* When scene changes are dense, they already provide good timeline coverage,
|
|
96
|
+
* so we increase the base interval to avoid excessive frames.
|
|
97
|
+
*
|
|
98
|
+
* Thresholds:
|
|
99
|
+
* - < 20 scenes: 10s base (scenes too sparse, need dense base sampling)
|
|
100
|
+
* - 20-50 scenes: 20s base (moderate coverage from scenes)
|
|
101
|
+
* - > 50 scenes: 30s base (scenes provide excellent coverage)
|
|
102
|
+
*
|
|
103
|
+
* @param sceneCount - Number of detected scene changes
|
|
104
|
+
* @param configBaseInterval - User-configured base interval (used as minimum)
|
|
105
|
+
* @returns Adjusted base interval in seconds
|
|
106
|
+
*/
|
|
107
|
+
export function calculateAdaptiveBaseInterval(sceneCount, configBaseInterval) {
|
|
108
|
+
if (sceneCount > 50)
|
|
109
|
+
return Math.max(configBaseInterval, 30);
|
|
110
|
+
if (sceneCount > 20)
|
|
111
|
+
return Math.max(configBaseInterval, 20);
|
|
112
|
+
return configBaseInterval;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Adaptively sample frames with scene change awareness.
|
|
116
|
+
*
|
|
117
|
+
* Strategy:
|
|
118
|
+
* 1. Always include frames nearest to scene change timestamps
|
|
119
|
+
* 2. Between scene changes, sample at base interval
|
|
120
|
+
* 3. Detect gaps larger than threshold and fill with denser sampling
|
|
121
|
+
*
|
|
122
|
+
* @param allFrames - All extracted frames (typically at 2s intervals)
|
|
123
|
+
* @param sceneChanges - Timestamps of detected scene changes from ffmpeg
|
|
124
|
+
* @param config - Sampling configuration
|
|
125
|
+
* @returns Sampled frames with reason annotations
|
|
126
|
+
*/
|
|
127
|
+
export function adaptiveSampleWithScenes(allFrames, sceneChanges, config = {}) {
|
|
128
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
129
|
+
if (allFrames.length === 0)
|
|
130
|
+
return [];
|
|
131
|
+
// Adjust base interval based on scene density
|
|
132
|
+
cfg.baseIntervalSeconds = calculateAdaptiveBaseInterval(sceneChanges.length, cfg.baseIntervalSeconds);
|
|
133
|
+
// When scene density is high, also increase gap threshold to prevent
|
|
134
|
+
// gap filling between closely-spaced scene changes
|
|
135
|
+
if (sceneChanges.length > 50) {
|
|
136
|
+
cfg.gapThresholdSeconds = Math.max(cfg.gapThresholdSeconds, 60);
|
|
137
|
+
cfg.gapFillIntervalSeconds = Math.max(cfg.gapFillIntervalSeconds, 10);
|
|
138
|
+
}
|
|
139
|
+
else if (sceneChanges.length > 20) {
|
|
140
|
+
cfg.gapThresholdSeconds = Math.max(cfg.gapThresholdSeconds, 40);
|
|
141
|
+
cfg.gapFillIntervalSeconds = Math.max(cfg.gapFillIntervalSeconds, 5);
|
|
142
|
+
}
|
|
143
|
+
// Sort frames by timestamp
|
|
144
|
+
const sortedFrames = [...allFrames].sort((a, b) => a.timestamp - b.timestamp);
|
|
145
|
+
// Track which timestamps we've already sampled
|
|
146
|
+
const sampledTimestamps = new Set();
|
|
147
|
+
const result = [];
|
|
148
|
+
// Step 1: Always include frames nearest to scene changes
|
|
149
|
+
for (const changeTime of sceneChanges) {
|
|
150
|
+
const nearest = findNearestFrame(sortedFrames, changeTime);
|
|
151
|
+
if (nearest && !sampledTimestamps.has(nearest.timestamp)) {
|
|
152
|
+
result.push({
|
|
153
|
+
imagePath: nearest.imagePath,
|
|
154
|
+
timestamp: nearest.timestamp,
|
|
155
|
+
reason: 'scene_change',
|
|
156
|
+
});
|
|
157
|
+
sampledTimestamps.add(nearest.timestamp);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
// Sort scene change frames by timestamp
|
|
161
|
+
result.sort((a, b) => a.timestamp - b.timestamp);
|
|
162
|
+
// Step 2: Between scene changes, sample at base interval
|
|
163
|
+
// Create segments between scene changes
|
|
164
|
+
const sceneTimestamps = result.map((f) => f.timestamp);
|
|
165
|
+
const segments = [];
|
|
166
|
+
if (sceneTimestamps.length === 0) {
|
|
167
|
+
// No scene changes - sample entire video
|
|
168
|
+
segments.push({
|
|
169
|
+
start: sortedFrames[0].timestamp,
|
|
170
|
+
end: sortedFrames[sortedFrames.length - 1].timestamp,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
// Create segments: before first scene, between scenes, after last scene
|
|
175
|
+
segments.push({
|
|
176
|
+
start: sortedFrames[0].timestamp,
|
|
177
|
+
end: sceneTimestamps[0],
|
|
178
|
+
});
|
|
179
|
+
for (let i = 0; i < sceneTimestamps.length - 1; i++) {
|
|
180
|
+
segments.push({
|
|
181
|
+
start: sceneTimestamps[i],
|
|
182
|
+
end: sceneTimestamps[i + 1],
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
segments.push({
|
|
186
|
+
start: sceneTimestamps[sceneTimestamps.length - 1],
|
|
187
|
+
end: sortedFrames[sortedFrames.length - 1].timestamp,
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
// Sample each segment at base interval
|
|
191
|
+
for (const segment of segments) {
|
|
192
|
+
let lastSampleTime = segment.start;
|
|
193
|
+
for (const frame of sortedFrames) {
|
|
194
|
+
if (frame.timestamp < segment.start || frame.timestamp > segment.end) {
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
if (frame.timestamp - lastSampleTime >= cfg.baseIntervalSeconds &&
|
|
198
|
+
!sampledTimestamps.has(frame.timestamp)) {
|
|
199
|
+
result.push({
|
|
200
|
+
imagePath: frame.imagePath,
|
|
201
|
+
timestamp: frame.timestamp,
|
|
202
|
+
reason: 'base',
|
|
203
|
+
});
|
|
204
|
+
sampledTimestamps.add(frame.timestamp);
|
|
205
|
+
lastSampleTime = frame.timestamp;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
// Sort before gap filling
|
|
210
|
+
result.sort((a, b) => a.timestamp - b.timestamp);
|
|
211
|
+
// Step 3: Fill large gaps between any samples
|
|
212
|
+
const withGapsFilled = [];
|
|
213
|
+
for (let i = 0; i < result.length; i++) {
|
|
214
|
+
withGapsFilled.push(result[i]);
|
|
215
|
+
if (i < result.length - 1) {
|
|
216
|
+
const currentTime = result[i].timestamp;
|
|
217
|
+
const nextTime = result[i + 1].timestamp;
|
|
218
|
+
const gap = nextTime - currentTime;
|
|
219
|
+
if (gap > cfg.gapThresholdSeconds) {
|
|
220
|
+
// Fill the gap with denser samples
|
|
221
|
+
const gapStart = currentTime + cfg.gapFillIntervalSeconds;
|
|
222
|
+
const gapEnd = nextTime - cfg.gapFillIntervalSeconds;
|
|
223
|
+
for (let t = gapStart; t <= gapEnd; t += cfg.gapFillIntervalSeconds) {
|
|
224
|
+
const nearestFrame = findNearestFrame(sortedFrames, t);
|
|
225
|
+
if (nearestFrame && !sampledTimestamps.has(nearestFrame.timestamp)) {
|
|
226
|
+
withGapsFilled.push({
|
|
227
|
+
imagePath: nearestFrame.imagePath,
|
|
228
|
+
timestamp: nearestFrame.timestamp,
|
|
229
|
+
reason: 'gap_fill',
|
|
230
|
+
});
|
|
231
|
+
sampledTimestamps.add(nearestFrame.timestamp);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// Sort final result by timestamp
|
|
238
|
+
return withGapsFilled.sort((a, b) => a.timestamp - b.timestamp);
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Get sampling statistics for logging.
|
|
242
|
+
*/
|
|
243
|
+
export function getSamplingStats(original, sampled) {
|
|
244
|
+
const baseCount = sampled.filter((f) => f.reason === 'base').length;
|
|
245
|
+
const gapFillCount = sampled.filter((f) => f.reason === 'gap_fill').length;
|
|
246
|
+
const sceneChangeCount = sampled.filter((f) => f.reason === 'scene_change').length;
|
|
247
|
+
return {
|
|
248
|
+
originalCount: original.length,
|
|
249
|
+
sampledCount: sampled.length,
|
|
250
|
+
reductionPercent: Math.round((1 - sampled.length / (original.length || 1)) * 100),
|
|
251
|
+
baseCount,
|
|
252
|
+
gapFillCount,
|
|
253
|
+
sceneChangeCount,
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Calculate required frame timestamps WITHOUT extracting frames.
|
|
258
|
+
* Used by the smart extraction pipeline to extract only needed frames.
|
|
259
|
+
*
|
|
260
|
+
* This is the inverse of adaptiveSampleWithScenes - instead of selecting
|
|
261
|
+
* from existing frames, we calculate which timestamps we need.
|
|
262
|
+
*
|
|
263
|
+
* @param durationSeconds - Total video duration in seconds
|
|
264
|
+
* @param sceneChanges - Timestamps of detected scene changes
|
|
265
|
+
* @param config - Sampling configuration
|
|
266
|
+
* @returns Sorted array of timestamps that need frames
|
|
267
|
+
*
|
|
268
|
+
* @example
|
|
269
|
+
* // For a 60s video with 3 scene changes at 10s, 25s, 45s
|
|
270
|
+
* // Default config: 10s base interval, 15s gap threshold, 3s gap fill
|
|
271
|
+
* const timestamps = calculateRequiredTimestamps(60, [10, 25, 45]);
|
|
272
|
+
* // Returns: [0, 10, 20, 25, 30, 40, 45, 50, 60] (9 frames)
|
|
273
|
+
* // Instead of extracting 30 frames (every 2s), we only extract 9
|
|
274
|
+
*
|
|
275
|
+
* @example
|
|
276
|
+
* // For a 132min (7920s) video with 50 scene changes
|
|
277
|
+
* // Default config produces ~200-400 frames instead of ~3960
|
|
278
|
+
* const timestamps = calculateRequiredTimestamps(7920, sceneChanges);
|
|
279
|
+
*/
|
|
280
|
+
export function calculateRequiredTimestamps(durationSeconds, sceneChanges = [], config = {}) {
|
|
281
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
282
|
+
if (durationSeconds <= 0)
|
|
283
|
+
return [];
|
|
284
|
+
// Adjust for scene density (same logic as adaptiveSampleWithScenes)
|
|
285
|
+
cfg.baseIntervalSeconds = calculateAdaptiveBaseInterval(sceneChanges.length, cfg.baseIntervalSeconds);
|
|
286
|
+
// Adjust gap thresholds for high scene density
|
|
287
|
+
if (sceneChanges.length > 50) {
|
|
288
|
+
cfg.gapThresholdSeconds = Math.max(cfg.gapThresholdSeconds, 60);
|
|
289
|
+
cfg.gapFillIntervalSeconds = Math.max(cfg.gapFillIntervalSeconds, 10);
|
|
290
|
+
}
|
|
291
|
+
else if (sceneChanges.length > 20) {
|
|
292
|
+
cfg.gapThresholdSeconds = Math.max(cfg.gapThresholdSeconds, 40);
|
|
293
|
+
cfg.gapFillIntervalSeconds = Math.max(cfg.gapFillIntervalSeconds, 5);
|
|
294
|
+
}
|
|
295
|
+
const timestamps = new Set();
|
|
296
|
+
// Step 1: Add scene change timestamps (rounded to nearest second)
|
|
297
|
+
for (const t of sceneChanges) {
|
|
298
|
+
const rounded = Math.round(t);
|
|
299
|
+
if (rounded >= 0 && rounded <= durationSeconds) {
|
|
300
|
+
timestamps.add(rounded);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
// Step 2: Add base interval samples throughout video
|
|
304
|
+
for (let t = 0; t <= durationSeconds; t += cfg.baseIntervalSeconds) {
|
|
305
|
+
timestamps.add(Math.round(t));
|
|
306
|
+
}
|
|
307
|
+
// Step 3: Fill large gaps between samples
|
|
308
|
+
const sorted = [...timestamps].sort((a, b) => a - b);
|
|
309
|
+
for (let i = 0; i < sorted.length - 1; i++) {
|
|
310
|
+
const gap = sorted[i + 1] - sorted[i];
|
|
311
|
+
if (gap > cfg.gapThresholdSeconds) {
|
|
312
|
+
for (let t = sorted[i] + cfg.gapFillIntervalSeconds; t < sorted[i + 1]; t += cfg.gapFillIntervalSeconds) {
|
|
313
|
+
timestamps.add(Math.round(t));
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return [...timestamps].sort((a, b) => a - b);
|
|
318
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escribano - Signal Extraction Service
|
|
3
|
+
*
|
|
4
|
+
* Extracts semantic signals (apps, urls, projects, topics) from cluster observations.
|
|
5
|
+
* Uses a tiered approach: regex for structured → patterns for semi-structured → LLM for semantic.
|
|
6
|
+
*/
|
|
7
|
+
// ============================================================================
|
|
8
|
+
// TIER 1: REGEX-BASED EXTRACTION (URLs, Domains)
|
|
9
|
+
// ============================================================================
|
|
10
|
+
const URL_REGEX = /(?:https?:\/\/)?(?:www\.)?([a-zA-Z0-9][-a-zA-Z0-9]*(?:\.[a-zA-Z0-9][-a-zA-Z0-9]*)+)(?:\/[^\s]*)?/gi;
|
|
11
|
+
const NOISE_DOMAINS = ['localhost', '127.0.0.1', '0.0.0.0', 'example.com'];
|
|
12
|
+
export function extractUrls(texts) {
|
|
13
|
+
const domains = new Map();
|
|
14
|
+
for (const text of texts) {
|
|
15
|
+
const matches = text.matchAll(URL_REGEX);
|
|
16
|
+
for (const match of matches) {
|
|
17
|
+
const domain = match[1].toLowerCase();
|
|
18
|
+
if (!NOISE_DOMAINS.includes(domain) && !domain.startsWith('192.168.')) {
|
|
19
|
+
domains.set(domain, (domains.get(domain) || 0) + 1);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
// Return domains appearing at least twice, sorted by frequency
|
|
24
|
+
return Array.from(domains.entries())
|
|
25
|
+
.filter(([_, count]) => count >= 2)
|
|
26
|
+
.sort((a, b) => b[1] - a[1])
|
|
27
|
+
.map(([domain]) => domain);
|
|
28
|
+
}
|
|
29
|
+
// ============================================================================
|
|
30
|
+
// TIER 2: PATTERN-BASED EXTRACTION (Apps, Projects)
|
|
31
|
+
// ============================================================================
|
|
32
|
+
const APP_PATTERNS = {
|
|
33
|
+
'VS Code': /(?:visual\s+studio\s+code|vscode|code\s+-|\[Code\]|\.vscode)/i,
|
|
34
|
+
Chrome: /(?:google\s+chrome|chrome\s+-|\s+-\s+chrome)/i,
|
|
35
|
+
Firefox: /(?:mozilla\s+firefox|firefox\s+-)/i,
|
|
36
|
+
Safari: /(?:safari\s+-|apple\s+safari)/i,
|
|
37
|
+
Terminal: /(?:terminal|iterm|iterm2|hyper)/i,
|
|
38
|
+
Ghostty: /ghostty/i,
|
|
39
|
+
Neovim: /(?:neovim|nvim|nvimtree)/i,
|
|
40
|
+
Vim: /(?:\bvim\b(?!tree))/i,
|
|
41
|
+
Slack: /(?:slack\s+-|\[Slack\])/i,
|
|
42
|
+
Discord: /(?:discord\s+-|\[Discord\])/i,
|
|
43
|
+
YouTube: /(?:youtube\.com|youtube\s+-)/i,
|
|
44
|
+
GitHub: /(?:github\.com|github\s+-)/i,
|
|
45
|
+
Figma: /(?:figma\.com|figma\s+-)/i,
|
|
46
|
+
Notion: /(?:notion\.so|notion\s+-)/i,
|
|
47
|
+
Obsidian: /(?:obsidian\s+-|\.obsidian)/i,
|
|
48
|
+
};
|
|
49
|
+
export function extractApps(texts) {
|
|
50
|
+
const detected = new Set();
|
|
51
|
+
for (const text of texts) {
|
|
52
|
+
for (const [app, pattern] of Object.entries(APP_PATTERNS)) {
|
|
53
|
+
if (pattern.test(text)) {
|
|
54
|
+
detected.add(app);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return Array.from(detected);
|
|
59
|
+
}
|
|
60
|
+
const PROJECT_PATTERNS = [
|
|
61
|
+
/(?:repos|projects|dev|src|code)\/([a-zA-Z0-9_-]+)/i,
|
|
62
|
+
/(?:github\.com|gitlab\.com)\/[^/]+\/([a-zA-Z0-9_-]+)/i,
|
|
63
|
+
/package\.json.*?"name":\s*"([^"]+)"/i,
|
|
64
|
+
/~\/([a-zA-Z0-9_-]+)\/(?:src|lib|packages)/i,
|
|
65
|
+
];
|
|
66
|
+
export function extractProjects(texts) {
|
|
67
|
+
const projects = new Map();
|
|
68
|
+
for (const text of texts) {
|
|
69
|
+
for (const pattern of PROJECT_PATTERNS) {
|
|
70
|
+
const match = text.match(pattern);
|
|
71
|
+
if (match && match[1]) {
|
|
72
|
+
const name = match[1].toLowerCase();
|
|
73
|
+
// Filter out common non-project names
|
|
74
|
+
if (!['src', 'lib', 'dist', 'build', 'node_modules', 'packages'].includes(name)) {
|
|
75
|
+
projects.set(name, (projects.get(name) || 0) + 1);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return Array.from(projects.keys());
|
|
81
|
+
}
|
|
82
|
+
// ============================================================================
|
|
83
|
+
// TIER 3: LLM-BASED EXTRACTION (Topics)
|
|
84
|
+
// ============================================================================
|
|
85
|
+
export async function extractTopics(observations, intelligence) {
|
|
86
|
+
return intelligence.extractTopics(observations);
|
|
87
|
+
}
|
|
88
|
+
// ============================================================================
|
|
89
|
+
// COMBINED EXTRACTION
|
|
90
|
+
// ============================================================================
|
|
91
|
+
export async function extractSignals(observations, intelligence) {
|
|
92
|
+
// Collect all text content
|
|
93
|
+
const allTexts = observations.map((o) => {
|
|
94
|
+
if (o.type === 'visual') {
|
|
95
|
+
return [o.ocr_text || '', o.vlm_description || ''].join(' ');
|
|
96
|
+
}
|
|
97
|
+
return o.text || '';
|
|
98
|
+
});
|
|
99
|
+
// Tier 1 & 2: Fast extraction
|
|
100
|
+
const urls = extractUrls(allTexts);
|
|
101
|
+
const apps = extractApps(allTexts);
|
|
102
|
+
const projects = extractProjects(allTexts);
|
|
103
|
+
// Tier 3: LLM-based topics
|
|
104
|
+
const topics = await extractTopics(observations, intelligence);
|
|
105
|
+
return { apps, urls, projects, topics };
|
|
106
|
+
}
|