escribano 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +297 -0
  3. package/dist/0_types.js +279 -0
  4. package/dist/actions/classify-session.js +77 -0
  5. package/dist/actions/create-contexts.js +44 -0
  6. package/dist/actions/create-topic-blocks.js +68 -0
  7. package/dist/actions/extract-metadata.js +24 -0
  8. package/dist/actions/generate-artifact-v3.js +296 -0
  9. package/dist/actions/generate-artifact.js +61 -0
  10. package/dist/actions/generate-summary-v3.js +260 -0
  11. package/dist/actions/outline-index.js +204 -0
  12. package/dist/actions/process-recording-v2.js +494 -0
  13. package/dist/actions/process-recording-v3.js +412 -0
  14. package/dist/actions/process-session.js +183 -0
  15. package/dist/actions/publish-summary-v3.js +303 -0
  16. package/dist/actions/sync-to-outline.js +196 -0
  17. package/dist/adapters/audio.silero.adapter.js +69 -0
  18. package/dist/adapters/cap.adapter.js +94 -0
  19. package/dist/adapters/capture.cap.adapter.js +107 -0
  20. package/dist/adapters/capture.filesystem.adapter.js +124 -0
  21. package/dist/adapters/embedding.ollama.adapter.js +141 -0
  22. package/dist/adapters/intelligence.adapter.js +202 -0
  23. package/dist/adapters/intelligence.mlx.adapter.js +395 -0
  24. package/dist/adapters/intelligence.ollama.adapter.js +741 -0
  25. package/dist/adapters/publishing.outline.adapter.js +75 -0
  26. package/dist/adapters/storage.adapter.js +81 -0
  27. package/dist/adapters/storage.fs.adapter.js +83 -0
  28. package/dist/adapters/transcription.whisper.adapter.js +206 -0
  29. package/dist/adapters/video.ffmpeg.adapter.js +405 -0
  30. package/dist/adapters/whisper.adapter.js +168 -0
  31. package/dist/batch-context.js +329 -0
  32. package/dist/db/helpers.js +50 -0
  33. package/dist/db/index.js +95 -0
  34. package/dist/db/migrate.js +80 -0
  35. package/dist/db/repositories/artifact.sqlite.js +77 -0
  36. package/dist/db/repositories/cluster.sqlite.js +92 -0
  37. package/dist/db/repositories/context.sqlite.js +75 -0
  38. package/dist/db/repositories/index.js +10 -0
  39. package/dist/db/repositories/observation.sqlite.js +70 -0
  40. package/dist/db/repositories/recording.sqlite.js +56 -0
  41. package/dist/db/repositories/subject.sqlite.js +64 -0
  42. package/dist/db/repositories/topic-block.sqlite.js +45 -0
  43. package/dist/db/types.js +4 -0
  44. package/dist/domain/classification.js +60 -0
  45. package/dist/domain/context.js +97 -0
  46. package/dist/domain/index.js +2 -0
  47. package/dist/domain/observation.js +17 -0
  48. package/dist/domain/recording.js +41 -0
  49. package/dist/domain/segment.js +93 -0
  50. package/dist/domain/session.js +93 -0
  51. package/dist/domain/time-range.js +38 -0
  52. package/dist/domain/transcript.js +79 -0
  53. package/dist/index.js +173 -0
  54. package/dist/pipeline/context.js +162 -0
  55. package/dist/pipeline/events.js +2 -0
  56. package/dist/prerequisites.js +226 -0
  57. package/dist/scripts/rebuild-index.js +53 -0
  58. package/dist/scripts/seed-fixtures.js +290 -0
  59. package/dist/services/activity-segmentation.js +333 -0
  60. package/dist/services/activity-segmentation.test.js +191 -0
  61. package/dist/services/app-normalization.js +212 -0
  62. package/dist/services/cluster-merge.js +69 -0
  63. package/dist/services/clustering.js +237 -0
  64. package/dist/services/debug.js +58 -0
  65. package/dist/services/frame-sampling.js +318 -0
  66. package/dist/services/signal-extraction.js +106 -0
  67. package/dist/services/subject-grouping.js +342 -0
  68. package/dist/services/temporal-alignment.js +99 -0
  69. package/dist/services/vlm-enrichment.js +84 -0
  70. package/dist/services/vlm-service.js +130 -0
  71. package/dist/stats/index.js +3 -0
  72. package/dist/stats/observer.js +65 -0
  73. package/dist/stats/repository.js +36 -0
  74. package/dist/stats/resource-tracker.js +86 -0
  75. package/dist/stats/types.js +1 -0
  76. package/dist/test-classification-prompts.js +181 -0
  77. package/dist/tests/cap.adapter.test.js +75 -0
  78. package/dist/tests/capture.cap.adapter.test.js +69 -0
  79. package/dist/tests/classify-session.test.js +140 -0
  80. package/dist/tests/db/repositories.test.js +243 -0
  81. package/dist/tests/domain/time-range.test.js +31 -0
  82. package/dist/tests/integration.test.js +84 -0
  83. package/dist/tests/intelligence.adapter.test.js +102 -0
  84. package/dist/tests/intelligence.ollama.adapter.test.js +178 -0
  85. package/dist/tests/process-v2.test.js +90 -0
  86. package/dist/tests/services/clustering.test.js +112 -0
  87. package/dist/tests/services/frame-sampling.test.js +152 -0
  88. package/dist/tests/utils/ocr.test.js +76 -0
  89. package/dist/tests/utils/parallel.test.js +57 -0
  90. package/dist/tests/visual-observer.test.js +175 -0
  91. package/dist/utils/id-normalization.js +15 -0
  92. package/dist/utils/index.js +9 -0
  93. package/dist/utils/model-detector.js +154 -0
  94. package/dist/utils/ocr.js +80 -0
  95. package/dist/utils/parallel.js +32 -0
  96. package/migrations/001_initial.sql +109 -0
  97. package/migrations/002_clusters.sql +41 -0
  98. package/migrations/003_observations_vlm_fields.sql +14 -0
  99. package/migrations/004_observations_unique.sql +18 -0
  100. package/migrations/005_processing_stats.sql +29 -0
  101. package/migrations/006_vlm_raw_response.sql +6 -0
  102. package/migrations/007_subjects.sql +23 -0
  103. package/migrations/008_artifacts_recording.sql +6 -0
  104. package/migrations/009_artifact_subjects.sql +10 -0
  105. package/package.json +82 -0
  106. package/prompts/action-items.md +55 -0
  107. package/prompts/blog-draft.md +54 -0
  108. package/prompts/blog-research.md +87 -0
  109. package/prompts/card.md +54 -0
  110. package/prompts/classify-segment.md +38 -0
  111. package/prompts/classify.md +37 -0
  112. package/prompts/code-snippets.md +163 -0
  113. package/prompts/extract-metadata.md +149 -0
  114. package/prompts/notes.md +83 -0
  115. package/prompts/runbook.md +123 -0
  116. package/prompts/standup.md +50 -0
  117. package/prompts/step-by-step.md +125 -0
  118. package/prompts/subject-grouping.md +31 -0
  119. package/prompts/summary-v3.md +89 -0
  120. package/prompts/summary.md +77 -0
  121. package/prompts/topic-classifier.md +24 -0
  122. package/prompts/topic-extract.md +13 -0
  123. package/prompts/vlm-batch.md +21 -0
  124. package/prompts/vlm-single.md +19 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eduardo Sanchez
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,297 @@
1
+ # Escribano
2
+
3
+ Record your screen. Get a structured summary of what you did.
4
+
5
+ > **Platform:** macOS (Apple Silicon) required. Linux/Windows on the roadmap.
6
+ > **Minimum:** 16GB unified memory (32GB recommended for best quality)
7
+
8
+ ---
9
+
10
+ ## What you put in
11
+
12
+ A screen recording. Could be 20 minutes, could be 3 hours. You didn't take notes.
13
+
14
+ ## What you get back (~9 minutes later)
15
+
16
+ ```markdown
17
+ # Session Card - Feb 25, 2026
18
+
19
+ ## Escribano Pipeline Optimization
20
+ **1h 53m** | coding 22m, debugging 30m, terminal 24m, review 58m, planning 6m
21
+
22
+ - Optimized the video processing pipeline by evaluating skip-frame strategies
23
+ and removing scene detection for 180-minute videos.
24
+ - Resolved persistent VLM parsing failures and truncation errors by implementing
25
+ raw response logging and fallback mechanisms.
26
+ - Executed database migrations to add the new observations table schema.
27
+ - Benchmarked the performance of the GLM-5 and Qwen-VL models.
28
+
29
+ ## Frame Extraction & Scene Detection
30
+ **19m** | coding 11m, debugging 4m, terminal 4m
31
+
32
+ - Developed TypeScript scripts for video frame extraction using FFmpeg.
33
+ - Debugged a critical parsing failure at Frame 3.
34
+ - Monitored terminal logs to track progress of a 792-second video file.
35
+
36
+ ## Research & System Analysis
37
+ **22m** | review 3m, research 2m, coding 7m, terminal 6m
38
+
39
+ - Reviewed GitHub Copilot pricing and Screenpipe repository architecture.
40
+ - Investigated the database schema in TablePlus.
41
+
42
+ ---
43
+ *Personal time: 2h 38m (WhatsApp, Instagram, Email)*
44
+ ```
45
+
46
+ That's the **card** format. Two others:
47
+
48
+ ### Standup format
49
+
50
+ ```markdown
51
+ ## Standup - Feb 25, 2026
52
+
53
+ **What I did:**
54
+ - Debugged VLM parsing failures by implementing raw response logging
55
+ - Optimized video frame extraction pipeline using FFmpeg
56
+ - Analyzed GLM-5 and Qwen-VL model performance
57
+ - Implemented database schema migrations
58
+
59
+ **Key outcomes:**
60
+ - Resolved truncated response issues with fallback parsing
61
+ - Identified scene detection as a latency bottleneck
62
+ - Validated new batch extraction strategy
63
+
64
+ **Next:**
65
+ - Merge scene detection optimization branch
66
+ - Benchmark qwen3_next model
67
+ - Add unit tests for fallback parsing
68
+ ```
69
+
70
+ Paste straight into Slack.
71
+
72
+ ### Narrative format
73
+
74
+ ```markdown
75
+ # Session Summary: Sunday, February 22, 2026
76
+
77
+ ## Overview
78
+ I spent nearly three hours optimizing the VLM inference pipeline. The main focus
79
+ was resolving JSON parsing errors during batch processing and benchmarking the
80
+ qwen3-vl:4b model against InternVL-14B. By the end, I'd identified the truncation
81
+ root cause, adjusted MAX_TOKENS, and validated the fix against 342 frames —
82
+ resulting in a 4x speedup with continuous batching.
83
+
84
+ ## Timeline
85
+ * **0:00** (45m): Terminal work, running benchmark scripts
86
+ * **45:00** (60m): Debugging JSON parsing in VS Code
87
+ * **1:45:00** (40m): Researching model quantization
88
+ * **2:25:00** (34m): Documenting performance metrics
89
+ ...
90
+ ```
91
+
92
+ Good for retrospectives or blog drafts.
93
+
94
+ ---
95
+
96
+ ## Benchmarks
97
+
98
+ Ran the full pipeline on 11 real screen recordings:
99
+
100
+ | Metric | Result |
101
+ |--------|--------|
102
+ | Videos processed | 11 |
103
+ | Artifacts generated | 33 (3 formats × 11 videos) |
104
+ | Success rate | 100% |
105
+ | Total time | 1h 41m |
106
+ | Avg per video | **~9 min** (pipeline + all 3 formats) |
107
+ | Hardware | MacBook Pro M4 Max, 128GB |
108
+
109
+ Everything runs locally. No API keys. Nothing leaves your machine.
110
+
111
+ ---
112
+
113
+ ## Why this exists
114
+
115
+ Most screen recording tools just give you a video file. If you want to remember what you did, you have to watch it back.
116
+
117
+ Escribano watches it for you. It extracts frames, runs them through a vision-language model, transcribes any audio, and writes up what happened — broken into topics, with timestamps and time per activity.
118
+
119
+ Built for developers: understands the difference between debugging, coding, reading docs, and scrolling Slack. Doesn't just OCR text (which produces garbage when every screen has "function" and "const" on it).
120
+
121
+ ---
122
+
123
+ ## How it works
124
+
125
+ ```
126
+ Screen recording
127
+
128
+ ├──► Audio: Silero VAD → Whisper → transcripts
129
+
130
+ └──► Video: FFmpeg frames → scene detection → adaptive sampling
131
+
132
+
133
+ VLM inference (MLX-VLM, Qwen3-VL-2B)
134
+
135
+
136
+ "Debugging in terminal"
137
+ "Reading docs in Chrome"
138
+ "Coding in VS Code"
139
+
140
+
141
+ Activity segmentation → temporal audio alignment → TopicBlocks
142
+
143
+
144
+ LLM summary (Ollama, auto-detected) → Markdown artifact
145
+ ```
146
+
147
+ Uses VLM-first visual understanding, not OCR + text clustering. OCR fails for developer work because all code screens produce similar tokens. VLMs understand the *activity*, not just the text.
148
+
149
+ ---
150
+
151
+ ## Quick Start
152
+
153
+ ### Prerequisites
154
+
155
+ ```bash
156
+ # macOS (Homebrew)
157
+ brew install ollama whisper-cpp ffmpeg
158
+
159
+ # MLX-VLM for frame analysis (Apple Silicon)
160
+ pip install mlx-vlm
161
+ ```
162
+
163
+ ### LLM Model Setup
164
+
165
+ Escribano auto-detects the best model for your hardware:
166
+
167
+ | Your RAM | Auto-selected | Install command |
168
+ |----------|---------------|-----------------|
169
+ | 16GB | `qwen3:8b` | `ollama pull qwen3:8b` |
170
+ | 32GB | `qwen3:14b` | `ollama pull qwen3:14b` |
171
+ | 64GB+ | `qwen3.5:27b` | `ollama pull qwen3.5:27b` |
172
+
173
+ ```bash
174
+ # Minimum (16GB)
175
+ ollama pull qwen3:8b
176
+
177
+ # Or best quality (64GB+)
178
+ ollama pull qwen3.5:27b
179
+ ```
180
+
181
+ ### Run
182
+
183
+ ```bash
184
+ # Check prerequisites
185
+ npx escribano doctor
186
+
187
+ # Process a recording
188
+ npx escribano --file "~/Desktop/Screen Recording.mov"
189
+ ```
190
+
191
+ ### Local Development
192
+
193
+ ```bash
194
+ git clone https://github.com/eduardosanzb/escribano.git
195
+ cd escribano
196
+ pnpm install
197
+ pnpm escribano --file "~/Desktop/Screen Recording.mov"
198
+ ```
199
+
200
+ Output: `~/.escribano/artifacts/`
201
+
202
+ ---
203
+
204
+ ## CLI
205
+
206
+ ### Flags
207
+
208
+ | Flag | What it does |
209
+ |------|--------------|
210
+ | `--file <path>` | Process a video file |
211
+ | `--mic-audio <path>` | External mic audio |
212
+ | `--system-audio <path>` | External system audio |
213
+ | `--format <format>` | `card`, `standup`, or `narrative` (default: card) |
214
+ | `--force` | Reprocess from scratch |
215
+ | `--skip-summary` | Process frames only, skip artifact |
216
+ | `--include-personal` | Include personal time (filtered by default) |
217
+ | `--copy` | Copy to clipboard |
218
+ | `--stdout` | Print to stdout |
219
+ | `--help` | Show all options |
220
+
221
+ ### Formats
222
+
223
+ | Format | Use for | Style |
224
+ |--------|---------|-------|
225
+ | `card` | Personal review, daily notes | Time breakdowns per subject, bullets |
226
+ | `standup` | Daily standup, async updates | What I did / Outcomes / Next |
227
+ | `narrative` | Retrospectives, blog drafts | Prose with timeline |
228
+
229
+ ### Examples
230
+
231
+ ```bash
232
+ # Process and copy
233
+ npx escribano --file "~/Desktop/Screen Recording.mov" --format standup --copy
234
+
235
+ # Narrative format
236
+ npx escribano --file session.mp4 --format narrative --force
237
+
238
+ # With external audio
239
+ npx escribano --file recording.mov --mic-audio mic.wav
240
+ ```
241
+
242
+ ---
243
+
244
+ ## Supported inputs
245
+
246
+ | Source | Command |
247
+ |--------|---------|
248
+ | QuickTime recording | `--file video.mov` |
249
+ | Cap recording | Auto-detected in `~/Movies/Cap/` |
250
+ | Any MP4/MOV | `--file /path/to/video.mp4` |
251
+ | External audio | `--mic-audio mic.wav --system-audio system.wav` |
252
+
253
+ ---
254
+
255
+ ## Architecture
256
+
257
+ Clean architecture: domain entities, pure services, adapter interfaces for external systems (MLX-VLM, Ollama, Whisper, FFmpeg, SQLite).
258
+
259
+ Deep dives:
260
+ - [Why OCR fails for developers](docs/adr/005-vlm-first-visual-pipeline.md)
261
+ - [MLX-VLM migration for 4x speedup](docs/adr/006-mlx-vlm-adapter.md)
262
+ - [Benchmarks and learnings](docs/learnings.md)
263
+
264
+ Full architecture: [docs/architecture.md](docs/architecture.md)
265
+
266
+ ---
267
+
268
+ ## Requirements
269
+
270
+ - **macOS** (Apple Silicon for MLX-VLM)
271
+ - **Node.js 20+**
272
+ - **16GB+ RAM** (see model tiers above)
273
+ - **~10GB disk** for models
274
+
275
+ ---
276
+
277
+ ## Roadmap
278
+
279
+ - [x] VLM-first visual pipeline
280
+ - [x] MLX-VLM migration
281
+ - [x] Activity segmentation
282
+ - [x] Multiple artifact formats
283
+ - [x] Auto-detect best LLM model
284
+ - [ ] Auto-detect ffmpeg hardware acceleration
285
+ - [ ] OCR on keyframes for code/URLs
286
+ - [ ] MCP server for AI assistants
287
+ - [ ] Cross-recording queries
288
+
289
+ ---
290
+
291
+ ## License
292
+
293
+ MIT
294
+
295
+ ---
296
+
297
+ *Escribano = "The Scribe"*
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Escribano - Core Types
3
+ *
4
+ * All types and interfaces in one place.
5
+ */
6
+ import { z } from 'zod';
7
+ // =============================================================================
8
+ // RECORDING
9
+ // =============================================================================
10
+ export const recordingSchema = z.object({
11
+ id: z.string(),
12
+ source: z.object({
13
+ type: z.enum(['cap', 'meetily', 'raw']),
14
+ originalPath: z.string(),
15
+ metadata: z.record(z.string(), z.any()).optional(),
16
+ }),
17
+ videoPath: z.string().nullable(),
18
+ audioMicPath: z.string().nullable(),
19
+ audioSystemPath: z.string().nullable(),
20
+ duration: z.number(),
21
+ capturedAt: z.date(),
22
+ });
23
+ // =============================================================================
24
+ // TRANSCRIPT
25
+ // =============================================================================
26
+ export const transcriptSegmentSchema = z.object({
27
+ id: z.string(),
28
+ start: z.number(),
29
+ end: z.number(),
30
+ text: z.string(),
31
+ speaker: z.string().nullable().optional(),
32
+ });
33
+ export const transcriptSchema = z.object({
34
+ fullText: z.string(),
35
+ segments: z.array(transcriptSegmentSchema),
36
+ language: z.string().default('en'),
37
+ duration: z.number(),
38
+ });
39
+ // =============================================================================
40
+ // SESSION
41
+ // =============================================================================
42
+ // Tagged transcript to identify audio source
43
+ export const taggedTranscriptSchema = z.object({
44
+ source: z.enum(['mic', 'system']),
45
+ transcript: transcriptSchema,
46
+ });
47
+ // =============================================================================
48
+ // CLASSIFICATION
49
+ // =============================================================================
50
+ export const sessionTypeSchema = z.enum([
51
+ 'meeting',
52
+ 'debugging',
53
+ 'tutorial',
54
+ 'learning',
55
+ 'working',
56
+ ]);
57
+ export const classificationSchema = z.object({
58
+ meeting: z.number().min(0).max(100),
59
+ debugging: z.number().min(0).max(100),
60
+ tutorial: z.number().min(0).max(100),
61
+ learning: z.number().min(0).max(100),
62
+ working: z.number().min(0).max(100),
63
+ });
64
+ // =============================================================================
65
+ // TRANSCRIPT METADATA
66
+ // =============================================================================
67
+ const speakerSchema = z.object({
68
+ name: z.string(),
69
+ role: z.string().optional(),
70
+ });
71
+ const keyMomentSchema = z.object({
72
+ timestamp: z.number(),
73
+ description: z.string(),
74
+ importance: z.enum(['high', 'medium', 'low']),
75
+ });
76
+ const actionItemSchema = z.object({
77
+ description: z.string(),
78
+ owner: z.string().nullable(),
79
+ priority: z.enum(['high', 'medium', 'low']).optional(),
80
+ });
81
+ const technicalTermSchema = z.object({
82
+ term: z.string(),
83
+ context: z.string(),
84
+ type: z.enum(['error', 'file', 'function', 'variable', 'other']),
85
+ });
86
+ const codeSnippetSchema = z.object({
87
+ language: z.string().optional(),
88
+ code: z.string(),
89
+ description: z.string().optional(),
90
+ timestamp: z.number().optional(),
91
+ });
92
+ export const transcriptMetadataSchema = z.object({
93
+ speakers: z.array(speakerSchema).optional(),
94
+ keyMoments: z.array(keyMomentSchema).optional(),
95
+ actionItems: z.array(actionItemSchema).optional(),
96
+ technicalTerms: z.array(technicalTermSchema).optional(),
97
+ codeSnippets: z.array(codeSnippetSchema).optional(),
98
+ });
99
+ // =============================================================================
100
+ // ARTIFACTS
101
+ // =============================================================================
102
+ export const artifactTypeSchema = z.enum([
103
+ 'summary',
104
+ 'action-items',
105
+ 'runbook',
106
+ 'step-by-step',
107
+ 'notes',
108
+ 'code-snippets',
109
+ 'blog-research',
110
+ 'blog-draft',
111
+ ]);
112
+ export const artifactSchema = z.object({
113
+ id: z.string(),
114
+ type: artifactTypeSchema,
115
+ content: z.string(),
116
+ format: z.enum(['markdown']).default('markdown'),
117
+ createdAt: z.date(),
118
+ });
119
+ // =============================================================================
120
+ // VISUAL LOG
121
+ // =============================================================================
122
+ export const visualLogEntrySchema = z.object({
123
+ timestamp: z.number(),
124
+ imagePath: z.string(),
125
+ description: z.string().optional(),
126
+ ocrSummary: z.string().optional(),
127
+ heuristicLabel: z.string().optional(),
128
+ sceneScore: z.number().optional(),
129
+ });
130
+ export const visualLogSchema = z.object({
131
+ entries: z.array(visualLogEntrySchema),
132
+ source: z.enum(['screen', 'camera']).default('screen'),
133
+ });
134
+ // =============================================================================
135
+ // VISUAL ANALYSIS (External Tool Output)
136
+ // =============================================================================
137
+ export const visualIndexFrameSchema = z.object({
138
+ index: z.number(),
139
+ timestamp: z.number(),
140
+ imagePath: z.string(),
141
+ ocrText: z.string(),
142
+ clusterId: z.number(),
143
+ changeScore: z.number(),
144
+ });
145
+ export const visualIndexClusterSchema = z.object({
146
+ id: z.number(),
147
+ heuristicLabel: z.string(),
148
+ timeRange: z.tuple([z.number(), z.number()]),
149
+ frameCount: z.number(),
150
+ representativeIdx: z.number(),
151
+ avgOcrCharacters: z.number(),
152
+ mediaIndicators: z.array(z.string()),
153
+ });
154
+ export const visualIndexSchema = z.object({
155
+ frames: z.array(visualIndexFrameSchema),
156
+ clusters: z.array(visualIndexClusterSchema),
157
+ processingTime: z.object({
158
+ ocrMs: z.number(),
159
+ clipMs: z.number(),
160
+ totalMs: z.number(),
161
+ }),
162
+ });
163
+ export const visualDescriptionSchema = z.object({
164
+ clusterId: z.number(),
165
+ timestamp: z.number(),
166
+ description: z.string(),
167
+ });
168
+ export const visualDescriptionsSchema = z.object({
169
+ descriptions: z.array(visualDescriptionSchema),
170
+ processingTime: z.object({
171
+ vlmMs: z.number(),
172
+ framesProcessed: z.number(),
173
+ }),
174
+ });
175
+ // =============================================================================
176
+ // INTELLIGENCE & EMBEDDINGS
177
+ // =============================================================================
178
+ export const activityContextSchema = z.object({
179
+ type: z.enum(['url', 'file', 'app', 'topic', 'unknown']),
180
+ value: z.string(),
181
+ confidence: z.number().min(0).max(1),
182
+ });
183
+ export const sessionSegmentSchema = z.object({
184
+ id: z.string(),
185
+ timeRange: z.tuple([z.number(), z.number()]),
186
+ visualClusterIds: z.array(z.number()),
187
+ contexts: z.array(activityContextSchema),
188
+ transcriptSlice: taggedTranscriptSchema.nullable(),
189
+ classification: classificationSchema.nullable(),
190
+ isNoise: z.boolean(),
191
+ });
192
+ export const embeddingConfigSchema = z.object({
193
+ model: z.string().default('qwen3-embedding:8b'),
194
+ similarityThreshold: z.number().min(0).max(1).default(0.4),
195
+ });
196
+ // =============================================================================
197
+ // SESSION
198
+ // =============================================================================
199
+ export const outlineSyncStateSchema = z.object({
200
+ collectionId: z.string(),
201
+ sessionDocumentId: z.string(),
202
+ sessionDocumentUrl: z.string(),
203
+ artifacts: z.array(z.object({
204
+ type: artifactTypeSchema,
205
+ documentId: z.string(),
206
+ documentUrl: z.string(),
207
+ syncedAt: z.date(),
208
+ contentHash: z.string(),
209
+ })),
210
+ lastSyncedAt: z.date(),
211
+ });
212
+ export const sessionSchema = z.object({
213
+ id: z.string(),
214
+ recording: recordingSchema,
215
+ transcripts: z.array(taggedTranscriptSchema),
216
+ visualLogs: z.array(visualLogSchema).default([]),
217
+ segments: z.array(sessionSegmentSchema).default([]),
218
+ status: z.enum([
219
+ 'raw',
220
+ 'transcribed',
221
+ 'visual-logged',
222
+ 'classified',
223
+ 'metadata-extracted',
224
+ 'complete',
225
+ 'error',
226
+ ]),
227
+ classification: classificationSchema.nullable(),
228
+ metadata: transcriptMetadataSchema.nullable(),
229
+ artifacts: z.array(artifactSchema).default([]),
230
+ outlineSyncState: outlineSyncStateSchema.nullable().optional(),
231
+ createdAt: z.date(),
232
+ updatedAt: z.date(),
233
+ errorMessage: z.string().nullable().optional(),
234
+ });
235
+ // =============================================================================
236
+ // CONFIG
237
+ // =============================================================================
238
+ export const capConfigSchema = z.object({
239
+ recordingsPath: z
240
+ .string()
241
+ .default('~/Library/Application Support/so.cap.desktop/recordings'),
242
+ });
243
+ export const whisperConfigSchema = z.object({
244
+ binaryPath: z.string().default('whisper-cli'),
245
+ model: z.string().default('large-v3'),
246
+ cwd: z.string().optional(),
247
+ outputFormat: z.enum(['json', 'txt', 'srt', 'vtt']).default('json'),
248
+ language: z.string().optional(),
249
+ });
250
+ export const intelligenceConfigSchema = z.object({
251
+ provider: z.enum(['ollama', 'mlx']).default('ollama'),
252
+ endpoint: z.string().default('http://localhost:11434/api/chat'),
253
+ model: z.string().default('qwen3.5:27b'),
254
+ generationModel: z.string().default('qwen3.5:27b'),
255
+ visionModel: z.string().default('minicpm-v:8b'),
256
+ maxRetries: z.number().default(3),
257
+ timeout: z.number().default(600000), // 10 minutes
258
+ keepAlive: z.string().default('10m'),
259
+ maxContextSize: z.number().default(131072), // qwen3:8b supports up to 128K
260
+ embedding: embeddingConfigSchema.default({
261
+ model: 'nomic-embed-text',
262
+ similarityThreshold: 0.75,
263
+ }),
264
+ // MLX-VLM specific config
265
+ vlmBatchSize: z.number().default(4),
266
+ vlmMaxTokens: z.number().default(2000),
267
+ mlxSocketPath: z.string().default('/tmp/escribano-mlx.sock'),
268
+ });
269
+ export const DEFAULT_INTELLIGENCE_CONFIG = intelligenceConfigSchema.parse({});
270
+ const artifactConfigSchema = z.object({
271
+ parallelGeneration: z.boolean().default(false),
272
+ maxParallel: z.number().default(3),
273
+ maxScreenshots: z.number().default(10),
274
+ });
275
+ export const outlineConfigSchema = z.object({
276
+ url: z.string().url(),
277
+ token: z.string(),
278
+ collectionName: z.string().default('Escribano Sessions'),
279
+ });
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Escribano - Classify Session Action
3
+ *
4
+ * Classifies a session using IntelligenceService.
5
+ * If segments exist, it classifies each segment individually and aggregates the results.
6
+ */
7
+ import { Session } from '../domain/session.js';
8
+ import { Transcript } from '../domain/transcript.js';
9
+ export async function classifySession(session, intelligence) {
10
+ if (session.transcripts.length === 0) {
11
+ throw new Error('Cannot classify session without transcripts');
12
+ }
13
+ // 1. If segments exist, classify each segment sequentially
14
+ if (session.segments.length > 0) {
15
+ const nonNoiseSegments = session.segments.filter((s) => !s.isNoise);
16
+ const noiseCount = session.segments.length - nonNoiseSegments.length;
17
+ console.log(`Classifying ${nonNoiseSegments.length} segments (${noiseCount} noise skipped)...`);
18
+ // Sequential classification to avoid parallel warmup race + Ollama overload
19
+ const classifiedSegments = [];
20
+ let classifiedCount = 0;
21
+ for (const segment of session.segments) {
22
+ if (segment.isNoise) {
23
+ classifiedSegments.push({
24
+ ...segment,
25
+ classification: {
26
+ meeting: 0,
27
+ debugging: 0,
28
+ tutorial: 0,
29
+ learning: 0,
30
+ working: 0,
31
+ },
32
+ });
33
+ continue;
34
+ }
35
+ classifiedCount++;
36
+ console.log(` Segment ${classifiedCount}/${nonNoiseSegments.length}: ${segment.id}`);
37
+ try {
38
+ const classification = await intelligence.classifySegment(segment);
39
+ classifiedSegments.push({ ...segment, classification });
40
+ }
41
+ catch (error) {
42
+ console.error(` Failed to classify segment ${segment.id}:`, error);
43
+ classifiedSegments.push(segment);
44
+ }
45
+ }
46
+ // Update session with classified segments
47
+ const updatedSession = {
48
+ ...session,
49
+ segments: classifiedSegments,
50
+ status: 'classified',
51
+ updatedAt: new Date(),
52
+ };
53
+ // 2. Derive session-level classification from aggregated segments
54
+ const aggregatedClassification = Session.getActivityBreakdown(updatedSession);
55
+ // Convert Record to Classification type
56
+ updatedSession.classification = {
57
+ meeting: aggregatedClassification.meeting || 0,
58
+ debugging: aggregatedClassification.debugging || 0,
59
+ tutorial: aggregatedClassification.tutorial || 0,
60
+ learning: aggregatedClassification.learning || 0,
61
+ working: aggregatedClassification.working || 0,
62
+ };
63
+ return updatedSession;
64
+ }
65
+ // Fallback to legacy whole-session classification if no segments exist
66
+ console.log('No segments found, falling back to session-level classification...');
67
+ const transcriptForClassification = session.transcripts.length === 1
68
+ ? session.transcripts[0].transcript
69
+ : Transcript.interleave(session.transcripts);
70
+ const classification = await intelligence.classify(transcriptForClassification, session.visualLogs);
71
+ return {
72
+ ...session,
73
+ status: 'classified',
74
+ classification,
75
+ updatedAt: new Date(),
76
+ };
77
+ }