openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,920 @@
1
+ # Workflow Segmentation System
2
+
3
+ The workflow segmentation system automatically extracts, deduplicates, and annotates reusable workflow episodes from GUI recordings. This creates a library of canonical workflows that can be used for:
4
+
5
+ - **Training data curation**: Identify high-quality demonstration episodes for fine-tuning
6
+ - **Demo retrieval**: Build libraries of workflows for demo-conditioned prompting
7
+ - **Workflow documentation**: Automatically generate step-by-step workflow guides
8
+ - **Deduplication**: Find similar workflows across recordings to build canonical definitions
9
+
10
+ ## Architecture
11
+
12
+ The system uses a **4-stage pipeline**:
13
+
14
+ ### Stage 1: Frame Description (VLM)
15
+ Converts screenshots + actions into semantic descriptions using Vision-Language Models.
16
+
17
+ **Input**: Recording directory with screenshots and action events
18
+ **Output**: `ActionTranscript` with frame-by-frame descriptions
19
+
20
+ **Example**:
21
+ ```python
22
+ from openadapt_ml.segmentation import FrameDescriber
23
+
24
+ describer = FrameDescriber(model="gemini-2.0-flash")
25
+ transcript = describer.describe_recording("/path/to/recording")
26
+
27
+ # View as plain text
28
+ print(transcript.to_transcript_text())
29
+ # [00:00.0] User opens System Preferences from Apple menu
30
+ # [00:02.5] User clicks Display settings icon
31
+ # [00:05.1] User navigates to Night Shift tab
32
+ # ...
33
+ ```
34
+
35
+ **Supported VLMs**:
36
+ - Gemini 2.0 Flash / Pro (recommended for speed)
37
+ - Claude Sonnet 4 / Haiku
38
+ - GPT-4o / GPT-4o-mini
39
+
40
+ **Features**:
41
+ - Automatic caching to avoid reprocessing frames
42
+ - Batch processing for API efficiency
43
+ - Extracts: application name, visible elements, screen context, action target, user intent
44
+
45
+ ---
46
+
47
+ ### Stage 2: Episode Extraction (LLM)
48
+ Identifies coherent workflow boundaries and extracts episodes using Large Language Models.
49
+
50
+ **Input**: `ActionTranscript` from Stage 1
51
+ **Output**: `EpisodeExtractionResult` with identified episodes
52
+
53
+ **Example**:
54
+ ```python
55
+ from openadapt_ml.segmentation import SegmentExtractor
56
+
57
+ extractor = SegmentExtractor(
58
+ model="gpt-4o",
59
+ use_few_shot=True, # Include examples in prompts
60
+ min_segment_duration=2.0, # Minimum episode length
61
+ max_segment_duration=300.0 # Maximum episode length
62
+ )
63
+
64
+ result = extractor.extract_segments(transcript)
65
+
66
+ for episode in result.episodes:
67
+ print(f"{episode.name}: {episode.start_time_formatted} - {episode.end_time_formatted}")
68
+ print(f" Steps: {', '.join(episode.step_summaries)}")
69
+ print(f" Confidence: {episode.boundary_confidence:.2f}")
70
+ ```
71
+
72
+ **Output**:
73
+ - Episode name and description
74
+ - Precise start/end timestamps
75
+ - Step-by-step breakdown
76
+ - Prerequisites and outcomes
77
+ - Boundary confidence scores
78
+
79
+ **Supported LLMs**:
80
+ - GPT-4o / GPT-4o-mini (recommended)
81
+ - Claude Sonnet 4 / Haiku
82
+ - Gemini 2.0 Pro / Flash
83
+
84
+ **Features**:
85
+ - Few-shot prompting for better segmentation quality
86
+ - Hierarchical extraction (nested subtasks)
87
+ - Confidence-based filtering
88
+ - Manual boundary adjustment helpers
89
+
90
+ ---
91
+
92
+ ### Stage 3: Deduplication (Embeddings)
93
+ Finds and merges similar workflows across recordings using embedding similarity.
94
+
95
+ **Input**: List of `EpisodeExtractionResult` from multiple recordings
96
+ **Output**: `EpisodeLibrary` with canonical workflows
97
+
98
+ **Example**:
99
+ ```python
100
+ from openadapt_ml.segmentation import WorkflowDeduplicator
101
+
102
+ dedup = WorkflowDeduplicator(
103
+ threshold=0.85, # Cosine similarity threshold (0.80-0.90 recommended)
104
+ embedding_model="text-embedding-3-large",
105
+ merge_strategy="centroid" # or "longest", "first"
106
+ )
107
+
108
+ library = dedup.deduplicate(extraction_results)
109
+
110
+ print(f"Total episodes: {library.total_episodes_extracted}")
111
+ print(f"Unique workflows: {library.unique_episode_count}")
112
+ print(f"Deduplication ratio: {library.deduplication_ratio:.1%}")
113
+ ```
114
+
115
+ **Features**:
116
+ - Semantic similarity using text embeddings (OpenAI API or local HuggingFace models)
117
+ - Agglomerative clustering with cosine similarity
118
+ - Multiple merge strategies (centroid, longest, first)
119
+ - Incremental library updates (add new recordings to existing libraries)
120
+
121
+ **Merge Strategies**:
122
+ - `centroid`: Use episode closest to cluster centroid (most representative)
123
+ - `longest`: Use episode with longest/most detailed description
124
+ - `first`: Use first encountered episode
125
+
126
+ ---
127
+
128
+ ### Stage 4: Annotation (VLM Quality Assessment)
129
+ Automatically annotates episodes for training data quality control.
130
+
131
+ **Input**: `EpisodeExtractionResult` + recording path
132
+ **Output**: `AnnotatedEpisodeLibrary` with gold/exclusion labels
133
+
134
+ **Example**:
135
+ ```python
136
+ from openadapt_ml.segmentation import EpisodeAnnotator
137
+
138
+ annotator = EpisodeAnnotator(
139
+ model="gemini-2.0-flash",
140
+ lookahead_frames=10 # Analyze frames after episode to detect failures
141
+ )
142
+
143
+ library = annotator.annotate_extraction_result(
144
+ extraction_result=result,
145
+ recording_path="/path/to/recording"
146
+ )
147
+
148
+ print(f"Total episodes: {library.total_episodes}")
149
+ print(f"Recommended as gold: {library.gold_count}")
150
+ print(f"Pending human review: {library.total_episodes - library.verified_count}")
151
+
152
+ # Get gold episodes for export
153
+ gold_episodes = library.get_verified_gold_episodes()
154
+ ```
155
+
156
+ **What it checks**:
157
+ - Boundary accuracy (are start/end frames correct?)
158
+ - Workflow completeness (did all steps execute successfully?)
159
+ - Failure detection:
160
+ - Error dialogs or messages
161
+ - Undo actions (Ctrl+Z, etc.)
162
+ - Repeated attempts at same action
163
+ - User navigating back or canceling
164
+ - Post-episode analysis (examines frames *after* episode ends for delayed failures)
165
+
166
+ **Output**:
167
+ - `is_gold`: Boolean recommendation for training data inclusion
168
+ - `confidence`: VLM confidence in assessment (0-1)
169
+ - `failure_signals`: List of detected issues
170
+ - `exclusion_reason`: Explanation if not gold
171
+ - `start_frame` / `end_frame`: Refined boundaries
172
+
173
+ **Human-in-the-loop review**:
174
+ ```python
175
+ from openadapt_ml.segmentation import verify_annotation
176
+
177
+ # After reviewing an annotation
178
+ verified = verify_annotation(
179
+ annotation=ann,
180
+ is_gold=True, # Human decision
181
+ notes="Verified - workflow completed successfully",
182
+ verified_by="reviewer_name"
183
+ )
184
+ ```
185
+
186
+ ---
187
+
188
+ ## Complete Pipeline
189
+
190
+ Run all 4 stages together:
191
+
192
+ ```python
193
+ from openadapt_ml.segmentation import SegmentationPipeline, PipelineConfig
194
+
195
+ config = PipelineConfig(
196
+ vlm_model="gemini-2.0-flash", # Stage 1
197
+ llm_model="gpt-4o", # Stage 2
198
+ similarity_threshold=0.85, # Stage 3
199
+ use_local_embeddings=False, # Use OpenAI embeddings
200
+ cache_enabled=True
201
+ )
202
+
203
+ pipeline = SegmentationPipeline(config)
204
+
205
+ result = pipeline.run(
206
+ recordings=[
207
+ "/path/to/recording1",
208
+ "/path/to/recording2"
209
+ ],
210
+ output_dir="segmentation_output",
211
+ progress_callback=lambda stage, cur, tot: print(f"[{stage}] {cur}/{tot}")
212
+ )
213
+
214
+ print(f"Recordings processed: {result.recordings_processed}")
215
+ print(f"Total episodes: {result.total_episodes_extracted}")
216
+ print(f"Unique workflows: {result.unique_episodes}")
217
+ print(f"Processing time: {result.processing_time_seconds:.1f}s")
218
+ ```
219
+
220
+ The pipeline automatically saves intermediate results:
221
+ - `{recording_id}_transcript.json` - Stage 1 output
222
+ - `{recording_id}_episodes.json` - Stage 2 output
223
+ - `episode_library.json` - Stage 3 output (final deduplicated library)
224
+
225
+ ---
226
+
227
+ ## CLI Usage
228
+
229
+ All stages have CLI commands:
230
+
231
+ ### Describe (Stage 1)
232
+ ```bash
233
+ # Generate frame descriptions
234
+ python -m openadapt_ml.segmentation.cli describe \
235
+ --recording /path/to/recording \
236
+ --model gemini-2.0-flash \
237
+ --output transcript.json
238
+
239
+ # View as plain text
240
+ python -m openadapt_ml.segmentation.cli describe \
241
+ --recording /path/to/recording \
242
+ --format text
243
+ ```
244
+
245
+ ### Extract (Stage 2)
246
+ ```bash
247
+ # Extract episodes from a recording
248
+ python -m openadapt_ml.segmentation.cli extract \
249
+ --recording /path/to/recording \
250
+ --model gpt-4o \
251
+ --output episodes.json
252
+
253
+ # Or from existing transcript
254
+ python -m openadapt_ml.segmentation.cli extract \
255
+ --transcript transcript.json \
256
+ --model gpt-4o \
257
+ --output episodes.json
258
+ ```
259
+
260
+ ### Deduplicate (Stage 3)
261
+ ```bash
262
+ # Deduplicate across multiple recordings
263
+ python -m openadapt_ml.segmentation.cli deduplicate \
264
+ recording1_episodes.json recording2_episodes.json \
265
+ --threshold 0.85 \
266
+ --output library.json
267
+
268
+ # Or from a directory
269
+ python -m openadapt_ml.segmentation.cli deduplicate \
270
+ --input-dir segmentation_output/ \
271
+ --threshold 0.85 \
272
+ --output library.json
273
+ ```
274
+
275
+ ### Annotate (Stage 4)
276
+ ```bash
277
+ # Auto-annotate episodes for quality control
278
+ python -m openadapt_ml.segmentation.cli annotate \
279
+ --episodes recording1_episodes.json \
280
+ --recording /path/to/recording1 \
281
+ --model gemini-2.0-flash \
282
+ --output annotated_library.json
283
+
284
+ # Review annotations interactively
285
+ python -m openadapt_ml.segmentation.cli review \
286
+ --library annotated_library.json \
287
+ --recording /path/to/recording1 \
288
+ --reviewer your_name \
289
+ --auto-approve-high-confidence # Auto-approve confidence > 0.9
290
+
291
+ # Export gold episodes for fine-tuning
292
+ python -m openadapt_ml.segmentation.cli export-gold \
293
+ annotated_library.json \
294
+ --format jsonl \
295
+ --output gold_episodes.jsonl \
296
+ --include-screenshots
297
+ ```
298
+
299
+ ### Complete Pipeline (all stages)
300
+ ```bash
301
+ python -m openadapt_ml.segmentation.cli pipeline \
302
+ /path/to/recording1 /path/to/recording2 /path/to/recording3 \
303
+ --vlm-model gemini-2.0-flash \
304
+ --llm-model gpt-4o \
305
+ --threshold 0.85 \
306
+ --output segmentation_output/ \
307
+ --save-intermediate \
308
+ --verbose
309
+ ```
310
+
311
+ ### List Library Contents
312
+ ```bash
313
+ python -m openadapt_ml.segmentation.cli list \
314
+ --library library.json \
315
+ --details
316
+ ```
317
+
318
+ ### Export Library
319
+ ```bash
320
+ # Export as CSV, JSONL, or HTML
321
+ python -m openadapt_ml.segmentation.cli export \
322
+ library.json \
323
+ --format html \
324
+ --output workflows.html
325
+ ```
326
+
327
+ ---
328
+
329
+ ## Data Schemas
330
+
331
+ All schemas are defined using Pydantic in `openadapt_ml/segmentation/schemas.py`:
332
+
333
+ ### `FrameDescription` (Stage 1 output)
334
+ ```python
335
+ {
336
+ "timestamp": 2.5,
337
+ "formatted_time": "00:02.5",
338
+ "visible_application": "System Preferences",
339
+ "visible_elements": ["Night Shift toggle", "Schedule slider"],
340
+ "screen_context": "Display settings panel with Night Shift tab active",
341
+ "action_type": "click",
342
+ "action_target": "Night Shift toggle",
343
+ "action_value": None,
344
+ "apparent_intent": "Enable Night Shift automatic scheduling",
345
+ "confidence": 0.95,
346
+ "frame_index": 5,
347
+ "vlm_model": "gemini-2.0-flash"
348
+ }
349
+ ```
350
+
351
+ ### `Episode` (Stage 2 output)
352
+ ```python
353
+ {
354
+ "episode_id": "uuid-here",
355
+ "name": "Configure Night Shift Schedule",
356
+ "start_time": 0.0,
357
+ "end_time": 12.5,
358
+ "start_time_formatted": "00:00.0",
359
+ "end_time_formatted": "00:12.5",
360
+ "description": "Enable and configure Night Shift automatic scheduling...",
361
+ "step_summaries": [
362
+ "Open System Preferences",
363
+ "Navigate to Display > Night Shift",
364
+ "Enable Night Shift",
365
+ "Set schedule 9 PM - 7 AM"
366
+ ],
367
+ "application": "System Preferences",
368
+ "prerequisites": ["System Preferences must be accessible"],
369
+ "outcomes": ["Night Shift enabled with custom schedule"],
370
+ "boundary_confidence": 0.95,
371
+ "coherence_score": 0.90,
372
+ "recording_id": "recording1",
373
+ "frame_indices": [0, 1, 2, 3, 4, 5]
374
+ }
375
+ ```
376
+
377
+ ### `CanonicalEpisode` (Stage 3 output)
378
+ ```python
379
+ {
380
+ "canonical_id": "uuid-here",
381
+ "canonical_name": "Configure Night Shift Schedule",
382
+ "canonical_description": "Enable and configure Night Shift...",
383
+ "canonical_steps": ["Open System Preferences", "Navigate to Display > Night Shift", ...],
384
+ "variant_names": ["Adjust Night Shift Settings", "Set up Night Shift"],
385
+ "variant_descriptions": ["...", "..."],
386
+ "source_recordings": ["recording1", "recording2"],
387
+ "source_episode_ids": ["uuid1", "uuid2"],
388
+ "occurrence_count": 3,
389
+ "embedding": [0.123, -0.456, ...],
390
+ "cluster_id": 0,
391
+ "internal_similarity": 0.92
392
+ }
393
+ ```
394
+
395
+ ### `EpisodeAnnotation` (Stage 4 output)
396
+ ```python
397
+ {
398
+ "annotation_id": "uuid-here",
399
+ "episode_id": "uuid-of-episode",
400
+ "start_frame": 0,
401
+ "end_frame": 5,
402
+ "is_gold": True,
403
+ "exclusion_reason": None,
404
+ "confidence": 0.95,
405
+ "human_verified": False,
406
+ "notes": None,
407
+ "failure_signals": [],
408
+ "created_at": "2026-01-17T10:00:00",
409
+ "verified_at": None,
410
+ "verified_by": None
411
+ }
412
+ ```
413
+
414
+ ---
415
+
416
+ ## Configuration
417
+
418
+ ### API Keys
419
+
420
+ Set environment variables for VLM/LLM providers:
421
+
422
+ ```bash
423
+ export GOOGLE_API_KEY="your-gemini-key"
424
+ export ANTHROPIC_API_KEY="your-claude-key"
425
+ export OPENAI_API_KEY="your-openai-key"
426
+ ```
427
+
428
+ ### Caching
429
+
430
+ Frame descriptions are automatically cached to avoid reprocessing:
431
+
432
+ ```python
433
+ # Cache location: ~/.openadapt/cache/descriptions/
434
+
435
+ # Clear cache for a specific recording
436
+ describer.clear_cache(recording_id="recording1")
437
+
438
+ # Disable caching
439
+ describer = FrameDescriber(cache_enabled=False)
440
+ ```
441
+
442
+ ### Local Embeddings (No API required)
443
+
444
+ Use local HuggingFace models instead of OpenAI embeddings:
445
+
446
+ ```python
447
+ dedup = WorkflowDeduplicator(
448
+ use_local_embeddings=True # Uses intfloat/e5-large-v2
449
+ )
450
+
451
+ # Requires: pip install transformers torch
452
+ ```
453
+
454
+ ---
455
+
456
+ ## Use Cases
457
+
458
+ ### 1. Training Data Curation
459
+
460
+ Extract and filter high-quality episodes for fine-tuning:
461
+
462
+ ```python
463
+ # Extract episodes from all recordings
464
+ results = []
465
+ for recording in recordings:
466
+ transcript = describer.describe_recording(recording)
467
+ result = extractor.extract_segments(transcript)
468
+ results.append(result)
469
+
470
+ # Deduplicate to find unique workflows
471
+ library = dedup.deduplicate(results)
472
+
473
+ # Annotate for quality
474
+ annotator = EpisodeAnnotator()
475
+ for recording, result in zip(recordings, results):
476
+ annotated = annotator.annotate_extraction_result(result, recording)
477
+
478
+ # Human review
479
+ for episode, annotation in annotated.get_pending_review():
480
+ # Present to human for verification
481
+ verified = verify_annotation(annotation, is_gold=True, verified_by="human")
482
+
483
+ # Export gold episodes
484
+ from openadapt_ml.segmentation import export_gold_episodes
485
+ export_gold_episodes(
486
+ library=annotated_library,
487
+ output_path="training_data.jsonl",
488
+ format="jsonl"
489
+ )
490
+ ```
491
+
492
+ ### 2. Demo Retrieval Library
493
+
494
+ Build a searchable library of workflow demonstrations:
495
+
496
+ ```python
497
+ # Build library from multiple recordings
498
+ library = pipeline.run(recordings, output_dir="demo_library").library
499
+
500
+ # Find similar workflows for retrieval
501
+ target_episode = Episode(...) # Current task
502
+ similar = dedup.find_similar(target_episode, library, top_k=5)
503
+
504
+ for canonical, similarity in similar:
505
+ print(f"{canonical.canonical_name}: {similarity:.2f}")
506
+ print(f" Found in: {canonical.source_recordings}")
507
+ print(f" Steps: {canonical.canonical_steps}")
508
+ ```
509
+
510
+ ### 3. Workflow Documentation
511
+
512
+ Generate documentation from recordings:
513
+
514
+ ```python
515
+ result = pipeline.run(recordings, output_dir="docs")
516
+
517
+ # Export as HTML
518
+ from openadapt_ml.segmentation.cli import export
519
+ export(
520
+ library=result.library,
521
+ format="html",
522
+ output="workflow_guide.html"
523
+ )
524
+ ```
525
+
526
+ ---
527
+
528
+ ## Advanced Features
529
+
530
+ ### Hierarchical Segmentation
531
+
532
+ Extract nested task/subtask structures:
533
+
534
+ ```python
535
+ extractor = SegmentExtractor(hierarchical=True)
536
+ result = extractor.extract_segments(transcript)
537
+
538
+ for episode in result.episodes:
539
+ if episode.child_episode_ids:
540
+ print(f"{episode.name} contains {len(episode.child_episode_ids)} subtasks")
541
+ ```
542
+
543
+ ### Boundary Refinement
544
+
545
+ Manually adjust or automatically refine boundaries:
546
+
547
+ ```python
548
+ # Automatic refinement
549
+ refined = extractor.refine_segment(segment, transcript)
550
+
551
+ # Manual adjustment
552
+ adjusted = extractor.adjust_boundary(
553
+ segment,
554
+ new_start=2.5, # New start time
555
+ new_end=15.0, # New end time
556
+ transcript=transcript
557
+ )
558
+ ```
559
+
560
+ ### Segment Merging
561
+
562
+ Merge adjacent segments that belong together:
563
+
564
+ ```python
565
+ merged = extractor.merge_segments(
566
+ segments=episodes,
567
+ max_gap=2.0 # Max seconds between segments to merge
568
+ )
569
+ ```
570
+
571
+ ### Incremental Library Updates
572
+
573
+ Add new recordings to an existing library:
574
+
575
+ ```python
576
+ # Load existing library
577
+ import json
578
+ library_data = json.loads(Path("library.json").read_text())
579
+ existing_library = EpisodeLibrary.model_validate(library_data)
580
+
581
+ # Add new recording
582
+ new_result = pipeline.run(
583
+ ["new_recording"],
584
+ existing_library=existing_library
585
+ )
586
+
587
+ # Library now contains both old and new workflows
588
+ ```
589
+
590
+ ---
591
+
592
+ ## Integration with openadapt-capture
593
+
594
+ **Status**: Integration layer needed
595
+
596
+ The segmentation system currently expects recordings in one of these formats:
597
+
598
+ 1. **openadapt-capture format** (preferred):
599
+ - Directory with `metadata.json` and `events.json`
600
+ - `screenshots/` subdirectory with numbered PNGs
601
+
602
+ 2. **JSON format**:
603
+ - Single JSON file with base64-encoded screenshots
604
+
605
+ 3. **Directory format**:
606
+ - Directory with numbered PNG files
607
+ - Creates synthetic event data
608
+
609
+ **Required**: Create adapter to load from `capture.db` (SQLite format used by openadapt-capture).
610
+
611
+ See [Integration Requirements](#integration-requirements) section below for details.
612
+
613
+ ---
614
+
615
+ ## Next Steps & Recommendations
616
+
617
+ ### P0 (High Priority)
618
+
619
+ 1. **Create openadapt-capture adapter**
620
+ - Read events from `capture.db` SQLite database
621
+ - Convert to format expected by FrameDescriber
622
+ - Location: `openadapt_ml/segmentation/adapters/capture_adapter.py`
623
+
624
+ 2. **Add visualization generator**
625
+ - Create annotated screenshots showing segment boundaries
626
+ - Highlight key actions within segments
627
+ - Generate comparison views (before/after deduplication)
628
+
629
+ 3. **Integration tests**
630
+ - Test full pipeline on real openadapt-capture recordings
631
+ - Validate output quality
632
+ - Benchmark performance (time, API costs)
633
+
634
+ ### P1 (Medium Priority)
635
+
636
+ 4. **Improve prompt engineering**
637
+ - Refine few-shot examples based on real data
638
+ - Add domain-specific examples (web, desktop, mobile)
639
+ - Experiment with structured output formats (JSON schema)
640
+
641
+ 5. **Cost optimization**
642
+ - Implement frame sampling strategies (skip similar frames)
643
+ - Add batch processing limits to control API costs
644
+ - Support vision-only models (no text description needed)
645
+
646
+ 6. **Quality metrics**
647
+ - Add inter-annotator agreement metrics
648
+ - Track segmentation quality over time
649
+ - Benchmark against human annotations
650
+
651
+ ### P2 (Nice to Have)
652
+
653
+ 7. **Active learning**
654
+ - Suggest most valuable recordings to annotate next
655
+ - Identify edge cases that need human review
656
+ - Adapt prompts based on human feedback
657
+
658
+ 8. **Multi-modal features**
659
+ - Incorporate audio transcripts (already captured)
660
+ - Use OCR for better text extraction
661
+ - Analyze cursor movement patterns
662
+
663
+ 9. **Export formats**
664
+ - HuggingFace datasets format
665
+ - Parquet for large-scale storage
666
+ - Demo-conditioning format for retrieval
667
+
668
+ ---
669
+
670
+ ## Integration Requirements
671
+
672
+ ### openadapt-capture Adapter
673
+
674
+ The current recordings use `capture.db` (SQLite) but the segmentation system expects `events.json`. Create an adapter:
675
+
676
+ ```python
677
+ # openadapt_ml/segmentation/adapters/capture_adapter.py
678
+
679
+ import sqlite3
680
+ import json
681
+ from pathlib import Path
682
+ from PIL import Image
683
+
684
+ class CaptureAdapter:
685
+ """Adapter for openadapt-capture SQLite format."""
686
+
687
+ def load_recording(self, capture_path: Path) -> tuple[list[Image.Image], list[dict]]:
688
+ """Load recording from capture.db format.
689
+
690
+ Args:
691
+ capture_path: Path to recording directory with capture.db
692
+
693
+ Returns:
694
+ Tuple of (images, action_events)
695
+ """
696
+ db_path = capture_path / "capture.db"
697
+ screenshots_dir = capture_path / "screenshots"
698
+
699
+ # Connect to SQLite
700
+ conn = sqlite3.connect(db_path)
701
+ cursor = conn.cursor()
702
+
703
+ # Query events
704
+ cursor.execute("""
705
+ SELECT timestamp, type, data
706
+ FROM events
707
+ WHERE type IN ('click', 'type', 'scroll', 'key', 'move')
708
+ ORDER BY timestamp
709
+ """)
710
+
711
+ images = []
712
+ events = []
713
+
714
+ for i, (timestamp, event_type, data_json) in enumerate(cursor.fetchall()):
715
+ data = json.loads(data_json)
716
+
717
+ # Find corresponding screenshot
718
+ screenshot_path = self._find_screenshot(screenshots_dir, i)
719
+ if screenshot_path:
720
+ images.append(Image.open(screenshot_path))
721
+
722
+ # Convert to expected format
723
+ event = {
724
+ "timestamp": timestamp,
725
+ "frame_index": i,
726
+ "name": event_type,
727
+ "mouse_x": data.get("x"),
728
+ "mouse_y": data.get("y"),
729
+ "text": data.get("text"),
730
+ }
731
+ events.append(event)
732
+
733
+ conn.close()
734
+ return images, events
735
+
736
+ def _find_screenshot(self, screenshots_dir: Path, frame_index: int) -> Path | None:
737
+ """Find screenshot file for frame index."""
738
+ # openadapt-capture uses format: capture_{id}_step_{n}.png
739
+ matches = list(screenshots_dir.glob(f"*_step_{frame_index}.png"))
740
+ return matches[0] if matches else None
741
+ ```
742
+
743
+ **Integration**:
744
+
745
+ Update `FrameDescriber._load_recording()` to use the adapter:
746
+
747
+ ```python
748
+ # In frame_describer.py
749
+
750
+ def _load_recording(self, recording_path: Path):
751
+ # Check for capture.db
752
+ if (recording_path / "capture.db").exists():
753
+ from openadapt_ml.segmentation.adapters import CaptureAdapter
754
+ adapter = CaptureAdapter()
755
+ return adapter.load_recording(recording_path)
756
+
757
+ # ... existing code for other formats
758
+ ```
759
+
760
+ ---
761
+
762
+ ## Cost Estimates
763
+
764
+ Approximate API costs for a 30-second recording (~20 frames):
765
+
766
+ ### Stage 1 (Frame Description)
767
+ - **Gemini 2.0 Flash**: $0.01 - $0.05 per recording
768
+ - **Claude Haiku**: $0.10 - $0.30 per recording
769
+ - **GPT-4o-mini**: $0.05 - $0.15 per recording
770
+
771
+ ### Stage 2 (Episode Extraction)
772
+ - **GPT-4o**: $0.01 - $0.02 per recording
773
+ - **Claude Sonnet 4**: $0.02 - $0.05 per recording
774
+
775
+ ### Stage 3 (Deduplication)
776
+ - **OpenAI text-embedding-3-large**: $0.001 per recording
777
+ - **Local embeddings**: Free (requires GPU for speed)
778
+
779
+ ### Stage 4 (Annotation)
780
+ - **Gemini 2.0 Flash**: $0.02 - $0.10 per episode
781
+ - **GPT-4o-mini**: $0.05 - $0.15 per episode
782
+
783
+ **Total per recording**: ~$0.05 - $0.50 depending on model choices
784
+
785
+ **Recommendation**: Use Gemini 2.0 Flash for Stages 1 & 4, GPT-4o for Stage 2, local embeddings for Stage 3.
786
+
787
+ ---
788
+
789
+ ## Performance
790
+
791
+ Approximate processing times for a 30-second recording (~20 frames):
792
+
793
+ - **Stage 1 (Description)**: 10-30 seconds (with batching)
794
+ - **Stage 2 (Extraction)**: 5-15 seconds
795
+ - **Stage 3 (Deduplication)**: 1-5 seconds (per 100 episodes)
796
+ - **Stage 4 (Annotation)**: 10-20 seconds per episode
797
+
798
+ **Bottleneck**: VLM API calls (Stages 1 & 4). Use caching and batching to optimize.
799
+
800
+ ---
801
+
802
+ ## Troubleshooting
803
+
804
+ ### "GOOGLE_API_KEY not set"
805
+ Set the API key: `export GOOGLE_API_KEY="your-key"`
806
+
807
+ ### "Failed to load recording"
808
+ Check that the recording directory has the expected format (screenshots/ and events.json or capture.db)
809
+
810
+ ### "No episodes extracted"
811
+ - Lower `min_segment_duration` if recordings are short
812
+ - Check `confidence_threshold` (try 0.5 instead of 0.7)
813
+ - Review Stage 1 transcript to ensure VLM descriptions are accurate
814
+
815
+ ### "Deduplication not working"
816
+ - Lower `threshold` (try 0.75 instead of 0.85)
817
+ - Check that episode descriptions are sufficiently detailed
818
+ - Verify embeddings are being generated correctly
819
+
820
+ ### "High API costs"
821
+ - Enable caching: `cache_enabled=True`
822
+ - Use faster/cheaper models (Gemini Flash, GPT-4o-mini)
823
+ - Reduce batch size to process fewer frames per call
824
+ - Use local embeddings for Stage 3
825
+
826
+ ---
827
+
828
+ ## References
829
+
830
+ - **Schemas**: `openadapt_ml/segmentation/schemas.py`
831
+ - **Frame Describer**: `openadapt_ml/segmentation/frame_describer.py`
832
+ - **Segment Extractor**: `openadapt_ml/segmentation/segment_extractor.py`
833
+ - **Deduplicator**: `openadapt_ml/segmentation/deduplicator.py`
834
+ - **Annotator**: `openadapt_ml/segmentation/annotator.py`
835
+ - **Pipeline**: `openadapt_ml/segmentation/pipeline.py`
836
+ - **CLI**: `openadapt_ml/segmentation/cli.py`
837
+
838
+ ---
839
+
840
+ ## Example: Complete Workflow
841
+
842
+ ```python
843
+ from openadapt_ml.segmentation import (
844
+ SegmentationPipeline,
845
+ PipelineConfig,
846
+ EpisodeAnnotator,
847
+ export_gold_episodes
848
+ )
849
+
850
+ # Configure pipeline
851
+ config = PipelineConfig(
852
+ vlm_model="gemini-2.0-flash", # Fast and cheap for Stage 1
853
+ llm_model="gpt-4o", # Best quality for Stage 2
854
+ similarity_threshold=0.85,
855
+ use_local_embeddings=True, # No API cost for Stage 3
856
+ cache_enabled=True
857
+ )
858
+
859
+ # Run segmentation on multiple recordings
860
+ pipeline = SegmentationPipeline(config)
861
+ result = pipeline.run(
862
+ recordings=[
863
+ "/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift",
864
+ "/Users/abrichr/oa/src/openadapt-capture/demo_new"
865
+ ],
866
+ output_dir="workflow_library",
867
+ progress_callback=lambda stage, cur, tot: print(f"[{stage}] {cur}/{tot}")
868
+ )
869
+
870
+ print(f"\nExtraction complete!")
871
+ print(f" Unique workflows: {result.unique_episodes}")
872
+ print(f" Deduplication: {result.library.deduplication_ratio:.1%}")
873
+
874
+ # Annotate for quality (Stage 4)
875
+ annotator = EpisodeAnnotator(model="gemini-2.0-flash")
876
+
877
+ for recording, extraction in zip(
878
+ ["/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift"],
879
+ [result.extractions["turn-off-nightshift"]]
880
+ ):
881
+ annotated = annotator.annotate_extraction_result(extraction, recording)
882
+ print(f"\nAnnotation: {annotated.gold_count}/{annotated.total_episodes} gold episodes")
883
+
884
+ # Export gold episodes for training
885
+ export_gold_episodes(
886
+ library=annotated,
887
+ output_path="gold_episodes.jsonl",
888
+ format="jsonl"
889
+ )
890
+
891
+ print(f"\nWorkflow library saved to: workflow_library/episode_library.json")
892
+ ```
893
+
894
+ ---
895
+
896
+ ## Contributing
897
+
898
+ To add support for new VLM/LLM providers:
899
+
900
+ 1. Create a new backend class in `frame_describer.py` or `segment_extractor.py`
901
+ 2. Implement the required methods (`describe_frame`, `describe_batch`, etc.)
902
+ 3. Update `_create_backend()` to detect and instantiate your backend
903
+ 4. Add to `SUPPORTED_MODELS` list
904
+
905
+ Example:
906
+
907
+ ```python
908
+ class CustomVLMBackend(VLMBackend):
909
+ def __init__(self, model: str, api_key: str):
910
+ self.model = model
911
+ self.api_key = api_key
912
+
913
+ def describe_frame(self, image, action_context, system_prompt, user_prompt):
914
+ # Your implementation here
915
+ pass
916
+
917
+ def describe_batch(self, images, action_contexts, system_prompt, user_prompt):
918
+ # Your implementation here
919
+ pass
920
+ ```