openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,920 @@
|
|
|
1
|
+
# Workflow Segmentation System
|
|
2
|
+
|
|
3
|
+
The workflow segmentation system automatically extracts, deduplicates, and annotates reusable workflow episodes from GUI recordings. This creates a library of canonical workflows that can be used for:
|
|
4
|
+
|
|
5
|
+
- **Training data curation**: Identify high-quality demonstration episodes for fine-tuning
|
|
6
|
+
- **Demo retrieval**: Build libraries of workflows for demo-conditioned prompting
|
|
7
|
+
- **Workflow documentation**: Automatically generate step-by-step workflow guides
|
|
8
|
+
- **Deduplication**: Find similar workflows across recordings to build canonical definitions
|
|
9
|
+
|
|
10
|
+
## Architecture
|
|
11
|
+
|
|
12
|
+
The system uses a **4-stage pipeline**:
|
|
13
|
+
|
|
14
|
+
### Stage 1: Frame Description (VLM)
|
|
15
|
+
Converts screenshots + actions into semantic descriptions using Vision-Language Models.
|
|
16
|
+
|
|
17
|
+
**Input**: Recording directory with screenshots and action events
|
|
18
|
+
**Output**: `ActionTranscript` with frame-by-frame descriptions
|
|
19
|
+
|
|
20
|
+
**Example**:
|
|
21
|
+
```python
|
|
22
|
+
from openadapt_ml.segmentation import FrameDescriber
|
|
23
|
+
|
|
24
|
+
describer = FrameDescriber(model="gemini-2.0-flash")
|
|
25
|
+
transcript = describer.describe_recording("/path/to/recording")
|
|
26
|
+
|
|
27
|
+
# View as plain text
|
|
28
|
+
print(transcript.to_transcript_text())
|
|
29
|
+
# [00:00.0] User opens System Preferences from Apple menu
|
|
30
|
+
# [00:02.5] User clicks Display settings icon
|
|
31
|
+
# [00:05.1] User navigates to Night Shift tab
|
|
32
|
+
# ...
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Supported VLMs**:
|
|
36
|
+
- Gemini 2.0 Flash / Pro (recommended for speed)
|
|
37
|
+
- Claude Sonnet 4 / Haiku
|
|
38
|
+
- GPT-4o / GPT-4o-mini
|
|
39
|
+
|
|
40
|
+
**Features**:
|
|
41
|
+
- Automatic caching to avoid reprocessing frames
|
|
42
|
+
- Batch processing for API efficiency
|
|
43
|
+
- Extracts: application name, visible elements, screen context, action target, user intent
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
### Stage 2: Episode Extraction (LLM)
|
|
48
|
+
Identifies coherent workflow boundaries and extracts episodes using Large Language Models.
|
|
49
|
+
|
|
50
|
+
**Input**: `ActionTranscript` from Stage 1
|
|
51
|
+
**Output**: `EpisodeExtractionResult` with identified episodes
|
|
52
|
+
|
|
53
|
+
**Example**:
|
|
54
|
+
```python
|
|
55
|
+
from openadapt_ml.segmentation import SegmentExtractor
|
|
56
|
+
|
|
57
|
+
extractor = SegmentExtractor(
|
|
58
|
+
model="gpt-4o",
|
|
59
|
+
use_few_shot=True, # Include examples in prompts
|
|
60
|
+
min_segment_duration=2.0, # Minimum episode length
|
|
61
|
+
max_segment_duration=300.0 # Maximum episode length
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
result = extractor.extract_segments(transcript)
|
|
65
|
+
|
|
66
|
+
for episode in result.episodes:
|
|
67
|
+
print(f"{episode.name}: {episode.start_time_formatted} - {episode.end_time_formatted}")
|
|
68
|
+
print(f" Steps: {', '.join(episode.step_summaries)}")
|
|
69
|
+
print(f" Confidence: {episode.boundary_confidence:.2f}")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**Output**:
|
|
73
|
+
- Episode name and description
|
|
74
|
+
- Precise start/end timestamps
|
|
75
|
+
- Step-by-step breakdown
|
|
76
|
+
- Prerequisites and outcomes
|
|
77
|
+
- Boundary confidence scores
|
|
78
|
+
|
|
79
|
+
**Supported LLMs**:
|
|
80
|
+
- GPT-4o / GPT-4o-mini (recommended)
|
|
81
|
+
- Claude Sonnet 4 / Haiku
|
|
82
|
+
- Gemini 2.0 Pro / Flash
|
|
83
|
+
|
|
84
|
+
**Features**:
|
|
85
|
+
- Few-shot prompting for better segmentation quality
|
|
86
|
+
- Hierarchical extraction (nested subtasks)
|
|
87
|
+
- Confidence-based filtering
|
|
88
|
+
- Manual boundary adjustment helpers
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
### Stage 3: Deduplication (Embeddings)
|
|
93
|
+
Finds and merges similar workflows across recordings using embedding similarity.
|
|
94
|
+
|
|
95
|
+
**Input**: List of `EpisodeExtractionResult` from multiple recordings
|
|
96
|
+
**Output**: `EpisodeLibrary` with canonical workflows
|
|
97
|
+
|
|
98
|
+
**Example**:
|
|
99
|
+
```python
|
|
100
|
+
from openadapt_ml.segmentation import WorkflowDeduplicator
|
|
101
|
+
|
|
102
|
+
dedup = WorkflowDeduplicator(
|
|
103
|
+
threshold=0.85, # Cosine similarity threshold (0.80-0.90 recommended)
|
|
104
|
+
embedding_model="text-embedding-3-large",
|
|
105
|
+
merge_strategy="centroid" # or "longest", "first"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
library = dedup.deduplicate(extraction_results)
|
|
109
|
+
|
|
110
|
+
print(f"Total episodes: {library.total_episodes_extracted}")
|
|
111
|
+
print(f"Unique workflows: {library.unique_episode_count}")
|
|
112
|
+
print(f"Deduplication ratio: {library.deduplication_ratio:.1%}")
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**Features**:
|
|
116
|
+
- Semantic similarity using text embeddings (OpenAI API or local HuggingFace models)
|
|
117
|
+
- Agglomerative clustering with cosine similarity
|
|
118
|
+
- Multiple merge strategies (centroid, longest, first)
|
|
119
|
+
- Incremental library updates (add new recordings to existing libraries)
|
|
120
|
+
|
|
121
|
+
**Merge Strategies**:
|
|
122
|
+
- `centroid`: Use episode closest to cluster centroid (most representative)
|
|
123
|
+
- `longest`: Use episode with longest/most detailed description
|
|
124
|
+
- `first`: Use first encountered episode
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
### Stage 4: Annotation (VLM Quality Assessment)
|
|
129
|
+
Automatically annotates episodes for training data quality control.
|
|
130
|
+
|
|
131
|
+
**Input**: `EpisodeExtractionResult` + recording path
|
|
132
|
+
**Output**: `AnnotatedEpisodeLibrary` with gold/exclusion labels
|
|
133
|
+
|
|
134
|
+
**Example**:
|
|
135
|
+
```python
|
|
136
|
+
from openadapt_ml.segmentation import EpisodeAnnotator
|
|
137
|
+
|
|
138
|
+
annotator = EpisodeAnnotator(
|
|
139
|
+
model="gemini-2.0-flash",
|
|
140
|
+
lookahead_frames=10 # Analyze frames after episode to detect failures
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
library = annotator.annotate_extraction_result(
|
|
144
|
+
extraction_result=result,
|
|
145
|
+
recording_path="/path/to/recording"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
print(f"Total episodes: {library.total_episodes}")
|
|
149
|
+
print(f"Recommended as gold: {library.gold_count}")
|
|
150
|
+
print(f"Pending human review: {library.total_episodes - library.verified_count}")
|
|
151
|
+
|
|
152
|
+
# Get gold episodes for export
|
|
153
|
+
gold_episodes = library.get_verified_gold_episodes()
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**What it checks**:
|
|
157
|
+
- Boundary accuracy (are start/end frames correct?)
|
|
158
|
+
- Workflow completeness (did all steps execute successfully?)
|
|
159
|
+
- Failure detection:
|
|
160
|
+
- Error dialogs or messages
|
|
161
|
+
- Undo actions (Ctrl+Z, etc.)
|
|
162
|
+
- Repeated attempts at same action
|
|
163
|
+
- User navigating back or canceling
|
|
164
|
+
- Post-episode analysis (examines frames *after* episode ends for delayed failures)
|
|
165
|
+
|
|
166
|
+
**Output**:
|
|
167
|
+
- `is_gold`: Boolean recommendation for training data inclusion
|
|
168
|
+
- `confidence`: VLM confidence in assessment (0-1)
|
|
169
|
+
- `failure_signals`: List of detected issues
|
|
170
|
+
- `exclusion_reason`: Explanation if not gold
|
|
171
|
+
- `start_frame` / `end_frame`: Refined boundaries
|
|
172
|
+
|
|
173
|
+
**Human-in-the-loop review**:
|
|
174
|
+
```python
|
|
175
|
+
from openadapt_ml.segmentation import verify_annotation
|
|
176
|
+
|
|
177
|
+
# After reviewing an annotation
|
|
178
|
+
verified = verify_annotation(
|
|
179
|
+
annotation=ann,
|
|
180
|
+
is_gold=True, # Human decision
|
|
181
|
+
notes="Verified - workflow completed successfully",
|
|
182
|
+
verified_by="reviewer_name"
|
|
183
|
+
)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Complete Pipeline
|
|
189
|
+
|
|
190
|
+
Run all 4 stages together:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from openadapt_ml.segmentation import SegmentationPipeline, PipelineConfig
|
|
194
|
+
|
|
195
|
+
config = PipelineConfig(
|
|
196
|
+
vlm_model="gemini-2.0-flash", # Stage 1
|
|
197
|
+
llm_model="gpt-4o", # Stage 2
|
|
198
|
+
similarity_threshold=0.85, # Stage 3
|
|
199
|
+
use_local_embeddings=False, # Use OpenAI embeddings
|
|
200
|
+
cache_enabled=True
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
pipeline = SegmentationPipeline(config)
|
|
204
|
+
|
|
205
|
+
result = pipeline.run(
|
|
206
|
+
recordings=[
|
|
207
|
+
"/path/to/recording1",
|
|
208
|
+
"/path/to/recording2"
|
|
209
|
+
],
|
|
210
|
+
output_dir="segmentation_output",
|
|
211
|
+
progress_callback=lambda stage, cur, tot: print(f"[{stage}] {cur}/{tot}")
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
print(f"Recordings processed: {result.recordings_processed}")
|
|
215
|
+
print(f"Total episodes: {result.total_episodes_extracted}")
|
|
216
|
+
print(f"Unique workflows: {result.unique_episodes}")
|
|
217
|
+
print(f"Processing time: {result.processing_time_seconds:.1f}s")
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
The pipeline automatically saves intermediate results:
|
|
221
|
+
- `{recording_id}_transcript.json` - Stage 1 output
|
|
222
|
+
- `{recording_id}_episodes.json` - Stage 2 output
|
|
223
|
+
- `episode_library.json` - Stage 3 output (final deduplicated library)
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
## CLI Usage
|
|
228
|
+
|
|
229
|
+
All stages have CLI commands:
|
|
230
|
+
|
|
231
|
+
### Describe (Stage 1)
|
|
232
|
+
```bash
|
|
233
|
+
# Generate frame descriptions
|
|
234
|
+
python -m openadapt_ml.segmentation.cli describe \
|
|
235
|
+
--recording /path/to/recording \
|
|
236
|
+
--model gemini-2.0-flash \
|
|
237
|
+
--output transcript.json
|
|
238
|
+
|
|
239
|
+
# View as plain text
|
|
240
|
+
python -m openadapt_ml.segmentation.cli describe \
|
|
241
|
+
--recording /path/to/recording \
|
|
242
|
+
--format text
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Extract (Stage 2)
|
|
246
|
+
```bash
|
|
247
|
+
# Extract episodes from a recording
|
|
248
|
+
python -m openadapt_ml.segmentation.cli extract \
|
|
249
|
+
--recording /path/to/recording \
|
|
250
|
+
--model gpt-4o \
|
|
251
|
+
--output episodes.json
|
|
252
|
+
|
|
253
|
+
# Or from existing transcript
|
|
254
|
+
python -m openadapt_ml.segmentation.cli extract \
|
|
255
|
+
--transcript transcript.json \
|
|
256
|
+
--model gpt-4o \
|
|
257
|
+
--output episodes.json
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Deduplicate (Stage 3)
|
|
261
|
+
```bash
|
|
262
|
+
# Deduplicate across multiple recordings
|
|
263
|
+
python -m openadapt_ml.segmentation.cli deduplicate \
|
|
264
|
+
recording1_episodes.json recording2_episodes.json \
|
|
265
|
+
--threshold 0.85 \
|
|
266
|
+
--output library.json
|
|
267
|
+
|
|
268
|
+
# Or from a directory
|
|
269
|
+
python -m openadapt_ml.segmentation.cli deduplicate \
|
|
270
|
+
--input-dir segmentation_output/ \
|
|
271
|
+
--threshold 0.85 \
|
|
272
|
+
--output library.json
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### Annotate (Stage 4)
|
|
276
|
+
```bash
|
|
277
|
+
# Auto-annotate episodes for quality control
|
|
278
|
+
python -m openadapt_ml.segmentation.cli annotate \
|
|
279
|
+
--episodes recording1_episodes.json \
|
|
280
|
+
--recording /path/to/recording1 \
|
|
281
|
+
--model gemini-2.0-flash \
|
|
282
|
+
--output annotated_library.json
|
|
283
|
+
|
|
284
|
+
# Review annotations interactively
|
|
285
|
+
python -m openadapt_ml.segmentation.cli review \
|
|
286
|
+
--library annotated_library.json \
|
|
287
|
+
--recording /path/to/recording1 \
|
|
288
|
+
--reviewer your_name \
|
|
289
|
+
--auto-approve-high-confidence # Auto-approve confidence > 0.9
|
|
290
|
+
|
|
291
|
+
# Export gold episodes for fine-tuning
|
|
292
|
+
python -m openadapt_ml.segmentation.cli export-gold \
|
|
293
|
+
annotated_library.json \
|
|
294
|
+
--format jsonl \
|
|
295
|
+
--output gold_episodes.jsonl \
|
|
296
|
+
--include-screenshots
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### Complete Pipeline (all stages)
|
|
300
|
+
```bash
|
|
301
|
+
python -m openadapt_ml.segmentation.cli pipeline \
|
|
302
|
+
/path/to/recording1 /path/to/recording2 /path/to/recording3 \
|
|
303
|
+
--vlm-model gemini-2.0-flash \
|
|
304
|
+
--llm-model gpt-4o \
|
|
305
|
+
--threshold 0.85 \
|
|
306
|
+
--output segmentation_output/ \
|
|
307
|
+
--save-intermediate \
|
|
308
|
+
--verbose
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### List Library Contents
|
|
312
|
+
```bash
|
|
313
|
+
python -m openadapt_ml.segmentation.cli list \
|
|
314
|
+
--library library.json \
|
|
315
|
+
--details
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Export Library
|
|
319
|
+
```bash
|
|
320
|
+
# Export as CSV, JSONL, or HTML
|
|
321
|
+
python -m openadapt_ml.segmentation.cli export \
|
|
322
|
+
library.json \
|
|
323
|
+
--format html \
|
|
324
|
+
--output workflows.html
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
---
|
|
328
|
+
|
|
329
|
+
## Data Schemas
|
|
330
|
+
|
|
331
|
+
All schemas are defined using Pydantic in `openadapt_ml/segmentation/schemas.py`:
|
|
332
|
+
|
|
333
|
+
### `FrameDescription` (Stage 1 output)
|
|
334
|
+
```python
|
|
335
|
+
{
|
|
336
|
+
"timestamp": 2.5,
|
|
337
|
+
"formatted_time": "00:02.5",
|
|
338
|
+
"visible_application": "System Preferences",
|
|
339
|
+
"visible_elements": ["Night Shift toggle", "Schedule slider"],
|
|
340
|
+
"screen_context": "Display settings panel with Night Shift tab active",
|
|
341
|
+
"action_type": "click",
|
|
342
|
+
"action_target": "Night Shift toggle",
|
|
343
|
+
"action_value": None,
|
|
344
|
+
"apparent_intent": "Enable Night Shift automatic scheduling",
|
|
345
|
+
"confidence": 0.95,
|
|
346
|
+
"frame_index": 5,
|
|
347
|
+
"vlm_model": "gemini-2.0-flash"
|
|
348
|
+
}
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### `Episode` (Stage 2 output)
|
|
352
|
+
```python
|
|
353
|
+
{
|
|
354
|
+
"episode_id": "uuid-here",
|
|
355
|
+
"name": "Configure Night Shift Schedule",
|
|
356
|
+
"start_time": 0.0,
|
|
357
|
+
"end_time": 12.5,
|
|
358
|
+
"start_time_formatted": "00:00.0",
|
|
359
|
+
"end_time_formatted": "00:12.5",
|
|
360
|
+
"description": "Enable and configure Night Shift automatic scheduling...",
|
|
361
|
+
"step_summaries": [
|
|
362
|
+
"Open System Preferences",
|
|
363
|
+
"Navigate to Display > Night Shift",
|
|
364
|
+
"Enable Night Shift",
|
|
365
|
+
"Set schedule 9 PM - 7 AM"
|
|
366
|
+
],
|
|
367
|
+
"application": "System Preferences",
|
|
368
|
+
"prerequisites": ["System Preferences must be accessible"],
|
|
369
|
+
"outcomes": ["Night Shift enabled with custom schedule"],
|
|
370
|
+
"boundary_confidence": 0.95,
|
|
371
|
+
"coherence_score": 0.90,
|
|
372
|
+
"recording_id": "recording1",
|
|
373
|
+
"frame_indices": [0, 1, 2, 3, 4, 5]
|
|
374
|
+
}
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
### `CanonicalEpisode` (Stage 3 output)
|
|
378
|
+
```python
|
|
379
|
+
{
|
|
380
|
+
"canonical_id": "uuid-here",
|
|
381
|
+
"canonical_name": "Configure Night Shift Schedule",
|
|
382
|
+
"canonical_description": "Enable and configure Night Shift...",
|
|
383
|
+
"canonical_steps": ["Open System Preferences", "Navigate to Display > Night Shift", ...],
|
|
384
|
+
"variant_names": ["Adjust Night Shift Settings", "Set up Night Shift"],
|
|
385
|
+
"variant_descriptions": ["...", "..."],
|
|
386
|
+
"source_recordings": ["recording1", "recording2"],
|
|
387
|
+
"source_episode_ids": ["uuid1", "uuid2"],
|
|
388
|
+
"occurrence_count": 3,
|
|
389
|
+
"embedding": [0.123, -0.456, ...],
|
|
390
|
+
"cluster_id": 0,
|
|
391
|
+
"internal_similarity": 0.92
|
|
392
|
+
}
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### `EpisodeAnnotation` (Stage 4 output)
|
|
396
|
+
```python
|
|
397
|
+
{
|
|
398
|
+
"annotation_id": "uuid-here",
|
|
399
|
+
"episode_id": "uuid-of-episode",
|
|
400
|
+
"start_frame": 0,
|
|
401
|
+
"end_frame": 5,
|
|
402
|
+
"is_gold": True,
|
|
403
|
+
"exclusion_reason": None,
|
|
404
|
+
"confidence": 0.95,
|
|
405
|
+
"human_verified": False,
|
|
406
|
+
"notes": None,
|
|
407
|
+
"failure_signals": [],
|
|
408
|
+
"created_at": "2026-01-17T10:00:00",
|
|
409
|
+
"verified_at": None,
|
|
410
|
+
"verified_by": None
|
|
411
|
+
}
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
---
|
|
415
|
+
|
|
416
|
+
## Configuration
|
|
417
|
+
|
|
418
|
+
### API Keys
|
|
419
|
+
|
|
420
|
+
Set environment variables for VLM/LLM providers:
|
|
421
|
+
|
|
422
|
+
```bash
|
|
423
|
+
export GOOGLE_API_KEY="your-gemini-key"
|
|
424
|
+
export ANTHROPIC_API_KEY="your-claude-key"
|
|
425
|
+
export OPENAI_API_KEY="your-openai-key"
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
### Caching
|
|
429
|
+
|
|
430
|
+
Frame descriptions are automatically cached to avoid reprocessing:
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
# Cache location: ~/.openadapt/cache/descriptions/
|
|
434
|
+
|
|
435
|
+
# Clear cache for a specific recording
|
|
436
|
+
describer.clear_cache(recording_id="recording1")
|
|
437
|
+
|
|
438
|
+
# Disable caching
|
|
439
|
+
describer = FrameDescriber(cache_enabled=False)
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
### Local Embeddings (No API required)
|
|
443
|
+
|
|
444
|
+
Use local HuggingFace models instead of OpenAI embeddings:
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
dedup = WorkflowDeduplicator(
|
|
448
|
+
use_local_embeddings=True # Uses intfloat/e5-large-v2
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Requires: pip install transformers torch
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
---
|
|
455
|
+
|
|
456
|
+
## Use Cases
|
|
457
|
+
|
|
458
|
+
### 1. Training Data Curation
|
|
459
|
+
|
|
460
|
+
Extract and filter high-quality episodes for fine-tuning:
|
|
461
|
+
|
|
462
|
+
```python
|
|
463
|
+
# Extract episodes from all recordings
|
|
464
|
+
results = []
|
|
465
|
+
for recording in recordings:
|
|
466
|
+
transcript = describer.describe_recording(recording)
|
|
467
|
+
result = extractor.extract_segments(transcript)
|
|
468
|
+
results.append(result)
|
|
469
|
+
|
|
470
|
+
# Deduplicate to find unique workflows
|
|
471
|
+
library = dedup.deduplicate(results)
|
|
472
|
+
|
|
473
|
+
# Annotate for quality
|
|
474
|
+
annotator = EpisodeAnnotator()
|
|
475
|
+
for recording, result in zip(recordings, results):
|
|
476
|
+
annotated = annotator.annotate_extraction_result(result, recording)
|
|
477
|
+
|
|
478
|
+
# Human review
|
|
479
|
+
for episode, annotation in annotated.get_pending_review():
|
|
480
|
+
# Present to human for verification
|
|
481
|
+
verified = verify_annotation(annotation, is_gold=True, verified_by="human")
|
|
482
|
+
|
|
483
|
+
# Export gold episodes
|
|
484
|
+
from openadapt_ml.segmentation import export_gold_episodes
|
|
485
|
+
export_gold_episodes(
|
|
486
|
+
library=annotated_library,
|
|
487
|
+
output_path="training_data.jsonl",
|
|
488
|
+
format="jsonl"
|
|
489
|
+
)
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
### 2. Demo Retrieval Library
|
|
493
|
+
|
|
494
|
+
Build a searchable library of workflow demonstrations:
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
# Build library from multiple recordings
|
|
498
|
+
library = pipeline.run(recordings, output_dir="demo_library").library
|
|
499
|
+
|
|
500
|
+
# Find similar workflows for retrieval
|
|
501
|
+
target_episode = Episode(...) # Current task
|
|
502
|
+
similar = dedup.find_similar(target_episode, library, top_k=5)
|
|
503
|
+
|
|
504
|
+
for canonical, similarity in similar:
|
|
505
|
+
print(f"{canonical.canonical_name}: {similarity:.2f}")
|
|
506
|
+
print(f" Found in: {canonical.source_recordings}")
|
|
507
|
+
print(f" Steps: {canonical.canonical_steps}")
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
### 3. Workflow Documentation
|
|
511
|
+
|
|
512
|
+
Generate documentation from recordings:
|
|
513
|
+
|
|
514
|
+
```python
|
|
515
|
+
result = pipeline.run(recordings, output_dir="docs")
|
|
516
|
+
|
|
517
|
+
# Export as HTML
|
|
518
|
+
from openadapt_ml.segmentation.cli import export
|
|
519
|
+
export(
|
|
520
|
+
library=result.library,
|
|
521
|
+
format="html",
|
|
522
|
+
output="workflow_guide.html"
|
|
523
|
+
)
|
|
524
|
+
```
|
|
525
|
+
|
|
526
|
+
---
|
|
527
|
+
|
|
528
|
+
## Advanced Features
|
|
529
|
+
|
|
530
|
+
### Hierarchical Segmentation
|
|
531
|
+
|
|
532
|
+
Extract nested task/subtask structures:
|
|
533
|
+
|
|
534
|
+
```python
|
|
535
|
+
extractor = SegmentExtractor(hierarchical=True)
|
|
536
|
+
result = extractor.extract_segments(transcript)
|
|
537
|
+
|
|
538
|
+
for episode in result.episodes:
|
|
539
|
+
if episode.child_episode_ids:
|
|
540
|
+
print(f"{episode.name} contains {len(episode.child_episode_ids)} subtasks")
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
### Boundary Refinement
|
|
544
|
+
|
|
545
|
+
Manually adjust or automatically refine boundaries:
|
|
546
|
+
|
|
547
|
+
```python
|
|
548
|
+
# Automatic refinement
|
|
549
|
+
refined = extractor.refine_segment(segment, transcript)
|
|
550
|
+
|
|
551
|
+
# Manual adjustment
|
|
552
|
+
adjusted = extractor.adjust_boundary(
|
|
553
|
+
segment,
|
|
554
|
+
new_start=2.5, # New start time
|
|
555
|
+
new_end=15.0, # New end time
|
|
556
|
+
transcript=transcript
|
|
557
|
+
)
|
|
558
|
+
```
|
|
559
|
+
|
|
560
|
+
### Segment Merging
|
|
561
|
+
|
|
562
|
+
Merge adjacent segments that belong together:
|
|
563
|
+
|
|
564
|
+
```python
|
|
565
|
+
merged = extractor.merge_segments(
|
|
566
|
+
segments=episodes,
|
|
567
|
+
max_gap=2.0 # Max seconds between segments to merge
|
|
568
|
+
)
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
### Incremental Library Updates
|
|
572
|
+
|
|
573
|
+
Add new recordings to an existing library:
|
|
574
|
+
|
|
575
|
+
```python
|
|
576
|
+
# Load existing library
|
|
577
|
+
import json
|
|
578
|
+
library_data = json.loads(Path("library.json").read_text())
|
|
579
|
+
existing_library = EpisodeLibrary.model_validate(library_data)
|
|
580
|
+
|
|
581
|
+
# Add new recording
|
|
582
|
+
new_result = pipeline.run(
|
|
583
|
+
["new_recording"],
|
|
584
|
+
existing_library=existing_library
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Library now contains both old and new workflows
|
|
588
|
+
```
|
|
589
|
+
|
|
590
|
+
---
|
|
591
|
+
|
|
592
|
+
## Integration with openadapt-capture
|
|
593
|
+
|
|
594
|
+
**Status**: Integration layer needed
|
|
595
|
+
|
|
596
|
+
The segmentation system currently expects recordings in one of these formats:
|
|
597
|
+
|
|
598
|
+
1. **openadapt-capture format** (preferred):
|
|
599
|
+
- Directory with `metadata.json` and `events.json`
|
|
600
|
+
- `screenshots/` subdirectory with numbered PNGs
|
|
601
|
+
|
|
602
|
+
2. **JSON format**:
|
|
603
|
+
- Single JSON file with base64-encoded screenshots
|
|
604
|
+
|
|
605
|
+
3. **Directory format**:
|
|
606
|
+
- Directory with numbered PNG files
|
|
607
|
+
- Creates synthetic event data
|
|
608
|
+
|
|
609
|
+
**Required**: Create adapter to load from `capture.db` (SQLite format used by openadapt-capture).
|
|
610
|
+
|
|
611
|
+
See [Integration Requirements](#integration-requirements) section below for details.
|
|
612
|
+
|
|
613
|
+
---
|
|
614
|
+
|
|
615
|
+
## Next Steps & Recommendations
|
|
616
|
+
|
|
617
|
+
### P0 (High Priority)
|
|
618
|
+
|
|
619
|
+
1. **Create openadapt-capture adapter**
|
|
620
|
+
- Read events from `capture.db` SQLite database
|
|
621
|
+
- Convert to format expected by FrameDescriber
|
|
622
|
+
- Location: `openadapt_ml/segmentation/adapters/capture_adapter.py`
|
|
623
|
+
|
|
624
|
+
2. **Add visualization generator**
|
|
625
|
+
- Create annotated screenshots showing segment boundaries
|
|
626
|
+
- Highlight key actions within segments
|
|
627
|
+
- Generate comparison views (before/after deduplication)
|
|
628
|
+
|
|
629
|
+
3. **Integration tests**
|
|
630
|
+
- Test full pipeline on real openadapt-capture recordings
|
|
631
|
+
- Validate output quality
|
|
632
|
+
- Benchmark performance (time, API costs)
|
|
633
|
+
|
|
634
|
+
### P1 (Medium Priority)
|
|
635
|
+
|
|
636
|
+
4. **Improve prompt engineering**
|
|
637
|
+
- Refine few-shot examples based on real data
|
|
638
|
+
- Add domain-specific examples (web, desktop, mobile)
|
|
639
|
+
- Experiment with structured output formats (JSON schema)
|
|
640
|
+
|
|
641
|
+
5. **Cost optimization**
|
|
642
|
+
- Implement frame sampling strategies (skip similar frames)
|
|
643
|
+
- Add batch processing limits to control API costs
|
|
644
|
+
- Support vision-only models (no text description needed)
|
|
645
|
+
|
|
646
|
+
6. **Quality metrics**
|
|
647
|
+
- Add inter-annotator agreement metrics
|
|
648
|
+
- Track segmentation quality over time
|
|
649
|
+
- Benchmark against human annotations
|
|
650
|
+
|
|
651
|
+
### P2 (Nice to Have)
|
|
652
|
+
|
|
653
|
+
7. **Active learning**
|
|
654
|
+
- Suggest most valuable recordings to annotate next
|
|
655
|
+
- Identify edge cases that need human review
|
|
656
|
+
- Adapt prompts based on human feedback
|
|
657
|
+
|
|
658
|
+
8. **Multi-modal features**
|
|
659
|
+
- Incorporate audio transcripts (already captured)
|
|
660
|
+
- Use OCR for better text extraction
|
|
661
|
+
- Analyze cursor movement patterns
|
|
662
|
+
|
|
663
|
+
9. **Export formats**
|
|
664
|
+
- HuggingFace datasets format
|
|
665
|
+
- Parquet for large-scale storage
|
|
666
|
+
- Demo-conditioning format for retrieval
|
|
667
|
+
|
|
668
|
+
---
|
|
669
|
+
|
|
670
|
+
## Integration Requirements
|
|
671
|
+
|
|
672
|
+
### openadapt-capture Adapter
|
|
673
|
+
|
|
674
|
+
The current recordings use `capture.db` (SQLite) but the segmentation system expects `events.json`. Create an adapter:
|
|
675
|
+
|
|
676
|
+
```python
|
|
677
|
+
# openadapt_ml/segmentation/adapters/capture_adapter.py
|
|
678
|
+
|
|
679
|
+
import sqlite3
|
|
680
|
+
import json
|
|
681
|
+
from pathlib import Path
|
|
682
|
+
from PIL import Image
|
|
683
|
+
|
|
684
|
+
class CaptureAdapter:
|
|
685
|
+
"""Adapter for openadapt-capture SQLite format."""
|
|
686
|
+
|
|
687
|
+
def load_recording(self, capture_path: Path) -> tuple[list[Image.Image], list[dict]]:
|
|
688
|
+
"""Load recording from capture.db format.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
capture_path: Path to recording directory with capture.db
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Tuple of (images, action_events)
|
|
695
|
+
"""
|
|
696
|
+
db_path = capture_path / "capture.db"
|
|
697
|
+
screenshots_dir = capture_path / "screenshots"
|
|
698
|
+
|
|
699
|
+
# Connect to SQLite
|
|
700
|
+
conn = sqlite3.connect(db_path)
|
|
701
|
+
cursor = conn.cursor()
|
|
702
|
+
|
|
703
|
+
# Query events
|
|
704
|
+
cursor.execute("""
|
|
705
|
+
SELECT timestamp, type, data
|
|
706
|
+
FROM events
|
|
707
|
+
WHERE type IN ('click', 'type', 'scroll', 'key', 'move')
|
|
708
|
+
ORDER BY timestamp
|
|
709
|
+
""")
|
|
710
|
+
|
|
711
|
+
images = []
|
|
712
|
+
events = []
|
|
713
|
+
|
|
714
|
+
for i, (timestamp, event_type, data_json) in enumerate(cursor.fetchall()):
|
|
715
|
+
data = json.loads(data_json)
|
|
716
|
+
|
|
717
|
+
# Find corresponding screenshot
|
|
718
|
+
screenshot_path = self._find_screenshot(screenshots_dir, i)
|
|
719
|
+
if screenshot_path:
|
|
720
|
+
images.append(Image.open(screenshot_path))
|
|
721
|
+
|
|
722
|
+
# Convert to expected format
|
|
723
|
+
event = {
|
|
724
|
+
"timestamp": timestamp,
|
|
725
|
+
"frame_index": i,
|
|
726
|
+
"name": event_type,
|
|
727
|
+
"mouse_x": data.get("x"),
|
|
728
|
+
"mouse_y": data.get("y"),
|
|
729
|
+
"text": data.get("text"),
|
|
730
|
+
}
|
|
731
|
+
events.append(event)
|
|
732
|
+
|
|
733
|
+
conn.close()
|
|
734
|
+
return images, events
|
|
735
|
+
|
|
736
|
+
def _find_screenshot(self, screenshots_dir: Path, frame_index: int) -> Path | None:
|
|
737
|
+
"""Find screenshot file for frame index."""
|
|
738
|
+
# openadapt-capture uses format: capture_{id}_step_{n}.png
|
|
739
|
+
matches = list(screenshots_dir.glob(f"*_step_{frame_index}.png"))
|
|
740
|
+
return matches[0] if matches else None
|
|
741
|
+
```
|
|
742
|
+
|
|
743
|
+
**Integration**:
|
|
744
|
+
|
|
745
|
+
Update `FrameDescriber._load_recording()` to use the adapter:
|
|
746
|
+
|
|
747
|
+
```python
|
|
748
|
+
# In frame_describer.py
|
|
749
|
+
|
|
750
|
+
def _load_recording(self, recording_path: Path):
|
|
751
|
+
# Check for capture.db
|
|
752
|
+
if (recording_path / "capture.db").exists():
|
|
753
|
+
from openadapt_ml.segmentation.adapters import CaptureAdapter
|
|
754
|
+
adapter = CaptureAdapter()
|
|
755
|
+
return adapter.load_recording(recording_path)
|
|
756
|
+
|
|
757
|
+
# ... existing code for other formats
|
|
758
|
+
```
|
|
759
|
+
|
|
760
|
+
---
|
|
761
|
+
|
|
762
|
+
## Cost Estimates
|
|
763
|
+
|
|
764
|
+
Approximate API costs for a 30-second recording (~20 frames):
|
|
765
|
+
|
|
766
|
+
### Stage 1 (Frame Description)
|
|
767
|
+
- **Gemini 2.0 Flash**: $0.01 - $0.05 per recording
|
|
768
|
+
- **Claude Haiku**: $0.10 - $0.30 per recording
|
|
769
|
+
- **GPT-4o-mini**: $0.05 - $0.15 per recording
|
|
770
|
+
|
|
771
|
+
### Stage 2 (Episode Extraction)
|
|
772
|
+
- **GPT-4o**: $0.01 - $0.02 per recording
|
|
773
|
+
- **Claude Sonnet 4**: $0.02 - $0.05 per recording
|
|
774
|
+
|
|
775
|
+
### Stage 3 (Deduplication)
|
|
776
|
+
- **OpenAI text-embedding-3-large**: $0.001 per recording
|
|
777
|
+
- **Local embeddings**: Free (requires GPU for speed)
|
|
778
|
+
|
|
779
|
+
### Stage 4 (Annotation)
|
|
780
|
+
- **Gemini 2.0 Flash**: $0.02 - $0.10 per episode
|
|
781
|
+
- **GPT-4o-mini**: $0.05 - $0.15 per episode
|
|
782
|
+
|
|
783
|
+
**Total per recording**: ~$0.05 - $0.50 depending on model choices
|
|
784
|
+
|
|
785
|
+
**Recommendation**: Use Gemini 2.0 Flash for Stages 1 & 4, GPT-4o for Stage 2, local embeddings for Stage 3.
|
|
786
|
+
|
|
787
|
+
---
|
|
788
|
+
|
|
789
|
+
## Performance
|
|
790
|
+
|
|
791
|
+
Approximate processing times for a 30-second recording (~20 frames):
|
|
792
|
+
|
|
793
|
+
- **Stage 1 (Description)**: 10-30 seconds (with batching)
|
|
794
|
+
- **Stage 2 (Extraction)**: 5-15 seconds
|
|
795
|
+
- **Stage 3 (Deduplication)**: 1-5 seconds (per 100 episodes)
|
|
796
|
+
- **Stage 4 (Annotation)**: 10-20 seconds per episode
|
|
797
|
+
|
|
798
|
+
**Bottleneck**: VLM API calls (Stages 1 & 4). Use caching and batching to optimize.
|
|
799
|
+
|
|
800
|
+
---
|
|
801
|
+
|
|
802
|
+
## Troubleshooting
|
|
803
|
+
|
|
804
|
+
### "GOOGLE_API_KEY not set"
|
|
805
|
+
Set the API key: `export GOOGLE_API_KEY="your-key"`
|
|
806
|
+
|
|
807
|
+
### "Failed to load recording"
|
|
808
|
+
Check that the recording directory has the expected format (screenshots/ and events.json or capture.db)
|
|
809
|
+
|
|
810
|
+
### "No episodes extracted"
|
|
811
|
+
- Lower `min_segment_duration` if recordings are short
|
|
812
|
+
- Check `confidence_threshold` (try 0.5 instead of 0.7)
|
|
813
|
+
- Review Stage 1 transcript to ensure VLM descriptions are accurate
|
|
814
|
+
|
|
815
|
+
### "Deduplication not working"
|
|
816
|
+
- Lower `threshold` (try 0.75 instead of 0.85)
|
|
817
|
+
- Check that episode descriptions are sufficiently detailed
|
|
818
|
+
- Verify embeddings are being generated correctly
|
|
819
|
+
|
|
820
|
+
### "High API costs"
|
|
821
|
+
- Enable caching: `cache_enabled=True`
|
|
822
|
+
- Use faster/cheaper models (Gemini Flash, GPT-4o-mini)
|
|
823
|
+
- Reduce batch size to process fewer frames per call
|
|
824
|
+
- Use local embeddings for Stage 3
|
|
825
|
+
|
|
826
|
+
---
|
|
827
|
+
|
|
828
|
+
## References
|
|
829
|
+
|
|
830
|
+
- **Schemas**: `openadapt_ml/segmentation/schemas.py`
|
|
831
|
+
- **Frame Describer**: `openadapt_ml/segmentation/frame_describer.py`
|
|
832
|
+
- **Segment Extractor**: `openadapt_ml/segmentation/segment_extractor.py`
|
|
833
|
+
- **Deduplicator**: `openadapt_ml/segmentation/deduplicator.py`
|
|
834
|
+
- **Annotator**: `openadapt_ml/segmentation/annotator.py`
|
|
835
|
+
- **Pipeline**: `openadapt_ml/segmentation/pipeline.py`
|
|
836
|
+
- **CLI**: `openadapt_ml/segmentation/cli.py`
|
|
837
|
+
|
|
838
|
+
---
|
|
839
|
+
|
|
840
|
+
## Example: Complete Workflow
|
|
841
|
+
|
|
842
|
+
```python
|
|
843
|
+
from openadapt_ml.segmentation import (
|
|
844
|
+
SegmentationPipeline,
|
|
845
|
+
PipelineConfig,
|
|
846
|
+
EpisodeAnnotator,
|
|
847
|
+
export_gold_episodes
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
# Configure pipeline
|
|
851
|
+
config = PipelineConfig(
|
|
852
|
+
vlm_model="gemini-2.0-flash", # Fast and cheap for Stage 1
|
|
853
|
+
llm_model="gpt-4o", # Best quality for Stage 2
|
|
854
|
+
similarity_threshold=0.85,
|
|
855
|
+
use_local_embeddings=True, # No API cost for Stage 3
|
|
856
|
+
cache_enabled=True
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
# Run segmentation on multiple recordings
|
|
860
|
+
pipeline = SegmentationPipeline(config)
|
|
861
|
+
result = pipeline.run(
|
|
862
|
+
recordings=[
|
|
863
|
+
"/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift",
|
|
864
|
+
"/Users/abrichr/oa/src/openadapt-capture/demo_new"
|
|
865
|
+
],
|
|
866
|
+
output_dir="workflow_library",
|
|
867
|
+
progress_callback=lambda stage, cur, tot: print(f"[{stage}] {cur}/{tot}")
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
print(f"\nExtraction complete!")
|
|
871
|
+
print(f" Unique workflows: {result.unique_episodes}")
|
|
872
|
+
print(f" Deduplication: {result.library.deduplication_ratio:.1%}")
|
|
873
|
+
|
|
874
|
+
# Annotate for quality (Stage 4)
|
|
875
|
+
annotator = EpisodeAnnotator(model="gemini-2.0-flash")
|
|
876
|
+
|
|
877
|
+
for recording, extraction in zip(
|
|
878
|
+
["/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift"],
|
|
879
|
+
[result.extractions["turn-off-nightshift"]]
|
|
880
|
+
):
|
|
881
|
+
annotated = annotator.annotate_extraction_result(extraction, recording)
|
|
882
|
+
print(f"\nAnnotation: {annotated.gold_count}/{annotated.total_episodes} gold episodes")
|
|
883
|
+
|
|
884
|
+
# Export gold episodes for training
|
|
885
|
+
export_gold_episodes(
|
|
886
|
+
library=annotated,
|
|
887
|
+
output_path="gold_episodes.jsonl",
|
|
888
|
+
format="jsonl"
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
print(f"\nWorkflow library saved to: workflow_library/episode_library.json")
|
|
892
|
+
```
|
|
893
|
+
|
|
894
|
+
---
|
|
895
|
+
|
|
896
|
+
## Contributing
|
|
897
|
+
|
|
898
|
+
To add support for new VLM/LLM providers:
|
|
899
|
+
|
|
900
|
+
1. Create a new backend class in `frame_describer.py` or `segment_extractor.py`
|
|
901
|
+
2. Implement the required methods (`describe_frame`, `describe_batch`, etc.)
|
|
902
|
+
3. Update `_create_backend()` to detect and instantiate your backend
|
|
903
|
+
4. Add to `SUPPORTED_MODELS` list
|
|
904
|
+
|
|
905
|
+
Example:
|
|
906
|
+
|
|
907
|
+
```python
|
|
908
|
+
class CustomVLMBackend(VLMBackend):
|
|
909
|
+
def __init__(self, model: str, api_key: str):
|
|
910
|
+
self.model = model
|
|
911
|
+
self.api_key = api_key
|
|
912
|
+
|
|
913
|
+
def describe_frame(self, image, action_context, system_prompt, user_prompt):
|
|
914
|
+
# Your implementation here
|
|
915
|
+
pass
|
|
916
|
+
|
|
917
|
+
def describe_batch(self, images, action_contexts, system_prompt, user_prompt):
|
|
918
|
+
# Your implementation here
|
|
919
|
+
pass
|
|
920
|
+
```
|