openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.2.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,634 @@
1
+ """Workflow segment extraction using Large Language Models.
2
+
3
+ This module analyzes action transcripts to identify coherent
4
+ workflow segments (episodes) with clear boundaries (Stage 2 of pipeline).
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import Optional
11
+ from uuid import uuid4
12
+
13
+ from openadapt_ml.segmentation.schemas import (
14
+ ActionTranscript,
15
+ Episode,
16
+ EpisodeBoundary,
17
+ EpisodeExtractionResult,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class SegmentExtractor:
24
+ """Extracts workflow segments (episodes) from action transcripts using LLMs.
25
+
26
+ This class implements Stage 2 of the segmentation pipeline, identifying
27
+ coherent workflow boundaries within recorded sessions.
28
+
29
+ Example:
30
+ >>> extractor = SegmentExtractor(model="gpt-4o")
31
+ >>> result = extractor.extract_segments(transcript)
32
+ >>> for episode in result.episodes:
33
+ ... print(f"{episode.name}: {episode.start_time_formatted} - {episode.end_time_formatted}")
34
+ Adjust Night Shift Settings: 00:00.0 - 00:12.5
35
+ Change Display Resolution: 00:15.3 - 00:28.1
36
+
37
+ Attributes:
38
+ model: LLM model identifier
39
+ use_few_shot: Whether to include few-shot examples
40
+ hierarchical: Whether to extract hierarchical segments
41
+ """
42
+
43
+ SUPPORTED_MODELS = [
44
+ "gpt-4o",
45
+ "gpt-4o-mini",
46
+ "claude-sonnet-4-20250514",
47
+ "claude-3-5-haiku-20241022",
48
+ "gemini-2.0-pro",
49
+ "gemini-2.0-flash",
50
+ ]
51
+
52
+ def __init__(
53
+ self,
54
+ model: str = "gpt-4o",
55
+ use_few_shot: bool = True,
56
+ hierarchical: bool = False,
57
+ min_segment_duration: float = 2.0,
58
+ max_segment_duration: float = 300.0,
59
+ confidence_threshold: float = 0.7,
60
+ ) -> None:
61
+ """Initialize the segment extractor.
62
+
63
+ Args:
64
+ model: LLM model to use.
65
+ use_few_shot: Include few-shot examples in prompts.
66
+ hierarchical: Extract nested task/subtask structure.
67
+ min_segment_duration: Minimum segment length in seconds.
68
+ max_segment_duration: Maximum segment length in seconds.
69
+ confidence_threshold: Minimum boundary confidence to accept.
70
+ """
71
+ self.model = model
72
+ self.use_few_shot = use_few_shot
73
+ self.hierarchical = hierarchical
74
+ self.min_segment_duration = min_segment_duration
75
+ self.max_segment_duration = max_segment_duration
76
+ self.confidence_threshold = confidence_threshold
77
+ self._client = None
78
+
79
+ def _get_client(self):
80
+ """Get or create LLM client."""
81
+ if self._client is not None:
82
+ return self._client
83
+
84
+ if "gpt" in self.model.lower():
85
+ import openai
86
+ from openadapt_ml.config import settings
87
+
88
+ self._client = openai.OpenAI(api_key=settings.openai_api_key)
89
+ elif "claude" in self.model.lower():
90
+ import anthropic
91
+ from openadapt_ml.config import settings
92
+
93
+ self._client = anthropic.Anthropic(api_key=settings.anthropic_api_key)
94
+ elif "gemini" in self.model.lower():
95
+ import google.generativeai as genai
96
+ from openadapt_ml.config import settings
97
+
98
+ genai.configure(api_key=settings.google_api_key)
99
+ self._client = genai.GenerativeModel(self.model)
100
+ else:
101
+ raise ValueError(f"Unknown model: {self.model}")
102
+
103
+ return self._client
104
+
105
+ def _get_system_prompt(self) -> str:
106
+ """Return system prompt for LLM."""
107
+ return """You are an expert at analyzing user workflows in GUI applications. Your task is to identify distinct workflow segments (episodes) within a transcript of user actions.
108
+
109
+ A workflow segment is:
110
+ - A coherent sequence of actions with a clear goal
111
+ - Self-contained (could be taught/explained as a single procedure)
112
+ - Has a clear beginning and end
113
+
114
+ Guidelines for identifying segments:
115
+ 1. **Goal boundaries**: When the user's apparent goal changes, that's a new segment
116
+ 2. **Application switches**: Major application changes often indicate segment boundaries
117
+ 3. **Task completion**: Successful completion of a task (clicking Save, Submit, etc.) often ends a segment
118
+ 4. **Natural pauses**: Significant time gaps may indicate segment boundaries
119
+ 5. **Hierarchical tasks**: Large tasks may contain sub-segments (e.g., "Create document" contains "Add title", "Add body", "Save")
120
+
121
+ Avoid:
122
+ - Creating segments that are too granular (single actions)
123
+ - Creating segments that are too broad (entire session as one segment)
124
+ - Missing obvious task boundaries"""
125
+
126
+ def _get_few_shot_examples(self) -> str:
127
+ """Return few-shot examples for better extraction."""
128
+ return """Here are examples of correctly segmented transcripts:
129
+
130
+ ## Example 1: System Settings Workflow
131
+ **Transcript**:
132
+ ```
133
+ [00:00.0] User opens System Preferences from Apple menu
134
+ [00:02.5] User clicks Display settings
135
+ [00:05.1] User navigates to Night Shift tab
136
+ [00:07.3] User enables Night Shift toggle
137
+ [00:09.8] User adjusts schedule slider to 9 PM - 7 AM
138
+ [00:12.5] User closes System Preferences
139
+ [00:15.0] User opens Notes application
140
+ [00:17.2] User creates a new note
141
+ [00:20.5] User types "Meeting notes for tomorrow"
142
+ ```
143
+
144
+ **Expected segments**:
145
+ ```json
146
+ {
147
+ "segments": [
148
+ {
149
+ "name": "Configure Night Shift Schedule",
150
+ "start_time": 0.0,
151
+ "end_time": 12.5,
152
+ "description": "Enable and configure Night Shift automatic scheduling in Display settings",
153
+ "step_summaries": [
154
+ "Open System Preferences",
155
+ "Navigate to Display > Night Shift",
156
+ "Enable Night Shift",
157
+ "Set schedule 9 PM - 7 AM"
158
+ ],
159
+ "application": "System Preferences",
160
+ "boundary_confidence": 0.95
161
+ },
162
+ {
163
+ "name": "Create Meeting Notes",
164
+ "start_time": 15.0,
165
+ "end_time": 20.5,
166
+ "description": "Start a new note for meeting notes in the Notes application",
167
+ "step_summaries": [
168
+ "Open Notes application",
169
+ "Create new note",
170
+ "Add title"
171
+ ],
172
+ "application": "Notes",
173
+ "boundary_confidence": 0.85
174
+ }
175
+ ]
176
+ }
177
+ ```
178
+
179
+ ## Example 2: Web Browser Workflow
180
+ **Transcript**:
181
+ ```
182
+ [00:00.0] User opens Chrome browser
183
+ [00:02.1] User clicks URL bar
184
+ [00:03.5] User types "github.com"
185
+ [00:05.2] User presses Enter to navigate
186
+ [00:08.4] User clicks "Sign in" button
187
+ [00:10.1] User types email address
188
+ [00:12.8] User types password
189
+ [00:15.3] User clicks "Sign in" button
190
+ [00:18.5] User clicks "New repository" button
191
+ [00:21.2] User types "my-project" as repository name
192
+ [00:24.8] User selects "Private" radio button
193
+ [00:27.1] User clicks "Create repository" button
194
+ ```
195
+
196
+ **Expected segments**:
197
+ ```json
198
+ {
199
+ "segments": [
200
+ {
201
+ "name": "Sign In to GitHub",
202
+ "start_time": 0.0,
203
+ "end_time": 15.3,
204
+ "description": "Navigate to GitHub and authenticate with email and password",
205
+ "step_summaries": [
206
+ "Open browser and navigate to github.com",
207
+ "Click Sign in",
208
+ "Enter credentials",
209
+ "Submit login form"
210
+ ],
211
+ "application": "Chrome - GitHub",
212
+ "boundary_confidence": 0.95
213
+ },
214
+ {
215
+ "name": "Create Private Repository",
216
+ "start_time": 18.5,
217
+ "end_time": 27.1,
218
+ "description": "Create a new private repository named my-project",
219
+ "step_summaries": [
220
+ "Click New repository",
221
+ "Enter repository name",
222
+ "Select Private visibility",
223
+ "Create repository"
224
+ ],
225
+ "application": "Chrome - GitHub",
226
+ "boundary_confidence": 0.9
227
+ }
228
+ ]
229
+ }
230
+ ```
231
+
232
+ ---
233
+
234
+ """
235
+
236
+ def _build_user_prompt(
237
+ self, transcript: ActionTranscript, context: Optional[str]
238
+ ) -> str:
239
+ """Build user prompt for segment extraction."""
240
+ lines = []
241
+
242
+ if self.use_few_shot:
243
+ lines.append(self._get_few_shot_examples())
244
+
245
+ lines.append("Now analyze this transcript:\n")
246
+ lines.append("## Recording Information")
247
+ lines.append(f"- Recording ID: {transcript.recording_id}")
248
+ lines.append(f"- Total Duration: {transcript.duration_formatted}")
249
+ if transcript.task_description:
250
+ lines.append(f"- Task Description: {transcript.task_description}")
251
+ if context:
252
+ lines.append(f"- Additional Context: {context}")
253
+
254
+ lines.append("\n## Action Transcript")
255
+ lines.append("```")
256
+ lines.append(transcript.to_transcript_text())
257
+ lines.append("```")
258
+
259
+ lines.append("""
260
+ Identify all workflow segments in this transcript. For each segment, provide:
261
+ 1. A concise name (e.g., "Adjust Night Shift Settings")
262
+ 2. Start and end timestamps
263
+ 3. A description of what the workflow accomplishes
264
+ 4. A list of high-level steps
265
+ 5. Confidence in the segment boundaries (0-1)
266
+
267
+ Respond with JSON in this format:
268
+ ```json
269
+ {
270
+ "segments": [
271
+ {
272
+ "name": "Segment Name",
273
+ "start_time": 0.0,
274
+ "end_time": 12.5,
275
+ "start_time_formatted": "00:00.0",
276
+ "end_time_formatted": "00:12.5",
277
+ "description": "What this workflow accomplishes",
278
+ "step_summaries": ["Step 1", "Step 2", "Step 3"],
279
+ "application": "Primary application",
280
+ "boundary_confidence": 0.9,
281
+ "coherence_score": 0.85
282
+ }
283
+ ],
284
+ "boundaries": [
285
+ {
286
+ "timestamp": 12.5,
287
+ "confidence": 0.9,
288
+ "reason": "Task completed - settings saved"
289
+ }
290
+ ]
291
+ }
292
+ ```""")
293
+ return "\n".join(lines)
294
+
295
+ def _call_llm(self, system_prompt: str, user_prompt: str) -> str:
296
+ """Call the LLM and return response text."""
297
+ client = self._get_client()
298
+
299
+ if "gpt" in self.model.lower():
300
+ response = client.chat.completions.create(
301
+ model=self.model,
302
+ max_tokens=4096,
303
+ messages=[
304
+ {"role": "system", "content": system_prompt},
305
+ {"role": "user", "content": user_prompt},
306
+ ],
307
+ )
308
+ return response.choices[0].message.content
309
+
310
+ elif "claude" in self.model.lower():
311
+ response = client.messages.create(
312
+ model=self.model,
313
+ max_tokens=4096,
314
+ system=system_prompt,
315
+ messages=[{"role": "user", "content": user_prompt}],
316
+ )
317
+ return response.content[0].text
318
+
319
+ elif "gemini" in self.model.lower():
320
+ response = client.generate_content(f"{system_prompt}\n\n{user_prompt}")
321
+ return response.text
322
+
323
+ raise ValueError(f"Unknown model: {self.model}")
324
+
325
+ def _parse_response(
326
+ self, text: str, transcript: ActionTranscript
327
+ ) -> tuple[list[Episode], list[EpisodeBoundary]]:
328
+ """Parse LLM response into Episode and EpisodeBoundary objects."""
329
+ episodes = []
330
+ boundaries = []
331
+
332
+ try:
333
+ # Find JSON in response
334
+ start = text.find("{")
335
+ end = text.rfind("}") + 1
336
+ if start >= 0 and end > start:
337
+ data = json.loads(text[start:end])
338
+
339
+ # Parse segments
340
+ for seg_data in data.get("segments", []):
341
+ # Find frame indices for this segment
342
+ start_time = seg_data.get("start_time", 0)
343
+ end_time = seg_data.get("end_time", 0)
344
+ frame_indices = [
345
+ f.frame_index
346
+ for f in transcript.frames
347
+ if start_time <= f.timestamp <= end_time
348
+ ]
349
+
350
+ episode = Episode(
351
+ episode_id=uuid4(),
352
+ name=seg_data.get("name", "Unknown"),
353
+ start_time=start_time,
354
+ end_time=end_time,
355
+ start_time_formatted=seg_data.get(
356
+ "start_time_formatted",
357
+ f"{int(start_time // 60):02d}:{start_time % 60:04.1f}",
358
+ ),
359
+ end_time_formatted=seg_data.get(
360
+ "end_time_formatted",
361
+ f"{int(end_time // 60):02d}:{end_time % 60:04.1f}",
362
+ ),
363
+ description=seg_data.get("description", ""),
364
+ step_summaries=seg_data.get("step_summaries", []),
365
+ application=seg_data.get("application", "Unknown"),
366
+ boundary_confidence=seg_data.get("boundary_confidence", 0.5),
367
+ coherence_score=seg_data.get("coherence_score", 0.5),
368
+ recording_id=transcript.recording_id,
369
+ frame_indices=frame_indices,
370
+ )
371
+ episodes.append(episode)
372
+
373
+ # Parse boundaries
374
+ for bnd_data in data.get("boundaries", []):
375
+ boundary = EpisodeBoundary(
376
+ timestamp=bnd_data.get("timestamp", 0),
377
+ confidence=bnd_data.get("confidence", 0.5),
378
+ reason=bnd_data.get("reason", ""),
379
+ )
380
+ boundaries.append(boundary)
381
+
382
+ except json.JSONDecodeError as e:
383
+ logger.warning(f"Failed to parse LLM response: {e}")
384
+ # Create a single episode covering the entire transcript
385
+ episode = Episode(
386
+ episode_id=uuid4(),
387
+ name="Full Recording",
388
+ start_time=0,
389
+ end_time=transcript.total_duration,
390
+ start_time_formatted="00:00.0",
391
+ end_time_formatted=transcript.duration_formatted,
392
+ description="Complete recording (automatic segmentation failed)",
393
+ step_summaries=[f.apparent_intent for f in transcript.frames[:5]],
394
+ application="Unknown",
395
+ boundary_confidence=0.1,
396
+ coherence_score=0.1,
397
+ recording_id=transcript.recording_id,
398
+ frame_indices=[f.frame_index for f in transcript.frames],
399
+ )
400
+ episodes.append(episode)
401
+
402
+ return episodes, boundaries
403
+
404
+ def extract_segments(
405
+ self,
406
+ transcript: ActionTranscript,
407
+ context: Optional[str] = None,
408
+ ) -> EpisodeExtractionResult:
409
+ """Extract workflow segments from a transcript.
410
+
411
+ Args:
412
+ transcript: ActionTranscript from Stage 1.
413
+ context: Additional context (e.g., user-provided task description).
414
+
415
+ Returns:
416
+ EpisodeExtractionResult with identified episodes.
417
+ """
418
+ system_prompt = self._get_system_prompt()
419
+ user_prompt = self._build_user_prompt(transcript, context)
420
+
421
+ response_text = self._call_llm(system_prompt, user_prompt)
422
+ episodes, boundaries = self._parse_response(response_text, transcript)
423
+
424
+ # Filter by duration
425
+ filtered_episodes = [
426
+ e
427
+ for e in episodes
428
+ if self.min_segment_duration <= e.duration <= self.max_segment_duration
429
+ ]
430
+
431
+ # Filter by confidence
432
+ filtered_episodes = [
433
+ e
434
+ for e in filtered_episodes
435
+ if e.boundary_confidence >= self.confidence_threshold
436
+ ]
437
+
438
+ # Calculate coverage
439
+ total_covered = sum(e.duration for e in filtered_episodes)
440
+ coverage = (
441
+ total_covered / transcript.total_duration
442
+ if transcript.total_duration > 0
443
+ else 0
444
+ )
445
+
446
+ # Calculate average confidence
447
+ avg_confidence = (
448
+ sum(e.boundary_confidence for e in filtered_episodes)
449
+ / len(filtered_episodes)
450
+ if filtered_episodes
451
+ else 0
452
+ )
453
+
454
+ return EpisodeExtractionResult(
455
+ recording_id=transcript.recording_id,
456
+ recording_name=transcript.recording_name,
457
+ episodes=filtered_episodes,
458
+ boundaries=boundaries,
459
+ llm_model=self.model,
460
+ processing_timestamp=datetime.now(),
461
+ coverage=min(coverage, 1.0),
462
+ avg_confidence=avg_confidence,
463
+ )
464
+
465
+ def identify_boundaries(
466
+ self,
467
+ transcript: ActionTranscript,
468
+ ) -> list[EpisodeBoundary]:
469
+ """Identify potential segment boundaries in a transcript.
470
+
471
+ This is a lighter-weight method that just finds boundaries
472
+ without full episode extraction.
473
+
474
+ Args:
475
+ transcript: ActionTranscript from Stage 1.
476
+
477
+ Returns:
478
+ List of potential boundaries with confidence scores.
479
+ """
480
+ result = self.extract_segments(transcript)
481
+ return result.boundaries
482
+
483
+ def refine_segment(
484
+ self,
485
+ segment: Episode,
486
+ transcript: ActionTranscript,
487
+ ) -> Episode:
488
+ """Refine a segment's boundaries and description.
489
+
490
+ Use this to improve segment quality after initial extraction.
491
+
492
+ Args:
493
+ segment: Segment to refine.
494
+ transcript: Full transcript for context.
495
+
496
+ Returns:
497
+ Refined Episode.
498
+ """
499
+ # Get frames around the segment boundaries
500
+ context_frames = [
501
+ f
502
+ for f in transcript.frames
503
+ if segment.start_time - 5 <= f.timestamp <= segment.end_time + 5
504
+ ]
505
+
506
+ context = f"Refining segment '{segment.name}' with original boundaries {segment.start_time_formatted} - {segment.end_time_formatted}"
507
+
508
+ # Create mini-transcript
509
+ mini_transcript = ActionTranscript(
510
+ recording_id=transcript.recording_id,
511
+ recording_name=transcript.recording_name,
512
+ frames=context_frames,
513
+ total_duration=context_frames[-1].timestamp - context_frames[0].timestamp
514
+ if context_frames
515
+ else 0,
516
+ frame_count=len(context_frames),
517
+ vlm_model=transcript.vlm_model,
518
+ )
519
+
520
+ result = self.extract_segments(mini_transcript, context)
521
+ if result.episodes:
522
+ return result.episodes[0]
523
+ return segment
524
+
525
+ def merge_segments(
526
+ self,
527
+ segments: list[Episode],
528
+ max_gap: float = 2.0,
529
+ ) -> list[Episode]:
530
+ """Merge adjacent segments that appear to be part of the same workflow.
531
+
532
+ Args:
533
+ segments: List of segments to potentially merge.
534
+ max_gap: Maximum gap (seconds) between segments to consider merging.
535
+
536
+ Returns:
537
+ List of merged segments.
538
+ """
539
+ if not segments:
540
+ return []
541
+
542
+ # Sort by start time
543
+ sorted_segments = sorted(segments, key=lambda s: s.start_time)
544
+
545
+ merged = [sorted_segments[0]]
546
+ for segment in sorted_segments[1:]:
547
+ last = merged[-1]
548
+ gap = segment.start_time - last.end_time
549
+
550
+ # Check if should merge
551
+ if gap <= max_gap and segment.application == last.application:
552
+ # Merge segments
553
+ merged_segment = Episode(
554
+ episode_id=uuid4(),
555
+ name=f"{last.name} + {segment.name}",
556
+ start_time=last.start_time,
557
+ end_time=segment.end_time,
558
+ start_time_formatted=last.start_time_formatted,
559
+ end_time_formatted=segment.end_time_formatted,
560
+ description=f"{last.description}. Then, {segment.description}",
561
+ step_summaries=last.step_summaries + segment.step_summaries,
562
+ application=last.application,
563
+ boundary_confidence=min(
564
+ last.boundary_confidence, segment.boundary_confidence
565
+ ),
566
+ coherence_score=(last.coherence_score + segment.coherence_score)
567
+ / 2,
568
+ recording_id=last.recording_id,
569
+ frame_indices=last.frame_indices + segment.frame_indices,
570
+ )
571
+ merged[-1] = merged_segment
572
+ else:
573
+ merged.append(segment)
574
+
575
+ return merged
576
+
577
+ def adjust_boundary(
578
+ self,
579
+ segment: Episode,
580
+ new_start: Optional[float] = None,
581
+ new_end: Optional[float] = None,
582
+ transcript: Optional[ActionTranscript] = None,
583
+ ) -> Episode:
584
+ """Manually adjust segment boundaries.
585
+
586
+ For human-in-the-loop refinement.
587
+
588
+ Args:
589
+ segment: Segment to adjust.
590
+ new_start: New start time (or None to keep existing).
591
+ new_end: New end time (or None to keep existing).
592
+ transcript: Transcript to re-extract step info from new boundaries.
593
+
594
+ Returns:
595
+ Adjusted Episode.
596
+ """
597
+ start_time = new_start if new_start is not None else segment.start_time
598
+ end_time = new_end if new_end is not None else segment.end_time
599
+
600
+ # Update frame indices if transcript provided
601
+ frame_indices = segment.frame_indices
602
+ if transcript:
603
+ frame_indices = [
604
+ f.frame_index
605
+ for f in transcript.frames
606
+ if start_time <= f.timestamp <= end_time
607
+ ]
608
+
609
+ return Episode(
610
+ episode_id=segment.episode_id,
611
+ name=segment.name,
612
+ start_time=start_time,
613
+ end_time=end_time,
614
+ start_time_formatted=f"{int(start_time // 60):02d}:{start_time % 60:04.1f}",
615
+ end_time_formatted=f"{int(end_time // 60):02d}:{end_time % 60:04.1f}",
616
+ description=segment.description,
617
+ steps=segment.steps,
618
+ step_summaries=segment.step_summaries,
619
+ application=segment.application,
620
+ prerequisites=segment.prerequisites,
621
+ outcomes=segment.outcomes,
622
+ parent_episode_id=segment.parent_episode_id,
623
+ child_episode_ids=segment.child_episode_ids,
624
+ boundary_confidence=segment.boundary_confidence
625
+ * 0.9, # Reduce confidence for manual adjustment
626
+ coherence_score=segment.coherence_score,
627
+ recording_id=segment.recording_id,
628
+ frame_indices=frame_indices,
629
+ )
630
+
631
+ @property
632
+ def supported_models(self) -> list[str]:
633
+ """Return list of supported LLM models."""
634
+ return self.SUPPORTED_MODELS