openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.2.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,634 @@
|
|
|
1
|
+
"""Workflow segment extraction using Large Language Models.
|
|
2
|
+
|
|
3
|
+
This module analyzes action transcripts to identify coherent
|
|
4
|
+
workflow segments (episodes) with clear boundaries (Stage 2 of pipeline).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
from openadapt_ml.segmentation.schemas import (
|
|
14
|
+
ActionTranscript,
|
|
15
|
+
Episode,
|
|
16
|
+
EpisodeBoundary,
|
|
17
|
+
EpisodeExtractionResult,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SegmentExtractor:
|
|
24
|
+
"""Extracts workflow segments (episodes) from action transcripts using LLMs.
|
|
25
|
+
|
|
26
|
+
This class implements Stage 2 of the segmentation pipeline, identifying
|
|
27
|
+
coherent workflow boundaries within recorded sessions.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> extractor = SegmentExtractor(model="gpt-4o")
|
|
31
|
+
>>> result = extractor.extract_segments(transcript)
|
|
32
|
+
>>> for episode in result.episodes:
|
|
33
|
+
... print(f"{episode.name}: {episode.start_time_formatted} - {episode.end_time_formatted}")
|
|
34
|
+
Adjust Night Shift Settings: 00:00.0 - 00:12.5
|
|
35
|
+
Change Display Resolution: 00:15.3 - 00:28.1
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
model: LLM model identifier
|
|
39
|
+
use_few_shot: Whether to include few-shot examples
|
|
40
|
+
hierarchical: Whether to extract hierarchical segments
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
SUPPORTED_MODELS = [
|
|
44
|
+
"gpt-4o",
|
|
45
|
+
"gpt-4o-mini",
|
|
46
|
+
"claude-sonnet-4-20250514",
|
|
47
|
+
"claude-3-5-haiku-20241022",
|
|
48
|
+
"gemini-2.0-pro",
|
|
49
|
+
"gemini-2.0-flash",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
model: str = "gpt-4o",
|
|
55
|
+
use_few_shot: bool = True,
|
|
56
|
+
hierarchical: bool = False,
|
|
57
|
+
min_segment_duration: float = 2.0,
|
|
58
|
+
max_segment_duration: float = 300.0,
|
|
59
|
+
confidence_threshold: float = 0.7,
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Initialize the segment extractor.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
model: LLM model to use.
|
|
65
|
+
use_few_shot: Include few-shot examples in prompts.
|
|
66
|
+
hierarchical: Extract nested task/subtask structure.
|
|
67
|
+
min_segment_duration: Minimum segment length in seconds.
|
|
68
|
+
max_segment_duration: Maximum segment length in seconds.
|
|
69
|
+
confidence_threshold: Minimum boundary confidence to accept.
|
|
70
|
+
"""
|
|
71
|
+
self.model = model
|
|
72
|
+
self.use_few_shot = use_few_shot
|
|
73
|
+
self.hierarchical = hierarchical
|
|
74
|
+
self.min_segment_duration = min_segment_duration
|
|
75
|
+
self.max_segment_duration = max_segment_duration
|
|
76
|
+
self.confidence_threshold = confidence_threshold
|
|
77
|
+
self._client = None
|
|
78
|
+
|
|
79
|
+
def _get_client(self):
|
|
80
|
+
"""Get or create LLM client."""
|
|
81
|
+
if self._client is not None:
|
|
82
|
+
return self._client
|
|
83
|
+
|
|
84
|
+
if "gpt" in self.model.lower():
|
|
85
|
+
import openai
|
|
86
|
+
from openadapt_ml.config import settings
|
|
87
|
+
|
|
88
|
+
self._client = openai.OpenAI(api_key=settings.openai_api_key)
|
|
89
|
+
elif "claude" in self.model.lower():
|
|
90
|
+
import anthropic
|
|
91
|
+
from openadapt_ml.config import settings
|
|
92
|
+
|
|
93
|
+
self._client = anthropic.Anthropic(api_key=settings.anthropic_api_key)
|
|
94
|
+
elif "gemini" in self.model.lower():
|
|
95
|
+
import google.generativeai as genai
|
|
96
|
+
from openadapt_ml.config import settings
|
|
97
|
+
|
|
98
|
+
genai.configure(api_key=settings.google_api_key)
|
|
99
|
+
self._client = genai.GenerativeModel(self.model)
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError(f"Unknown model: {self.model}")
|
|
102
|
+
|
|
103
|
+
return self._client
|
|
104
|
+
|
|
105
|
+
def _get_system_prompt(self) -> str:
|
|
106
|
+
"""Return system prompt for LLM."""
|
|
107
|
+
return """You are an expert at analyzing user workflows in GUI applications. Your task is to identify distinct workflow segments (episodes) within a transcript of user actions.
|
|
108
|
+
|
|
109
|
+
A workflow segment is:
|
|
110
|
+
- A coherent sequence of actions with a clear goal
|
|
111
|
+
- Self-contained (could be taught/explained as a single procedure)
|
|
112
|
+
- Has a clear beginning and end
|
|
113
|
+
|
|
114
|
+
Guidelines for identifying segments:
|
|
115
|
+
1. **Goal boundaries**: When the user's apparent goal changes, that's a new segment
|
|
116
|
+
2. **Application switches**: Major application changes often indicate segment boundaries
|
|
117
|
+
3. **Task completion**: Successful completion of a task (clicking Save, Submit, etc.) often ends a segment
|
|
118
|
+
4. **Natural pauses**: Significant time gaps may indicate segment boundaries
|
|
119
|
+
5. **Hierarchical tasks**: Large tasks may contain sub-segments (e.g., "Create document" contains "Add title", "Add body", "Save")
|
|
120
|
+
|
|
121
|
+
Avoid:
|
|
122
|
+
- Creating segments that are too granular (single actions)
|
|
123
|
+
- Creating segments that are too broad (entire session as one segment)
|
|
124
|
+
- Missing obvious task boundaries"""
|
|
125
|
+
|
|
126
|
+
def _get_few_shot_examples(self) -> str:
|
|
127
|
+
"""Return few-shot examples for better extraction."""
|
|
128
|
+
return """Here are examples of correctly segmented transcripts:
|
|
129
|
+
|
|
130
|
+
## Example 1: System Settings Workflow
|
|
131
|
+
**Transcript**:
|
|
132
|
+
```
|
|
133
|
+
[00:00.0] User opens System Preferences from Apple menu
|
|
134
|
+
[00:02.5] User clicks Display settings
|
|
135
|
+
[00:05.1] User navigates to Night Shift tab
|
|
136
|
+
[00:07.3] User enables Night Shift toggle
|
|
137
|
+
[00:09.8] User adjusts schedule slider to 9 PM - 7 AM
|
|
138
|
+
[00:12.5] User closes System Preferences
|
|
139
|
+
[00:15.0] User opens Notes application
|
|
140
|
+
[00:17.2] User creates a new note
|
|
141
|
+
[00:20.5] User types "Meeting notes for tomorrow"
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Expected segments**:
|
|
145
|
+
```json
|
|
146
|
+
{
|
|
147
|
+
"segments": [
|
|
148
|
+
{
|
|
149
|
+
"name": "Configure Night Shift Schedule",
|
|
150
|
+
"start_time": 0.0,
|
|
151
|
+
"end_time": 12.5,
|
|
152
|
+
"description": "Enable and configure Night Shift automatic scheduling in Display settings",
|
|
153
|
+
"step_summaries": [
|
|
154
|
+
"Open System Preferences",
|
|
155
|
+
"Navigate to Display > Night Shift",
|
|
156
|
+
"Enable Night Shift",
|
|
157
|
+
"Set schedule 9 PM - 7 AM"
|
|
158
|
+
],
|
|
159
|
+
"application": "System Preferences",
|
|
160
|
+
"boundary_confidence": 0.95
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
"name": "Create Meeting Notes",
|
|
164
|
+
"start_time": 15.0,
|
|
165
|
+
"end_time": 20.5,
|
|
166
|
+
"description": "Start a new note for meeting notes in the Notes application",
|
|
167
|
+
"step_summaries": [
|
|
168
|
+
"Open Notes application",
|
|
169
|
+
"Create new note",
|
|
170
|
+
"Add title"
|
|
171
|
+
],
|
|
172
|
+
"application": "Notes",
|
|
173
|
+
"boundary_confidence": 0.85
|
|
174
|
+
}
|
|
175
|
+
]
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Example 2: Web Browser Workflow
|
|
180
|
+
**Transcript**:
|
|
181
|
+
```
|
|
182
|
+
[00:00.0] User opens Chrome browser
|
|
183
|
+
[00:02.1] User clicks URL bar
|
|
184
|
+
[00:03.5] User types "github.com"
|
|
185
|
+
[00:05.2] User presses Enter to navigate
|
|
186
|
+
[00:08.4] User clicks "Sign in" button
|
|
187
|
+
[00:10.1] User types email address
|
|
188
|
+
[00:12.8] User types password
|
|
189
|
+
[00:15.3] User clicks "Sign in" button
|
|
190
|
+
[00:18.5] User clicks "New repository" button
|
|
191
|
+
[00:21.2] User types "my-project" as repository name
|
|
192
|
+
[00:24.8] User selects "Private" radio button
|
|
193
|
+
[00:27.1] User clicks "Create repository" button
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
**Expected segments**:
|
|
197
|
+
```json
|
|
198
|
+
{
|
|
199
|
+
"segments": [
|
|
200
|
+
{
|
|
201
|
+
"name": "Sign In to GitHub",
|
|
202
|
+
"start_time": 0.0,
|
|
203
|
+
"end_time": 15.3,
|
|
204
|
+
"description": "Navigate to GitHub and authenticate with email and password",
|
|
205
|
+
"step_summaries": [
|
|
206
|
+
"Open browser and navigate to github.com",
|
|
207
|
+
"Click Sign in",
|
|
208
|
+
"Enter credentials",
|
|
209
|
+
"Submit login form"
|
|
210
|
+
],
|
|
211
|
+
"application": "Chrome - GitHub",
|
|
212
|
+
"boundary_confidence": 0.95
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
"name": "Create Private Repository",
|
|
216
|
+
"start_time": 18.5,
|
|
217
|
+
"end_time": 27.1,
|
|
218
|
+
"description": "Create a new private repository named my-project",
|
|
219
|
+
"step_summaries": [
|
|
220
|
+
"Click New repository",
|
|
221
|
+
"Enter repository name",
|
|
222
|
+
"Select Private visibility",
|
|
223
|
+
"Create repository"
|
|
224
|
+
],
|
|
225
|
+
"application": "Chrome - GitHub",
|
|
226
|
+
"boundary_confidence": 0.9
|
|
227
|
+
}
|
|
228
|
+
]
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def _build_user_prompt(
|
|
237
|
+
self, transcript: ActionTranscript, context: Optional[str]
|
|
238
|
+
) -> str:
|
|
239
|
+
"""Build user prompt for segment extraction."""
|
|
240
|
+
lines = []
|
|
241
|
+
|
|
242
|
+
if self.use_few_shot:
|
|
243
|
+
lines.append(self._get_few_shot_examples())
|
|
244
|
+
|
|
245
|
+
lines.append("Now analyze this transcript:\n")
|
|
246
|
+
lines.append("## Recording Information")
|
|
247
|
+
lines.append(f"- Recording ID: {transcript.recording_id}")
|
|
248
|
+
lines.append(f"- Total Duration: {transcript.duration_formatted}")
|
|
249
|
+
if transcript.task_description:
|
|
250
|
+
lines.append(f"- Task Description: {transcript.task_description}")
|
|
251
|
+
if context:
|
|
252
|
+
lines.append(f"- Additional Context: {context}")
|
|
253
|
+
|
|
254
|
+
lines.append("\n## Action Transcript")
|
|
255
|
+
lines.append("```")
|
|
256
|
+
lines.append(transcript.to_transcript_text())
|
|
257
|
+
lines.append("```")
|
|
258
|
+
|
|
259
|
+
lines.append("""
|
|
260
|
+
Identify all workflow segments in this transcript. For each segment, provide:
|
|
261
|
+
1. A concise name (e.g., "Adjust Night Shift Settings")
|
|
262
|
+
2. Start and end timestamps
|
|
263
|
+
3. A description of what the workflow accomplishes
|
|
264
|
+
4. A list of high-level steps
|
|
265
|
+
5. Confidence in the segment boundaries (0-1)
|
|
266
|
+
|
|
267
|
+
Respond with JSON in this format:
|
|
268
|
+
```json
|
|
269
|
+
{
|
|
270
|
+
"segments": [
|
|
271
|
+
{
|
|
272
|
+
"name": "Segment Name",
|
|
273
|
+
"start_time": 0.0,
|
|
274
|
+
"end_time": 12.5,
|
|
275
|
+
"start_time_formatted": "00:00.0",
|
|
276
|
+
"end_time_formatted": "00:12.5",
|
|
277
|
+
"description": "What this workflow accomplishes",
|
|
278
|
+
"step_summaries": ["Step 1", "Step 2", "Step 3"],
|
|
279
|
+
"application": "Primary application",
|
|
280
|
+
"boundary_confidence": 0.9,
|
|
281
|
+
"coherence_score": 0.85
|
|
282
|
+
}
|
|
283
|
+
],
|
|
284
|
+
"boundaries": [
|
|
285
|
+
{
|
|
286
|
+
"timestamp": 12.5,
|
|
287
|
+
"confidence": 0.9,
|
|
288
|
+
"reason": "Task completed - settings saved"
|
|
289
|
+
}
|
|
290
|
+
]
|
|
291
|
+
}
|
|
292
|
+
```""")
|
|
293
|
+
return "\n".join(lines)
|
|
294
|
+
|
|
295
|
+
def _call_llm(self, system_prompt: str, user_prompt: str) -> str:
|
|
296
|
+
"""Call the LLM and return response text."""
|
|
297
|
+
client = self._get_client()
|
|
298
|
+
|
|
299
|
+
if "gpt" in self.model.lower():
|
|
300
|
+
response = client.chat.completions.create(
|
|
301
|
+
model=self.model,
|
|
302
|
+
max_tokens=4096,
|
|
303
|
+
messages=[
|
|
304
|
+
{"role": "system", "content": system_prompt},
|
|
305
|
+
{"role": "user", "content": user_prompt},
|
|
306
|
+
],
|
|
307
|
+
)
|
|
308
|
+
return response.choices[0].message.content
|
|
309
|
+
|
|
310
|
+
elif "claude" in self.model.lower():
|
|
311
|
+
response = client.messages.create(
|
|
312
|
+
model=self.model,
|
|
313
|
+
max_tokens=4096,
|
|
314
|
+
system=system_prompt,
|
|
315
|
+
messages=[{"role": "user", "content": user_prompt}],
|
|
316
|
+
)
|
|
317
|
+
return response.content[0].text
|
|
318
|
+
|
|
319
|
+
elif "gemini" in self.model.lower():
|
|
320
|
+
response = client.generate_content(f"{system_prompt}\n\n{user_prompt}")
|
|
321
|
+
return response.text
|
|
322
|
+
|
|
323
|
+
raise ValueError(f"Unknown model: {self.model}")
|
|
324
|
+
|
|
325
|
+
def _parse_response(
|
|
326
|
+
self, text: str, transcript: ActionTranscript
|
|
327
|
+
) -> tuple[list[Episode], list[EpisodeBoundary]]:
|
|
328
|
+
"""Parse LLM response into Episode and EpisodeBoundary objects."""
|
|
329
|
+
episodes = []
|
|
330
|
+
boundaries = []
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
# Find JSON in response
|
|
334
|
+
start = text.find("{")
|
|
335
|
+
end = text.rfind("}") + 1
|
|
336
|
+
if start >= 0 and end > start:
|
|
337
|
+
data = json.loads(text[start:end])
|
|
338
|
+
|
|
339
|
+
# Parse segments
|
|
340
|
+
for seg_data in data.get("segments", []):
|
|
341
|
+
# Find frame indices for this segment
|
|
342
|
+
start_time = seg_data.get("start_time", 0)
|
|
343
|
+
end_time = seg_data.get("end_time", 0)
|
|
344
|
+
frame_indices = [
|
|
345
|
+
f.frame_index
|
|
346
|
+
for f in transcript.frames
|
|
347
|
+
if start_time <= f.timestamp <= end_time
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
episode = Episode(
|
|
351
|
+
episode_id=uuid4(),
|
|
352
|
+
name=seg_data.get("name", "Unknown"),
|
|
353
|
+
start_time=start_time,
|
|
354
|
+
end_time=end_time,
|
|
355
|
+
start_time_formatted=seg_data.get(
|
|
356
|
+
"start_time_formatted",
|
|
357
|
+
f"{int(start_time // 60):02d}:{start_time % 60:04.1f}",
|
|
358
|
+
),
|
|
359
|
+
end_time_formatted=seg_data.get(
|
|
360
|
+
"end_time_formatted",
|
|
361
|
+
f"{int(end_time // 60):02d}:{end_time % 60:04.1f}",
|
|
362
|
+
),
|
|
363
|
+
description=seg_data.get("description", ""),
|
|
364
|
+
step_summaries=seg_data.get("step_summaries", []),
|
|
365
|
+
application=seg_data.get("application", "Unknown"),
|
|
366
|
+
boundary_confidence=seg_data.get("boundary_confidence", 0.5),
|
|
367
|
+
coherence_score=seg_data.get("coherence_score", 0.5),
|
|
368
|
+
recording_id=transcript.recording_id,
|
|
369
|
+
frame_indices=frame_indices,
|
|
370
|
+
)
|
|
371
|
+
episodes.append(episode)
|
|
372
|
+
|
|
373
|
+
# Parse boundaries
|
|
374
|
+
for bnd_data in data.get("boundaries", []):
|
|
375
|
+
boundary = EpisodeBoundary(
|
|
376
|
+
timestamp=bnd_data.get("timestamp", 0),
|
|
377
|
+
confidence=bnd_data.get("confidence", 0.5),
|
|
378
|
+
reason=bnd_data.get("reason", ""),
|
|
379
|
+
)
|
|
380
|
+
boundaries.append(boundary)
|
|
381
|
+
|
|
382
|
+
except json.JSONDecodeError as e:
|
|
383
|
+
logger.warning(f"Failed to parse LLM response: {e}")
|
|
384
|
+
# Create a single episode covering the entire transcript
|
|
385
|
+
episode = Episode(
|
|
386
|
+
episode_id=uuid4(),
|
|
387
|
+
name="Full Recording",
|
|
388
|
+
start_time=0,
|
|
389
|
+
end_time=transcript.total_duration,
|
|
390
|
+
start_time_formatted="00:00.0",
|
|
391
|
+
end_time_formatted=transcript.duration_formatted,
|
|
392
|
+
description="Complete recording (automatic segmentation failed)",
|
|
393
|
+
step_summaries=[f.apparent_intent for f in transcript.frames[:5]],
|
|
394
|
+
application="Unknown",
|
|
395
|
+
boundary_confidence=0.1,
|
|
396
|
+
coherence_score=0.1,
|
|
397
|
+
recording_id=transcript.recording_id,
|
|
398
|
+
frame_indices=[f.frame_index for f in transcript.frames],
|
|
399
|
+
)
|
|
400
|
+
episodes.append(episode)
|
|
401
|
+
|
|
402
|
+
return episodes, boundaries
|
|
403
|
+
|
|
404
|
+
def extract_segments(
|
|
405
|
+
self,
|
|
406
|
+
transcript: ActionTranscript,
|
|
407
|
+
context: Optional[str] = None,
|
|
408
|
+
) -> EpisodeExtractionResult:
|
|
409
|
+
"""Extract workflow segments from a transcript.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
transcript: ActionTranscript from Stage 1.
|
|
413
|
+
context: Additional context (e.g., user-provided task description).
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
EpisodeExtractionResult with identified episodes.
|
|
417
|
+
"""
|
|
418
|
+
system_prompt = self._get_system_prompt()
|
|
419
|
+
user_prompt = self._build_user_prompt(transcript, context)
|
|
420
|
+
|
|
421
|
+
response_text = self._call_llm(system_prompt, user_prompt)
|
|
422
|
+
episodes, boundaries = self._parse_response(response_text, transcript)
|
|
423
|
+
|
|
424
|
+
# Filter by duration
|
|
425
|
+
filtered_episodes = [
|
|
426
|
+
e
|
|
427
|
+
for e in episodes
|
|
428
|
+
if self.min_segment_duration <= e.duration <= self.max_segment_duration
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
# Filter by confidence
|
|
432
|
+
filtered_episodes = [
|
|
433
|
+
e
|
|
434
|
+
for e in filtered_episodes
|
|
435
|
+
if e.boundary_confidence >= self.confidence_threshold
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
# Calculate coverage
|
|
439
|
+
total_covered = sum(e.duration for e in filtered_episodes)
|
|
440
|
+
coverage = (
|
|
441
|
+
total_covered / transcript.total_duration
|
|
442
|
+
if transcript.total_duration > 0
|
|
443
|
+
else 0
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Calculate average confidence
|
|
447
|
+
avg_confidence = (
|
|
448
|
+
sum(e.boundary_confidence for e in filtered_episodes)
|
|
449
|
+
/ len(filtered_episodes)
|
|
450
|
+
if filtered_episodes
|
|
451
|
+
else 0
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
return EpisodeExtractionResult(
|
|
455
|
+
recording_id=transcript.recording_id,
|
|
456
|
+
recording_name=transcript.recording_name,
|
|
457
|
+
episodes=filtered_episodes,
|
|
458
|
+
boundaries=boundaries,
|
|
459
|
+
llm_model=self.model,
|
|
460
|
+
processing_timestamp=datetime.now(),
|
|
461
|
+
coverage=min(coverage, 1.0),
|
|
462
|
+
avg_confidence=avg_confidence,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
def identify_boundaries(
|
|
466
|
+
self,
|
|
467
|
+
transcript: ActionTranscript,
|
|
468
|
+
) -> list[EpisodeBoundary]:
|
|
469
|
+
"""Identify potential segment boundaries in a transcript.
|
|
470
|
+
|
|
471
|
+
This is a lighter-weight method that just finds boundaries
|
|
472
|
+
without full episode extraction.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
transcript: ActionTranscript from Stage 1.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
List of potential boundaries with confidence scores.
|
|
479
|
+
"""
|
|
480
|
+
result = self.extract_segments(transcript)
|
|
481
|
+
return result.boundaries
|
|
482
|
+
|
|
483
|
+
def refine_segment(
|
|
484
|
+
self,
|
|
485
|
+
segment: Episode,
|
|
486
|
+
transcript: ActionTranscript,
|
|
487
|
+
) -> Episode:
|
|
488
|
+
"""Refine a segment's boundaries and description.
|
|
489
|
+
|
|
490
|
+
Use this to improve segment quality after initial extraction.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
segment: Segment to refine.
|
|
494
|
+
transcript: Full transcript for context.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Refined Episode.
|
|
498
|
+
"""
|
|
499
|
+
# Get frames around the segment boundaries
|
|
500
|
+
context_frames = [
|
|
501
|
+
f
|
|
502
|
+
for f in transcript.frames
|
|
503
|
+
if segment.start_time - 5 <= f.timestamp <= segment.end_time + 5
|
|
504
|
+
]
|
|
505
|
+
|
|
506
|
+
context = f"Refining segment '{segment.name}' with original boundaries {segment.start_time_formatted} - {segment.end_time_formatted}"
|
|
507
|
+
|
|
508
|
+
# Create mini-transcript
|
|
509
|
+
mini_transcript = ActionTranscript(
|
|
510
|
+
recording_id=transcript.recording_id,
|
|
511
|
+
recording_name=transcript.recording_name,
|
|
512
|
+
frames=context_frames,
|
|
513
|
+
total_duration=context_frames[-1].timestamp - context_frames[0].timestamp
|
|
514
|
+
if context_frames
|
|
515
|
+
else 0,
|
|
516
|
+
frame_count=len(context_frames),
|
|
517
|
+
vlm_model=transcript.vlm_model,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
result = self.extract_segments(mini_transcript, context)
|
|
521
|
+
if result.episodes:
|
|
522
|
+
return result.episodes[0]
|
|
523
|
+
return segment
|
|
524
|
+
|
|
525
|
+
def merge_segments(
|
|
526
|
+
self,
|
|
527
|
+
segments: list[Episode],
|
|
528
|
+
max_gap: float = 2.0,
|
|
529
|
+
) -> list[Episode]:
|
|
530
|
+
"""Merge adjacent segments that appear to be part of the same workflow.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
segments: List of segments to potentially merge.
|
|
534
|
+
max_gap: Maximum gap (seconds) between segments to consider merging.
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
List of merged segments.
|
|
538
|
+
"""
|
|
539
|
+
if not segments:
|
|
540
|
+
return []
|
|
541
|
+
|
|
542
|
+
# Sort by start time
|
|
543
|
+
sorted_segments = sorted(segments, key=lambda s: s.start_time)
|
|
544
|
+
|
|
545
|
+
merged = [sorted_segments[0]]
|
|
546
|
+
for segment in sorted_segments[1:]:
|
|
547
|
+
last = merged[-1]
|
|
548
|
+
gap = segment.start_time - last.end_time
|
|
549
|
+
|
|
550
|
+
# Check if should merge
|
|
551
|
+
if gap <= max_gap and segment.application == last.application:
|
|
552
|
+
# Merge segments
|
|
553
|
+
merged_segment = Episode(
|
|
554
|
+
episode_id=uuid4(),
|
|
555
|
+
name=f"{last.name} + {segment.name}",
|
|
556
|
+
start_time=last.start_time,
|
|
557
|
+
end_time=segment.end_time,
|
|
558
|
+
start_time_formatted=last.start_time_formatted,
|
|
559
|
+
end_time_formatted=segment.end_time_formatted,
|
|
560
|
+
description=f"{last.description}. Then, {segment.description}",
|
|
561
|
+
step_summaries=last.step_summaries + segment.step_summaries,
|
|
562
|
+
application=last.application,
|
|
563
|
+
boundary_confidence=min(
|
|
564
|
+
last.boundary_confidence, segment.boundary_confidence
|
|
565
|
+
),
|
|
566
|
+
coherence_score=(last.coherence_score + segment.coherence_score)
|
|
567
|
+
/ 2,
|
|
568
|
+
recording_id=last.recording_id,
|
|
569
|
+
frame_indices=last.frame_indices + segment.frame_indices,
|
|
570
|
+
)
|
|
571
|
+
merged[-1] = merged_segment
|
|
572
|
+
else:
|
|
573
|
+
merged.append(segment)
|
|
574
|
+
|
|
575
|
+
return merged
|
|
576
|
+
|
|
577
|
+
def adjust_boundary(
|
|
578
|
+
self,
|
|
579
|
+
segment: Episode,
|
|
580
|
+
new_start: Optional[float] = None,
|
|
581
|
+
new_end: Optional[float] = None,
|
|
582
|
+
transcript: Optional[ActionTranscript] = None,
|
|
583
|
+
) -> Episode:
|
|
584
|
+
"""Manually adjust segment boundaries.
|
|
585
|
+
|
|
586
|
+
For human-in-the-loop refinement.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
segment: Segment to adjust.
|
|
590
|
+
new_start: New start time (or None to keep existing).
|
|
591
|
+
new_end: New end time (or None to keep existing).
|
|
592
|
+
transcript: Transcript to re-extract step info from new boundaries.
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
Adjusted Episode.
|
|
596
|
+
"""
|
|
597
|
+
start_time = new_start if new_start is not None else segment.start_time
|
|
598
|
+
end_time = new_end if new_end is not None else segment.end_time
|
|
599
|
+
|
|
600
|
+
# Update frame indices if transcript provided
|
|
601
|
+
frame_indices = segment.frame_indices
|
|
602
|
+
if transcript:
|
|
603
|
+
frame_indices = [
|
|
604
|
+
f.frame_index
|
|
605
|
+
for f in transcript.frames
|
|
606
|
+
if start_time <= f.timestamp <= end_time
|
|
607
|
+
]
|
|
608
|
+
|
|
609
|
+
return Episode(
|
|
610
|
+
episode_id=segment.episode_id,
|
|
611
|
+
name=segment.name,
|
|
612
|
+
start_time=start_time,
|
|
613
|
+
end_time=end_time,
|
|
614
|
+
start_time_formatted=f"{int(start_time // 60):02d}:{start_time % 60:04.1f}",
|
|
615
|
+
end_time_formatted=f"{int(end_time // 60):02d}:{end_time % 60:04.1f}",
|
|
616
|
+
description=segment.description,
|
|
617
|
+
steps=segment.steps,
|
|
618
|
+
step_summaries=segment.step_summaries,
|
|
619
|
+
application=segment.application,
|
|
620
|
+
prerequisites=segment.prerequisites,
|
|
621
|
+
outcomes=segment.outcomes,
|
|
622
|
+
parent_episode_id=segment.parent_episode_id,
|
|
623
|
+
child_episode_ids=segment.child_episode_ids,
|
|
624
|
+
boundary_confidence=segment.boundary_confidence
|
|
625
|
+
* 0.9, # Reduce confidence for manual adjustment
|
|
626
|
+
coherence_score=segment.coherence_score,
|
|
627
|
+
recording_id=segment.recording_id,
|
|
628
|
+
frame_indices=frame_indices,
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
@property
|
|
632
|
+
def supported_models(self) -> list[str]:
|
|
633
|
+
"""Return list of supported LLM models."""
|
|
634
|
+
return self.SUPPORTED_MODELS
|