openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -107
- openadapt_ml/benchmarks/agent.py +297 -374
- openadapt_ml/benchmarks/azure.py +62 -24
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1874 -751
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +1236 -0
- openadapt_ml/benchmarks/vm_monitor.py +1111 -0
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +3194 -89
- openadapt_ml/cloud/ssh_tunnel.py +595 -0
- openadapt_ml/datasets/next_action.py +125 -96
- openadapt_ml/evals/grounding.py +32 -9
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +120 -57
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +732 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +277 -0
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +11 -10
- openadapt_ml/ingest/capture.py +97 -86
- openadapt_ml/ingest/loader.py +120 -69
- openadapt_ml/ingest/synthetic.py +344 -193
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +843 -0
- openadapt_ml/retrieval/embeddings.py +630 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +162 -0
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +27 -14
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +113 -0
- openadapt_ml/schema/converters.py +588 -0
- openadapt_ml/schema/episode.py +470 -0
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +102 -61
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +19 -14
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +16 -17
- openadapt_ml/scripts/train.py +98 -75
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +3255 -19
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +255 -441
- openadapt_ml/training/trl_trainer.py +403 -0
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/runner.py +0 -381
- openadapt_ml/benchmarks/waa.py +0 -704
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
"""Workflow deduplication using embeddings and clustering.
|
|
2
|
+
|
|
3
|
+
This module identifies and merges similar workflows across
|
|
4
|
+
multiple recordings to create a canonical episode library (Stage 3).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from numpy.typing import NDArray
|
|
15
|
+
|
|
16
|
+
from openadapt_ml.segmentation.schemas import (
|
|
17
|
+
CanonicalEpisode,
|
|
18
|
+
Episode,
|
|
19
|
+
EpisodeExtractionResult,
|
|
20
|
+
EpisodeLibrary,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OpenAIEmbedder:
|
|
27
|
+
"""OpenAI text embeddings."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model: str = "text-embedding-3-large",
|
|
32
|
+
api_key: Optional[str] = None,
|
|
33
|
+
):
|
|
34
|
+
self.model = model
|
|
35
|
+
self._api_key = api_key
|
|
36
|
+
self._client = None
|
|
37
|
+
|
|
38
|
+
def _get_client(self):
|
|
39
|
+
if self._client is None:
|
|
40
|
+
import openai
|
|
41
|
+
from openadapt_ml.config import settings
|
|
42
|
+
|
|
43
|
+
api_key = self._api_key or settings.openai_api_key
|
|
44
|
+
self._client = openai.OpenAI(api_key=api_key)
|
|
45
|
+
return self._client
|
|
46
|
+
|
|
47
|
+
def embed(self, texts: list[str]) -> NDArray[np.float32]:
|
|
48
|
+
"""Generate embeddings for texts."""
|
|
49
|
+
client = self._get_client()
|
|
50
|
+
response = client.embeddings.create(
|
|
51
|
+
model=self.model,
|
|
52
|
+
input=texts,
|
|
53
|
+
)
|
|
54
|
+
embeddings = [r.embedding for r in response.data]
|
|
55
|
+
return np.array(embeddings, dtype=np.float32)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LocalEmbedder:
|
|
59
|
+
"""Local HuggingFace embeddings (no API required)."""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
model: str = "intfloat/e5-large-v2",
|
|
64
|
+
device: str = "auto",
|
|
65
|
+
):
|
|
66
|
+
self.model_name = model
|
|
67
|
+
self.device = device
|
|
68
|
+
self._model = None
|
|
69
|
+
self._tokenizer = None
|
|
70
|
+
|
|
71
|
+
def _load_model(self):
|
|
72
|
+
if self._model is None:
|
|
73
|
+
try:
|
|
74
|
+
from transformers import AutoModel, AutoTokenizer
|
|
75
|
+
import torch
|
|
76
|
+
|
|
77
|
+
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
78
|
+
self._model = AutoModel.from_pretrained(self.model_name)
|
|
79
|
+
|
|
80
|
+
if self.device == "auto":
|
|
81
|
+
if torch.cuda.is_available():
|
|
82
|
+
self._model = self._model.cuda()
|
|
83
|
+
elif (
|
|
84
|
+
hasattr(torch.backends, "mps")
|
|
85
|
+
and torch.backends.mps.is_available()
|
|
86
|
+
):
|
|
87
|
+
self._model = self._model.to("mps")
|
|
88
|
+
elif self.device != "cpu":
|
|
89
|
+
self._model = self._model.to(self.device)
|
|
90
|
+
|
|
91
|
+
self._model.eval()
|
|
92
|
+
except ImportError:
|
|
93
|
+
raise ImportError(
|
|
94
|
+
"LocalEmbedder requires transformers and torch. "
|
|
95
|
+
"Install with: pip install transformers torch"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def embed(self, texts: list[str]) -> NDArray[np.float32]:
|
|
99
|
+
"""Generate embeddings for texts."""
|
|
100
|
+
import torch
|
|
101
|
+
|
|
102
|
+
self._load_model()
|
|
103
|
+
|
|
104
|
+
# Add prefix for e5 models
|
|
105
|
+
if "e5" in self.model_name.lower():
|
|
106
|
+
texts = [f"query: {t}" for t in texts]
|
|
107
|
+
|
|
108
|
+
inputs = self._tokenizer(
|
|
109
|
+
texts,
|
|
110
|
+
padding=True,
|
|
111
|
+
truncation=True,
|
|
112
|
+
max_length=512,
|
|
113
|
+
return_tensors="pt",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if next(self._model.parameters()).is_cuda:
|
|
117
|
+
inputs = {k: v.cuda() for k, v in inputs.items()}
|
|
118
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
119
|
+
device = next(self._model.parameters()).device
|
|
120
|
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
121
|
+
|
|
122
|
+
with torch.no_grad():
|
|
123
|
+
outputs = self._model(**inputs)
|
|
124
|
+
# Mean pooling
|
|
125
|
+
attention_mask = inputs["attention_mask"]
|
|
126
|
+
token_embeddings = outputs.last_hidden_state
|
|
127
|
+
input_mask_expanded = (
|
|
128
|
+
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
|
129
|
+
)
|
|
130
|
+
embeddings = torch.sum(
|
|
131
|
+
token_embeddings * input_mask_expanded, 1
|
|
132
|
+
) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
|
133
|
+
|
|
134
|
+
return embeddings.cpu().numpy().astype(np.float32)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def episode_to_text(episode: Episode) -> str:
|
|
138
|
+
"""Convert an episode to text for embedding.
|
|
139
|
+
|
|
140
|
+
Combines multiple fields for rich semantic representation.
|
|
141
|
+
"""
|
|
142
|
+
parts = [
|
|
143
|
+
f"Workflow: {episode.name}",
|
|
144
|
+
f"Description: {episode.description}",
|
|
145
|
+
f"Application: {episode.application}",
|
|
146
|
+
f"Steps: {', '.join(episode.step_summaries)}",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
if episode.prerequisites:
|
|
150
|
+
parts.append(f"Prerequisites: {', '.join(episode.prerequisites)}")
|
|
151
|
+
|
|
152
|
+
if episode.outcomes:
|
|
153
|
+
parts.append(f"Outcomes: {', '.join(episode.outcomes)}")
|
|
154
|
+
|
|
155
|
+
return "\n".join(parts)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class WorkflowDeduplicator:
|
|
159
|
+
"""Deduplicates workflow episodes using embedding similarity.
|
|
160
|
+
|
|
161
|
+
This class implements Stage 3 of the segmentation pipeline, identifying
|
|
162
|
+
similar workflows across recordings and merging them into canonical
|
|
163
|
+
definitions.
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
>>> dedup = WorkflowDeduplicator(threshold=0.85)
|
|
167
|
+
>>> library = dedup.deduplicate(extraction_results)
|
|
168
|
+
>>> print(f"Found {library.unique_episode_count} unique workflows")
|
|
169
|
+
>>> print(f"Deduplication ratio: {library.deduplication_ratio:.1%}")
|
|
170
|
+
Found 15 unique workflows
|
|
171
|
+
Deduplication ratio: 34.2%
|
|
172
|
+
|
|
173
|
+
Attributes:
|
|
174
|
+
threshold: Similarity threshold for clustering
|
|
175
|
+
embedding_model: Model used for text embeddings
|
|
176
|
+
merge_strategy: How to merge similar episodes
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
def __init__(
|
|
180
|
+
self,
|
|
181
|
+
threshold: float = 0.85,
|
|
182
|
+
embedding_model: str = "text-embedding-3-large",
|
|
183
|
+
embedding_dim: int = 3072,
|
|
184
|
+
merge_strategy: str = "centroid",
|
|
185
|
+
min_cluster_size: int = 1,
|
|
186
|
+
use_local_embeddings: bool = False,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Initialize the deduplicator.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
threshold: Cosine similarity threshold for clustering.
|
|
192
|
+
Higher = stricter matching, fewer merges.
|
|
193
|
+
Recommended: 0.80-0.90
|
|
194
|
+
embedding_model: Text embedding model.
|
|
195
|
+
embedding_dim: Embedding dimension (model-specific).
|
|
196
|
+
merge_strategy: How to create canonical definition:
|
|
197
|
+
- "centroid": Use episode closest to cluster centroid
|
|
198
|
+
- "longest": Use longest description
|
|
199
|
+
- "first": Use first encountered
|
|
200
|
+
min_cluster_size: Minimum episodes to form a cluster.
|
|
201
|
+
use_local_embeddings: Use local HuggingFace model instead of API.
|
|
202
|
+
"""
|
|
203
|
+
self.threshold = threshold
|
|
204
|
+
self.embedding_model = embedding_model
|
|
205
|
+
self.embedding_dim = embedding_dim
|
|
206
|
+
self.merge_strategy = merge_strategy
|
|
207
|
+
self.min_cluster_size = min_cluster_size
|
|
208
|
+
self.use_local_embeddings = use_local_embeddings
|
|
209
|
+
|
|
210
|
+
if use_local_embeddings:
|
|
211
|
+
self._embedder = LocalEmbedder(model="intfloat/e5-large-v2")
|
|
212
|
+
else:
|
|
213
|
+
self._embedder = OpenAIEmbedder(model=embedding_model)
|
|
214
|
+
|
|
215
|
+
def deduplicate(
|
|
216
|
+
self,
|
|
217
|
+
extraction_results: list[EpisodeExtractionResult],
|
|
218
|
+
existing_library: Optional[EpisodeLibrary] = None,
|
|
219
|
+
) -> EpisodeLibrary:
|
|
220
|
+
"""Deduplicate episodes across multiple extraction results.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
extraction_results: List of extraction results from Stage 2.
|
|
224
|
+
existing_library: Optional existing library to merge with.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
EpisodeLibrary with deduplicated canonical episodes.
|
|
228
|
+
"""
|
|
229
|
+
# Collect all episodes
|
|
230
|
+
all_episodes = []
|
|
231
|
+
for result in extraction_results:
|
|
232
|
+
all_episodes.extend(result.episodes)
|
|
233
|
+
|
|
234
|
+
# Add episodes from existing library
|
|
235
|
+
existing_episodes = []
|
|
236
|
+
if existing_library:
|
|
237
|
+
for canonical in existing_library.episodes:
|
|
238
|
+
# Create synthetic Episode from CanonicalEpisode
|
|
239
|
+
for i, (rec_id, seg_id) in enumerate(
|
|
240
|
+
zip(canonical.source_recordings, canonical.source_episode_ids)
|
|
241
|
+
):
|
|
242
|
+
synthetic = Episode(
|
|
243
|
+
episode_id=seg_id,
|
|
244
|
+
name=canonical.variant_names[i]
|
|
245
|
+
if i < len(canonical.variant_names)
|
|
246
|
+
else canonical.canonical_name,
|
|
247
|
+
start_time=0,
|
|
248
|
+
end_time=0,
|
|
249
|
+
start_time_formatted="00:00.0",
|
|
250
|
+
end_time_formatted="00:00.0",
|
|
251
|
+
description=canonical.variant_descriptions[i]
|
|
252
|
+
if i < len(canonical.variant_descriptions)
|
|
253
|
+
else canonical.canonical_description,
|
|
254
|
+
step_summaries=canonical.canonical_steps,
|
|
255
|
+
application="Unknown",
|
|
256
|
+
boundary_confidence=1.0,
|
|
257
|
+
coherence_score=1.0,
|
|
258
|
+
recording_id=rec_id,
|
|
259
|
+
)
|
|
260
|
+
existing_episodes.append(synthetic)
|
|
261
|
+
all_episodes.extend(existing_episodes)
|
|
262
|
+
|
|
263
|
+
if not all_episodes:
|
|
264
|
+
return EpisodeLibrary(
|
|
265
|
+
episodes=[],
|
|
266
|
+
total_recordings_processed=len(extraction_results),
|
|
267
|
+
total_episodes_extracted=0,
|
|
268
|
+
unique_episode_count=0,
|
|
269
|
+
deduplication_ratio=0.0,
|
|
270
|
+
similarity_threshold=self.threshold,
|
|
271
|
+
embedding_model=self.embedding_model,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Generate embeddings
|
|
275
|
+
embeddings = self.embed_episodes(all_episodes)
|
|
276
|
+
|
|
277
|
+
# Cluster similar episodes
|
|
278
|
+
clusters = self.cluster_episodes(embeddings, all_episodes)
|
|
279
|
+
|
|
280
|
+
# Merge clusters into canonical episodes
|
|
281
|
+
canonical_episodes = []
|
|
282
|
+
for cluster_id, indices in enumerate(clusters):
|
|
283
|
+
cluster_episodes = [all_episodes[i] for i in indices]
|
|
284
|
+
cluster_embeddings = embeddings[indices]
|
|
285
|
+
|
|
286
|
+
canonical = self.merge_cluster(
|
|
287
|
+
cluster_episodes, cluster_embeddings, cluster_id
|
|
288
|
+
)
|
|
289
|
+
canonical_episodes.append(canonical)
|
|
290
|
+
|
|
291
|
+
# Calculate statistics
|
|
292
|
+
total_extracted = len(all_episodes)
|
|
293
|
+
unique_count = len(canonical_episodes)
|
|
294
|
+
dedup_ratio = 1 - (unique_count / total_extracted) if total_extracted > 0 else 0
|
|
295
|
+
|
|
296
|
+
return EpisodeLibrary(
|
|
297
|
+
episodes=canonical_episodes,
|
|
298
|
+
total_recordings_processed=len(extraction_results),
|
|
299
|
+
total_episodes_extracted=total_extracted,
|
|
300
|
+
unique_episode_count=unique_count,
|
|
301
|
+
deduplication_ratio=dedup_ratio,
|
|
302
|
+
similarity_threshold=self.threshold,
|
|
303
|
+
embedding_model=self.embedding_model,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def embed_episode(self, episode: Episode) -> NDArray[np.float32]:
|
|
307
|
+
"""Generate embedding for a single workflow episode."""
|
|
308
|
+
text = episode_to_text(episode)
|
|
309
|
+
embeddings = self._embedder.embed([text])
|
|
310
|
+
return embeddings[0]
|
|
311
|
+
|
|
312
|
+
def embed_episodes(
|
|
313
|
+
self,
|
|
314
|
+
episodes: list[Episode],
|
|
315
|
+
show_progress: bool = True,
|
|
316
|
+
) -> NDArray[np.float32]:
|
|
317
|
+
"""Generate embeddings for multiple episodes.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
episodes: List of episodes to embed.
|
|
321
|
+
show_progress: Show progress bar.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Embedding matrix of shape (n_episodes, embedding_dim).
|
|
325
|
+
"""
|
|
326
|
+
texts = [episode_to_text(ep) for ep in episodes]
|
|
327
|
+
|
|
328
|
+
# Process in batches to avoid API limits
|
|
329
|
+
batch_size = 100
|
|
330
|
+
all_embeddings = []
|
|
331
|
+
|
|
332
|
+
for i in range(0, len(texts), batch_size):
|
|
333
|
+
batch = texts[i : i + batch_size]
|
|
334
|
+
batch_embeddings = self._embedder.embed(batch)
|
|
335
|
+
all_embeddings.append(batch_embeddings)
|
|
336
|
+
|
|
337
|
+
if show_progress:
|
|
338
|
+
logger.info(
|
|
339
|
+
f"Embedded {min(i + batch_size, len(texts))}/{len(texts)} episodes"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
return np.vstack(all_embeddings)
|
|
343
|
+
|
|
344
|
+
def compute_similarity_matrix(
|
|
345
|
+
self,
|
|
346
|
+
embeddings: NDArray[np.float32],
|
|
347
|
+
) -> NDArray[np.float32]:
|
|
348
|
+
"""Compute pairwise cosine similarity matrix.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
embeddings: Embedding matrix of shape (n, embedding_dim).
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Similarity matrix of shape (n, n) with values in [-1, 1].
|
|
355
|
+
"""
|
|
356
|
+
# Normalize embeddings
|
|
357
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
358
|
+
normalized = embeddings / np.maximum(norms, 1e-9)
|
|
359
|
+
|
|
360
|
+
# Compute cosine similarity
|
|
361
|
+
similarity = normalized @ normalized.T
|
|
362
|
+
return similarity
|
|
363
|
+
|
|
364
|
+
def cluster_episodes(
|
|
365
|
+
self,
|
|
366
|
+
embeddings: NDArray[np.float32],
|
|
367
|
+
episodes: list[Episode],
|
|
368
|
+
) -> list[list[int]]:
|
|
369
|
+
"""Cluster similar episodes using agglomerative clustering.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
embeddings: Embedding matrix.
|
|
373
|
+
episodes: Original episodes (for metadata).
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
List of clusters, each containing episode indices.
|
|
377
|
+
"""
|
|
378
|
+
try:
|
|
379
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
380
|
+
except ImportError:
|
|
381
|
+
logger.warning("sklearn not available, using simple clustering")
|
|
382
|
+
return self._simple_cluster(embeddings)
|
|
383
|
+
|
|
384
|
+
# Normalize embeddings for cosine similarity
|
|
385
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
386
|
+
normalized = embeddings / np.maximum(norms, 1e-9)
|
|
387
|
+
|
|
388
|
+
# Compute cosine distances
|
|
389
|
+
distances = 1 - (normalized @ normalized.T)
|
|
390
|
+
|
|
391
|
+
# Cluster
|
|
392
|
+
distance_threshold = 1 - self.threshold
|
|
393
|
+
clustering = AgglomerativeClustering(
|
|
394
|
+
n_clusters=None,
|
|
395
|
+
distance_threshold=distance_threshold,
|
|
396
|
+
metric="precomputed",
|
|
397
|
+
linkage="average",
|
|
398
|
+
)
|
|
399
|
+
labels = clustering.fit_predict(distances)
|
|
400
|
+
|
|
401
|
+
# Group indices by cluster
|
|
402
|
+
clusters = {}
|
|
403
|
+
for idx, label in enumerate(labels):
|
|
404
|
+
if label not in clusters:
|
|
405
|
+
clusters[label] = []
|
|
406
|
+
clusters[label].append(idx)
|
|
407
|
+
|
|
408
|
+
return list(clusters.values())
|
|
409
|
+
|
|
410
|
+
def _simple_cluster(self, embeddings: NDArray[np.float32]) -> list[list[int]]:
|
|
411
|
+
"""Simple greedy clustering when sklearn not available."""
|
|
412
|
+
# Normalize
|
|
413
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
414
|
+
normalized = embeddings / np.maximum(norms, 1e-9)
|
|
415
|
+
|
|
416
|
+
n = len(embeddings)
|
|
417
|
+
assigned = [False] * n
|
|
418
|
+
clusters = []
|
|
419
|
+
|
|
420
|
+
for i in range(n):
|
|
421
|
+
if assigned[i]:
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Start new cluster
|
|
425
|
+
cluster = [i]
|
|
426
|
+
assigned[i] = True
|
|
427
|
+
|
|
428
|
+
for j in range(i + 1, n):
|
|
429
|
+
if assigned[j]:
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
# Check similarity
|
|
433
|
+
sim = np.dot(normalized[i], normalized[j])
|
|
434
|
+
if sim >= self.threshold:
|
|
435
|
+
cluster.append(j)
|
|
436
|
+
assigned[j] = True
|
|
437
|
+
|
|
438
|
+
clusters.append(cluster)
|
|
439
|
+
|
|
440
|
+
return clusters
|
|
441
|
+
|
|
442
|
+
def merge_cluster(
|
|
443
|
+
self,
|
|
444
|
+
episodes: list[Episode],
|
|
445
|
+
embeddings: NDArray[np.float32],
|
|
446
|
+
cluster_id: int,
|
|
447
|
+
) -> CanonicalEpisode:
|
|
448
|
+
"""Merge a cluster of similar episodes into a canonical episode.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
episodes: Episodes in this cluster.
|
|
452
|
+
embeddings: Embeddings for these episodes.
|
|
453
|
+
cluster_id: ID for this cluster.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
CanonicalEpisode representing the merged cluster.
|
|
457
|
+
"""
|
|
458
|
+
if self.merge_strategy == "centroid":
|
|
459
|
+
# Find episode closest to cluster centroid
|
|
460
|
+
centroid = embeddings.mean(axis=0)
|
|
461
|
+
distances = np.linalg.norm(embeddings - centroid, axis=1)
|
|
462
|
+
canonical_idx = int(np.argmin(distances))
|
|
463
|
+
|
|
464
|
+
elif self.merge_strategy == "longest":
|
|
465
|
+
# Use episode with longest description
|
|
466
|
+
lengths = [len(ep.description) for ep in episodes]
|
|
467
|
+
canonical_idx = int(np.argmax(lengths))
|
|
468
|
+
|
|
469
|
+
elif self.merge_strategy == "first":
|
|
470
|
+
# Use first encountered
|
|
471
|
+
canonical_idx = 0
|
|
472
|
+
|
|
473
|
+
else:
|
|
474
|
+
raise ValueError(f"Unknown merge strategy: {self.merge_strategy}")
|
|
475
|
+
|
|
476
|
+
canonical_episode = episodes[canonical_idx]
|
|
477
|
+
|
|
478
|
+
# Compute internal similarity
|
|
479
|
+
if len(embeddings) > 1:
|
|
480
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
481
|
+
normalized = embeddings / np.maximum(norms, 1e-9)
|
|
482
|
+
sim_matrix = normalized @ normalized.T
|
|
483
|
+
# Average of upper triangle (excluding diagonal)
|
|
484
|
+
internal_sim = np.mean(sim_matrix[np.triu_indices(len(sim_matrix), k=1)])
|
|
485
|
+
else:
|
|
486
|
+
internal_sim = 1.0
|
|
487
|
+
|
|
488
|
+
return CanonicalEpisode(
|
|
489
|
+
canonical_id=uuid4(),
|
|
490
|
+
canonical_name=canonical_episode.name,
|
|
491
|
+
canonical_description=canonical_episode.description,
|
|
492
|
+
canonical_steps=canonical_episode.step_summaries,
|
|
493
|
+
variant_names=[ep.name for ep in episodes if ep != canonical_episode],
|
|
494
|
+
variant_descriptions=[
|
|
495
|
+
ep.description for ep in episodes if ep != canonical_episode
|
|
496
|
+
],
|
|
497
|
+
source_recordings=list(set(ep.recording_id for ep in episodes)),
|
|
498
|
+
source_episode_ids=[ep.episode_id for ep in episodes],
|
|
499
|
+
occurrence_count=len(episodes),
|
|
500
|
+
embedding=embeddings[canonical_idx].tolist(),
|
|
501
|
+
cluster_id=cluster_id,
|
|
502
|
+
cluster_centroid_distance=float(
|
|
503
|
+
np.linalg.norm(embeddings[canonical_idx] - embeddings.mean(axis=0))
|
|
504
|
+
),
|
|
505
|
+
internal_similarity=float(internal_sim),
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
def find_similar(
|
|
509
|
+
self,
|
|
510
|
+
episode: Episode,
|
|
511
|
+
library: EpisodeLibrary,
|
|
512
|
+
top_k: int = 5,
|
|
513
|
+
) -> list[tuple[CanonicalEpisode, float]]:
|
|
514
|
+
"""Find similar workflows in an existing library.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
episode: Episode to find matches for.
|
|
518
|
+
library: Existing workflow library.
|
|
519
|
+
top_k: Number of results to return.
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
List of (canonical_episode, similarity_score) tuples.
|
|
523
|
+
"""
|
|
524
|
+
if not library.episodes:
|
|
525
|
+
return []
|
|
526
|
+
|
|
527
|
+
# Get embedding for query episode
|
|
528
|
+
query_embedding = self.embed_episode(episode)
|
|
529
|
+
query_norm = query_embedding / np.linalg.norm(query_embedding)
|
|
530
|
+
|
|
531
|
+
# Get embeddings for library
|
|
532
|
+
results = []
|
|
533
|
+
for canonical in library.episodes:
|
|
534
|
+
if canonical.embedding:
|
|
535
|
+
lib_embedding = np.array(canonical.embedding, dtype=np.float32)
|
|
536
|
+
lib_norm = lib_embedding / np.linalg.norm(lib_embedding)
|
|
537
|
+
similarity = float(np.dot(query_norm, lib_norm))
|
|
538
|
+
results.append((canonical, similarity))
|
|
539
|
+
|
|
540
|
+
# Sort by similarity (descending)
|
|
541
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
542
|
+
return results[:top_k]
|
|
543
|
+
|
|
544
|
+
def add_to_library(
|
|
545
|
+
self,
|
|
546
|
+
episode: Episode,
|
|
547
|
+
library: EpisodeLibrary,
|
|
548
|
+
) -> tuple[EpisodeLibrary, Optional[CanonicalEpisode]]:
|
|
549
|
+
"""Add an episode to an existing library.
|
|
550
|
+
|
|
551
|
+
Either merges with existing workflow or creates new one.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
episode: New episode to add.
|
|
555
|
+
library: Existing library.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
Tuple of (updated_library, matched_canonical or None if new).
|
|
559
|
+
"""
|
|
560
|
+
similar = self.find_similar(episode, library, top_k=1)
|
|
561
|
+
|
|
562
|
+
if similar and similar[0][1] >= self.threshold:
|
|
563
|
+
# Merge with existing
|
|
564
|
+
matched_canonical = similar[0][0]
|
|
565
|
+
|
|
566
|
+
# Update the canonical episode
|
|
567
|
+
for can in library.episodes:
|
|
568
|
+
if can.canonical_id == matched_canonical.canonical_id:
|
|
569
|
+
can.variant_names.append(episode.name)
|
|
570
|
+
can.variant_descriptions.append(episode.description)
|
|
571
|
+
can.source_recordings.append(episode.recording_id)
|
|
572
|
+
can.source_episode_ids.append(episode.episode_id)
|
|
573
|
+
can.occurrence_count += 1
|
|
574
|
+
break
|
|
575
|
+
|
|
576
|
+
library.total_episodes_extracted += 1
|
|
577
|
+
library.deduplication_ratio = 1 - (
|
|
578
|
+
library.unique_episode_count / library.total_episodes_extracted
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
return library, matched_canonical
|
|
582
|
+
|
|
583
|
+
else:
|
|
584
|
+
# Create new canonical episode
|
|
585
|
+
embedding = self.embed_episode(episode)
|
|
586
|
+
new_canonical = CanonicalEpisode(
|
|
587
|
+
canonical_id=uuid4(),
|
|
588
|
+
canonical_name=episode.name,
|
|
589
|
+
canonical_description=episode.description,
|
|
590
|
+
canonical_steps=episode.step_summaries,
|
|
591
|
+
variant_names=[],
|
|
592
|
+
variant_descriptions=[],
|
|
593
|
+
source_recordings=[episode.recording_id],
|
|
594
|
+
source_episode_ids=[episode.episode_id],
|
|
595
|
+
occurrence_count=1,
|
|
596
|
+
embedding=embedding.tolist(),
|
|
597
|
+
cluster_id=len(library.episodes),
|
|
598
|
+
cluster_centroid_distance=0.0,
|
|
599
|
+
internal_similarity=1.0,
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
library.episodes.append(new_canonical)
|
|
603
|
+
library.total_episodes_extracted += 1
|
|
604
|
+
library.unique_episode_count += 1
|
|
605
|
+
library.deduplication_ratio = 1 - (
|
|
606
|
+
library.unique_episode_count / library.total_episodes_extracted
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
return library, None
|
|
610
|
+
|
|
611
|
+
def save_embeddings(
|
|
612
|
+
self,
|
|
613
|
+
path: Union[str, Path],
|
|
614
|
+
embeddings: NDArray[np.float32],
|
|
615
|
+
episodes: list[Episode],
|
|
616
|
+
) -> None:
|
|
617
|
+
"""Save embeddings and metadata for later reuse.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
path: Output file path (will create .npy and .json).
|
|
621
|
+
embeddings: Embedding matrix.
|
|
622
|
+
episodes: Original episodes for metadata.
|
|
623
|
+
"""
|
|
624
|
+
path = Path(path)
|
|
625
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
626
|
+
|
|
627
|
+
# Save embeddings
|
|
628
|
+
np.save(str(path.with_suffix(".npy")), embeddings)
|
|
629
|
+
|
|
630
|
+
# Save metadata
|
|
631
|
+
metadata = [
|
|
632
|
+
{
|
|
633
|
+
"episode_id": str(ep.episode_id),
|
|
634
|
+
"name": ep.name,
|
|
635
|
+
"recording_id": ep.recording_id,
|
|
636
|
+
}
|
|
637
|
+
for ep in episodes
|
|
638
|
+
]
|
|
639
|
+
path.with_suffix(".json").write_text(json.dumps(metadata, indent=2))
|
|
640
|
+
|
|
641
|
+
def load_embeddings(
|
|
642
|
+
self,
|
|
643
|
+
path: Union[str, Path],
|
|
644
|
+
) -> tuple[NDArray[np.float32], list[dict]]:
|
|
645
|
+
"""Load previously saved embeddings.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
path: Path to saved embeddings.
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
Tuple of (embeddings, episode_metadata).
|
|
652
|
+
"""
|
|
653
|
+
path = Path(path)
|
|
654
|
+
embeddings = np.load(str(path.with_suffix(".npy")))
|
|
655
|
+
metadata = json.loads(path.with_suffix(".json").read_text())
|
|
656
|
+
return embeddings, metadata
|