openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,656 @@
1
+ """Workflow deduplication using embeddings and clustering.
2
+
3
+ This module identifies and merges similar workflows across
4
+ multiple recordings to create a canonical episode library (Stage 3).
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Optional, Union
11
+ from uuid import uuid4
12
+
13
+ import numpy as np
14
+ from numpy.typing import NDArray
15
+
16
+ from openadapt_ml.segmentation.schemas import (
17
+ CanonicalEpisode,
18
+ Episode,
19
+ EpisodeExtractionResult,
20
+ EpisodeLibrary,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class OpenAIEmbedder:
27
+ """OpenAI text embeddings."""
28
+
29
+ def __init__(
30
+ self,
31
+ model: str = "text-embedding-3-large",
32
+ api_key: Optional[str] = None,
33
+ ):
34
+ self.model = model
35
+ self._api_key = api_key
36
+ self._client = None
37
+
38
+ def _get_client(self):
39
+ if self._client is None:
40
+ import openai
41
+ from openadapt_ml.config import settings
42
+
43
+ api_key = self._api_key or settings.openai_api_key
44
+ self._client = openai.OpenAI(api_key=api_key)
45
+ return self._client
46
+
47
+ def embed(self, texts: list[str]) -> NDArray[np.float32]:
48
+ """Generate embeddings for texts."""
49
+ client = self._get_client()
50
+ response = client.embeddings.create(
51
+ model=self.model,
52
+ input=texts,
53
+ )
54
+ embeddings = [r.embedding for r in response.data]
55
+ return np.array(embeddings, dtype=np.float32)
56
+
57
+
58
+ class LocalEmbedder:
59
+ """Local HuggingFace embeddings (no API required)."""
60
+
61
+ def __init__(
62
+ self,
63
+ model: str = "intfloat/e5-large-v2",
64
+ device: str = "auto",
65
+ ):
66
+ self.model_name = model
67
+ self.device = device
68
+ self._model = None
69
+ self._tokenizer = None
70
+
71
+ def _load_model(self):
72
+ if self._model is None:
73
+ try:
74
+ from transformers import AutoModel, AutoTokenizer
75
+ import torch
76
+
77
+ self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
78
+ self._model = AutoModel.from_pretrained(self.model_name)
79
+
80
+ if self.device == "auto":
81
+ if torch.cuda.is_available():
82
+ self._model = self._model.cuda()
83
+ elif (
84
+ hasattr(torch.backends, "mps")
85
+ and torch.backends.mps.is_available()
86
+ ):
87
+ self._model = self._model.to("mps")
88
+ elif self.device != "cpu":
89
+ self._model = self._model.to(self.device)
90
+
91
+ self._model.eval()
92
+ except ImportError:
93
+ raise ImportError(
94
+ "LocalEmbedder requires transformers and torch. "
95
+ "Install with: pip install transformers torch"
96
+ )
97
+
98
+ def embed(self, texts: list[str]) -> NDArray[np.float32]:
99
+ """Generate embeddings for texts."""
100
+ import torch
101
+
102
+ self._load_model()
103
+
104
+ # Add prefix for e5 models
105
+ if "e5" in self.model_name.lower():
106
+ texts = [f"query: {t}" for t in texts]
107
+
108
+ inputs = self._tokenizer(
109
+ texts,
110
+ padding=True,
111
+ truncation=True,
112
+ max_length=512,
113
+ return_tensors="pt",
114
+ )
115
+
116
+ if next(self._model.parameters()).is_cuda:
117
+ inputs = {k: v.cuda() for k, v in inputs.items()}
118
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
119
+ device = next(self._model.parameters()).device
120
+ inputs = {k: v.to(device) for k, v in inputs.items()}
121
+
122
+ with torch.no_grad():
123
+ outputs = self._model(**inputs)
124
+ # Mean pooling
125
+ attention_mask = inputs["attention_mask"]
126
+ token_embeddings = outputs.last_hidden_state
127
+ input_mask_expanded = (
128
+ attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
129
+ )
130
+ embeddings = torch.sum(
131
+ token_embeddings * input_mask_expanded, 1
132
+ ) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
133
+
134
+ return embeddings.cpu().numpy().astype(np.float32)
135
+
136
+
137
+ def episode_to_text(episode: Episode) -> str:
138
+ """Convert an episode to text for embedding.
139
+
140
+ Combines multiple fields for rich semantic representation.
141
+ """
142
+ parts = [
143
+ f"Workflow: {episode.name}",
144
+ f"Description: {episode.description}",
145
+ f"Application: {episode.application}",
146
+ f"Steps: {', '.join(episode.step_summaries)}",
147
+ ]
148
+
149
+ if episode.prerequisites:
150
+ parts.append(f"Prerequisites: {', '.join(episode.prerequisites)}")
151
+
152
+ if episode.outcomes:
153
+ parts.append(f"Outcomes: {', '.join(episode.outcomes)}")
154
+
155
+ return "\n".join(parts)
156
+
157
+
158
+ class WorkflowDeduplicator:
159
+ """Deduplicates workflow episodes using embedding similarity.
160
+
161
+ This class implements Stage 3 of the segmentation pipeline, identifying
162
+ similar workflows across recordings and merging them into canonical
163
+ definitions.
164
+
165
+ Example:
166
+ >>> dedup = WorkflowDeduplicator(threshold=0.85)
167
+ >>> library = dedup.deduplicate(extraction_results)
168
+ >>> print(f"Found {library.unique_episode_count} unique workflows")
169
+ >>> print(f"Deduplication ratio: {library.deduplication_ratio:.1%}")
170
+ Found 15 unique workflows
171
+ Deduplication ratio: 34.2%
172
+
173
+ Attributes:
174
+ threshold: Similarity threshold for clustering
175
+ embedding_model: Model used for text embeddings
176
+ merge_strategy: How to merge similar episodes
177
+ """
178
+
179
+ def __init__(
180
+ self,
181
+ threshold: float = 0.85,
182
+ embedding_model: str = "text-embedding-3-large",
183
+ embedding_dim: int = 3072,
184
+ merge_strategy: str = "centroid",
185
+ min_cluster_size: int = 1,
186
+ use_local_embeddings: bool = False,
187
+ ) -> None:
188
+ """Initialize the deduplicator.
189
+
190
+ Args:
191
+ threshold: Cosine similarity threshold for clustering.
192
+ Higher = stricter matching, fewer merges.
193
+ Recommended: 0.80-0.90
194
+ embedding_model: Text embedding model.
195
+ embedding_dim: Embedding dimension (model-specific).
196
+ merge_strategy: How to create canonical definition:
197
+ - "centroid": Use episode closest to cluster centroid
198
+ - "longest": Use longest description
199
+ - "first": Use first encountered
200
+ min_cluster_size: Minimum episodes to form a cluster.
201
+ use_local_embeddings: Use local HuggingFace model instead of API.
202
+ """
203
+ self.threshold = threshold
204
+ self.embedding_model = embedding_model
205
+ self.embedding_dim = embedding_dim
206
+ self.merge_strategy = merge_strategy
207
+ self.min_cluster_size = min_cluster_size
208
+ self.use_local_embeddings = use_local_embeddings
209
+
210
+ if use_local_embeddings:
211
+ self._embedder = LocalEmbedder(model="intfloat/e5-large-v2")
212
+ else:
213
+ self._embedder = OpenAIEmbedder(model=embedding_model)
214
+
215
+ def deduplicate(
216
+ self,
217
+ extraction_results: list[EpisodeExtractionResult],
218
+ existing_library: Optional[EpisodeLibrary] = None,
219
+ ) -> EpisodeLibrary:
220
+ """Deduplicate episodes across multiple extraction results.
221
+
222
+ Args:
223
+ extraction_results: List of extraction results from Stage 2.
224
+ existing_library: Optional existing library to merge with.
225
+
226
+ Returns:
227
+ EpisodeLibrary with deduplicated canonical episodes.
228
+ """
229
+ # Collect all episodes
230
+ all_episodes = []
231
+ for result in extraction_results:
232
+ all_episodes.extend(result.episodes)
233
+
234
+ # Add episodes from existing library
235
+ existing_episodes = []
236
+ if existing_library:
237
+ for canonical in existing_library.episodes:
238
+ # Create synthetic Episode from CanonicalEpisode
239
+ for i, (rec_id, seg_id) in enumerate(
240
+ zip(canonical.source_recordings, canonical.source_episode_ids)
241
+ ):
242
+ synthetic = Episode(
243
+ episode_id=seg_id,
244
+ name=canonical.variant_names[i]
245
+ if i < len(canonical.variant_names)
246
+ else canonical.canonical_name,
247
+ start_time=0,
248
+ end_time=0,
249
+ start_time_formatted="00:00.0",
250
+ end_time_formatted="00:00.0",
251
+ description=canonical.variant_descriptions[i]
252
+ if i < len(canonical.variant_descriptions)
253
+ else canonical.canonical_description,
254
+ step_summaries=canonical.canonical_steps,
255
+ application="Unknown",
256
+ boundary_confidence=1.0,
257
+ coherence_score=1.0,
258
+ recording_id=rec_id,
259
+ )
260
+ existing_episodes.append(synthetic)
261
+ all_episodes.extend(existing_episodes)
262
+
263
+ if not all_episodes:
264
+ return EpisodeLibrary(
265
+ episodes=[],
266
+ total_recordings_processed=len(extraction_results),
267
+ total_episodes_extracted=0,
268
+ unique_episode_count=0,
269
+ deduplication_ratio=0.0,
270
+ similarity_threshold=self.threshold,
271
+ embedding_model=self.embedding_model,
272
+ )
273
+
274
+ # Generate embeddings
275
+ embeddings = self.embed_episodes(all_episodes)
276
+
277
+ # Cluster similar episodes
278
+ clusters = self.cluster_episodes(embeddings, all_episodes)
279
+
280
+ # Merge clusters into canonical episodes
281
+ canonical_episodes = []
282
+ for cluster_id, indices in enumerate(clusters):
283
+ cluster_episodes = [all_episodes[i] for i in indices]
284
+ cluster_embeddings = embeddings[indices]
285
+
286
+ canonical = self.merge_cluster(
287
+ cluster_episodes, cluster_embeddings, cluster_id
288
+ )
289
+ canonical_episodes.append(canonical)
290
+
291
+ # Calculate statistics
292
+ total_extracted = len(all_episodes)
293
+ unique_count = len(canonical_episodes)
294
+ dedup_ratio = 1 - (unique_count / total_extracted) if total_extracted > 0 else 0
295
+
296
+ return EpisodeLibrary(
297
+ episodes=canonical_episodes,
298
+ total_recordings_processed=len(extraction_results),
299
+ total_episodes_extracted=total_extracted,
300
+ unique_episode_count=unique_count,
301
+ deduplication_ratio=dedup_ratio,
302
+ similarity_threshold=self.threshold,
303
+ embedding_model=self.embedding_model,
304
+ )
305
+
306
+ def embed_episode(self, episode: Episode) -> NDArray[np.float32]:
307
+ """Generate embedding for a single workflow episode."""
308
+ text = episode_to_text(episode)
309
+ embeddings = self._embedder.embed([text])
310
+ return embeddings[0]
311
+
312
+ def embed_episodes(
313
+ self,
314
+ episodes: list[Episode],
315
+ show_progress: bool = True,
316
+ ) -> NDArray[np.float32]:
317
+ """Generate embeddings for multiple episodes.
318
+
319
+ Args:
320
+ episodes: List of episodes to embed.
321
+ show_progress: Show progress bar.
322
+
323
+ Returns:
324
+ Embedding matrix of shape (n_episodes, embedding_dim).
325
+ """
326
+ texts = [episode_to_text(ep) for ep in episodes]
327
+
328
+ # Process in batches to avoid API limits
329
+ batch_size = 100
330
+ all_embeddings = []
331
+
332
+ for i in range(0, len(texts), batch_size):
333
+ batch = texts[i : i + batch_size]
334
+ batch_embeddings = self._embedder.embed(batch)
335
+ all_embeddings.append(batch_embeddings)
336
+
337
+ if show_progress:
338
+ logger.info(
339
+ f"Embedded {min(i + batch_size, len(texts))}/{len(texts)} episodes"
340
+ )
341
+
342
+ return np.vstack(all_embeddings)
343
+
344
+ def compute_similarity_matrix(
345
+ self,
346
+ embeddings: NDArray[np.float32],
347
+ ) -> NDArray[np.float32]:
348
+ """Compute pairwise cosine similarity matrix.
349
+
350
+ Args:
351
+ embeddings: Embedding matrix of shape (n, embedding_dim).
352
+
353
+ Returns:
354
+ Similarity matrix of shape (n, n) with values in [-1, 1].
355
+ """
356
+ # Normalize embeddings
357
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
358
+ normalized = embeddings / np.maximum(norms, 1e-9)
359
+
360
+ # Compute cosine similarity
361
+ similarity = normalized @ normalized.T
362
+ return similarity
363
+
364
+ def cluster_episodes(
365
+ self,
366
+ embeddings: NDArray[np.float32],
367
+ episodes: list[Episode],
368
+ ) -> list[list[int]]:
369
+ """Cluster similar episodes using agglomerative clustering.
370
+
371
+ Args:
372
+ embeddings: Embedding matrix.
373
+ episodes: Original episodes (for metadata).
374
+
375
+ Returns:
376
+ List of clusters, each containing episode indices.
377
+ """
378
+ try:
379
+ from sklearn.cluster import AgglomerativeClustering
380
+ except ImportError:
381
+ logger.warning("sklearn not available, using simple clustering")
382
+ return self._simple_cluster(embeddings)
383
+
384
+ # Normalize embeddings for cosine similarity
385
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
386
+ normalized = embeddings / np.maximum(norms, 1e-9)
387
+
388
+ # Compute cosine distances
389
+ distances = 1 - (normalized @ normalized.T)
390
+
391
+ # Cluster
392
+ distance_threshold = 1 - self.threshold
393
+ clustering = AgglomerativeClustering(
394
+ n_clusters=None,
395
+ distance_threshold=distance_threshold,
396
+ metric="precomputed",
397
+ linkage="average",
398
+ )
399
+ labels = clustering.fit_predict(distances)
400
+
401
+ # Group indices by cluster
402
+ clusters = {}
403
+ for idx, label in enumerate(labels):
404
+ if label not in clusters:
405
+ clusters[label] = []
406
+ clusters[label].append(idx)
407
+
408
+ return list(clusters.values())
409
+
410
+ def _simple_cluster(self, embeddings: NDArray[np.float32]) -> list[list[int]]:
411
+ """Simple greedy clustering when sklearn not available."""
412
+ # Normalize
413
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
414
+ normalized = embeddings / np.maximum(norms, 1e-9)
415
+
416
+ n = len(embeddings)
417
+ assigned = [False] * n
418
+ clusters = []
419
+
420
+ for i in range(n):
421
+ if assigned[i]:
422
+ continue
423
+
424
+ # Start new cluster
425
+ cluster = [i]
426
+ assigned[i] = True
427
+
428
+ for j in range(i + 1, n):
429
+ if assigned[j]:
430
+ continue
431
+
432
+ # Check similarity
433
+ sim = np.dot(normalized[i], normalized[j])
434
+ if sim >= self.threshold:
435
+ cluster.append(j)
436
+ assigned[j] = True
437
+
438
+ clusters.append(cluster)
439
+
440
+ return clusters
441
+
442
+ def merge_cluster(
443
+ self,
444
+ episodes: list[Episode],
445
+ embeddings: NDArray[np.float32],
446
+ cluster_id: int,
447
+ ) -> CanonicalEpisode:
448
+ """Merge a cluster of similar episodes into a canonical episode.
449
+
450
+ Args:
451
+ episodes: Episodes in this cluster.
452
+ embeddings: Embeddings for these episodes.
453
+ cluster_id: ID for this cluster.
454
+
455
+ Returns:
456
+ CanonicalEpisode representing the merged cluster.
457
+ """
458
+ if self.merge_strategy == "centroid":
459
+ # Find episode closest to cluster centroid
460
+ centroid = embeddings.mean(axis=0)
461
+ distances = np.linalg.norm(embeddings - centroid, axis=1)
462
+ canonical_idx = int(np.argmin(distances))
463
+
464
+ elif self.merge_strategy == "longest":
465
+ # Use episode with longest description
466
+ lengths = [len(ep.description) for ep in episodes]
467
+ canonical_idx = int(np.argmax(lengths))
468
+
469
+ elif self.merge_strategy == "first":
470
+ # Use first encountered
471
+ canonical_idx = 0
472
+
473
+ else:
474
+ raise ValueError(f"Unknown merge strategy: {self.merge_strategy}")
475
+
476
+ canonical_episode = episodes[canonical_idx]
477
+
478
+ # Compute internal similarity
479
+ if len(embeddings) > 1:
480
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
481
+ normalized = embeddings / np.maximum(norms, 1e-9)
482
+ sim_matrix = normalized @ normalized.T
483
+ # Average of upper triangle (excluding diagonal)
484
+ internal_sim = np.mean(sim_matrix[np.triu_indices(len(sim_matrix), k=1)])
485
+ else:
486
+ internal_sim = 1.0
487
+
488
+ return CanonicalEpisode(
489
+ canonical_id=uuid4(),
490
+ canonical_name=canonical_episode.name,
491
+ canonical_description=canonical_episode.description,
492
+ canonical_steps=canonical_episode.step_summaries,
493
+ variant_names=[ep.name for ep in episodes if ep != canonical_episode],
494
+ variant_descriptions=[
495
+ ep.description for ep in episodes if ep != canonical_episode
496
+ ],
497
+ source_recordings=list(set(ep.recording_id for ep in episodes)),
498
+ source_episode_ids=[ep.episode_id for ep in episodes],
499
+ occurrence_count=len(episodes),
500
+ embedding=embeddings[canonical_idx].tolist(),
501
+ cluster_id=cluster_id,
502
+ cluster_centroid_distance=float(
503
+ np.linalg.norm(embeddings[canonical_idx] - embeddings.mean(axis=0))
504
+ ),
505
+ internal_similarity=float(internal_sim),
506
+ )
507
+
508
+ def find_similar(
509
+ self,
510
+ episode: Episode,
511
+ library: EpisodeLibrary,
512
+ top_k: int = 5,
513
+ ) -> list[tuple[CanonicalEpisode, float]]:
514
+ """Find similar workflows in an existing library.
515
+
516
+ Args:
517
+ episode: Episode to find matches for.
518
+ library: Existing workflow library.
519
+ top_k: Number of results to return.
520
+
521
+ Returns:
522
+ List of (canonical_episode, similarity_score) tuples.
523
+ """
524
+ if not library.episodes:
525
+ return []
526
+
527
+ # Get embedding for query episode
528
+ query_embedding = self.embed_episode(episode)
529
+ query_norm = query_embedding / np.linalg.norm(query_embedding)
530
+
531
+ # Get embeddings for library
532
+ results = []
533
+ for canonical in library.episodes:
534
+ if canonical.embedding:
535
+ lib_embedding = np.array(canonical.embedding, dtype=np.float32)
536
+ lib_norm = lib_embedding / np.linalg.norm(lib_embedding)
537
+ similarity = float(np.dot(query_norm, lib_norm))
538
+ results.append((canonical, similarity))
539
+
540
+ # Sort by similarity (descending)
541
+ results.sort(key=lambda x: x[1], reverse=True)
542
+ return results[:top_k]
543
+
544
+ def add_to_library(
545
+ self,
546
+ episode: Episode,
547
+ library: EpisodeLibrary,
548
+ ) -> tuple[EpisodeLibrary, Optional[CanonicalEpisode]]:
549
+ """Add an episode to an existing library.
550
+
551
+ Either merges with existing workflow or creates new one.
552
+
553
+ Args:
554
+ episode: New episode to add.
555
+ library: Existing library.
556
+
557
+ Returns:
558
+ Tuple of (updated_library, matched_canonical or None if new).
559
+ """
560
+ similar = self.find_similar(episode, library, top_k=1)
561
+
562
+ if similar and similar[0][1] >= self.threshold:
563
+ # Merge with existing
564
+ matched_canonical = similar[0][0]
565
+
566
+ # Update the canonical episode
567
+ for can in library.episodes:
568
+ if can.canonical_id == matched_canonical.canonical_id:
569
+ can.variant_names.append(episode.name)
570
+ can.variant_descriptions.append(episode.description)
571
+ can.source_recordings.append(episode.recording_id)
572
+ can.source_episode_ids.append(episode.episode_id)
573
+ can.occurrence_count += 1
574
+ break
575
+
576
+ library.total_episodes_extracted += 1
577
+ library.deduplication_ratio = 1 - (
578
+ library.unique_episode_count / library.total_episodes_extracted
579
+ )
580
+
581
+ return library, matched_canonical
582
+
583
+ else:
584
+ # Create new canonical episode
585
+ embedding = self.embed_episode(episode)
586
+ new_canonical = CanonicalEpisode(
587
+ canonical_id=uuid4(),
588
+ canonical_name=episode.name,
589
+ canonical_description=episode.description,
590
+ canonical_steps=episode.step_summaries,
591
+ variant_names=[],
592
+ variant_descriptions=[],
593
+ source_recordings=[episode.recording_id],
594
+ source_episode_ids=[episode.episode_id],
595
+ occurrence_count=1,
596
+ embedding=embedding.tolist(),
597
+ cluster_id=len(library.episodes),
598
+ cluster_centroid_distance=0.0,
599
+ internal_similarity=1.0,
600
+ )
601
+
602
+ library.episodes.append(new_canonical)
603
+ library.total_episodes_extracted += 1
604
+ library.unique_episode_count += 1
605
+ library.deduplication_ratio = 1 - (
606
+ library.unique_episode_count / library.total_episodes_extracted
607
+ )
608
+
609
+ return library, None
610
+
611
+ def save_embeddings(
612
+ self,
613
+ path: Union[str, Path],
614
+ embeddings: NDArray[np.float32],
615
+ episodes: list[Episode],
616
+ ) -> None:
617
+ """Save embeddings and metadata for later reuse.
618
+
619
+ Args:
620
+ path: Output file path (will create .npy and .json).
621
+ embeddings: Embedding matrix.
622
+ episodes: Original episodes for metadata.
623
+ """
624
+ path = Path(path)
625
+ path.parent.mkdir(parents=True, exist_ok=True)
626
+
627
+ # Save embeddings
628
+ np.save(str(path.with_suffix(".npy")), embeddings)
629
+
630
+ # Save metadata
631
+ metadata = [
632
+ {
633
+ "episode_id": str(ep.episode_id),
634
+ "name": ep.name,
635
+ "recording_id": ep.recording_id,
636
+ }
637
+ for ep in episodes
638
+ ]
639
+ path.with_suffix(".json").write_text(json.dumps(metadata, indent=2))
640
+
641
+ def load_embeddings(
642
+ self,
643
+ path: Union[str, Path],
644
+ ) -> tuple[NDArray[np.float32], list[dict]]:
645
+ """Load previously saved embeddings.
646
+
647
+ Args:
648
+ path: Path to saved embeddings.
649
+
650
+ Returns:
651
+ Tuple of (embeddings, episode_metadata).
652
+ """
653
+ path = Path(path)
654
+ embeddings = np.load(str(path.with_suffix(".npy")))
655
+ metadata = json.loads(path.with_suffix(".json").read_text())
656
+ return embeddings, metadata