robotframework-testselection 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TestSelection/__init__.py +3 -0
- TestSelection/cli.py +256 -0
- TestSelection/embedding/__init__.py +1 -0
- TestSelection/embedding/embedder.py +43 -0
- TestSelection/embedding/models.py +198 -0
- TestSelection/embedding/ports.py +24 -0
- TestSelection/execution/__init__.py +1 -0
- TestSelection/execution/listener.py +44 -0
- TestSelection/execution/prerun_modifier.py +43 -0
- TestSelection/execution/runner.py +75 -0
- TestSelection/parsing/__init__.py +1 -0
- TestSelection/parsing/datadriver_reader.py +54 -0
- TestSelection/parsing/keyword_resolver.py +51 -0
- TestSelection/parsing/suite_collector.py +85 -0
- TestSelection/parsing/text_builder.py +79 -0
- TestSelection/pipeline/__init__.py +1 -0
- TestSelection/pipeline/artifacts.py +110 -0
- TestSelection/pipeline/cache.py +74 -0
- TestSelection/pipeline/errors.py +18 -0
- TestSelection/pipeline/execute.py +52 -0
- TestSelection/pipeline/select.py +183 -0
- TestSelection/pipeline/vectorize.py +190 -0
- TestSelection/py.typed +0 -0
- TestSelection/selection/__init__.py +25 -0
- TestSelection/selection/dpp.py +31 -0
- TestSelection/selection/facility.py +25 -0
- TestSelection/selection/filtering.py +21 -0
- TestSelection/selection/fps.py +67 -0
- TestSelection/selection/kmedoids.py +32 -0
- TestSelection/selection/registry.py +70 -0
- TestSelection/selection/strategy.py +142 -0
- TestSelection/shared/__init__.py +1 -0
- TestSelection/shared/config.py +31 -0
- TestSelection/shared/types.py +117 -0
- robotframework_testselection-0.1.0.dist-info/METADATA +408 -0
- robotframework_testselection-0.1.0.dist-info/RECORD +39 -0
- robotframework_testselection-0.1.0.dist-info/WHEEL +4 -0
- robotframework_testselection-0.1.0.dist-info/entry_points.txt +2 -0
- robotframework_testselection-0.1.0.dist-info/licenses/LICENSE +191 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Stage 2 orchestrator: load artifacts, filter, select, and output."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from TestSelection.pipeline.artifacts import ArtifactManager
|
|
10
|
+
from TestSelection.pipeline.errors import SelectionError
|
|
11
|
+
from TestSelection.selection.filtering import filter_by_tags
|
|
12
|
+
from TestSelection.selection.registry import default_registry
|
|
13
|
+
from TestSelection.selection.strategy import (
|
|
14
|
+
DiversityMetrics,
|
|
15
|
+
SelectedTest,
|
|
16
|
+
SelectionResult,
|
|
17
|
+
TagFilter,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run_select(
|
|
24
|
+
artifact_dir: Path,
|
|
25
|
+
k: int,
|
|
26
|
+
strategy: str = "fps",
|
|
27
|
+
output_file: Path | None = None,
|
|
28
|
+
include_tags: list[str] | None = None,
|
|
29
|
+
exclude_tags: list[str] | None = None,
|
|
30
|
+
seed: int = 42,
|
|
31
|
+
include_datadriver: bool = True,
|
|
32
|
+
) -> SelectionResult:
|
|
33
|
+
"""Run the selection stage.
|
|
34
|
+
|
|
35
|
+
Returns SelectionResult. Raises SelectionError on failure.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
manager = ArtifactManager(artifact_dir)
|
|
39
|
+
|
|
40
|
+
# Validate artifacts
|
|
41
|
+
valid, message = manager.validate_artifacts()
|
|
42
|
+
if not valid:
|
|
43
|
+
raise SelectionError(f"Artifact validation failed: {message}")
|
|
44
|
+
|
|
45
|
+
manifest = manager.load_manifest()
|
|
46
|
+
vectors = manager.load_vectors()
|
|
47
|
+
|
|
48
|
+
logger.info(
|
|
49
|
+
"[DIVERSE-SELECT] stage=select event=artifacts_loaded "
|
|
50
|
+
"tests=%d dim=%d",
|
|
51
|
+
manifest.test_count,
|
|
52
|
+
manifest.embedding_dim,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Build tag filter
|
|
56
|
+
tag_filter = TagFilter(
|
|
57
|
+
include_tags=frozenset(
|
|
58
|
+
t.lower() for t in (include_tags or [])
|
|
59
|
+
),
|
|
60
|
+
exclude_tags=frozenset(
|
|
61
|
+
t.lower() for t in (exclude_tags or [])
|
|
62
|
+
),
|
|
63
|
+
include_datadriver=include_datadriver,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Filter entries
|
|
67
|
+
filtered_indices = filter_by_tags(manifest.tests, tag_filter)
|
|
68
|
+
filtered_count = len(filtered_indices)
|
|
69
|
+
|
|
70
|
+
if filtered_count == 0:
|
|
71
|
+
raise SelectionError(
|
|
72
|
+
"No tests remain after tag filtering"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
filtered_vectors = vectors[filtered_indices]
|
|
76
|
+
actual_k = min(k, filtered_count)
|
|
77
|
+
|
|
78
|
+
logger.info(
|
|
79
|
+
"[DIVERSE-SELECT] stage=select event=filter_complete "
|
|
80
|
+
"total=%d filtered=%d k=%d",
|
|
81
|
+
manifest.test_count,
|
|
82
|
+
filtered_count,
|
|
83
|
+
actual_k,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Get strategy and run selection
|
|
87
|
+
algo = default_registry.get(strategy)
|
|
88
|
+
selected_indices = algo.select(filtered_vectors, actual_k, seed=seed)
|
|
89
|
+
|
|
90
|
+
# Map back to manifest entries
|
|
91
|
+
selected_entries = []
|
|
92
|
+
for idx in selected_indices:
|
|
93
|
+
original_idx = filtered_indices[idx]
|
|
94
|
+
entry = manifest.tests[original_idx]
|
|
95
|
+
selected_entries.append(
|
|
96
|
+
SelectedTest(
|
|
97
|
+
name=entry.name,
|
|
98
|
+
id=entry.id,
|
|
99
|
+
suite=entry.suite,
|
|
100
|
+
is_datadriver=entry.is_datadriver,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Compute diversity metrics
|
|
105
|
+
selected_vectors = filtered_vectors[selected_indices]
|
|
106
|
+
metrics = _compute_diversity_metrics(
|
|
107
|
+
selected_vectors, selected_entries, manifest
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
result = SelectionResult(
|
|
111
|
+
strategy=strategy,
|
|
112
|
+
k=actual_k,
|
|
113
|
+
seed=seed,
|
|
114
|
+
total_tests=manifest.test_count,
|
|
115
|
+
filtered_tests=filtered_count,
|
|
116
|
+
selected=tuple(selected_entries),
|
|
117
|
+
diversity_metrics=metrics,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Write output
|
|
121
|
+
out_path = output_file or manager.selection_path
|
|
122
|
+
result.to_json(out_path)
|
|
123
|
+
|
|
124
|
+
logger.info(
|
|
125
|
+
"[DIVERSE-SELECT] stage=select event=complete "
|
|
126
|
+
"strategy=%s k=%d avg_dist=%.4f min_dist=%.4f "
|
|
127
|
+
"suite_coverage=%d/%d",
|
|
128
|
+
strategy,
|
|
129
|
+
actual_k,
|
|
130
|
+
metrics.avg_pairwise_distance,
|
|
131
|
+
metrics.min_pairwise_distance,
|
|
132
|
+
metrics.suite_coverage,
|
|
133
|
+
metrics.suite_total,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
except SelectionError:
|
|
139
|
+
raise
|
|
140
|
+
except KeyError as exc:
|
|
141
|
+
logger.warning(
|
|
142
|
+
"[DIVERSE-SELECT] stage=select event=error error=%s",
|
|
143
|
+
str(exc),
|
|
144
|
+
)
|
|
145
|
+
raise SelectionError(str(exc)) from exc
|
|
146
|
+
except Exception as exc:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"[DIVERSE-SELECT] stage=select event=error error=%s",
|
|
149
|
+
str(exc),
|
|
150
|
+
)
|
|
151
|
+
raise SelectionError(str(exc)) from exc
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _compute_diversity_metrics(
|
|
155
|
+
selected_vectors: np.ndarray,
|
|
156
|
+
selected_entries: list[SelectedTest],
|
|
157
|
+
manifest,
|
|
158
|
+
) -> DiversityMetrics:
|
|
159
|
+
"""Compute pairwise cosine distance metrics for selected tests."""
|
|
160
|
+
from sklearn.metrics.pairwise import cosine_distances
|
|
161
|
+
|
|
162
|
+
n = selected_vectors.shape[0]
|
|
163
|
+
if n < 2:
|
|
164
|
+
return DiversityMetrics(
|
|
165
|
+
avg_pairwise_distance=0.0,
|
|
166
|
+
min_pairwise_distance=0.0,
|
|
167
|
+
suite_coverage=len({e.suite for e in selected_entries}),
|
|
168
|
+
suite_total=len({e.suite for e in manifest.tests}),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
pairwise = cosine_distances(selected_vectors, selected_vectors)
|
|
172
|
+
mask = np.triu(np.ones_like(pairwise, dtype=bool), k=1)
|
|
173
|
+
upper_dists = pairwise[mask]
|
|
174
|
+
|
|
175
|
+
all_suites = {e.suite for e in manifest.tests}
|
|
176
|
+
selected_suites = {e.suite for e in selected_entries}
|
|
177
|
+
|
|
178
|
+
return DiversityMetrics(
|
|
179
|
+
avg_pairwise_distance=float(np.mean(upper_dists)),
|
|
180
|
+
min_pairwise_distance=float(np.min(upper_dists)),
|
|
181
|
+
suite_coverage=len(selected_suites),
|
|
182
|
+
suite_total=len(all_suites),
|
|
183
|
+
)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Stage 1 orchestrator: parse, embed, and store test vectors."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from TestSelection.pipeline.artifacts import ArtifactManager
|
|
8
|
+
from TestSelection.pipeline.cache import CacheInvalidator
|
|
9
|
+
from TestSelection.pipeline.errors import VectorizationError
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_vectorize(
|
|
15
|
+
suite_path: Path,
|
|
16
|
+
artifact_dir: Path,
|
|
17
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
18
|
+
resolve_depth: int = 0,
|
|
19
|
+
force: bool = False,
|
|
20
|
+
datadriver_csvs: list[Path] | None = None,
|
|
21
|
+
) -> bool:
|
|
22
|
+
"""Run the vectorization stage.
|
|
23
|
+
|
|
24
|
+
Returns True if indexing was performed, False if skipped (cache hit).
|
|
25
|
+
Raises VectorizationError on failure.
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
manager = ArtifactManager(artifact_dir)
|
|
29
|
+
cache = CacheInvalidator(manager.hash_store_path)
|
|
30
|
+
|
|
31
|
+
if (
|
|
32
|
+
not force
|
|
33
|
+
and not cache.has_changes(suite_path)
|
|
34
|
+
and manager.has_embedding_artifacts()
|
|
35
|
+
):
|
|
36
|
+
logger.info(
|
|
37
|
+
"[DIVERSE-SELECT] stage=vectorize event=skipped "
|
|
38
|
+
"reason=no_changes"
|
|
39
|
+
)
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
# Parse suite
|
|
43
|
+
from TestSelection.parsing.suite_collector import RobotApiAdapter
|
|
44
|
+
|
|
45
|
+
adapter = RobotApiAdapter()
|
|
46
|
+
raw_tests, kw_map = adapter.parse_suite(suite_path)
|
|
47
|
+
logger.info(
|
|
48
|
+
"[DIVERSE-SELECT] stage=vectorize event=parse_complete "
|
|
49
|
+
"tests_found=%d",
|
|
50
|
+
len(raw_tests),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Build text representations
|
|
54
|
+
from TestSelection.parsing.keyword_resolver import (
|
|
55
|
+
KeywordTreeResolver,
|
|
56
|
+
)
|
|
57
|
+
from TestSelection.parsing.text_builder import (
|
|
58
|
+
TextRepresentationBuilder,
|
|
59
|
+
)
|
|
60
|
+
from TestSelection.shared.config import TextBuilderConfig
|
|
61
|
+
from TestSelection.shared.types import (
|
|
62
|
+
SuitePath,
|
|
63
|
+
Tag,
|
|
64
|
+
TestCaseId,
|
|
65
|
+
TestCaseRecord,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
resolver = KeywordTreeResolver(kw_map)
|
|
69
|
+
config = TextBuilderConfig(resolve_depth=resolve_depth)
|
|
70
|
+
builder = TextRepresentationBuilder(resolver, config)
|
|
71
|
+
|
|
72
|
+
records: list[TestCaseRecord] = []
|
|
73
|
+
for test_dict in raw_tests:
|
|
74
|
+
tags = frozenset(Tag(value=t) for t in test_dict.get("tags", []))
|
|
75
|
+
text_rep = builder.build(
|
|
76
|
+
test_name=test_dict["name"],
|
|
77
|
+
tags=tags,
|
|
78
|
+
body_items=test_dict.get("body", []),
|
|
79
|
+
)
|
|
80
|
+
test_id = TestCaseId.from_source_and_name(
|
|
81
|
+
test_dict["source"], test_dict["name"]
|
|
82
|
+
)
|
|
83
|
+
records.append(
|
|
84
|
+
TestCaseRecord(
|
|
85
|
+
test_id=test_id,
|
|
86
|
+
name=test_dict["name"],
|
|
87
|
+
tags=tags,
|
|
88
|
+
suite_source=SuitePath(Path(test_dict["source"])),
|
|
89
|
+
suite_name=test_dict.get("suite_name", ""),
|
|
90
|
+
text_representation=text_rep,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Handle DataDriver CSVs
|
|
95
|
+
if datadriver_csvs:
|
|
96
|
+
from TestSelection.parsing.datadriver_reader import (
|
|
97
|
+
read_datadriver_csv,
|
|
98
|
+
)
|
|
99
|
+
from TestSelection.shared.types import TextRepresentation
|
|
100
|
+
|
|
101
|
+
for csv_path in datadriver_csvs:
|
|
102
|
+
dd_tests = read_datadriver_csv(csv_path, template_name="Template")
|
|
103
|
+
for dd in dd_tests:
|
|
104
|
+
test_id = TestCaseId.from_source_and_name(
|
|
105
|
+
dd["source"], dd["name"]
|
|
106
|
+
)
|
|
107
|
+
records.append(
|
|
108
|
+
TestCaseRecord(
|
|
109
|
+
test_id=test_id,
|
|
110
|
+
name=dd["name"],
|
|
111
|
+
tags=frozenset(),
|
|
112
|
+
suite_source=SuitePath(Path(dd["source"])),
|
|
113
|
+
suite_name="DataDriver",
|
|
114
|
+
text_representation=TextRepresentation(
|
|
115
|
+
text=dd["description"]
|
|
116
|
+
),
|
|
117
|
+
is_datadriver=True,
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if not records:
|
|
122
|
+
raise VectorizationError("No test cases found to vectorize")
|
|
123
|
+
|
|
124
|
+
# Encode via embedding model
|
|
125
|
+
try:
|
|
126
|
+
from TestSelection.embedding.embedder import (
|
|
127
|
+
SentenceTransformerAdapter,
|
|
128
|
+
)
|
|
129
|
+
except ImportError as exc:
|
|
130
|
+
raise VectorizationError(
|
|
131
|
+
"sentence-transformers is required for vectorization. "
|
|
132
|
+
"Install with: pip install testcase-selection[vectorize]"
|
|
133
|
+
) from exc
|
|
134
|
+
|
|
135
|
+
model = SentenceTransformerAdapter(model_name)
|
|
136
|
+
texts = [r.text_representation.text for r in records]
|
|
137
|
+
vectors = model.encode(texts)
|
|
138
|
+
|
|
139
|
+
logger.info(
|
|
140
|
+
"[DIVERSE-SELECT] stage=vectorize event=embed_complete "
|
|
141
|
+
"model=%s dim=%d tests=%d",
|
|
142
|
+
model.model_name,
|
|
143
|
+
model.embedding_dim,
|
|
144
|
+
len(texts),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Build EmbeddingMatrix and save artifacts
|
|
148
|
+
from TestSelection.embedding.models import (
|
|
149
|
+
EmbeddingMatrix,
|
|
150
|
+
ManifestEntry,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
test_ids = tuple(r.test_id.value for r in records)
|
|
154
|
+
matrix = EmbeddingMatrix(
|
|
155
|
+
model_name=model.model_name,
|
|
156
|
+
embedding_dim=model.embedding_dim,
|
|
157
|
+
vectors=vectors,
|
|
158
|
+
test_ids=test_ids,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
manifest_entries = tuple(
|
|
162
|
+
ManifestEntry(
|
|
163
|
+
id=r.test_id.value,
|
|
164
|
+
name=r.name,
|
|
165
|
+
tags=tuple(t.value for t in r.tags),
|
|
166
|
+
suite=str(r.suite_source.value),
|
|
167
|
+
suite_name=r.suite_name,
|
|
168
|
+
is_datadriver=r.is_datadriver,
|
|
169
|
+
)
|
|
170
|
+
for r in records
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
matrix.to_artifact(artifact_dir, manifest_entries, resolve_depth)
|
|
174
|
+
cache.save_hashes(suite_path)
|
|
175
|
+
|
|
176
|
+
logger.info(
|
|
177
|
+
"[DIVERSE-SELECT] stage=vectorize event=complete "
|
|
178
|
+
"tests_indexed=%d",
|
|
179
|
+
len(records),
|
|
180
|
+
)
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
except VectorizationError:
|
|
184
|
+
raise
|
|
185
|
+
except Exception as exc:
|
|
186
|
+
logger.warning(
|
|
187
|
+
"[DIVERSE-SELECT] stage=vectorize event=error error=%s",
|
|
188
|
+
str(exc),
|
|
189
|
+
)
|
|
190
|
+
raise VectorizationError(str(exc)) from exc
|
TestSelection/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Selection bounded context: diversity-based test subset selection algorithms."""
|
|
2
|
+
|
|
3
|
+
from TestSelection.selection.filtering import filter_by_tags
|
|
4
|
+
from TestSelection.selection.fps import FarthestPointSampling, FPSMultiStart
|
|
5
|
+
from TestSelection.selection.registry import StrategyRegistry, default_registry
|
|
6
|
+
from TestSelection.selection.strategy import (
|
|
7
|
+
DiversityMetrics,
|
|
8
|
+
SelectedTest,
|
|
9
|
+
SelectionResult,
|
|
10
|
+
SelectionStrategy,
|
|
11
|
+
TagFilter,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"DiversityMetrics",
|
|
16
|
+
"FPSMultiStart",
|
|
17
|
+
"FarthestPointSampling",
|
|
18
|
+
"SelectedTest",
|
|
19
|
+
"SelectionResult",
|
|
20
|
+
"SelectionStrategy",
|
|
21
|
+
"StrategyRegistry",
|
|
22
|
+
"TagFilter",
|
|
23
|
+
"default_registry",
|
|
24
|
+
"filter_by_tags",
|
|
25
|
+
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Determinantal Point Process selection strategy (requires dppy)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DPPSelection:
|
|
9
|
+
"""Determinantal Point Process for probabilistic diverse sampling.
|
|
10
|
+
|
|
11
|
+
Produces genuinely random diverse subsets.
|
|
12
|
+
Useful for nightly CI runs that collectively cover more ground.
|
|
13
|
+
Requires: pip install dppy
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
name = "dpp"
|
|
17
|
+
|
|
18
|
+
def select(
|
|
19
|
+
self, vectors: NDArray[np.float32], k: int, seed: int = 42
|
|
20
|
+
) -> list[int]:
|
|
21
|
+
from dppy.finite_dpps import FiniteDPP
|
|
22
|
+
from sklearn.preprocessing import normalize
|
|
23
|
+
|
|
24
|
+
rng = np.random.RandomState(seed)
|
|
25
|
+
np.random.seed(rng.randint(2**31))
|
|
26
|
+
x_norm = normalize(vectors, norm="l2")
|
|
27
|
+
kernel = x_norm @ x_norm.T
|
|
28
|
+
kernel = (kernel + kernel.T) / 2
|
|
29
|
+
dpp = FiniteDPP("likelihood", **{"L": kernel})
|
|
30
|
+
dpp.sample_exact_k_dpp(size=k)
|
|
31
|
+
return list(dpp.list_of_samples[-1])
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Facility location selection strategy (requires apricot-select)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FacilityLocationSelection:
|
|
9
|
+
"""Submodular facility location for representative selection.
|
|
10
|
+
|
|
11
|
+
(1-1/e) ~ 0.632 approximation guarantee.
|
|
12
|
+
Ensures no cluster goes unrepresented.
|
|
13
|
+
Requires: pip install apricot-select
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
name = "facility"
|
|
17
|
+
|
|
18
|
+
def select(
|
|
19
|
+
self, vectors: NDArray[np.float32], k: int, seed: int = 42
|
|
20
|
+
) -> list[int]:
|
|
21
|
+
from apricot import FacilityLocationSelection as ApricotFL
|
|
22
|
+
|
|
23
|
+
selector = ApricotFL(k, metric="cosine", verbose=False)
|
|
24
|
+
selector.fit(vectors)
|
|
25
|
+
return list(selector.ranking)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Tag-based filtering for test manifest entries."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from TestSelection.selection.strategy import TagFilter
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def filter_by_tags(
|
|
13
|
+
manifest_entries: Sequence[Any],
|
|
14
|
+
tag_filter: TagFilter,
|
|
15
|
+
) -> list[int]:
|
|
16
|
+
"""Return indices of manifest entries that match the tag filter."""
|
|
17
|
+
return [
|
|
18
|
+
i
|
|
19
|
+
for i, entry in enumerate(manifest_entries)
|
|
20
|
+
if tag_filter.matches(frozenset(entry.tags), entry.is_datadriver)
|
|
21
|
+
]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Farthest Point Sampling selection strategies."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FarthestPointSampling:
|
|
9
|
+
"""Greedy farthest-first traversal for maximum dispersion.
|
|
10
|
+
|
|
11
|
+
2-approximation guarantee for max-min dispersion (Gonzalez, 1985).
|
|
12
|
+
Time: O(N * k * d). Deterministic given seed.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
name = "fps"
|
|
16
|
+
|
|
17
|
+
def select(
|
|
18
|
+
self, vectors: NDArray[np.float32], k: int, seed: int = 42
|
|
19
|
+
) -> list[int]:
|
|
20
|
+
from sklearn.metrics.pairwise import cosine_distances
|
|
21
|
+
|
|
22
|
+
n = vectors.shape[0]
|
|
23
|
+
k = min(k, n)
|
|
24
|
+
rng = np.random.RandomState(seed)
|
|
25
|
+
initial = rng.randint(n)
|
|
26
|
+
selected = [initial]
|
|
27
|
+
min_distances = cosine_distances(vectors[initial : initial + 1], vectors)[0]
|
|
28
|
+
min_distances[initial] = -np.inf
|
|
29
|
+
for _ in range(k - 1):
|
|
30
|
+
next_idx = int(np.argmax(min_distances))
|
|
31
|
+
selected.append(next_idx)
|
|
32
|
+
new_dists = cosine_distances(vectors[next_idx : next_idx + 1], vectors)[0]
|
|
33
|
+
min_distances = np.minimum(min_distances, new_dists)
|
|
34
|
+
min_distances[next_idx] = -np.inf
|
|
35
|
+
return selected
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FPSMultiStart:
|
|
39
|
+
"""FPS from multiple starting points, keeps best result.
|
|
40
|
+
|
|
41
|
+
'Best' = maximizes minimum pairwise distance in selected set.
|
|
42
|
+
Mitigates initial-point sensitivity.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name = "fps_multi"
|
|
46
|
+
|
|
47
|
+
def __init__(self, n_starts: int = 5) -> None:
|
|
48
|
+
self._n_starts = n_starts
|
|
49
|
+
|
|
50
|
+
def select(
|
|
51
|
+
self, vectors: NDArray[np.float32], k: int, seed: int = 42
|
|
52
|
+
) -> list[int]:
|
|
53
|
+
from sklearn.metrics.pairwise import cosine_distances
|
|
54
|
+
|
|
55
|
+
fps = FarthestPointSampling()
|
|
56
|
+
best_selected: list[int] | None = None
|
|
57
|
+
best_min_dist = -1.0
|
|
58
|
+
for i in range(self._n_starts):
|
|
59
|
+
selected = fps.select(vectors, k, seed=seed + i)
|
|
60
|
+
sel_vectors = vectors[selected]
|
|
61
|
+
pairwise = cosine_distances(sel_vectors, sel_vectors)
|
|
62
|
+
np.fill_diagonal(pairwise, np.inf)
|
|
63
|
+
min_dist = float(pairwise.min())
|
|
64
|
+
if min_dist > best_min_dist:
|
|
65
|
+
best_min_dist = min_dist
|
|
66
|
+
best_selected = selected
|
|
67
|
+
return best_selected # type: ignore[return-value]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""k-Medoids selection strategy (requires sklearn-extra)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KMedoidsSelection:
|
|
9
|
+
"""Cluster representatives via k-medoids (PAM algorithm).
|
|
10
|
+
|
|
11
|
+
Optimizes representativeness, not dispersion.
|
|
12
|
+
Each medoid is a real data point.
|
|
13
|
+
Requires: pip install scikit-learn-extra
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
name = "kmedoids"
|
|
17
|
+
|
|
18
|
+
def select(
|
|
19
|
+
self, vectors: NDArray[np.float32], k: int, seed: int = 42
|
|
20
|
+
) -> list[int]:
|
|
21
|
+
from sklearn_extra.cluster import KMedoids
|
|
22
|
+
|
|
23
|
+
kmed = KMedoids(
|
|
24
|
+
n_clusters=k,
|
|
25
|
+
metric="cosine",
|
|
26
|
+
method="pam",
|
|
27
|
+
init="k-medoids++",
|
|
28
|
+
random_state=seed,
|
|
29
|
+
max_iter=300,
|
|
30
|
+
)
|
|
31
|
+
kmed.fit(vectors)
|
|
32
|
+
return list(kmed.medoid_indices_)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Strategy registry for selection algorithms."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from TestSelection.selection.fps import FarthestPointSampling, FPSMultiStart
|
|
7
|
+
from TestSelection.selection.strategy import SelectionStrategy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StrategyRegistry:
|
|
11
|
+
"""Registry mapping strategy names to their implementation classes."""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
self._strategies: dict[str, Any] = {}
|
|
15
|
+
|
|
16
|
+
def register(self, strategy_class: Any) -> None:
|
|
17
|
+
"""Register a strategy class by its name attribute."""
|
|
18
|
+
self._strategies[strategy_class.name] = strategy_class
|
|
19
|
+
|
|
20
|
+
def get(self, name: str) -> SelectionStrategy:
|
|
21
|
+
"""Instantiate and return a strategy by name."""
|
|
22
|
+
if name not in self._strategies:
|
|
23
|
+
available = ", ".join(sorted(self._strategies))
|
|
24
|
+
msg = (
|
|
25
|
+
f"Unknown strategy {name!r}. "
|
|
26
|
+
f"Available: {available}"
|
|
27
|
+
)
|
|
28
|
+
raise KeyError(msg)
|
|
29
|
+
return self._strategies[name]()
|
|
30
|
+
|
|
31
|
+
def available(self) -> list[str]:
|
|
32
|
+
"""Return names of all registered strategies."""
|
|
33
|
+
return sorted(self._strategies)
|
|
34
|
+
|
|
35
|
+
def is_available(self, name: str) -> bool:
|
|
36
|
+
"""Check if a strategy is registered."""
|
|
37
|
+
return name in self._strategies
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _build_default_registry() -> StrategyRegistry:
|
|
41
|
+
"""Build the default registry with core + optional strategies."""
|
|
42
|
+
registry = StrategyRegistry()
|
|
43
|
+
registry.register(FarthestPointSampling)
|
|
44
|
+
registry.register(FPSMultiStart)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
from TestSelection.selection.kmedoids import KMedoidsSelection
|
|
48
|
+
|
|
49
|
+
registry.register(KMedoidsSelection)
|
|
50
|
+
except ImportError:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from TestSelection.selection.dpp import DPPSelection
|
|
55
|
+
|
|
56
|
+
registry.register(DPPSelection)
|
|
57
|
+
except ImportError:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
from TestSelection.selection.facility import FacilityLocationSelection
|
|
62
|
+
|
|
63
|
+
registry.register(FacilityLocationSelection)
|
|
64
|
+
except ImportError:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
return registry
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
default_registry = _build_default_registry()
|