tribalmemory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tribalmemory/__init__.py +3 -0
- tribalmemory/a21/__init__.py +38 -0
- tribalmemory/a21/config/__init__.py +20 -0
- tribalmemory/a21/config/providers.py +104 -0
- tribalmemory/a21/config/system.py +184 -0
- tribalmemory/a21/container/__init__.py +8 -0
- tribalmemory/a21/container/container.py +212 -0
- tribalmemory/a21/providers/__init__.py +32 -0
- tribalmemory/a21/providers/base.py +241 -0
- tribalmemory/a21/providers/deduplication.py +99 -0
- tribalmemory/a21/providers/lancedb.py +232 -0
- tribalmemory/a21/providers/memory.py +128 -0
- tribalmemory/a21/providers/mock.py +54 -0
- tribalmemory/a21/providers/openai.py +151 -0
- tribalmemory/a21/providers/timestamp.py +88 -0
- tribalmemory/a21/system.py +293 -0
- tribalmemory/cli.py +298 -0
- tribalmemory/interfaces.py +306 -0
- tribalmemory/mcp/__init__.py +9 -0
- tribalmemory/mcp/__main__.py +6 -0
- tribalmemory/mcp/server.py +484 -0
- tribalmemory/performance/__init__.py +1 -0
- tribalmemory/performance/benchmarks.py +285 -0
- tribalmemory/performance/corpus_generator.py +171 -0
- tribalmemory/portability/__init__.py +1 -0
- tribalmemory/portability/embedding_metadata.py +320 -0
- tribalmemory/server/__init__.py +9 -0
- tribalmemory/server/__main__.py +6 -0
- tribalmemory/server/app.py +187 -0
- tribalmemory/server/config.py +115 -0
- tribalmemory/server/models.py +206 -0
- tribalmemory/server/routes.py +378 -0
- tribalmemory/services/__init__.py +15 -0
- tribalmemory/services/deduplication.py +115 -0
- tribalmemory/services/embeddings.py +273 -0
- tribalmemory/services/import_export.py +506 -0
- tribalmemory/services/memory.py +275 -0
- tribalmemory/services/vector_store.py +360 -0
- tribalmemory/testing/__init__.py +22 -0
- tribalmemory/testing/embedding_utils.py +110 -0
- tribalmemory/testing/fixtures.py +123 -0
- tribalmemory/testing/metrics.py +256 -0
- tribalmemory/testing/mocks.py +560 -0
- tribalmemory/testing/semantic_expansions.py +91 -0
- tribalmemory/utils.py +23 -0
- tribalmemory-0.1.0.dist-info/METADATA +275 -0
- tribalmemory-0.1.0.dist-info/RECORD +51 -0
- tribalmemory-0.1.0.dist-info/WHEEL +5 -0
- tribalmemory-0.1.0.dist-info/entry_points.txt +3 -0
- tribalmemory-0.1.0.dist-info/licenses/LICENSE +190 -0
- tribalmemory-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Shared embedding utilities for mock implementations.
|
|
2
|
+
|
|
3
|
+
Provides consistent, deterministic embedding generation for testing.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import hashlib
|
|
7
|
+
import math
|
|
8
|
+
import random
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def hash_to_embedding(text: str, dimensions: int = 1536) -> list[float]:
|
|
13
|
+
"""Convert text to deterministic embedding that preserves semantic similarity.
|
|
14
|
+
|
|
15
|
+
Uses word-level hashing so texts with shared words have similar embeddings.
|
|
16
|
+
Suitable for basic mock testing scenarios.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
text: Text to embed.
|
|
20
|
+
dimensions: Output embedding dimensions.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Normalized embedding vector.
|
|
24
|
+
"""
|
|
25
|
+
embedding = [0.0] * dimensions
|
|
26
|
+
|
|
27
|
+
def add_term(term: str, weight: float = 1.0):
|
|
28
|
+
"""Add a term's contribution to the embedding."""
|
|
29
|
+
term_hash = hashlib.sha256(term.encode()).digest()
|
|
30
|
+
random.seed(int.from_bytes(term_hash[:8], 'big'))
|
|
31
|
+
for i in range(dimensions):
|
|
32
|
+
embedding[i] += random.gauss(0, 1) * weight
|
|
33
|
+
|
|
34
|
+
# Normalize text
|
|
35
|
+
text_lower = text.lower()
|
|
36
|
+
|
|
37
|
+
# Add contribution for each unique word (skip very short words)
|
|
38
|
+
words = set(re.findall(r'\b\w+\b', text_lower))
|
|
39
|
+
for word in words:
|
|
40
|
+
if len(word) > 2:
|
|
41
|
+
add_term(word, 1.0)
|
|
42
|
+
|
|
43
|
+
# Add short text as a whole (helps exact match queries)
|
|
44
|
+
if len(text) < 200:
|
|
45
|
+
add_term(text_lower.strip(), 2.0)
|
|
46
|
+
|
|
47
|
+
# Normalize to unit vector
|
|
48
|
+
norm = math.sqrt(sum(x * x for x in embedding))
|
|
49
|
+
if norm == 0:
|
|
50
|
+
random.seed(42)
|
|
51
|
+
embedding = [random.gauss(0, 1) for _ in range(dimensions)]
|
|
52
|
+
norm = math.sqrt(sum(x * x for x in embedding))
|
|
53
|
+
|
|
54
|
+
return [x / norm for x in embedding]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def hash_to_embedding_extended(text: str, dimensions: int = 1536) -> list[float]:
|
|
58
|
+
"""Convert text to deterministic embedding with sliding window support.
|
|
59
|
+
|
|
60
|
+
Extended version that uses sliding windows to catch substring matches.
|
|
61
|
+
Better for tests that need substring similarity detection.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
text: Text to embed.
|
|
65
|
+
dimensions: Output embedding dimensions.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Normalized embedding vector.
|
|
69
|
+
"""
|
|
70
|
+
embedding = [0.0] * dimensions
|
|
71
|
+
|
|
72
|
+
def add_term(term: str, weight: float = 1.0):
|
|
73
|
+
"""Add a term's contribution to the embedding with given weight."""
|
|
74
|
+
term_hash = hashlib.sha256(term.encode()).digest()
|
|
75
|
+
random.seed(int.from_bytes(term_hash[:8], 'big'))
|
|
76
|
+
for i in range(dimensions):
|
|
77
|
+
embedding[i] += random.gauss(0, 1) * weight
|
|
78
|
+
|
|
79
|
+
# Normalize text
|
|
80
|
+
text_lower = text.lower()
|
|
81
|
+
|
|
82
|
+
# Add contribution for each unique word
|
|
83
|
+
words = set(re.findall(r'\b\w+\b', text_lower))
|
|
84
|
+
for word in words:
|
|
85
|
+
if len(word) > 2: # Skip very short words
|
|
86
|
+
add_term(word, 1.0)
|
|
87
|
+
|
|
88
|
+
# Add sliding windows of characters (catches substrings)
|
|
89
|
+
# Use windows of size 20, 40, 80 characters
|
|
90
|
+
for window_size in [20, 40, 80]:
|
|
91
|
+
seen = set()
|
|
92
|
+
for i in range(0, len(text_lower) - window_size + 1, window_size // 2):
|
|
93
|
+
chunk = text_lower[i:i + window_size].strip()
|
|
94
|
+
if chunk and chunk not in seen:
|
|
95
|
+
seen.add(chunk)
|
|
96
|
+
add_term(chunk, 2.0)
|
|
97
|
+
|
|
98
|
+
# For short texts (likely queries), add the whole text as a term
|
|
99
|
+
# This ensures short exact matches get high similarity
|
|
100
|
+
if len(text) < 200:
|
|
101
|
+
add_term(text_lower.strip(), 5.0)
|
|
102
|
+
|
|
103
|
+
# Normalize to unit vector
|
|
104
|
+
norm = math.sqrt(sum(x * x for x in embedding))
|
|
105
|
+
if norm == 0:
|
|
106
|
+
random.seed(42)
|
|
107
|
+
embedding = [random.gauss(0, 1) for _ in range(dimensions)]
|
|
108
|
+
norm = math.sqrt(sum(x * x for x in embedding))
|
|
109
|
+
|
|
110
|
+
return [x / norm for x in embedding]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Test data fixtures and loaders."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PreferenceTestCase:
|
|
11
|
+
"""A preference-based test case."""
|
|
12
|
+
id: str
|
|
13
|
+
stored_preference: str
|
|
14
|
+
query: str
|
|
15
|
+
expected_keywords: list[str] # Response should contain these
|
|
16
|
+
context: Optional[str] = None
|
|
17
|
+
tags: list[str] = field(default_factory=list)
|
|
18
|
+
negative: bool = False # If True, this is a negative test case
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ContextTaskTestCase:
|
|
23
|
+
"""A context-dependent task test case."""
|
|
24
|
+
id: str
|
|
25
|
+
memories: list[str] # Memories to store first
|
|
26
|
+
query: str
|
|
27
|
+
expected_keywords: list[str]
|
|
28
|
+
min_memories_referenced: int = 1
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ConsistencyTestCase:
|
|
33
|
+
"""A cross-session consistency test case."""
|
|
34
|
+
id: str
|
|
35
|
+
seed_memories: list[str]
|
|
36
|
+
query_variations: list[str] # Different ways to ask same thing
|
|
37
|
+
expected_consistent: bool = True
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class TestDataSet:
|
|
42
|
+
"""Complete test data set."""
|
|
43
|
+
preferences: list[PreferenceTestCase]
|
|
44
|
+
context_tasks: list[ContextTaskTestCase]
|
|
45
|
+
consistency: list[ConsistencyTestCase]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_test_data(data_dir: Path) -> TestDataSet:
|
|
49
|
+
"""Load test data from JSON files."""
|
|
50
|
+
preferences = []
|
|
51
|
+
context_tasks = []
|
|
52
|
+
consistency = []
|
|
53
|
+
|
|
54
|
+
# Load preferences
|
|
55
|
+
pref_file = data_dir / "preferences.json"
|
|
56
|
+
if pref_file.exists():
|
|
57
|
+
with open(pref_file) as f:
|
|
58
|
+
data = json.load(f)
|
|
59
|
+
preferences = [PreferenceTestCase(**p) for p in data]
|
|
60
|
+
|
|
61
|
+
# Load context tasks
|
|
62
|
+
ctx_file = data_dir / "context_tasks.json"
|
|
63
|
+
if ctx_file.exists():
|
|
64
|
+
with open(ctx_file) as f:
|
|
65
|
+
data = json.load(f)
|
|
66
|
+
context_tasks = [ContextTaskTestCase(**c) for c in data]
|
|
67
|
+
|
|
68
|
+
# Load consistency
|
|
69
|
+
cons_file = data_dir / "consistency.json"
|
|
70
|
+
if cons_file.exists():
|
|
71
|
+
with open(cons_file) as f:
|
|
72
|
+
data = json.load(f)
|
|
73
|
+
consistency = [ConsistencyTestCase(**c) for c in data]
|
|
74
|
+
|
|
75
|
+
return TestDataSet(
|
|
76
|
+
preferences=preferences,
|
|
77
|
+
context_tasks=context_tasks,
|
|
78
|
+
consistency=consistency
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def save_test_data(dataset: TestDataSet, data_dir: Path):
|
|
83
|
+
"""Save test data to JSON files."""
|
|
84
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
# Save preferences
|
|
87
|
+
with open(data_dir / "preferences.json", "w") as f:
|
|
88
|
+
json.dump([
|
|
89
|
+
{
|
|
90
|
+
"id": p.id,
|
|
91
|
+
"stored_preference": p.stored_preference,
|
|
92
|
+
"query": p.query,
|
|
93
|
+
"expected_keywords": p.expected_keywords,
|
|
94
|
+
"context": p.context,
|
|
95
|
+
"tags": p.tags,
|
|
96
|
+
}
|
|
97
|
+
for p in dataset.preferences
|
|
98
|
+
], f, indent=2)
|
|
99
|
+
|
|
100
|
+
# Save context tasks
|
|
101
|
+
with open(data_dir / "context_tasks.json", "w") as f:
|
|
102
|
+
json.dump([
|
|
103
|
+
{
|
|
104
|
+
"id": c.id,
|
|
105
|
+
"memories": c.memories,
|
|
106
|
+
"query": c.query,
|
|
107
|
+
"expected_keywords": c.expected_keywords,
|
|
108
|
+
"min_memories_referenced": c.min_memories_referenced,
|
|
109
|
+
}
|
|
110
|
+
for c in dataset.context_tasks
|
|
111
|
+
], f, indent=2)
|
|
112
|
+
|
|
113
|
+
# Save consistency
|
|
114
|
+
with open(data_dir / "consistency.json", "w") as f:
|
|
115
|
+
json.dump([
|
|
116
|
+
{
|
|
117
|
+
"id": c.id,
|
|
118
|
+
"seed_memories": c.seed_memories,
|
|
119
|
+
"query_variations": c.query_variations,
|
|
120
|
+
"expected_consistent": c.expected_consistent,
|
|
121
|
+
}
|
|
122
|
+
for c in dataset.consistency
|
|
123
|
+
], f, indent=2)
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Metrics collection and analysis for testing."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import statistics
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class LatencyMeasurement:
|
|
15
|
+
"""Single latency measurement."""
|
|
16
|
+
operation: str
|
|
17
|
+
duration_ms: float
|
|
18
|
+
timestamp: datetime = field(default_factory=datetime.utcnow)
|
|
19
|
+
success: bool = True
|
|
20
|
+
metadata: dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LatencyTracker:
|
|
24
|
+
"""Track and analyze operation latencies."""
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
self._measurements: list[LatencyMeasurement] = []
|
|
28
|
+
self._start_times: dict[str, float] = {}
|
|
29
|
+
|
|
30
|
+
def start(self, operation: str) -> str:
|
|
31
|
+
"""Start timing an operation. Returns operation key."""
|
|
32
|
+
key = f"{operation}_{time.perf_counter_ns()}"
|
|
33
|
+
self._start_times[key] = time.perf_counter()
|
|
34
|
+
return key
|
|
35
|
+
|
|
36
|
+
def stop(self, key: str, success: bool = True, **metadata) -> float:
|
|
37
|
+
"""Stop timing and record measurement. Returns duration in ms."""
|
|
38
|
+
if key not in self._start_times:
|
|
39
|
+
raise ValueError(f"Unknown operation key: {key}")
|
|
40
|
+
|
|
41
|
+
duration_ms = (time.perf_counter() - self._start_times[key]) * 1000
|
|
42
|
+
operation = key.rsplit('_', 1)[0]
|
|
43
|
+
|
|
44
|
+
self._measurements.append(LatencyMeasurement(
|
|
45
|
+
operation=operation,
|
|
46
|
+
duration_ms=duration_ms,
|
|
47
|
+
success=success,
|
|
48
|
+
metadata=metadata
|
|
49
|
+
))
|
|
50
|
+
|
|
51
|
+
del self._start_times[key]
|
|
52
|
+
return duration_ms
|
|
53
|
+
|
|
54
|
+
def record(self, operation: str, duration_ms: float, success: bool = True, **metadata):
|
|
55
|
+
"""Record a measurement directly."""
|
|
56
|
+
self._measurements.append(LatencyMeasurement(
|
|
57
|
+
operation=operation,
|
|
58
|
+
duration_ms=duration_ms,
|
|
59
|
+
success=success,
|
|
60
|
+
metadata=metadata
|
|
61
|
+
))
|
|
62
|
+
|
|
63
|
+
def get_stats(self, operation: Optional[str] = None) -> dict:
|
|
64
|
+
"""Get statistics for an operation or all operations."""
|
|
65
|
+
measurements = self._measurements
|
|
66
|
+
if operation:
|
|
67
|
+
measurements = [m for m in measurements if m.operation == operation]
|
|
68
|
+
|
|
69
|
+
if not measurements:
|
|
70
|
+
return {"count": 0}
|
|
71
|
+
|
|
72
|
+
durations = [m.duration_ms for m in measurements]
|
|
73
|
+
successful = [m for m in measurements if m.success]
|
|
74
|
+
|
|
75
|
+
sorted_durations = sorted(durations)
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
"count": len(measurements),
|
|
79
|
+
"success_rate": len(successful) / len(measurements),
|
|
80
|
+
"mean_ms": statistics.mean(durations),
|
|
81
|
+
"median_ms": statistics.median(durations),
|
|
82
|
+
"stdev_ms": statistics.stdev(durations) if len(durations) > 1 else 0,
|
|
83
|
+
"min_ms": min(durations),
|
|
84
|
+
"max_ms": max(durations),
|
|
85
|
+
"p95_ms": sorted_durations[int(len(sorted_durations) * 0.95)] if len(sorted_durations) > 1 else sorted_durations[0],
|
|
86
|
+
"p99_ms": sorted_durations[int(len(sorted_durations) * 0.99)] if len(sorted_durations) > 1 else sorted_durations[0],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def clear(self):
|
|
90
|
+
"""Clear all measurements."""
|
|
91
|
+
self._measurements.clear()
|
|
92
|
+
self._start_times.clear()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class SimilarityCalculator:
|
|
96
|
+
"""Calculate and compare embedding similarities."""
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
100
|
+
"""Calculate cosine similarity between two vectors."""
|
|
101
|
+
if len(a) != len(b):
|
|
102
|
+
raise ValueError("Vectors must have same dimension")
|
|
103
|
+
|
|
104
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
105
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
106
|
+
norm_b = math.sqrt(sum(x * x for x in b))
|
|
107
|
+
|
|
108
|
+
if norm_a == 0 or norm_b == 0:
|
|
109
|
+
return 0.0
|
|
110
|
+
|
|
111
|
+
return dot / (norm_a * norm_b)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def euclidean_distance(a: list[float], b: list[float]) -> float:
|
|
115
|
+
"""Calculate Euclidean distance between two vectors."""
|
|
116
|
+
if len(a) != len(b):
|
|
117
|
+
raise ValueError("Vectors must have same dimension")
|
|
118
|
+
|
|
119
|
+
return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def batch_similarities(
|
|
123
|
+
query: list[float],
|
|
124
|
+
candidates: list[list[float]]
|
|
125
|
+
) -> list[float]:
|
|
126
|
+
"""Calculate similarities between query and multiple candidates."""
|
|
127
|
+
return [
|
|
128
|
+
SimilarityCalculator.cosine_similarity(query, c)
|
|
129
|
+
for c in candidates
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class TestResult:
|
|
135
|
+
"""Result of a single test case."""
|
|
136
|
+
test_id: str
|
|
137
|
+
test_name: str
|
|
138
|
+
tier: str
|
|
139
|
+
passed: bool
|
|
140
|
+
score: Optional[float] = None
|
|
141
|
+
threshold: Optional[float] = None
|
|
142
|
+
duration_ms: float = 0
|
|
143
|
+
error: Optional[str] = None
|
|
144
|
+
details: dict = field(default_factory=dict)
|
|
145
|
+
timestamp: datetime = field(default_factory=datetime.utcnow)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class TestResultLogger:
|
|
149
|
+
"""Log and persist test results for analysis."""
|
|
150
|
+
|
|
151
|
+
def __init__(self, output_dir: Optional[Path] = None):
|
|
152
|
+
self.output_dir = output_dir or Path("test-results")
|
|
153
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
self._results: list[TestResult] = []
|
|
155
|
+
self._run_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
|
156
|
+
|
|
157
|
+
def log(self, result: TestResult):
|
|
158
|
+
"""Log a test result."""
|
|
159
|
+
self._results.append(result)
|
|
160
|
+
|
|
161
|
+
def get_summary(self) -> dict:
|
|
162
|
+
"""Get summary of all test results."""
|
|
163
|
+
if not self._results:
|
|
164
|
+
return {"total": 0}
|
|
165
|
+
|
|
166
|
+
by_tier = {}
|
|
167
|
+
for r in self._results:
|
|
168
|
+
if r.tier not in by_tier:
|
|
169
|
+
by_tier[r.tier] = {"passed": 0, "failed": 0}
|
|
170
|
+
if r.passed:
|
|
171
|
+
by_tier[r.tier]["passed"] += 1
|
|
172
|
+
else:
|
|
173
|
+
by_tier[r.tier]["failed"] += 1
|
|
174
|
+
|
|
175
|
+
passed = sum(1 for r in self._results if r.passed)
|
|
176
|
+
failed = len(self._results) - passed
|
|
177
|
+
|
|
178
|
+
return {
|
|
179
|
+
"run_id": self._run_id,
|
|
180
|
+
"total": len(self._results),
|
|
181
|
+
"passed": passed,
|
|
182
|
+
"failed": failed,
|
|
183
|
+
"pass_rate": passed / len(self._results),
|
|
184
|
+
"by_tier": by_tier,
|
|
185
|
+
"failed_tests": [
|
|
186
|
+
{"id": r.test_id, "name": r.test_name, "error": r.error}
|
|
187
|
+
for r in self._results if not r.passed
|
|
188
|
+
]
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def save(self, filename: Optional[str] = None):
|
|
192
|
+
"""Save results to JSON file."""
|
|
193
|
+
filename = filename or f"results_{self._run_id}.json"
|
|
194
|
+
filepath = self.output_dir / filename
|
|
195
|
+
|
|
196
|
+
data = {
|
|
197
|
+
"run_id": self._run_id,
|
|
198
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
199
|
+
"summary": self.get_summary(),
|
|
200
|
+
"results": [
|
|
201
|
+
{
|
|
202
|
+
"test_id": r.test_id,
|
|
203
|
+
"test_name": r.test_name,
|
|
204
|
+
"tier": r.tier,
|
|
205
|
+
"passed": r.passed,
|
|
206
|
+
"score": r.score,
|
|
207
|
+
"threshold": r.threshold,
|
|
208
|
+
"duration_ms": r.duration_ms,
|
|
209
|
+
"error": r.error,
|
|
210
|
+
"details": r.details,
|
|
211
|
+
"timestamp": r.timestamp.isoformat(),
|
|
212
|
+
}
|
|
213
|
+
for r in self._results
|
|
214
|
+
]
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
with open(filepath, "w") as f:
|
|
218
|
+
json.dump(data, f, indent=2)
|
|
219
|
+
|
|
220
|
+
return filepath
|
|
221
|
+
|
|
222
|
+
def compare_to_baseline(self, baseline_path: Path) -> dict:
|
|
223
|
+
"""Compare current results to a baseline."""
|
|
224
|
+
with open(baseline_path) as f:
|
|
225
|
+
baseline = json.load(f)
|
|
226
|
+
|
|
227
|
+
current_by_id = {r.test_id: r for r in self._results}
|
|
228
|
+
baseline_by_id = {r["test_id"]: r for r in baseline["results"]}
|
|
229
|
+
|
|
230
|
+
regressions = []
|
|
231
|
+
improvements = []
|
|
232
|
+
new_tests = []
|
|
233
|
+
|
|
234
|
+
for test_id, result in current_by_id.items():
|
|
235
|
+
if test_id not in baseline_by_id:
|
|
236
|
+
new_tests.append(test_id)
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
baseline_result = baseline_by_id[test_id]
|
|
240
|
+
|
|
241
|
+
if result.passed and not baseline_result["passed"]:
|
|
242
|
+
improvements.append(test_id)
|
|
243
|
+
elif not result.passed and baseline_result["passed"]:
|
|
244
|
+
regressions.append(test_id)
|
|
245
|
+
|
|
246
|
+
return {
|
|
247
|
+
"regressions": regressions,
|
|
248
|
+
"improvements": improvements,
|
|
249
|
+
"new_tests": new_tests,
|
|
250
|
+
"baseline_pass_rate": baseline["summary"]["pass_rate"],
|
|
251
|
+
"current_pass_rate": self.get_summary()["pass_rate"],
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
def clear(self):
|
|
255
|
+
"""Clear all results."""
|
|
256
|
+
self._results.clear()
|