prela 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prela/__init__.py +394 -0
- prela/_version.py +3 -0
- prela/contrib/CLI.md +431 -0
- prela/contrib/README.md +118 -0
- prela/contrib/__init__.py +5 -0
- prela/contrib/cli.py +1063 -0
- prela/contrib/explorer.py +571 -0
- prela/core/__init__.py +64 -0
- prela/core/clock.py +98 -0
- prela/core/context.py +228 -0
- prela/core/replay.py +403 -0
- prela/core/sampler.py +178 -0
- prela/core/span.py +295 -0
- prela/core/tracer.py +498 -0
- prela/evals/__init__.py +94 -0
- prela/evals/assertions/README.md +484 -0
- prela/evals/assertions/__init__.py +78 -0
- prela/evals/assertions/base.py +90 -0
- prela/evals/assertions/multi_agent.py +625 -0
- prela/evals/assertions/semantic.py +223 -0
- prela/evals/assertions/structural.py +443 -0
- prela/evals/assertions/tool.py +380 -0
- prela/evals/case.py +370 -0
- prela/evals/n8n/__init__.py +69 -0
- prela/evals/n8n/assertions.py +450 -0
- prela/evals/n8n/runner.py +497 -0
- prela/evals/reporters/README.md +184 -0
- prela/evals/reporters/__init__.py +32 -0
- prela/evals/reporters/console.py +251 -0
- prela/evals/reporters/json.py +176 -0
- prela/evals/reporters/junit.py +278 -0
- prela/evals/runner.py +525 -0
- prela/evals/suite.py +316 -0
- prela/exporters/__init__.py +27 -0
- prela/exporters/base.py +189 -0
- prela/exporters/console.py +443 -0
- prela/exporters/file.py +322 -0
- prela/exporters/http.py +394 -0
- prela/exporters/multi.py +154 -0
- prela/exporters/otlp.py +388 -0
- prela/instrumentation/ANTHROPIC.md +297 -0
- prela/instrumentation/LANGCHAIN.md +480 -0
- prela/instrumentation/OPENAI.md +59 -0
- prela/instrumentation/__init__.py +49 -0
- prela/instrumentation/anthropic.py +1436 -0
- prela/instrumentation/auto.py +129 -0
- prela/instrumentation/base.py +436 -0
- prela/instrumentation/langchain.py +959 -0
- prela/instrumentation/llamaindex.py +719 -0
- prela/instrumentation/multi_agent/__init__.py +48 -0
- prela/instrumentation/multi_agent/autogen.py +357 -0
- prela/instrumentation/multi_agent/crewai.py +404 -0
- prela/instrumentation/multi_agent/langgraph.py +299 -0
- prela/instrumentation/multi_agent/models.py +203 -0
- prela/instrumentation/multi_agent/swarm.py +231 -0
- prela/instrumentation/n8n/__init__.py +68 -0
- prela/instrumentation/n8n/code_node.py +534 -0
- prela/instrumentation/n8n/models.py +336 -0
- prela/instrumentation/n8n/webhook.py +489 -0
- prela/instrumentation/openai.py +1198 -0
- prela/license.py +245 -0
- prela/replay/__init__.py +31 -0
- prela/replay/comparison.py +390 -0
- prela/replay/engine.py +1227 -0
- prela/replay/loader.py +231 -0
- prela/replay/result.py +196 -0
- prela-0.1.0.dist-info/METADATA +399 -0
- prela-0.1.0.dist-info/RECORD +71 -0
- prela-0.1.0.dist-info/WHEEL +4 -0
- prela-0.1.0.dist-info/entry_points.txt +2 -0
- prela-0.1.0.dist-info/licenses/LICENSE +190 -0
prela/license.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Tier-based feature gating for Prela SDK.
|
|
2
|
+
|
|
3
|
+
This module provides decorators and utilities for restricting features
|
|
4
|
+
to specific subscription tiers.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import functools
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SubscriptionError(Exception):
|
|
16
|
+
"""Raised when a feature requires a higher subscription tier."""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Tier hierarchy (lower index = lower tier)
|
|
22
|
+
TIER_HIERARCHY = ["free", "lunch-money", "pro", "enterprise"]
|
|
23
|
+
|
|
24
|
+
# Current tier (detected from environment or API key)
|
|
25
|
+
_current_tier: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def set_tier(tier: str):
|
|
29
|
+
"""Set the current subscription tier.
|
|
30
|
+
|
|
31
|
+
This is called automatically when initializing the HTTP exporter
|
|
32
|
+
with an API key.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
tier: Subscription tier (free, lunch-money, pro, enterprise).
|
|
36
|
+
"""
|
|
37
|
+
global _current_tier
|
|
38
|
+
|
|
39
|
+
if tier not in TIER_HIERARCHY:
|
|
40
|
+
logger.warning(f"Unknown tier: {tier}, defaulting to free")
|
|
41
|
+
tier = "free"
|
|
42
|
+
|
|
43
|
+
_current_tier = tier
|
|
44
|
+
logger.debug(f"Subscription tier set to: {tier}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_tier() -> str:
|
|
48
|
+
"""Get the current subscription tier.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Current tier string. Defaults to "free" if not set.
|
|
52
|
+
"""
|
|
53
|
+
global _current_tier
|
|
54
|
+
|
|
55
|
+
# If not set, try to get from environment
|
|
56
|
+
if _current_tier is None:
|
|
57
|
+
env_tier = os.environ.get("PRELA_TIER", "free")
|
|
58
|
+
set_tier(env_tier)
|
|
59
|
+
|
|
60
|
+
return _current_tier or "free"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def has_access(current_tier: str, required_tier: str) -> bool:
|
|
64
|
+
"""Check if current tier has access to a feature requiring a specific tier.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
current_tier: User's current subscription tier.
|
|
68
|
+
required_tier: Tier required for the feature.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
True if user has access, False otherwise.
|
|
72
|
+
"""
|
|
73
|
+
try:
|
|
74
|
+
current_idx = TIER_HIERARCHY.index(current_tier)
|
|
75
|
+
required_idx = TIER_HIERARCHY.index(required_tier)
|
|
76
|
+
return current_idx >= required_idx
|
|
77
|
+
except ValueError:
|
|
78
|
+
# Unknown tier, deny access
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def require_tier(feature_name: str, required_tier: str):
|
|
83
|
+
"""Decorator to restrict a feature to a specific subscription tier.
|
|
84
|
+
|
|
85
|
+
Usage:
|
|
86
|
+
@require_tier("CrewAI instrumentation", "lunch-money")
|
|
87
|
+
def instrument_crewai():
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
feature_name: Human-readable name of the feature.
|
|
92
|
+
required_tier: Minimum tier required (free, lunch-money, pro, enterprise).
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Decorator function.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
SubscriptionError: If user doesn't have required tier.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def decorator(func: Callable) -> Callable:
|
|
102
|
+
@functools.wraps(func)
|
|
103
|
+
def wrapper(*args, **kwargs):
|
|
104
|
+
current_tier = get_tier()
|
|
105
|
+
|
|
106
|
+
if not has_access(current_tier, required_tier):
|
|
107
|
+
raise SubscriptionError(
|
|
108
|
+
f"\n\n"
|
|
109
|
+
f"🔒 {feature_name} requires '{required_tier}' subscription or higher.\n"
|
|
110
|
+
f" Current tier: '{current_tier}'\n\n"
|
|
111
|
+
f" Upgrade at: https://prela.dev/pricing\n\n"
|
|
112
|
+
f" Features by tier:\n"
|
|
113
|
+
f" • free: OpenAI, Anthropic, LangChain, LlamaIndex (basic)\n"
|
|
114
|
+
f" • lunch-money: + CrewAI, AutoGen, LangGraph, Swarm, n8n, replay, semantic assertions\n"
|
|
115
|
+
f" • pro: + hallucination detection, drift detection, natural language search\n"
|
|
116
|
+
f" • enterprise: + compliance features, dedicated support\n"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return func(*args, **kwargs)
|
|
120
|
+
|
|
121
|
+
return wrapper
|
|
122
|
+
|
|
123
|
+
return decorator
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def require_tier_async(feature_name: str, required_tier: str):
|
|
127
|
+
"""Async version of require_tier decorator.
|
|
128
|
+
|
|
129
|
+
Usage:
|
|
130
|
+
@require_tier_async("CrewAI instrumentation", "lunch-money")
|
|
131
|
+
async def instrument_crewai():
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
feature_name: Human-readable name of the feature.
|
|
136
|
+
required_tier: Minimum tier required.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Async decorator function.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
SubscriptionError: If user doesn't have required tier.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def decorator(func: Callable) -> Callable:
|
|
146
|
+
@functools.wraps(func)
|
|
147
|
+
async def wrapper(*args, **kwargs):
|
|
148
|
+
current_tier = get_tier()
|
|
149
|
+
|
|
150
|
+
if not has_access(current_tier, required_tier):
|
|
151
|
+
raise SubscriptionError(
|
|
152
|
+
f"\n\n"
|
|
153
|
+
f"🔒 {feature_name} requires '{required_tier}' subscription or higher.\n"
|
|
154
|
+
f" Current tier: '{current_tier}'\n\n"
|
|
155
|
+
f" Upgrade at: https://prela.dev/pricing\n"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
return await func(*args, **kwargs)
|
|
159
|
+
|
|
160
|
+
return wrapper
|
|
161
|
+
|
|
162
|
+
return decorator
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def check_tier(feature_name: str, required_tier: str, silent: bool = False) -> bool:
|
|
166
|
+
"""Check if current tier has access to a feature.
|
|
167
|
+
|
|
168
|
+
This is a non-decorator version for conditional logic.
|
|
169
|
+
|
|
170
|
+
Usage:
|
|
171
|
+
if check_tier("Replay engine", "lunch-money"):
|
|
172
|
+
# Enable replay features
|
|
173
|
+
pass
|
|
174
|
+
else:
|
|
175
|
+
print("Upgrade to use replay")
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
feature_name: Human-readable name of the feature.
|
|
179
|
+
required_tier: Minimum tier required.
|
|
180
|
+
silent: If True, don't log warnings.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
True if user has access, False otherwise.
|
|
184
|
+
"""
|
|
185
|
+
current_tier = get_tier()
|
|
186
|
+
has_feature = has_access(current_tier, required_tier)
|
|
187
|
+
|
|
188
|
+
if not has_feature and not silent:
|
|
189
|
+
logger.warning(
|
|
190
|
+
f"{feature_name} requires '{required_tier}' tier. "
|
|
191
|
+
f"Current tier: '{current_tier}'. "
|
|
192
|
+
f"Upgrade at https://prela.dev/pricing"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return has_feature
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def get_tier_features() -> dict[str, list[str]]:
|
|
199
|
+
"""Get a dictionary of features available in each tier.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Dictionary mapping tier names to lists of features.
|
|
203
|
+
"""
|
|
204
|
+
return {
|
|
205
|
+
"free": [
|
|
206
|
+
"Basic tracing (traces, spans, context)",
|
|
207
|
+
"OpenAI & Anthropic instrumentation",
|
|
208
|
+
"LangChain & LlamaIndex (basic)",
|
|
209
|
+
"Console & File exporters",
|
|
210
|
+
"Basic CLI commands",
|
|
211
|
+
"Local storage",
|
|
212
|
+
],
|
|
213
|
+
"lunch-money": [
|
|
214
|
+
"All free features",
|
|
215
|
+
"CrewAI, AutoGen, LangGraph, Swarm, n8n",
|
|
216
|
+
"All 17+ assertion types",
|
|
217
|
+
"Semantic similarity assertions",
|
|
218
|
+
"Multi-agent assertions",
|
|
219
|
+
"Replay engine (100/month)",
|
|
220
|
+
"HTTP exporter (cloud sync)",
|
|
221
|
+
"100k traces/month",
|
|
222
|
+
"30-day retention",
|
|
223
|
+
],
|
|
224
|
+
"pro": [
|
|
225
|
+
"All lunch-money features",
|
|
226
|
+
"Hallucination detection",
|
|
227
|
+
"Drift detection with alerts",
|
|
228
|
+
"Natural language search",
|
|
229
|
+
"One-click debug flow",
|
|
230
|
+
"Cost optimization",
|
|
231
|
+
"Batch replay (50 traces)",
|
|
232
|
+
"1M traces/month",
|
|
233
|
+
"90-day retention",
|
|
234
|
+
],
|
|
235
|
+
"enterprise": [
|
|
236
|
+
"All pro features",
|
|
237
|
+
"EU AI Act compliance",
|
|
238
|
+
"Data lineage tracking",
|
|
239
|
+
"Custom model cards",
|
|
240
|
+
"SSO/SAML",
|
|
241
|
+
"Dedicated infrastructure",
|
|
242
|
+
"Unlimited traces",
|
|
243
|
+
"Custom retention",
|
|
244
|
+
],
|
|
245
|
+
}
|
prela/replay/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Replay execution engine for deterministic re-execution of AI agent traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
# Check tier on module import
|
|
6
|
+
from prela.license import check_tier
|
|
7
|
+
|
|
8
|
+
if not check_tier("Replay engine", "lunch-money", silent=False):
|
|
9
|
+
raise ImportError(
|
|
10
|
+
"Replay engine requires 'lunch-money' subscription or higher. "
|
|
11
|
+
"Upgrade at https://prela.dev/pricing"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from prela.replay.comparison import ReplayComparator, compare_replays
|
|
15
|
+
from prela.replay.engine import ReplayEngine
|
|
16
|
+
from prela.replay.result import (
|
|
17
|
+
ReplayComparison,
|
|
18
|
+
ReplayResult,
|
|
19
|
+
ReplayedSpan,
|
|
20
|
+
SpanDifference,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"ReplayEngine",
|
|
25
|
+
"ReplayResult",
|
|
26
|
+
"ReplayedSpan",
|
|
27
|
+
"ReplayComparison",
|
|
28
|
+
"SpanDifference",
|
|
29
|
+
"ReplayComparator",
|
|
30
|
+
"compare_replays",
|
|
31
|
+
]
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""Comparison utilities for replay results with semantic similarity."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from prela.replay.result import ReplayComparison, ReplayResult, SpanDifference
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ReplayComparator:
|
|
14
|
+
"""Compares replay results with semantic similarity analysis.
|
|
15
|
+
|
|
16
|
+
Uses sentence-transformers for text similarity and deepdiff for
|
|
17
|
+
structural comparison.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, use_semantic_similarity: bool = True, show_download_progress: bool = True) -> None:
|
|
21
|
+
"""Initialize comparator.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
use_semantic_similarity: Whether to compute semantic similarity
|
|
25
|
+
for text fields. Requires sentence-transformers package.
|
|
26
|
+
show_download_progress: Whether to show progress for first-time model download
|
|
27
|
+
"""
|
|
28
|
+
self.use_semantic_similarity = use_semantic_similarity
|
|
29
|
+
self.semantic_similarity_available = False
|
|
30
|
+
self.semantic_similarity_model = None
|
|
31
|
+
self._encoder = None
|
|
32
|
+
|
|
33
|
+
if use_semantic_similarity:
|
|
34
|
+
try:
|
|
35
|
+
from sentence_transformers import SentenceTransformer
|
|
36
|
+
|
|
37
|
+
if show_download_progress:
|
|
38
|
+
logger.info("Loading semantic similarity model (one-time download ~90MB)...")
|
|
39
|
+
|
|
40
|
+
self._encoder = SentenceTransformer("all-MiniLM-L6-v2")
|
|
41
|
+
self.semantic_similarity_available = True
|
|
42
|
+
self.semantic_similarity_model = "all-MiniLM-L6-v2"
|
|
43
|
+
|
|
44
|
+
if show_download_progress:
|
|
45
|
+
logger.info("✓ Semantic similarity model loaded successfully")
|
|
46
|
+
|
|
47
|
+
logger.debug("Loaded sentence-transformers model for semantic similarity")
|
|
48
|
+
except ImportError:
|
|
49
|
+
logger.warning(
|
|
50
|
+
"sentence-transformers not available. Using fallback similarity metrics. "
|
|
51
|
+
"For better comparison, install with: pip install sentence-transformers"
|
|
52
|
+
)
|
|
53
|
+
self.use_semantic_similarity = False
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.warning(f"Failed to load sentence-transformers model: {e}. Using fallback metrics.")
|
|
56
|
+
self.use_semantic_similarity = False
|
|
57
|
+
|
|
58
|
+
def compare(
|
|
59
|
+
self,
|
|
60
|
+
original: ReplayResult,
|
|
61
|
+
modified: ReplayResult,
|
|
62
|
+
) -> ReplayComparison:
|
|
63
|
+
"""Compare two replay results.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
original: Original replay result
|
|
67
|
+
modified: Modified replay result
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
ReplayComparison with differences and summary
|
|
71
|
+
"""
|
|
72
|
+
if original.trace_id != modified.trace_id:
|
|
73
|
+
logger.warning(
|
|
74
|
+
f"Comparing results from different traces: "
|
|
75
|
+
f"{original.trace_id} vs {modified.trace_id}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
differences = []
|
|
79
|
+
|
|
80
|
+
# Compare spans by matching on original_span_id
|
|
81
|
+
original_map = {s.original_span_id: s for s in original.spans}
|
|
82
|
+
modified_map = {s.original_span_id: s for s in modified.spans}
|
|
83
|
+
|
|
84
|
+
# Find all span IDs (union of both sets)
|
|
85
|
+
all_span_ids = set(original_map.keys()) | set(modified_map.keys())
|
|
86
|
+
|
|
87
|
+
for span_id in all_span_ids:
|
|
88
|
+
orig_span = original_map.get(span_id)
|
|
89
|
+
mod_span = modified_map.get(span_id)
|
|
90
|
+
|
|
91
|
+
# Handle missing spans
|
|
92
|
+
if orig_span is None:
|
|
93
|
+
differences.append(
|
|
94
|
+
SpanDifference(
|
|
95
|
+
span_name=mod_span.name if mod_span else "unknown",
|
|
96
|
+
span_type=mod_span.span_type if mod_span else "unknown",
|
|
97
|
+
field="existence",
|
|
98
|
+
original_value=None,
|
|
99
|
+
modified_value=mod_span.to_dict() if mod_span else None,
|
|
100
|
+
exact_match=False,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if mod_span is None:
|
|
106
|
+
differences.append(
|
|
107
|
+
SpanDifference(
|
|
108
|
+
span_name=orig_span.name,
|
|
109
|
+
span_type=orig_span.span_type,
|
|
110
|
+
field="existence",
|
|
111
|
+
original_value=orig_span.to_dict(),
|
|
112
|
+
modified_value=None,
|
|
113
|
+
exact_match=False,
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Compare span fields
|
|
119
|
+
span_diffs = self._compare_spans(orig_span, mod_span)
|
|
120
|
+
differences.extend(span_diffs)
|
|
121
|
+
|
|
122
|
+
# Create comparison object
|
|
123
|
+
comparison = ReplayComparison(
|
|
124
|
+
original=original,
|
|
125
|
+
modified=modified,
|
|
126
|
+
differences=differences,
|
|
127
|
+
semantic_similarity_available=self.semantic_similarity_available,
|
|
128
|
+
semantic_similarity_model=self.semantic_similarity_model,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Generate summary
|
|
132
|
+
comparison.generate_summary()
|
|
133
|
+
|
|
134
|
+
return comparison
|
|
135
|
+
|
|
136
|
+
def _compare_spans(self, original, modified) -> list[SpanDifference]:
|
|
137
|
+
"""Compare two replayed spans.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
original: Original replayed span
|
|
141
|
+
modified: Modified replayed span
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
List of differences found
|
|
145
|
+
"""
|
|
146
|
+
differences = []
|
|
147
|
+
|
|
148
|
+
# Compare output (most important field)
|
|
149
|
+
if original.output != modified.output:
|
|
150
|
+
diff = self._compare_values(
|
|
151
|
+
span_name=original.name,
|
|
152
|
+
span_type=original.span_type,
|
|
153
|
+
field="output",
|
|
154
|
+
original_value=original.output,
|
|
155
|
+
modified_value=modified.output,
|
|
156
|
+
)
|
|
157
|
+
if diff:
|
|
158
|
+
differences.append(diff)
|
|
159
|
+
|
|
160
|
+
# Compare input
|
|
161
|
+
if original.input != modified.input:
|
|
162
|
+
diff = self._compare_values(
|
|
163
|
+
span_name=original.name,
|
|
164
|
+
span_type=original.span_type,
|
|
165
|
+
field="input",
|
|
166
|
+
original_value=original.input,
|
|
167
|
+
modified_value=modified.input,
|
|
168
|
+
)
|
|
169
|
+
if diff:
|
|
170
|
+
differences.append(diff)
|
|
171
|
+
|
|
172
|
+
# Compare duration (if significantly different)
|
|
173
|
+
duration_change = abs(modified.duration_ms - original.duration_ms)
|
|
174
|
+
duration_pct = (
|
|
175
|
+
duration_change / original.duration_ms if original.duration_ms > 0 else 0
|
|
176
|
+
)
|
|
177
|
+
if duration_pct > 0.1: # More than 10% change
|
|
178
|
+
differences.append(
|
|
179
|
+
SpanDifference(
|
|
180
|
+
span_name=original.name,
|
|
181
|
+
span_type=original.span_type,
|
|
182
|
+
field="duration_ms",
|
|
183
|
+
original_value=original.duration_ms,
|
|
184
|
+
modified_value=modified.duration_ms,
|
|
185
|
+
exact_match=False,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Compare tokens
|
|
190
|
+
if original.tokens_used != modified.tokens_used:
|
|
191
|
+
differences.append(
|
|
192
|
+
SpanDifference(
|
|
193
|
+
span_name=original.name,
|
|
194
|
+
span_type=original.span_type,
|
|
195
|
+
field="tokens_used",
|
|
196
|
+
original_value=original.tokens_used,
|
|
197
|
+
modified_value=modified.tokens_used,
|
|
198
|
+
exact_match=False,
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Compare cost
|
|
203
|
+
if abs(original.cost_usd - modified.cost_usd) > 0.0001: # $0.0001 threshold
|
|
204
|
+
differences.append(
|
|
205
|
+
SpanDifference(
|
|
206
|
+
span_name=original.name,
|
|
207
|
+
span_type=original.span_type,
|
|
208
|
+
field="cost_usd",
|
|
209
|
+
original_value=original.cost_usd,
|
|
210
|
+
modified_value=modified.cost_usd,
|
|
211
|
+
exact_match=False,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Compare error status
|
|
216
|
+
if original.error != modified.error:
|
|
217
|
+
differences.append(
|
|
218
|
+
SpanDifference(
|
|
219
|
+
span_name=original.name,
|
|
220
|
+
span_type=original.span_type,
|
|
221
|
+
field="error",
|
|
222
|
+
original_value=original.error,
|
|
223
|
+
modified_value=modified.error,
|
|
224
|
+
exact_match=False,
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return differences
|
|
229
|
+
|
|
230
|
+
def _compare_values(
|
|
231
|
+
self,
|
|
232
|
+
span_name: str,
|
|
233
|
+
span_type: str,
|
|
234
|
+
field: str,
|
|
235
|
+
original_value: Any,
|
|
236
|
+
modified_value: Any,
|
|
237
|
+
) -> SpanDifference | None:
|
|
238
|
+
"""Compare two values with semantic similarity if applicable.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
span_name: Name of span being compared
|
|
242
|
+
span_type: Type of span
|
|
243
|
+
field: Field being compared
|
|
244
|
+
original_value: Original value
|
|
245
|
+
modified_value: Modified value
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
SpanDifference if values differ, None if identical
|
|
249
|
+
"""
|
|
250
|
+
# Check exact match first
|
|
251
|
+
if original_value == modified_value:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
exact_match = False
|
|
255
|
+
semantic_similarity = None
|
|
256
|
+
|
|
257
|
+
# Compute semantic similarity for text fields
|
|
258
|
+
# Always compute if both are strings (will use fallback if encoder unavailable)
|
|
259
|
+
if isinstance(original_value, str) and isinstance(modified_value, str):
|
|
260
|
+
semantic_similarity = self._compute_semantic_similarity(
|
|
261
|
+
original_value, modified_value
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
return SpanDifference(
|
|
265
|
+
span_name=span_name,
|
|
266
|
+
span_type=span_type,
|
|
267
|
+
field=field,
|
|
268
|
+
original_value=original_value,
|
|
269
|
+
modified_value=modified_value,
|
|
270
|
+
semantic_similarity=semantic_similarity,
|
|
271
|
+
exact_match=exact_match,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
def _compute_semantic_similarity(self, text1: str, text2: str) -> float:
|
|
275
|
+
"""Compute semantic similarity between two texts.
|
|
276
|
+
|
|
277
|
+
Uses sentence-transformers if available, falls back to simpler metrics otherwise.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
text1: First text
|
|
281
|
+
text2: Second text
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Similarity score between 0 and 1
|
|
285
|
+
"""
|
|
286
|
+
# Try sentence-transformers (best quality)
|
|
287
|
+
if self._encoder:
|
|
288
|
+
try:
|
|
289
|
+
# Encode both texts
|
|
290
|
+
embeddings = self._encoder.encode([text1, text2])
|
|
291
|
+
|
|
292
|
+
# Compute cosine similarity
|
|
293
|
+
from numpy import dot
|
|
294
|
+
from numpy.linalg import norm
|
|
295
|
+
|
|
296
|
+
similarity = dot(embeddings[0], embeddings[1]) / (
|
|
297
|
+
norm(embeddings[0]) * norm(embeddings[1])
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Convert to Python float and ensure in [0, 1] range
|
|
301
|
+
return float(max(0.0, min(1.0, similarity)))
|
|
302
|
+
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.warning(f"Failed to compute semantic similarity: {e}, using fallback")
|
|
305
|
+
# Fall through to fallback methods
|
|
306
|
+
|
|
307
|
+
# Fallback: Use difflib sequence matcher
|
|
308
|
+
return self._compute_fallback_similarity(text1, text2)
|
|
309
|
+
|
|
310
|
+
def _compute_fallback_similarity(self, text1: str, text2: str) -> float:
|
|
311
|
+
"""Compute text similarity using fallback methods when sentence-transformers unavailable.
|
|
312
|
+
|
|
313
|
+
Uses multiple heuristics to provide reasonable similarity estimation.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
text1: First text
|
|
317
|
+
text2: Second text
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Similarity score between 0 and 1
|
|
321
|
+
"""
|
|
322
|
+
# Exact match
|
|
323
|
+
if text1 == text2:
|
|
324
|
+
return 1.0
|
|
325
|
+
|
|
326
|
+
# Both empty
|
|
327
|
+
if not text1 and not text2:
|
|
328
|
+
return 1.0
|
|
329
|
+
|
|
330
|
+
# One empty, one not
|
|
331
|
+
if not text1 or not text2:
|
|
332
|
+
return 0.0
|
|
333
|
+
|
|
334
|
+
# Method 1: difflib SequenceMatcher (best fallback)
|
|
335
|
+
try:
|
|
336
|
+
import difflib
|
|
337
|
+
|
|
338
|
+
return difflib.SequenceMatcher(None, text1, text2).ratio()
|
|
339
|
+
except Exception:
|
|
340
|
+
pass
|
|
341
|
+
|
|
342
|
+
# Method 2: Jaccard similarity on words
|
|
343
|
+
try:
|
|
344
|
+
words1 = set(text1.lower().split())
|
|
345
|
+
words2 = set(text2.lower().split())
|
|
346
|
+
|
|
347
|
+
if not words1 or not words2:
|
|
348
|
+
return 0.0
|
|
349
|
+
|
|
350
|
+
intersection = len(words1 & words2)
|
|
351
|
+
union = len(words1 | words2)
|
|
352
|
+
|
|
353
|
+
return intersection / union if union > 0 else 0.0
|
|
354
|
+
except Exception:
|
|
355
|
+
pass
|
|
356
|
+
|
|
357
|
+
# Method 3: Character-level Jaccard (last resort)
|
|
358
|
+
try:
|
|
359
|
+
chars1 = set(text1.lower())
|
|
360
|
+
chars2 = set(text2.lower())
|
|
361
|
+
|
|
362
|
+
intersection = len(chars1 & chars2)
|
|
363
|
+
union = len(chars1 | chars2)
|
|
364
|
+
|
|
365
|
+
return intersection / union if union > 0 else 0.0
|
|
366
|
+
except Exception:
|
|
367
|
+
pass
|
|
368
|
+
|
|
369
|
+
# Complete fallback: 0.5 if comparable lengths, 0.0 otherwise
|
|
370
|
+
len_ratio = min(len(text1), len(text2)) / max(len(text1), len(text2))
|
|
371
|
+
return 0.5 if len_ratio > 0.5 else 0.0
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def compare_replays(
|
|
375
|
+
original: ReplayResult,
|
|
376
|
+
modified: ReplayResult,
|
|
377
|
+
use_semantic_similarity: bool = True,
|
|
378
|
+
) -> ReplayComparison:
|
|
379
|
+
"""Convenience function to compare two replay results.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
original: Original replay result
|
|
383
|
+
modified: Modified replay result
|
|
384
|
+
use_semantic_similarity: Whether to compute semantic similarity
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
ReplayComparison with differences and summary
|
|
388
|
+
"""
|
|
389
|
+
comparator = ReplayComparator(use_semantic_similarity=use_semantic_similarity)
|
|
390
|
+
return comparator.compare(original, modified)
|