fasteval-langfuse 1.0.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fasteval_langfuse-1.0.0a1/PKG-INFO +181 -0
- fasteval_langfuse-1.0.0a1/README.md +169 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/__init__.py +58 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/client.py +180 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/config.py +80 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/decorators.py +326 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/py.typed +1 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/sampling/__init__.py +19 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/sampling/base.py +45 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/sampling/strategies.py +321 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/score_reporter.py +65 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/trace_fetcher.py +121 -0
- fasteval_langfuse-1.0.0a1/fasteval_langfuse/utils.py +117 -0
- fasteval_langfuse-1.0.0a1/pyproject.toml +44 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: fasteval-langfuse
|
|
3
|
+
Version: 1.0.0a1
|
|
4
|
+
Summary: Langfuse integration for fasteval - evaluate production traces with fasteval metrics
|
|
5
|
+
Author: Intuit
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Requires-Dist: fasteval-core>=1.0.0a1
|
|
8
|
+
Requires-Dist: langfuse>=2.0.0
|
|
9
|
+
Requires-Dist: pydantic>=2.0.0
|
|
10
|
+
Requires-Python: >=3.10, <4.0
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# fasteval-langfuse
|
|
14
|
+
|
|
15
|
+
Langfuse integration for [fasteval](https://github.com/intuit/fasteval) - evaluate production traces with fasteval's research-backed metrics.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install fasteval-core fasteval-langfuse
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
### Evaluate Production Traces
|
|
26
|
+
|
|
27
|
+
Fetch traces from Langfuse and evaluate them with fasteval metrics:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from fasteval_langfuse import langfuse_traces
|
|
31
|
+
from fasteval_langfuse.sampling import RandomSamplingStrategy
|
|
32
|
+
import fasteval as fe
|
|
33
|
+
|
|
34
|
+
@fe.correctness(threshold=0.8)
|
|
35
|
+
@fe.hallucination(threshold=0.9)
|
|
36
|
+
@langfuse_traces(
|
|
37
|
+
project="production",
|
|
38
|
+
filter_tags=["customer-support"],
|
|
39
|
+
time_range="last_24h",
|
|
40
|
+
sampling=RandomSamplingStrategy(sample_size=200)
|
|
41
|
+
)
|
|
42
|
+
def test_production_traces(trace_id, input, output, context, metadata):
|
|
43
|
+
# Evaluate the trace
|
|
44
|
+
fe.score(output, input=input)
|
|
45
|
+
|
|
46
|
+
# Run with pytest - scores automatically pushed to Langfuse
|
|
47
|
+
# pytest test_production.py -v
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Sampling Strategies
|
|
51
|
+
|
|
52
|
+
Reduce evaluation costs with intelligent sampling:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from fasteval_langfuse.sampling import (
|
|
56
|
+
RandomSamplingStrategy,
|
|
57
|
+
StratifiedSamplingStrategy,
|
|
58
|
+
ScoreBasedSamplingStrategy,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Random sampling - 200 random traces
|
|
62
|
+
@langfuse_traces(
|
|
63
|
+
project="prod",
|
|
64
|
+
sampling=RandomSamplingStrategy(sample_size=200, seed=42)
|
|
65
|
+
)
|
|
66
|
+
def test_random_sample(trace_id, input, output, context, metadata):
|
|
67
|
+
fe.score(output, input=input)
|
|
68
|
+
|
|
69
|
+
# Stratified sampling - even distribution across user types
|
|
70
|
+
@langfuse_traces(
|
|
71
|
+
project="prod",
|
|
72
|
+
sampling=StratifiedSamplingStrategy(
|
|
73
|
+
strata_key="metadata.user_type",
|
|
74
|
+
samples_per_stratum=30
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
def test_across_segments(trace_id, input, output, context, metadata):
|
|
78
|
+
fe.score(output, input=input)
|
|
79
|
+
|
|
80
|
+
# Score-based sampling - focus on failures
|
|
81
|
+
@langfuse_traces(
|
|
82
|
+
project="prod",
|
|
83
|
+
sampling=ScoreBasedSamplingStrategy(
|
|
84
|
+
score_name="user_rating",
|
|
85
|
+
low_score_threshold=3.0,
|
|
86
|
+
low_score_rate=1.0, # 100% of low ratings
|
|
87
|
+
high_score_rate=0.05 # 5% of high ratings
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
def test_failures(trace_id, input, output, context, metadata):
|
|
91
|
+
fe.score(output, input=input)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Built-in Sampling Strategies
|
|
95
|
+
|
|
96
|
+
- **NoSamplingStrategy**: Evaluate all matching traces (default)
|
|
97
|
+
- **RandomSamplingStrategy**: Unbiased random sampling
|
|
98
|
+
- **StratifiedSamplingStrategy**: Even distribution across groups
|
|
99
|
+
- **ScoreBasedSamplingStrategy**: Oversample low-scoring traces
|
|
100
|
+
- **RecentFirstSamplingStrategy**: Prioritize recent traces
|
|
101
|
+
|
|
102
|
+
## Dataset Integration
|
|
103
|
+
|
|
104
|
+
Evaluate against Langfuse datasets. All dataset columns are passed as parameters - declare what you need:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from fasteval_langfuse import langfuse_dataset
|
|
108
|
+
|
|
109
|
+
# Basic usage
|
|
110
|
+
@fe.correctness(threshold=0.8)
|
|
111
|
+
@langfuse_dataset(name="qa-golden-set", version="v2")
|
|
112
|
+
def test_qa_dataset(input, expected_output):
|
|
113
|
+
response = my_agent(input)
|
|
114
|
+
fe.score(response, expected_output, input=input)
|
|
115
|
+
|
|
116
|
+
# Using custom metadata fields
|
|
117
|
+
@fe.correctness(threshold=0.8)
|
|
118
|
+
@langfuse_dataset(name="qa-golden-set", version="v2")
|
|
119
|
+
def test_with_metadata(input, expected_output, difficulty, category):
|
|
120
|
+
# difficulty and category come from item.metadata
|
|
121
|
+
response = my_agent(input)
|
|
122
|
+
fe.score(response, expected_output, input=input)
|
|
123
|
+
|
|
124
|
+
# Only what you need
|
|
125
|
+
@fe.correctness(threshold=0.8)
|
|
126
|
+
@langfuse_dataset(name="inputs-only")
|
|
127
|
+
def test_minimal(input):
|
|
128
|
+
# Only declare input, ignore other fields
|
|
129
|
+
response = my_agent(input)
|
|
130
|
+
fe.score(response, input=input)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Configuration
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from fasteval_langfuse import configure_langfuse, LangfuseConfig
|
|
137
|
+
|
|
138
|
+
configure_langfuse(LangfuseConfig(
|
|
139
|
+
public_key="pk-...", # Or from LANGFUSE_PUBLIC_KEY env
|
|
140
|
+
secret_key="sk-...", # Or from LANGFUSE_SECRET_KEY env
|
|
141
|
+
host="https://cloud.langfuse.com", # Or self-hosted
|
|
142
|
+
default_project="production",
|
|
143
|
+
auto_push_scores=True, # Push scores back automatically
|
|
144
|
+
score_name_prefix="fasteval_", # Prefix for score names
|
|
145
|
+
))
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## RAG Evaluation with Context
|
|
149
|
+
|
|
150
|
+
The decorator automatically extracts context from trace metadata:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
@fe.faithfulness(threshold=0.8)
|
|
154
|
+
@fe.contextual_precision(threshold=0.7)
|
|
155
|
+
@langfuse_traces(
|
|
156
|
+
project="prod",
|
|
157
|
+
filter_tags=["rag"]
|
|
158
|
+
)
|
|
159
|
+
def test_rag_quality(trace_id, input, output, context, metadata):
|
|
160
|
+
# context is auto-extracted from metadata keys:
|
|
161
|
+
# - "context", "retrieved_docs", "documents", "retrieval_context"
|
|
162
|
+
|
|
163
|
+
# Or manually extract if needed:
|
|
164
|
+
if not context:
|
|
165
|
+
context = metadata.get("custom_docs_key")
|
|
166
|
+
|
|
167
|
+
fe.score(output, context=context, input=input)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Benefits
|
|
171
|
+
|
|
172
|
+
- 💰 **Cost Reduction**: Reduce LLM evaluation costs by 90%+ with sampling
|
|
173
|
+
- ⚡ **Faster Feedback**: Evaluate in minutes vs hours
|
|
174
|
+
- 📊 **Research-Backed Metrics**: Use fasteval's validated evaluation metrics
|
|
175
|
+
- 🎯 **Focus on Issues**: Oversample failures with ScoreBasedSamplingStrategy
|
|
176
|
+
- ✅ **Zero Instrumentation**: Evaluate existing traces without code changes
|
|
177
|
+
- 🔄 **Automatic Scoring**: Evaluation results automatically sync to Langfuse
|
|
178
|
+
|
|
179
|
+
## License
|
|
180
|
+
|
|
181
|
+
MIT
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# fasteval-langfuse
|
|
2
|
+
|
|
3
|
+
Langfuse integration for [fasteval](https://github.com/intuit/fasteval) - evaluate production traces with fasteval's research-backed metrics.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install fasteval-core fasteval-langfuse
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
### Evaluate Production Traces
|
|
14
|
+
|
|
15
|
+
Fetch traces from Langfuse and evaluate them with fasteval metrics:
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from fasteval_langfuse import langfuse_traces
|
|
19
|
+
from fasteval_langfuse.sampling import RandomSamplingStrategy
|
|
20
|
+
import fasteval as fe
|
|
21
|
+
|
|
22
|
+
@fe.correctness(threshold=0.8)
|
|
23
|
+
@fe.hallucination(threshold=0.9)
|
|
24
|
+
@langfuse_traces(
|
|
25
|
+
project="production",
|
|
26
|
+
filter_tags=["customer-support"],
|
|
27
|
+
time_range="last_24h",
|
|
28
|
+
sampling=RandomSamplingStrategy(sample_size=200)
|
|
29
|
+
)
|
|
30
|
+
def test_production_traces(trace_id, input, output, context, metadata):
|
|
31
|
+
# Evaluate the trace
|
|
32
|
+
fe.score(output, input=input)
|
|
33
|
+
|
|
34
|
+
# Run with pytest - scores automatically pushed to Langfuse
|
|
35
|
+
# pytest test_production.py -v
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Sampling Strategies
|
|
39
|
+
|
|
40
|
+
Reduce evaluation costs with intelligent sampling:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from fasteval_langfuse.sampling import (
|
|
44
|
+
RandomSamplingStrategy,
|
|
45
|
+
StratifiedSamplingStrategy,
|
|
46
|
+
ScoreBasedSamplingStrategy,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Random sampling - 200 random traces
|
|
50
|
+
@langfuse_traces(
|
|
51
|
+
project="prod",
|
|
52
|
+
sampling=RandomSamplingStrategy(sample_size=200, seed=42)
|
|
53
|
+
)
|
|
54
|
+
def test_random_sample(trace_id, input, output, context, metadata):
|
|
55
|
+
fe.score(output, input=input)
|
|
56
|
+
|
|
57
|
+
# Stratified sampling - even distribution across user types
|
|
58
|
+
@langfuse_traces(
|
|
59
|
+
project="prod",
|
|
60
|
+
sampling=StratifiedSamplingStrategy(
|
|
61
|
+
strata_key="metadata.user_type",
|
|
62
|
+
samples_per_stratum=30
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
def test_across_segments(trace_id, input, output, context, metadata):
|
|
66
|
+
fe.score(output, input=input)
|
|
67
|
+
|
|
68
|
+
# Score-based sampling - focus on failures
|
|
69
|
+
@langfuse_traces(
|
|
70
|
+
project="prod",
|
|
71
|
+
sampling=ScoreBasedSamplingStrategy(
|
|
72
|
+
score_name="user_rating",
|
|
73
|
+
low_score_threshold=3.0,
|
|
74
|
+
low_score_rate=1.0, # 100% of low ratings
|
|
75
|
+
high_score_rate=0.05 # 5% of high ratings
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
def test_failures(trace_id, input, output, context, metadata):
|
|
79
|
+
fe.score(output, input=input)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Built-in Sampling Strategies
|
|
83
|
+
|
|
84
|
+
- **NoSamplingStrategy**: Evaluate all matching traces (default)
|
|
85
|
+
- **RandomSamplingStrategy**: Unbiased random sampling
|
|
86
|
+
- **StratifiedSamplingStrategy**: Even distribution across groups
|
|
87
|
+
- **ScoreBasedSamplingStrategy**: Oversample low-scoring traces
|
|
88
|
+
- **RecentFirstSamplingStrategy**: Prioritize recent traces
|
|
89
|
+
|
|
90
|
+
## Dataset Integration
|
|
91
|
+
|
|
92
|
+
Evaluate against Langfuse datasets. All dataset columns are passed as parameters - declare what you need:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from fasteval_langfuse import langfuse_dataset
|
|
96
|
+
|
|
97
|
+
# Basic usage
|
|
98
|
+
@fe.correctness(threshold=0.8)
|
|
99
|
+
@langfuse_dataset(name="qa-golden-set", version="v2")
|
|
100
|
+
def test_qa_dataset(input, expected_output):
|
|
101
|
+
response = my_agent(input)
|
|
102
|
+
fe.score(response, expected_output, input=input)
|
|
103
|
+
|
|
104
|
+
# Using custom metadata fields
|
|
105
|
+
@fe.correctness(threshold=0.8)
|
|
106
|
+
@langfuse_dataset(name="qa-golden-set", version="v2")
|
|
107
|
+
def test_with_metadata(input, expected_output, difficulty, category):
|
|
108
|
+
# difficulty and category come from item.metadata
|
|
109
|
+
response = my_agent(input)
|
|
110
|
+
fe.score(response, expected_output, input=input)
|
|
111
|
+
|
|
112
|
+
# Only what you need
|
|
113
|
+
@fe.correctness(threshold=0.8)
|
|
114
|
+
@langfuse_dataset(name="inputs-only")
|
|
115
|
+
def test_minimal(input):
|
|
116
|
+
# Only declare input, ignore other fields
|
|
117
|
+
response = my_agent(input)
|
|
118
|
+
fe.score(response, input=input)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Configuration
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from fasteval_langfuse import configure_langfuse, LangfuseConfig
|
|
125
|
+
|
|
126
|
+
configure_langfuse(LangfuseConfig(
|
|
127
|
+
public_key="pk-...", # Or from LANGFUSE_PUBLIC_KEY env
|
|
128
|
+
secret_key="sk-...", # Or from LANGFUSE_SECRET_KEY env
|
|
129
|
+
host="https://cloud.langfuse.com", # Or self-hosted
|
|
130
|
+
default_project="production",
|
|
131
|
+
auto_push_scores=True, # Push scores back automatically
|
|
132
|
+
score_name_prefix="fasteval_", # Prefix for score names
|
|
133
|
+
))
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## RAG Evaluation with Context
|
|
137
|
+
|
|
138
|
+
The decorator automatically extracts context from trace metadata:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
@fe.faithfulness(threshold=0.8)
|
|
142
|
+
@fe.contextual_precision(threshold=0.7)
|
|
143
|
+
@langfuse_traces(
|
|
144
|
+
project="prod",
|
|
145
|
+
filter_tags=["rag"]
|
|
146
|
+
)
|
|
147
|
+
def test_rag_quality(trace_id, input, output, context, metadata):
|
|
148
|
+
# context is auto-extracted from metadata keys:
|
|
149
|
+
# - "context", "retrieved_docs", "documents", "retrieval_context"
|
|
150
|
+
|
|
151
|
+
# Or manually extract if needed:
|
|
152
|
+
if not context:
|
|
153
|
+
context = metadata.get("custom_docs_key")
|
|
154
|
+
|
|
155
|
+
fe.score(output, context=context, input=input)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Benefits
|
|
159
|
+
|
|
160
|
+
- 💰 **Cost Reduction**: Reduce LLM evaluation costs by 90%+ with sampling
|
|
161
|
+
- ⚡ **Faster Feedback**: Evaluate in minutes vs hours
|
|
162
|
+
- 📊 **Research-Backed Metrics**: Use fasteval's validated evaluation metrics
|
|
163
|
+
- 🎯 **Focus on Issues**: Oversample failures with ScoreBasedSamplingStrategy
|
|
164
|
+
- ✅ **Zero Instrumentation**: Evaluate existing traces without code changes
|
|
165
|
+
- 🔄 **Automatic Scoring**: Evaluation results automatically sync to Langfuse
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
MIT
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fasteval-langfuse: Langfuse integration for fasteval
|
|
3
|
+
|
|
4
|
+
Evaluate production traces from Langfuse with fasteval metrics.
|
|
5
|
+
|
|
6
|
+
Install: pip install fasteval-langfuse
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
from fasteval_langfuse import langfuse_traces
|
|
10
|
+
from fasteval_langfuse.sampling import RandomSamplingStrategy
|
|
11
|
+
import fasteval as fe
|
|
12
|
+
|
|
13
|
+
@fe.correctness(threshold=0.8)
|
|
14
|
+
@langfuse_traces(
|
|
15
|
+
project="production",
|
|
16
|
+
filter_tags=["customer-support"],
|
|
17
|
+
time_range="last_24h",
|
|
18
|
+
sampling=RandomSamplingStrategy(sample_size=200)
|
|
19
|
+
)
|
|
20
|
+
def test_production_traces(trace_id, input, output, context, metadata):
|
|
21
|
+
fe.score(output, input=input)
|
|
22
|
+
|
|
23
|
+
# Run with pytest:
|
|
24
|
+
# pytest test_production.py -v
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from fasteval_langfuse.client import LangfuseClient
|
|
28
|
+
from fasteval_langfuse.config import LangfuseConfig, configure_langfuse, get_config
|
|
29
|
+
from fasteval_langfuse.decorators import langfuse_dataset, langfuse_traces
|
|
30
|
+
from fasteval_langfuse.sampling import (
|
|
31
|
+
BaseSamplingStrategy,
|
|
32
|
+
NoSamplingStrategy,
|
|
33
|
+
RandomSamplingStrategy,
|
|
34
|
+
RecentFirstSamplingStrategy,
|
|
35
|
+
ScoreBasedSamplingStrategy,
|
|
36
|
+
StratifiedSamplingStrategy,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
__version__ = "0.1.0"
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Decorators
|
|
43
|
+
"langfuse_traces",
|
|
44
|
+
"langfuse_dataset",
|
|
45
|
+
# Configuration
|
|
46
|
+
"LangfuseConfig",
|
|
47
|
+
"configure_langfuse",
|
|
48
|
+
"get_config",
|
|
49
|
+
# Client
|
|
50
|
+
"LangfuseClient",
|
|
51
|
+
# Sampling strategies
|
|
52
|
+
"BaseSamplingStrategy",
|
|
53
|
+
"NoSamplingStrategy",
|
|
54
|
+
"RandomSamplingStrategy",
|
|
55
|
+
"StratifiedSamplingStrategy",
|
|
56
|
+
"ScoreBasedSamplingStrategy",
|
|
57
|
+
"RecentFirstSamplingStrategy",
|
|
58
|
+
]
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Langfuse SDK wrapper for fasteval integration."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from fasteval_langfuse.config import get_config
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from langfuse import Langfuse
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LangfuseClient:
|
|
12
|
+
"""
|
|
13
|
+
Wrapper around Langfuse SDK for trace fetching and score reporting.
|
|
14
|
+
|
|
15
|
+
Handles authentication, API calls, and error handling.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
public_key: Optional[str] = None,
|
|
21
|
+
secret_key: Optional[str] = None,
|
|
22
|
+
host: Optional[str] = None,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize Langfuse client.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
public_key: Override config public key
|
|
29
|
+
secret_key: Override config secret key
|
|
30
|
+
host: Override config host
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError: If credentials are missing
|
|
34
|
+
"""
|
|
35
|
+
config = get_config()
|
|
36
|
+
|
|
37
|
+
self.public_key = public_key or config.public_key
|
|
38
|
+
self.secret_key = secret_key or config.secret_key
|
|
39
|
+
self.host = host or config.host
|
|
40
|
+
|
|
41
|
+
if not self.public_key or not self.secret_key:
|
|
42
|
+
raise ValueError(
|
|
43
|
+
"Langfuse credentials required. Set via config or environment variables "
|
|
44
|
+
"(LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY)"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Lazy import to avoid dependency issues in tests
|
|
48
|
+
try:
|
|
49
|
+
from langfuse import Langfuse
|
|
50
|
+
except ImportError as e:
|
|
51
|
+
raise ImportError(
|
|
52
|
+
"Langfuse SDK not installed. Install with: pip install langfuse>=2.0.0"
|
|
53
|
+
) from e
|
|
54
|
+
|
|
55
|
+
self._client = Langfuse(
|
|
56
|
+
public_key=self.public_key,
|
|
57
|
+
secret_key=self.secret_key,
|
|
58
|
+
host=self.host,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def fetch_traces(
|
|
62
|
+
self,
|
|
63
|
+
project: Optional[str] = None,
|
|
64
|
+
tags: Optional[List[str]] = None,
|
|
65
|
+
from_timestamp: Optional[str] = None,
|
|
66
|
+
to_timestamp: Optional[str] = None,
|
|
67
|
+
user_id: Optional[str] = None,
|
|
68
|
+
session_id: Optional[str] = None,
|
|
69
|
+
limit: Optional[int] = None,
|
|
70
|
+
) -> List[Dict[str, Any]]:
|
|
71
|
+
"""
|
|
72
|
+
Fetch traces from Langfuse.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
project: Project name to filter by
|
|
76
|
+
tags: List of tags to filter by
|
|
77
|
+
from_timestamp: Start timestamp (ISO 8601)
|
|
78
|
+
to_timestamp: End timestamp (ISO 8601)
|
|
79
|
+
user_id: Filter by user ID
|
|
80
|
+
session_id: Filter by session ID
|
|
81
|
+
limit: Maximum number of traces to fetch
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of trace dictionaries
|
|
85
|
+
"""
|
|
86
|
+
# Build filter parameters
|
|
87
|
+
filters = {}
|
|
88
|
+
if tags:
|
|
89
|
+
filters["tags"] = tags
|
|
90
|
+
if user_id:
|
|
91
|
+
filters["user_id"] = user_id
|
|
92
|
+
if session_id:
|
|
93
|
+
filters["session_id"] = session_id
|
|
94
|
+
|
|
95
|
+
# Fetch traces using Langfuse SDK
|
|
96
|
+
traces = self._client.fetch_traces(
|
|
97
|
+
name=project,
|
|
98
|
+
from_timestamp=from_timestamp,
|
|
99
|
+
to_timestamp=to_timestamp,
|
|
100
|
+
limit=limit,
|
|
101
|
+
**filters,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Convert to dict format
|
|
105
|
+
return [self._trace_to_dict(trace) for trace in traces.data]
|
|
106
|
+
|
|
107
|
+
def push_score(
|
|
108
|
+
self,
|
|
109
|
+
trace_id: str,
|
|
110
|
+
name: str,
|
|
111
|
+
value: float,
|
|
112
|
+
comment: Optional[str] = None,
|
|
113
|
+
) -> None:
|
|
114
|
+
"""
|
|
115
|
+
Push an evaluation score to a Langfuse trace.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
trace_id: Langfuse trace ID
|
|
119
|
+
name: Score name
|
|
120
|
+
value: Score value (0.0-1.0)
|
|
121
|
+
comment: Optional score comment/reasoning
|
|
122
|
+
"""
|
|
123
|
+
self._client.score(
|
|
124
|
+
trace_id=trace_id,
|
|
125
|
+
name=name,
|
|
126
|
+
value=value,
|
|
127
|
+
comment=comment,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def fetch_dataset(
|
|
131
|
+
self,
|
|
132
|
+
name: str,
|
|
133
|
+
version: Optional[str] = None,
|
|
134
|
+
) -> List[Dict[str, Any]]:
|
|
135
|
+
"""
|
|
136
|
+
Fetch dataset items from Langfuse.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
name: Dataset name
|
|
140
|
+
version: Optional dataset version
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of dataset item dictionaries
|
|
144
|
+
"""
|
|
145
|
+
dataset = self._client.get_dataset(name)
|
|
146
|
+
|
|
147
|
+
# Filter by version if specified
|
|
148
|
+
items = dataset.items
|
|
149
|
+
if version:
|
|
150
|
+
items = [item for item in items if item.version == version]
|
|
151
|
+
|
|
152
|
+
return [self._dataset_item_to_dict(item) for item in items]
|
|
153
|
+
|
|
154
|
+
def _trace_to_dict(self, trace: Any) -> Dict[str, Any]:
|
|
155
|
+
"""Convert Langfuse trace object to dictionary."""
|
|
156
|
+
return {
|
|
157
|
+
"id": trace.id,
|
|
158
|
+
"timestamp": trace.timestamp,
|
|
159
|
+
"name": trace.name,
|
|
160
|
+
"user_id": getattr(trace, "user_id", None),
|
|
161
|
+
"session_id": getattr(trace, "session_id", None),
|
|
162
|
+
"tags": getattr(trace, "tags", []),
|
|
163
|
+
"metadata": getattr(trace, "metadata", {}),
|
|
164
|
+
"input": getattr(trace, "input", None),
|
|
165
|
+
"output": getattr(trace, "output", None),
|
|
166
|
+
"scores": getattr(trace, "scores", []),
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
def _dataset_item_to_dict(self, item: Any) -> Dict[str, Any]:
|
|
170
|
+
"""Convert Langfuse dataset item to dictionary."""
|
|
171
|
+
return {
|
|
172
|
+
"id": item.id,
|
|
173
|
+
"input": item.input,
|
|
174
|
+
"expected_output": item.expected_output,
|
|
175
|
+
"metadata": getattr(item, "metadata", {}),
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
def flush(self) -> None:
|
|
179
|
+
"""Flush pending scores to Langfuse."""
|
|
180
|
+
self._client.flush()
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Configuration for fasteval-langfuse."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LangfuseConfig(BaseModel):
|
|
10
|
+
"""
|
|
11
|
+
Configuration for Langfuse integration.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
public_key: Langfuse public API key (or from LANGFUSE_PUBLIC_KEY env)
|
|
15
|
+
secret_key: Langfuse secret API key (or from LANGFUSE_SECRET_KEY env)
|
|
16
|
+
host: Langfuse host URL (or from LANGFUSE_HOST env)
|
|
17
|
+
default_project: Default project name for traces
|
|
18
|
+
auto_push_scores: Automatically push evaluation scores back to Langfuse
|
|
19
|
+
batch_size: Batch size for fetching traces
|
|
20
|
+
max_parallel_evals: Maximum parallel evaluations
|
|
21
|
+
retry_on_failure: Retry failed score pushes
|
|
22
|
+
score_name_prefix: Prefix for score names in Langfuse
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
public_key: Optional[str] = Field(
|
|
26
|
+
default_factory=lambda: os.getenv("LANGFUSE_PUBLIC_KEY")
|
|
27
|
+
)
|
|
28
|
+
secret_key: Optional[str] = Field(
|
|
29
|
+
default_factory=lambda: os.getenv("LANGFUSE_SECRET_KEY")
|
|
30
|
+
)
|
|
31
|
+
host: str = Field(
|
|
32
|
+
default_factory=lambda: os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
|
33
|
+
)
|
|
34
|
+
default_project: Optional[str] = None
|
|
35
|
+
auto_push_scores: bool = True
|
|
36
|
+
batch_size: int = 50
|
|
37
|
+
max_parallel_evals: int = 5
|
|
38
|
+
retry_on_failure: bool = True
|
|
39
|
+
score_name_prefix: str = "fasteval_"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Global configuration instance
|
|
43
|
+
_config: Optional[LangfuseConfig] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def configure_langfuse(config: LangfuseConfig) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Configure the Langfuse integration.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
config: LangfuseConfig instance
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
from fasteval_langfuse import configure_langfuse, LangfuseConfig
|
|
55
|
+
|
|
56
|
+
configure_langfuse(LangfuseConfig(
|
|
57
|
+
public_key="pk-...",
|
|
58
|
+
secret_key="sk-...",
|
|
59
|
+
default_project="production"
|
|
60
|
+
))
|
|
61
|
+
"""
|
|
62
|
+
global _config
|
|
63
|
+
_config = config
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_config() -> LangfuseConfig:
|
|
67
|
+
"""
|
|
68
|
+
Get the current Langfuse configuration.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Current LangfuseConfig, or default if not configured
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
config = get_config()
|
|
75
|
+
print(config.default_project)
|
|
76
|
+
"""
|
|
77
|
+
global _config
|
|
78
|
+
if _config is None:
|
|
79
|
+
_config = LangfuseConfig()
|
|
80
|
+
return _config
|