langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__init__.py +6 -3
- langwatch/__version__.py +1 -1
- langwatch/dspy/__init__.py +4 -32
- langwatch/evaluation/__init__.py +535 -7
- langwatch/evaluations.py +183 -353
- langwatch/experiment/__init__.py +108 -0
- langwatch/experiment/experiment.py +912 -0
- langwatch/experiment/platform_run.py +435 -0
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/METADATA +1 -1
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/RECORD +11 -9
- langwatch/evaluation/evaluation.py +0 -484
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""
|
|
2
|
+
langwatch.experiment - Run experiments on LangWatch platform or via SDK.
|
|
3
|
+
|
|
4
|
+
This module provides two ways to run experiments:
|
|
5
|
+
|
|
6
|
+
1. Platform experiments (CI/CD):
|
|
7
|
+
Run experiments configured in the LangWatch platform UI.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
import langwatch
|
|
11
|
+
|
|
12
|
+
result = langwatch.experiment.run("my-experiment-slug")
|
|
13
|
+
result.print_summary()
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
2. SDK-defined experiments:
|
|
17
|
+
Define and run experiments programmatically.
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import langwatch
|
|
21
|
+
|
|
22
|
+
experiment = langwatch.experiment.init("my-experiment")
|
|
23
|
+
|
|
24
|
+
for index, row in experiment.loop(df.iterrows(), threads=4):
|
|
25
|
+
async def task(index, row):
|
|
26
|
+
result = await my_agent(row["input"])
|
|
27
|
+
experiment.evaluate(
|
|
28
|
+
"langevals/exact_match",
|
|
29
|
+
index=index,
|
|
30
|
+
data={"output": result, "expected_output": row["expected"]},
|
|
31
|
+
settings={},
|
|
32
|
+
)
|
|
33
|
+
experiment.submit(task, index, row)
|
|
34
|
+
```
|
|
35
|
+
"""
|
|
36
|
+
from typing import Optional
|
|
37
|
+
|
|
38
|
+
# Re-export the Experiment class for SDK-defined experiments
|
|
39
|
+
from langwatch.experiment.experiment import Experiment
|
|
40
|
+
|
|
41
|
+
# Re-export the platform run function and related types
|
|
42
|
+
from langwatch.experiment.platform_run import (
|
|
43
|
+
run,
|
|
44
|
+
ExperimentRunResult,
|
|
45
|
+
ExperimentRunSummary,
|
|
46
|
+
ExperimentNotFoundError,
|
|
47
|
+
ExperimentTimeoutError,
|
|
48
|
+
ExperimentRunFailedError,
|
|
49
|
+
ExperimentsApiError,
|
|
50
|
+
TargetStats,
|
|
51
|
+
EvaluatorStats,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def init(name: str, *, run_id: Optional[str] = None) -> Experiment:
|
|
56
|
+
"""
|
|
57
|
+
Initialize an SDK-defined experiment.
|
|
58
|
+
|
|
59
|
+
This creates an Experiment instance that you can use to run evaluators
|
|
60
|
+
programmatically using datasets and custom logic.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
name: Name for this experiment run
|
|
64
|
+
run_id: Optional custom run ID (auto-generated if not provided)
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Experiment instance with methods:
|
|
68
|
+
- loop(): Iterate over dataset rows with parallel execution
|
|
69
|
+
- evaluate(): Run an evaluator on the current row
|
|
70
|
+
- log(): Log custom metrics
|
|
71
|
+
- submit(): Submit async tasks
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
```python
|
|
75
|
+
import langwatch
|
|
76
|
+
|
|
77
|
+
experiment = langwatch.experiment.init("my-experiment")
|
|
78
|
+
|
|
79
|
+
for index, row in experiment.loop(df.iterrows(), threads=4):
|
|
80
|
+
async def task(index, row):
|
|
81
|
+
result = await my_agent(row["input"])
|
|
82
|
+
experiment.evaluate(
|
|
83
|
+
"langevals/exact_match",
|
|
84
|
+
index=index,
|
|
85
|
+
data={"output": result, "expected_output": row["expected"]},
|
|
86
|
+
settings={},
|
|
87
|
+
)
|
|
88
|
+
experiment.submit(task, index, row)
|
|
89
|
+
```
|
|
90
|
+
"""
|
|
91
|
+
experiment = Experiment(name, run_id=run_id)
|
|
92
|
+
experiment.init()
|
|
93
|
+
return experiment
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
__all__ = [
|
|
97
|
+
"init",
|
|
98
|
+
"run",
|
|
99
|
+
"Experiment",
|
|
100
|
+
"ExperimentRunResult",
|
|
101
|
+
"ExperimentRunSummary",
|
|
102
|
+
"ExperimentNotFoundError",
|
|
103
|
+
"ExperimentTimeoutError",
|
|
104
|
+
"ExperimentRunFailedError",
|
|
105
|
+
"ExperimentsApiError",
|
|
106
|
+
"TargetStats",
|
|
107
|
+
"EvaluatorStats",
|
|
108
|
+
]
|