lmnr 0.4.53.dev0__py3-none-any.whl → 0.7.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lmnr/__init__.py +32 -11
- lmnr/cli/__init__.py +270 -0
- lmnr/cli/datasets.py +371 -0
- lmnr/cli/evals.py +111 -0
- lmnr/cli/rules.py +42 -0
- lmnr/opentelemetry_lib/__init__.py +70 -0
- lmnr/opentelemetry_lib/decorators/__init__.py +337 -0
- lmnr/opentelemetry_lib/litellm/__init__.py +685 -0
- lmnr/opentelemetry_lib/litellm/utils.py +100 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/__init__.py +849 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/config.py +13 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_emitter.py +211 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_models.py +41 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/span_utils.py +401 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/streaming.py +425 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/utils.py +332 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/version.py +1 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/__init__.py +451 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/proxy.py +144 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_agent/__init__.py +100 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/__init__.py +476 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/utils.py +12 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/__init__.py +599 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/config.py +9 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/schema_utils.py +26 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/utils.py +330 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/__init__.py +488 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/config.py +8 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_emitter.py +143 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_models.py +41 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/span_utils.py +229 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/utils.py +92 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/version.py +1 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/__init__.py +381 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/utils.py +36 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/__init__.py +121 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/utils.py +60 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/__init__.py +61 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/__init__.py +472 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/chat_wrappers.py +1185 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/completion_wrappers.py +305 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/config.py +16 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/embeddings_wrappers.py +312 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_emitter.py +100 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_models.py +41 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/image_gen_wrappers.py +68 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/utils.py +197 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v0/__init__.py +176 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/__init__.py +368 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/assistant_wrappers.py +325 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/event_handler_wrapper.py +135 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/responses_wrappers.py +786 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/version.py +1 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openhands_ai/__init__.py +388 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/opentelemetry/__init__.py +69 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/skyvern/__init__.py +191 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/threading/__init__.py +197 -0
- lmnr/opentelemetry_lib/tracing/__init__.py +263 -0
- lmnr/opentelemetry_lib/tracing/_instrument_initializers.py +516 -0
- lmnr/{openllmetry_sdk → opentelemetry_lib}/tracing/attributes.py +21 -8
- lmnr/opentelemetry_lib/tracing/context.py +200 -0
- lmnr/opentelemetry_lib/tracing/exporter.py +153 -0
- lmnr/opentelemetry_lib/tracing/instruments.py +140 -0
- lmnr/opentelemetry_lib/tracing/processor.py +193 -0
- lmnr/opentelemetry_lib/tracing/span.py +398 -0
- lmnr/opentelemetry_lib/tracing/tracer.py +57 -0
- lmnr/opentelemetry_lib/tracing/utils.py +62 -0
- lmnr/opentelemetry_lib/utils/package_check.py +18 -0
- lmnr/opentelemetry_lib/utils/wrappers.py +11 -0
- lmnr/sdk/browser/__init__.py +0 -0
- lmnr/sdk/browser/background_send_events.py +158 -0
- lmnr/sdk/browser/browser_use_cdp_otel.py +100 -0
- lmnr/sdk/browser/browser_use_otel.py +142 -0
- lmnr/sdk/browser/bubus_otel.py +71 -0
- lmnr/sdk/browser/cdp_utils.py +518 -0
- lmnr/sdk/browser/inject_script.js +514 -0
- lmnr/sdk/browser/patchright_otel.py +151 -0
- lmnr/sdk/browser/playwright_otel.py +322 -0
- lmnr/sdk/browser/pw_utils.py +363 -0
- lmnr/sdk/browser/recorder/record.umd.min.cjs +84 -0
- lmnr/sdk/browser/utils.py +70 -0
- lmnr/sdk/client/asynchronous/async_client.py +180 -0
- lmnr/sdk/client/asynchronous/resources/__init__.py +6 -0
- lmnr/sdk/client/asynchronous/resources/base.py +32 -0
- lmnr/sdk/client/asynchronous/resources/browser_events.py +41 -0
- lmnr/sdk/client/asynchronous/resources/datasets.py +131 -0
- lmnr/sdk/client/asynchronous/resources/evals.py +266 -0
- lmnr/sdk/client/asynchronous/resources/evaluators.py +85 -0
- lmnr/sdk/client/asynchronous/resources/tags.py +83 -0
- lmnr/sdk/client/synchronous/resources/__init__.py +6 -0
- lmnr/sdk/client/synchronous/resources/base.py +32 -0
- lmnr/sdk/client/synchronous/resources/browser_events.py +40 -0
- lmnr/sdk/client/synchronous/resources/datasets.py +131 -0
- lmnr/sdk/client/synchronous/resources/evals.py +263 -0
- lmnr/sdk/client/synchronous/resources/evaluators.py +85 -0
- lmnr/sdk/client/synchronous/resources/tags.py +83 -0
- lmnr/sdk/client/synchronous/sync_client.py +191 -0
- lmnr/sdk/datasets/__init__.py +94 -0
- lmnr/sdk/datasets/file_utils.py +91 -0
- lmnr/sdk/decorators.py +163 -26
- lmnr/sdk/eval_control.py +3 -2
- lmnr/sdk/evaluations.py +403 -191
- lmnr/sdk/laminar.py +1080 -549
- lmnr/sdk/log.py +7 -2
- lmnr/sdk/types.py +246 -134
- lmnr/sdk/utils.py +151 -7
- lmnr/version.py +46 -0
- {lmnr-0.4.53.dev0.dist-info → lmnr-0.7.26.dist-info}/METADATA +152 -106
- lmnr-0.7.26.dist-info/RECORD +116 -0
- lmnr-0.7.26.dist-info/WHEEL +4 -0
- lmnr-0.7.26.dist-info/entry_points.txt +3 -0
- lmnr/cli.py +0 -101
- lmnr/openllmetry_sdk/.python-version +0 -1
- lmnr/openllmetry_sdk/__init__.py +0 -72
- lmnr/openllmetry_sdk/config/__init__.py +0 -9
- lmnr/openllmetry_sdk/decorators/base.py +0 -185
- lmnr/openllmetry_sdk/instruments.py +0 -38
- lmnr/openllmetry_sdk/tracing/__init__.py +0 -1
- lmnr/openllmetry_sdk/tracing/content_allow_list.py +0 -24
- lmnr/openllmetry_sdk/tracing/context_manager.py +0 -13
- lmnr/openllmetry_sdk/tracing/tracing.py +0 -884
- lmnr/openllmetry_sdk/utils/in_memory_span_exporter.py +0 -61
- lmnr/openllmetry_sdk/utils/package_check.py +0 -7
- lmnr/openllmetry_sdk/version.py +0 -1
- lmnr/sdk/datasets.py +0 -55
- lmnr-0.4.53.dev0.dist-info/LICENSE +0 -75
- lmnr-0.4.53.dev0.dist-info/RECORD +0 -33
- lmnr-0.4.53.dev0.dist-info/WHEEL +0 -4
- lmnr-0.4.53.dev0.dist-info/entry_points.txt +0 -3
- /lmnr/{openllmetry_sdk → opentelemetry_lib}/.flake8 +0 -0
- /lmnr/{openllmetry_sdk → opentelemetry_lib}/utils/__init__.py +0 -0
- /lmnr/{openllmetry_sdk → opentelemetry_lib}/utils/json_encoder.py +0 -0
- /lmnr/{openllmetry_sdk/decorators/__init__.py → py.typed} +0 -0
lmnr/sdk/evaluations.py
CHANGED
|
@@ -1,44 +1,59 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import re
|
|
3
|
-
import sys
|
|
4
3
|
import uuid
|
|
5
4
|
|
|
5
|
+
from typing import Any
|
|
6
|
+
from typing_extensions import TypedDict
|
|
7
|
+
|
|
6
8
|
from tqdm import tqdm
|
|
7
|
-
from typing import Any, Awaitable, Optional, Set, Union
|
|
8
9
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
10
|
+
from lmnr.opentelemetry_lib.tracing.instruments import Instruments
|
|
11
|
+
from lmnr.opentelemetry_lib.tracing.attributes import HUMAN_EVALUATOR_OPTIONS, SPAN_TYPE
|
|
11
12
|
|
|
12
|
-
from .
|
|
13
|
-
from .
|
|
14
|
-
from .
|
|
15
|
-
from .
|
|
16
|
-
from .
|
|
13
|
+
from lmnr.sdk.client.asynchronous.async_client import AsyncLaminarClient
|
|
14
|
+
from lmnr.sdk.client.synchronous.sync_client import LaminarClient
|
|
15
|
+
from lmnr.sdk.datasets import EvaluationDataset, LaminarDataset
|
|
16
|
+
from lmnr.sdk.eval_control import EVALUATION_INSTANCES, PREPARE_ONLY
|
|
17
|
+
from lmnr.sdk.laminar import Laminar as L
|
|
18
|
+
from lmnr.sdk.log import get_default_logger
|
|
19
|
+
from lmnr.sdk.types import (
|
|
17
20
|
Datapoint,
|
|
21
|
+
EvaluationDatapointDatasetLink,
|
|
18
22
|
EvaluationResultDatapoint,
|
|
19
23
|
EvaluatorFunction,
|
|
20
24
|
ExecutorFunction,
|
|
21
25
|
HumanEvaluator,
|
|
22
26
|
Numeric,
|
|
23
27
|
NumericTypes,
|
|
28
|
+
PartialEvaluationDatapoint,
|
|
24
29
|
SpanType,
|
|
25
30
|
TraceType,
|
|
26
31
|
)
|
|
27
|
-
from .utils import is_async
|
|
32
|
+
from lmnr.sdk.utils import from_env, is_async, json_dumps
|
|
28
33
|
|
|
29
34
|
DEFAULT_BATCH_SIZE = 5
|
|
35
|
+
MAX_EXPORT_BATCH_SIZE = 64
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EvaluationRunResult(TypedDict):
|
|
39
|
+
average_scores: dict[str, Numeric]
|
|
40
|
+
evaluation_id: uuid.UUID
|
|
41
|
+
project_id: uuid.UUID
|
|
42
|
+
url: str
|
|
43
|
+
error_message: str | None
|
|
30
44
|
|
|
31
45
|
|
|
32
46
|
def get_evaluation_url(
|
|
33
|
-
project_id: str, evaluation_id: str, base_url: str =
|
|
47
|
+
project_id: str, evaluation_id: str, base_url: str | None = None
|
|
34
48
|
):
|
|
49
|
+
if not base_url or base_url == "https://api.lmnr.ai":
|
|
50
|
+
base_url = "https://www.lmnr.ai"
|
|
51
|
+
|
|
35
52
|
url = base_url
|
|
36
|
-
|
|
37
|
-
url = url[:-1]
|
|
53
|
+
url = re.sub(r"\/$", "", url)
|
|
38
54
|
if url.endswith("localhost") or url.endswith("127.0.0.1"):
|
|
39
|
-
# We best effort assume that the frontend is running on port
|
|
40
|
-
|
|
41
|
-
url = url + ":3000"
|
|
55
|
+
# We best effort assume that the frontend is running on port 5667
|
|
56
|
+
url = url + ":5667"
|
|
42
57
|
return f"{url}/project/{project_id}/evaluations/{evaluation_id}"
|
|
43
58
|
|
|
44
59
|
|
|
@@ -52,13 +67,17 @@ def get_average_scores(results: list[EvaluationResultDatapoint]) -> dict[str, Nu
|
|
|
52
67
|
|
|
53
68
|
average_scores = {}
|
|
54
69
|
for key, values in per_score_values.items():
|
|
55
|
-
|
|
70
|
+
scores = [v for v in values if v is not None]
|
|
71
|
+
|
|
72
|
+
# If there are no scores, we don't want to include the key in the average scores
|
|
73
|
+
if len(scores) > 0:
|
|
74
|
+
average_scores[key] = sum(scores) / len(scores)
|
|
56
75
|
|
|
57
76
|
return average_scores
|
|
58
77
|
|
|
59
78
|
|
|
60
79
|
class EvaluationReporter:
|
|
61
|
-
def __init__(self, base_url
|
|
80
|
+
def __init__(self, base_url):
|
|
62
81
|
self.base_url = base_url
|
|
63
82
|
|
|
64
83
|
def start(self, length: int):
|
|
@@ -71,89 +90,107 @@ class EvaluationReporter:
|
|
|
71
90
|
def update(self, batch_length: int):
|
|
72
91
|
self.cli_progress.update(batch_length)
|
|
73
92
|
|
|
74
|
-
def
|
|
75
|
-
self
|
|
76
|
-
|
|
93
|
+
def stop_with_error(self, error: Exception):
|
|
94
|
+
if hasattr(self, "cli_progress"):
|
|
95
|
+
self.cli_progress.close()
|
|
96
|
+
raise error
|
|
77
97
|
|
|
78
98
|
def stop(
|
|
79
99
|
self, average_scores: dict[str, Numeric], project_id: str, evaluation_id: str
|
|
80
100
|
):
|
|
81
101
|
self.cli_progress.close()
|
|
82
|
-
print(
|
|
83
|
-
f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
|
|
84
|
-
)
|
|
85
102
|
print("Average scores:")
|
|
86
103
|
for name, score in average_scores.items():
|
|
87
104
|
print(f"{name}: {score}")
|
|
88
|
-
print(
|
|
105
|
+
print(
|
|
106
|
+
f"Check the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
|
|
107
|
+
)
|
|
89
108
|
|
|
90
109
|
|
|
91
110
|
class Evaluation:
|
|
92
111
|
def __init__(
|
|
93
112
|
self,
|
|
94
|
-
data:
|
|
113
|
+
data: EvaluationDataset | list[Datapoint | dict],
|
|
95
114
|
executor: Any,
|
|
96
|
-
evaluators: dict[str, EvaluatorFunction],
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
project_api_key:
|
|
102
|
-
base_url:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
115
|
+
evaluators: dict[str, EvaluatorFunction | HumanEvaluator],
|
|
116
|
+
name: str | None = None,
|
|
117
|
+
group_name: str | None = None,
|
|
118
|
+
metadata: dict[str, Any] | None = None,
|
|
119
|
+
concurrency_limit: int = DEFAULT_BATCH_SIZE,
|
|
120
|
+
project_api_key: str | None = None,
|
|
121
|
+
base_url: str | None = None,
|
|
122
|
+
base_http_url: str | None = None,
|
|
123
|
+
http_port: int | None = None,
|
|
124
|
+
grpc_port: int | None = None,
|
|
125
|
+
instruments: (
|
|
126
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
127
|
+
) = None,
|
|
128
|
+
disabled_instruments: (
|
|
129
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
130
|
+
) = None,
|
|
131
|
+
max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
|
|
132
|
+
trace_export_timeout_seconds: int | None = None,
|
|
106
133
|
):
|
|
107
134
|
"""
|
|
108
|
-
Initializes an instance of the
|
|
135
|
+
Initializes an instance of the Evaluation class.
|
|
109
136
|
|
|
110
137
|
Parameters:
|
|
111
|
-
data (
|
|
138
|
+
data (list[Datapoint|dict] | EvaluationDataset):\
|
|
112
139
|
List of data points to evaluate or an evaluation dataset.
|
|
113
|
-
|
|
114
|
-
|
|
140
|
+
`data` is the input to the executor function.
|
|
141
|
+
`target` is the input to the evaluator function.
|
|
142
|
+
`metadata` is optional metadata to associate with the\
|
|
143
|
+
datapoint.
|
|
115
144
|
executor (Callable[..., Any]): The executor function.\
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
evaluators (dict[str, Callable[..., Any]]): Evaluator
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
If the score is a single number
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
Defaults to an empty list.
|
|
129
|
-
name (Optional[str], optional): Optional name of the evaluation.\
|
|
145
|
+
Takes the data point + any additional arguments and returns\
|
|
146
|
+
the output to evaluate.
|
|
147
|
+
evaluators (dict[str, Callable[..., Any] | HumanEvaluator]): Evaluator\
|
|
148
|
+
functions and HumanEvaluator instances with names. Each evaluator\
|
|
149
|
+
function takes the output of the executor _and_ the target data,\
|
|
150
|
+
and returns a score. The score can be a single number or a dict\
|
|
151
|
+
of string keys and number values. If the score is a single number,\
|
|
152
|
+
it will be named after the evaluator function.\
|
|
153
|
+
HumanEvaluator instances create empty spans for manual evaluation.\
|
|
154
|
+
Evaluator names must contain only letters, digits, hyphens,\
|
|
155
|
+
underscores, or spaces.
|
|
156
|
+
name (str | None, optional): Optional name of the evaluation.\
|
|
130
157
|
Used to identify the evaluation in the group.\
|
|
131
158
|
If not provided, a random name will be generated.
|
|
132
159
|
Defaults to None.
|
|
133
|
-
|
|
134
|
-
evaluations. Only evaluations within the same
|
|
160
|
+
group_name (str | None, optional): an identifier to group\
|
|
161
|
+
evaluations. Only evaluations within the same group_name can be\
|
|
135
162
|
visually compared. If not provided, "default" is assigned.
|
|
136
163
|
Defaults to None
|
|
137
|
-
|
|
138
|
-
|
|
164
|
+
metadata (dict[str, Any] | None): optional metadata to associate with\
|
|
165
|
+
concurrency_limit (int, optional): The concurrency limit for\
|
|
166
|
+
evaluation. This many data points will be evaluated in parallel\
|
|
167
|
+
with a pool of workers.
|
|
139
168
|
Defaults to DEFAULT_BATCH_SIZE.
|
|
140
|
-
project_api_key (
|
|
169
|
+
project_api_key (str | None, optional): The project API key.\
|
|
141
170
|
If not provided, LMNR_PROJECT_API_KEY environment variable is\
|
|
142
171
|
used.
|
|
143
172
|
Defaults to an empty string.
|
|
144
|
-
base_url (
|
|
173
|
+
base_url (str | None, optional): The base URL for Laminar API.\
|
|
145
174
|
Useful if self-hosted. Do NOT include the port, use `http_port`\
|
|
146
175
|
and `grpc_port` instead.
|
|
147
176
|
Defaults to "https://api.lmnr.ai".
|
|
148
|
-
|
|
177
|
+
base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
|
|
178
|
+
Only set this if your Laminar backend HTTP is proxied\
|
|
179
|
+
through a different host. If not specified, defaults\
|
|
180
|
+
to https://api.lmnr.ai.
|
|
181
|
+
http_port (int | None, optional): The port for Laminar API\
|
|
149
182
|
HTTP service. Defaults to 443 if not specified.
|
|
150
|
-
grpc_port (
|
|
183
|
+
grpc_port (int | None, optional): The port for Laminar API\
|
|
151
184
|
gRPC service. Defaults to 8443 if not specified.
|
|
152
|
-
instruments (
|
|
185
|
+
instruments (set[Instruments] | None, optional): Set of modules\
|
|
153
186
|
to auto-instrument. If None, all available instruments will be\
|
|
154
187
|
used.
|
|
155
188
|
See https://docs.lmnr.ai/tracing/automatic-instrumentation
|
|
156
189
|
Defaults to None.
|
|
190
|
+
disabled_instruments (set[Instruments] | None, optional): Set of modules\
|
|
191
|
+
to disable auto-instrumentations. If None, only modules passed\
|
|
192
|
+
as `instruments` will be disabled.
|
|
193
|
+
Defaults to None.
|
|
157
194
|
"""
|
|
158
195
|
|
|
159
196
|
if not evaluators:
|
|
@@ -168,7 +205,8 @@ class Evaluation:
|
|
|
168
205
|
"underscores, or spaces."
|
|
169
206
|
)
|
|
170
207
|
|
|
171
|
-
|
|
208
|
+
base_url = base_url or from_env("LMNR_BASE_URL") or "https://api.lmnr.ai"
|
|
209
|
+
|
|
172
210
|
self.reporter = EvaluationReporter(base_url)
|
|
173
211
|
if isinstance(data, list):
|
|
174
212
|
self.data = [
|
|
@@ -177,212 +215,386 @@ class Evaluation:
|
|
|
177
215
|
]
|
|
178
216
|
else:
|
|
179
217
|
self.data = data
|
|
218
|
+
if not isinstance(self.data, LaminarDataset) and len(self.data) == 0:
|
|
219
|
+
raise ValueError("No data provided. Skipping evaluation")
|
|
180
220
|
self.executor = executor
|
|
181
221
|
self.evaluators = evaluators
|
|
182
|
-
self.
|
|
222
|
+
self.group_name = group_name
|
|
183
223
|
self.name = name
|
|
184
|
-
self.
|
|
224
|
+
self.metadata = metadata
|
|
225
|
+
self.concurrency_limit = concurrency_limit
|
|
226
|
+
self.batch_size = concurrency_limit
|
|
185
227
|
self._logger = get_default_logger(self.__class__.__name__)
|
|
186
|
-
self.
|
|
228
|
+
self.upload_tasks = []
|
|
229
|
+
self.base_http_url = f"{base_http_url or base_url}:{http_port or 443}"
|
|
230
|
+
|
|
231
|
+
api_key = project_api_key or from_env("LMNR_PROJECT_API_KEY")
|
|
232
|
+
if not api_key and not L.is_initialized():
|
|
233
|
+
raise ValueError(
|
|
234
|
+
"Please pass the project API key to `evaluate`"
|
|
235
|
+
" or set the LMNR_PROJECT_API_KEY environment variable"
|
|
236
|
+
" in your environment or .env file"
|
|
237
|
+
)
|
|
238
|
+
self.project_api_key = api_key
|
|
239
|
+
|
|
240
|
+
if L.is_initialized():
|
|
241
|
+
self.client = AsyncLaminarClient(
|
|
242
|
+
base_url=L.get_base_http_url(),
|
|
243
|
+
project_api_key=L.get_project_api_key(),
|
|
244
|
+
)
|
|
245
|
+
else:
|
|
246
|
+
self.client = AsyncLaminarClient(
|
|
247
|
+
base_url=self.base_http_url,
|
|
248
|
+
project_api_key=self.project_api_key,
|
|
249
|
+
)
|
|
250
|
+
|
|
187
251
|
L.initialize(
|
|
188
252
|
project_api_key=project_api_key,
|
|
189
253
|
base_url=base_url,
|
|
254
|
+
base_http_url=self.base_http_url,
|
|
190
255
|
http_port=http_port,
|
|
191
256
|
grpc_port=grpc_port,
|
|
192
257
|
instruments=instruments,
|
|
258
|
+
disabled_instruments=disabled_instruments,
|
|
259
|
+
max_export_batch_size=max_export_batch_size,
|
|
260
|
+
export_timeout_seconds=trace_export_timeout_seconds,
|
|
193
261
|
)
|
|
194
262
|
|
|
195
|
-
async def run(self) ->
|
|
196
|
-
if self.is_finished:
|
|
197
|
-
raise Exception("Evaluation is already finished")
|
|
263
|
+
async def run(self) -> EvaluationRunResult:
|
|
198
264
|
return await self._run()
|
|
199
265
|
|
|
200
|
-
async def _run(self) ->
|
|
201
|
-
|
|
266
|
+
async def _run(self) -> EvaluationRunResult:
|
|
267
|
+
if isinstance(self.data, LaminarDataset):
|
|
268
|
+
self.data.set_client(
|
|
269
|
+
LaminarClient(
|
|
270
|
+
base_url=self.base_http_url,
|
|
271
|
+
project_api_key=self.project_api_key,
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
if not self.data.id:
|
|
275
|
+
try:
|
|
276
|
+
datasets = await self.client.datasets.get_dataset_by_name(
|
|
277
|
+
self.data.name
|
|
278
|
+
)
|
|
279
|
+
if len(datasets) == 0:
|
|
280
|
+
self._logger.warning(f"Dataset {self.data.name} not found")
|
|
281
|
+
else:
|
|
282
|
+
self.data.id = datasets[0].id
|
|
283
|
+
except Exception as e:
|
|
284
|
+
# Backward compatibility with old Laminar API (self hosted)
|
|
285
|
+
self._logger.warning(f"Error getting dataset {self.data.name}: {e}")
|
|
202
286
|
|
|
203
287
|
try:
|
|
204
|
-
|
|
288
|
+
evaluation = await self.client.evals.init(
|
|
289
|
+
name=self.name, group_name=self.group_name, metadata=self.metadata
|
|
290
|
+
)
|
|
291
|
+
evaluation_id = evaluation.id
|
|
292
|
+
project_id = evaluation.projectId
|
|
293
|
+
url = get_evaluation_url(project_id, evaluation_id, self.reporter.base_url)
|
|
294
|
+
|
|
295
|
+
print(f"Check the results at {url}")
|
|
296
|
+
|
|
297
|
+
self.reporter.start(len(self.data))
|
|
298
|
+
result_datapoints = await self._evaluate_in_batches(evaluation.id)
|
|
299
|
+
# Wait for all background upload tasks to complete
|
|
300
|
+
if self.upload_tasks:
|
|
301
|
+
self._logger.debug(
|
|
302
|
+
f"Waiting for {len(self.upload_tasks)} upload tasks to complete"
|
|
303
|
+
)
|
|
304
|
+
await asyncio.gather(*self.upload_tasks)
|
|
305
|
+
self._logger.debug("All upload tasks completed")
|
|
205
306
|
except Exception as e:
|
|
206
|
-
self.
|
|
207
|
-
self.
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
# For now add all human evaluators to all result datapoints
|
|
211
|
-
# In the future, we will add ways to specify which human evaluators
|
|
212
|
-
# to add to which result datapoints, e.g. sample some randomly
|
|
213
|
-
for result_datapoint in result_datapoints:
|
|
214
|
-
result_datapoint.human_evaluators = self.human_evaluators or {}
|
|
215
|
-
|
|
216
|
-
evaluation = await L.create_evaluation(
|
|
217
|
-
data=result_datapoints, group_id=self.group_id, name=self.name
|
|
218
|
-
)
|
|
307
|
+
await self._shutdown()
|
|
308
|
+
self.reporter.stop_with_error(e)
|
|
309
|
+
|
|
219
310
|
average_scores = get_average_scores(result_datapoints)
|
|
220
311
|
self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
|
|
221
|
-
self.
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
self,
|
|
312
|
+
await self._shutdown()
|
|
313
|
+
return {
|
|
314
|
+
"average_scores": average_scores,
|
|
315
|
+
"evaluation_id": evaluation_id,
|
|
316
|
+
"project_id": project_id,
|
|
317
|
+
"url": url,
|
|
318
|
+
"error_message": None,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
async def _shutdown(self):
|
|
322
|
+
# We use flush() instead of shutdown() because multiple evaluations
|
|
323
|
+
# can be run sequentially in the same process. `shutdown()` would
|
|
324
|
+
# close the OTLP exporter and we wouldn't be able to export traces in
|
|
325
|
+
# the next evaluation.
|
|
326
|
+
L.flush()
|
|
327
|
+
await self.client.close()
|
|
328
|
+
if isinstance(self.data, LaminarDataset) and self.data.client:
|
|
329
|
+
self.data.client.close()
|
|
330
|
+
|
|
331
|
+
async def _evaluate_in_batches(
|
|
332
|
+
self, eval_id: uuid.UUID
|
|
238
333
|
) -> list[EvaluationResultDatapoint]:
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
334
|
+
|
|
335
|
+
semaphore = asyncio.Semaphore(self.concurrency_limit)
|
|
336
|
+
tasks = []
|
|
337
|
+
data_iter = self.data if isinstance(self.data, list) else range(len(self.data))
|
|
338
|
+
|
|
339
|
+
async def evaluate_task(datapoint, index):
|
|
340
|
+
try:
|
|
341
|
+
result = await self._evaluate_datapoint(eval_id, datapoint, index)
|
|
342
|
+
self.reporter.update(1)
|
|
343
|
+
return index, result
|
|
344
|
+
finally:
|
|
345
|
+
semaphore.release()
|
|
346
|
+
|
|
347
|
+
# Create tasks only after acquiring semaphore
|
|
348
|
+
for idx, item in enumerate(data_iter):
|
|
349
|
+
await semaphore.acquire()
|
|
350
|
+
datapoint = item if isinstance(self.data, list) else self.data[item]
|
|
351
|
+
task = asyncio.create_task(evaluate_task(datapoint, idx))
|
|
352
|
+
tasks.append(task)
|
|
353
|
+
|
|
354
|
+
# Wait for all tasks to complete and preserve order
|
|
355
|
+
results = await asyncio.gather(*tasks)
|
|
356
|
+
ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
357
|
+
|
|
358
|
+
return ordered_results
|
|
242
359
|
|
|
243
360
|
async def _evaluate_datapoint(
|
|
244
|
-
self, datapoint: Datapoint
|
|
361
|
+
self, eval_id: uuid.UUID, datapoint: Datapoint, index: int
|
|
245
362
|
) -> EvaluationResultDatapoint:
|
|
363
|
+
evaluation_id = uuid.uuid4()
|
|
246
364
|
with L.start_as_current_span("evaluation") as evaluation_span:
|
|
247
365
|
L._set_trace_type(trace_type=TraceType.EVALUATION)
|
|
248
366
|
evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
|
|
249
367
|
with L.start_as_current_span(
|
|
250
368
|
"executor", input={"data": datapoint.data}
|
|
251
369
|
) as executor_span:
|
|
252
|
-
executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
|
|
253
|
-
output = (
|
|
254
|
-
await self.executor(datapoint.data)
|
|
255
|
-
if is_async(self.executor)
|
|
256
|
-
else self.executor(datapoint.data)
|
|
257
|
-
)
|
|
258
|
-
L.set_span_output(output)
|
|
259
370
|
executor_span_id = uuid.UUID(
|
|
260
371
|
int=executor_span.get_span_context().span_id
|
|
261
372
|
)
|
|
373
|
+
trace_id = uuid.UUID(int=executor_span.get_span_context().trace_id)
|
|
374
|
+
|
|
375
|
+
partial_datapoint = PartialEvaluationDatapoint(
|
|
376
|
+
id=evaluation_id,
|
|
377
|
+
data=datapoint.data,
|
|
378
|
+
target=datapoint.target,
|
|
379
|
+
index=index,
|
|
380
|
+
trace_id=trace_id,
|
|
381
|
+
executor_span_id=executor_span_id,
|
|
382
|
+
metadata=datapoint.metadata,
|
|
383
|
+
)
|
|
384
|
+
if isinstance(self.data, LaminarDataset):
|
|
385
|
+
partial_datapoint.dataset_link = EvaluationDatapointDatasetLink(
|
|
386
|
+
dataset_id=self.data.id,
|
|
387
|
+
datapoint_id=datapoint.id,
|
|
388
|
+
created_at=datapoint.created_at,
|
|
389
|
+
)
|
|
390
|
+
# First, create datapoint with trace_id so that we can show the dp in the UI
|
|
391
|
+
await self.client.evals.save_datapoints(
|
|
392
|
+
eval_id, [partial_datapoint], self.group_name
|
|
393
|
+
)
|
|
394
|
+
executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
|
|
395
|
+
# Run synchronous executors in a thread pool to avoid blocking
|
|
396
|
+
if not is_async(self.executor):
|
|
397
|
+
loop = asyncio.get_event_loop()
|
|
398
|
+
output = await loop.run_in_executor(
|
|
399
|
+
None, self.executor, datapoint.data
|
|
400
|
+
)
|
|
401
|
+
else:
|
|
402
|
+
output = await self.executor(datapoint.data)
|
|
403
|
+
|
|
404
|
+
L.set_span_output(output)
|
|
262
405
|
target = datapoint.target
|
|
263
406
|
|
|
264
407
|
# Iterate over evaluators
|
|
265
408
|
scores: dict[str, Numeric] = {}
|
|
266
409
|
for evaluator_name, evaluator in self.evaluators.items():
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
410
|
+
# Check if evaluator is a HumanEvaluator instance
|
|
411
|
+
if isinstance(evaluator, HumanEvaluator):
|
|
412
|
+
# Create an empty span for human evaluators
|
|
413
|
+
with L.start_as_current_span(
|
|
414
|
+
evaluator_name, input={"output": output, "target": target}
|
|
415
|
+
) as human_evaluator_span:
|
|
416
|
+
human_evaluator_span.set_attribute(
|
|
417
|
+
SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value
|
|
418
|
+
)
|
|
419
|
+
if evaluator.options:
|
|
420
|
+
human_evaluator_span.set_attribute(
|
|
421
|
+
HUMAN_EVALUATOR_OPTIONS, json_dumps(evaluator.options)
|
|
422
|
+
)
|
|
423
|
+
# Human evaluators don't execute automatically, just create the span
|
|
424
|
+
L.set_span_output(None)
|
|
425
|
+
|
|
426
|
+
# We don't want to save the score for human evaluators
|
|
427
|
+
scores[evaluator_name] = None
|
|
281
428
|
else:
|
|
282
|
-
|
|
429
|
+
# Regular evaluator function
|
|
430
|
+
with L.start_as_current_span(
|
|
431
|
+
evaluator_name, input={"output": output, "target": target}
|
|
432
|
+
) as evaluator_span:
|
|
433
|
+
evaluator_span.set_attribute(
|
|
434
|
+
SPAN_TYPE, SpanType.EVALUATOR.value
|
|
435
|
+
)
|
|
436
|
+
if is_async(evaluator):
|
|
437
|
+
value = await evaluator(output, target)
|
|
438
|
+
else:
|
|
439
|
+
loop = asyncio.get_event_loop()
|
|
440
|
+
value = await loop.run_in_executor(
|
|
441
|
+
None, evaluator, output, target
|
|
442
|
+
)
|
|
443
|
+
L.set_span_output(value)
|
|
444
|
+
|
|
445
|
+
# If evaluator returns a single number, use evaluator name as key
|
|
446
|
+
if isinstance(value, NumericTypes):
|
|
447
|
+
scores[evaluator_name] = value
|
|
448
|
+
else:
|
|
449
|
+
scores.update(value)
|
|
283
450
|
|
|
284
451
|
trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
452
|
+
|
|
453
|
+
eval_datapoint = EvaluationResultDatapoint(
|
|
454
|
+
id=evaluation_id,
|
|
455
|
+
data=datapoint.data,
|
|
456
|
+
target=target,
|
|
457
|
+
executor_output=output,
|
|
458
|
+
scores=scores,
|
|
459
|
+
trace_id=trace_id,
|
|
460
|
+
executor_span_id=executor_span_id,
|
|
461
|
+
index=index,
|
|
462
|
+
metadata=datapoint.metadata,
|
|
463
|
+
)
|
|
464
|
+
if isinstance(self.data, LaminarDataset):
|
|
465
|
+
eval_datapoint.dataset_link = EvaluationDatapointDatasetLink(
|
|
466
|
+
dataset_id=self.data.id,
|
|
467
|
+
datapoint_id=datapoint.id,
|
|
468
|
+
created_at=datapoint.created_at,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Create background upload task without awaiting it
|
|
472
|
+
upload_task = asyncio.create_task(
|
|
473
|
+
self.client.evals.save_datapoints(
|
|
474
|
+
eval_id, [eval_datapoint], self.group_name
|
|
292
475
|
)
|
|
476
|
+
)
|
|
477
|
+
self.upload_tasks.append(upload_task)
|
|
478
|
+
|
|
479
|
+
return eval_datapoint
|
|
293
480
|
|
|
294
481
|
|
|
295
482
|
def evaluate(
|
|
296
|
-
data:
|
|
483
|
+
data: EvaluationDataset | list[Datapoint | dict],
|
|
297
484
|
executor: ExecutorFunction,
|
|
298
|
-
evaluators: dict[str, EvaluatorFunction],
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
project_api_key:
|
|
304
|
-
base_url:
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
485
|
+
evaluators: dict[str, EvaluatorFunction | HumanEvaluator],
|
|
486
|
+
name: str | None = None,
|
|
487
|
+
group_name: str | None = None,
|
|
488
|
+
metadata: dict[str, Any] | None = None,
|
|
489
|
+
concurrency_limit: int = DEFAULT_BATCH_SIZE,
|
|
490
|
+
project_api_key: str | None = None,
|
|
491
|
+
base_url: str | None = None,
|
|
492
|
+
base_http_url: str | None = None,
|
|
493
|
+
http_port: int | None = None,
|
|
494
|
+
grpc_port: int | None = None,
|
|
495
|
+
instruments: (
|
|
496
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
497
|
+
) = None,
|
|
498
|
+
disabled_instruments: (
|
|
499
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
500
|
+
) = None,
|
|
501
|
+
max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
|
|
502
|
+
trace_export_timeout_seconds: int | None = None,
|
|
503
|
+
) -> EvaluationRunResult | None:
|
|
309
504
|
"""
|
|
310
505
|
If added to the file which is called through `lmnr eval` command, then
|
|
311
506
|
registers the evaluation; otherwise, runs the evaluation.
|
|
312
507
|
|
|
313
508
|
If there is no event loop, creates it and runs the evaluation until
|
|
314
509
|
completion.
|
|
315
|
-
If there is an event loop,
|
|
316
|
-
|
|
510
|
+
If there is an event loop, returns an awaitable handle immediately. IMPORTANT:
|
|
511
|
+
You must await the call to `evaluate`.
|
|
317
512
|
|
|
318
513
|
Parameters:
|
|
319
|
-
data (
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
514
|
+
data (list[EvaluationDatapoint|dict] | EvaluationDataset):\
|
|
515
|
+
List of data points to evaluate or an evaluation dataset.
|
|
516
|
+
`data` is the input to the executor function,
|
|
517
|
+
`target` is the input to the evaluator function.
|
|
323
518
|
executor (Callable[..., Any]): The executor function.\
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
evaluators (
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
Defaults to None
|
|
346
|
-
batch_size (int, optional): The batch size for evaluation.
|
|
519
|
+
Takes the data point + any additional arguments\
|
|
520
|
+
and returns the output to evaluate.
|
|
521
|
+
evaluators (dict[str, Callable[..., Any] | HumanEvaluator]): Evaluator\
|
|
522
|
+
functions and HumanEvaluator instances with names. Each evaluator\
|
|
523
|
+
function takes the output of the executor _and_ the target data,\
|
|
524
|
+
and returns a score. The score can be a single number or a dict\
|
|
525
|
+
of string keys and number values. If the score is a single number,\
|
|
526
|
+
it will be named after the evaluator function.\
|
|
527
|
+
HumanEvaluator instances create empty spans for manual evaluation.\
|
|
528
|
+
Evaluator function names must contain only letters, digits, hyphens,\
|
|
529
|
+
underscores, or spaces.
|
|
530
|
+
name (str | None, optional): Optional name of the evaluation.\
|
|
531
|
+
Used to identify the evaluation in the group. If not provided, a\
|
|
532
|
+
random name will be generated.
|
|
533
|
+
Defaults to None.
|
|
534
|
+
group_name (str | None, optional): An identifier to group evaluations.\
|
|
535
|
+
Only evaluations within the same group_name can be visually compared.\
|
|
536
|
+
If not provided, set to "default".
|
|
537
|
+
Defaults to None
|
|
538
|
+
metadata (dict[str, Any] | None, optional): Optional metadata to associate with\
|
|
539
|
+
concurrency_limit (int, optional): The concurrency limit for evaluation.
|
|
347
540
|
Defaults to DEFAULT_BATCH_SIZE.
|
|
348
|
-
project_api_key (
|
|
541
|
+
project_api_key (str | None, optional): The project API key.
|
|
349
542
|
Defaults to None.
|
|
350
|
-
base_url (
|
|
543
|
+
base_url (str | None, optional): The base URL for Laminar API.\
|
|
351
544
|
Useful if self-hosted elsewhere. Do NOT include the\
|
|
352
545
|
port, use `http_port` and `grpc_port` instead.
|
|
353
546
|
Defaults to "https://api.lmnr.ai".
|
|
354
|
-
|
|
547
|
+
base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
|
|
548
|
+
Only set this if your Laminar backend HTTP is proxied\
|
|
549
|
+
through a different host. If not specified, defaults\
|
|
550
|
+
to https://api.lmnr.ai.
|
|
551
|
+
http_port (int | None, optional): The port for Laminar API's HTTP\
|
|
355
552
|
service. 443 is used if not specified.
|
|
356
553
|
Defaults to None.
|
|
357
|
-
grpc_port (
|
|
554
|
+
grpc_port (int | None, optional): The port for Laminar API's gRPC\
|
|
358
555
|
service. 8443 is used if not specified.
|
|
359
556
|
Defaults to None.
|
|
360
|
-
instruments (
|
|
557
|
+
instruments (set[Instruments] | None, optional): Set of modules to\
|
|
361
558
|
auto-instrument. If None, all available instruments\
|
|
362
559
|
will be used.
|
|
363
560
|
Defaults to None.
|
|
561
|
+
disabled_instruments (set[Instruments] | None, optional): Set of modules\
|
|
562
|
+
to disable auto-instrumentations. If None, no\
|
|
563
|
+
If None, only modules passed as `instruments` will be disabled.
|
|
564
|
+
Defaults to None.
|
|
565
|
+
trace_export_timeout_seconds (int | None, optional): The timeout for\
|
|
566
|
+
trace export on OpenTelemetry exporter. Defaults to None.
|
|
364
567
|
"""
|
|
365
|
-
|
|
366
568
|
evaluation = Evaluation(
|
|
367
569
|
data=data,
|
|
368
570
|
executor=executor,
|
|
369
571
|
evaluators=evaluators,
|
|
370
|
-
|
|
371
|
-
|
|
572
|
+
group_name=group_name,
|
|
573
|
+
metadata=metadata,
|
|
372
574
|
name=name,
|
|
373
|
-
|
|
575
|
+
concurrency_limit=concurrency_limit,
|
|
374
576
|
project_api_key=project_api_key,
|
|
375
577
|
base_url=base_url,
|
|
578
|
+
base_http_url=base_http_url,
|
|
376
579
|
http_port=http_port,
|
|
377
580
|
grpc_port=grpc_port,
|
|
378
581
|
instruments=instruments,
|
|
582
|
+
disabled_instruments=disabled_instruments,
|
|
583
|
+
max_export_batch_size=max_export_batch_size,
|
|
584
|
+
trace_export_timeout_seconds=trace_export_timeout_seconds,
|
|
379
585
|
)
|
|
380
586
|
|
|
381
587
|
if PREPARE_ONLY.get():
|
|
382
|
-
|
|
588
|
+
existing_evaluations = EVALUATION_INSTANCES.get([])
|
|
589
|
+
new_evaluations = (existing_evaluations or []) + [evaluation]
|
|
590
|
+
EVALUATION_INSTANCES.set(new_evaluations)
|
|
591
|
+
return None
|
|
383
592
|
else:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
593
|
+
try:
|
|
594
|
+
loop = asyncio.get_event_loop()
|
|
595
|
+
if loop.is_running():
|
|
596
|
+
return evaluation.run()
|
|
597
|
+
else:
|
|
598
|
+
return asyncio.run(evaluation.run())
|
|
599
|
+
except RuntimeError:
|
|
388
600
|
return asyncio.run(evaluation.run())
|