remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,960 @@
|
|
|
1
|
+
"""Phoenix client for REM evaluation workflows.
|
|
2
|
+
|
|
3
|
+
This client provides a lean interface to Arize Phoenix for:
|
|
4
|
+
- Dataset management (create golden sets, add examples)
|
|
5
|
+
- Experiment execution (run agents, run evaluators)
|
|
6
|
+
- Trace retrieval (query agent execution history)
|
|
7
|
+
- Label management (organize evaluations by type/difficulty)
|
|
8
|
+
|
|
9
|
+
Two-Phase Evaluation Pattern:
|
|
10
|
+
==============================
|
|
11
|
+
|
|
12
|
+
Phase 1 - Golden Set Creation (SME-driven):
|
|
13
|
+
1. SMEs create datasets with (input, reference) pairs
|
|
14
|
+
2. Store in Phoenix with metadata labels
|
|
15
|
+
3. No agent execution required
|
|
16
|
+
|
|
17
|
+
Phase 2 - Automated Evaluation (Agent-driven):
|
|
18
|
+
1. Run agents on golden set → agent outputs
|
|
19
|
+
2. Run evaluators on (input, agent_output, reference) → scores
|
|
20
|
+
3. Track in Phoenix for analysis
|
|
21
|
+
|
|
22
|
+
Example Workflow:
|
|
23
|
+
-----------------
|
|
24
|
+
|
|
25
|
+
# Phase 1: SME creates golden set
|
|
26
|
+
client = PhoenixClient()
|
|
27
|
+
dataset = client.create_dataset_from_data(
|
|
28
|
+
name="rem-lookup-golden",
|
|
29
|
+
inputs=[{"query": "LOOKUP person:sarah-chen"}],
|
|
30
|
+
outputs=[{"label": "sarah-chen", "type": "person", ...}],
|
|
31
|
+
metadata=[{"difficulty": "easy", "query_type": "LOOKUP"}]
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Phase 2a: Run agents to produce outputs
|
|
35
|
+
experiment = client.run_experiment(
|
|
36
|
+
dataset=dataset,
|
|
37
|
+
task=run_agent_task, # Calls ask_rem agent
|
|
38
|
+
experiment_name="rem-v1-baseline"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Phase 2b: Run evaluators on results
|
|
42
|
+
evaluator_exp = client.run_experiment(
|
|
43
|
+
dataset=experiment_results, # Uses agent outputs
|
|
44
|
+
task=None, # No task, just evaluate existing outputs
|
|
45
|
+
evaluators=[correctness_evaluator, completeness_evaluator],
|
|
46
|
+
experiment_name="rem-v1-evaluation"
|
|
47
|
+
)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
from __future__ import annotations
|
|
51
|
+
|
|
52
|
+
from datetime import datetime
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
from typing import Any, Callable, TYPE_CHECKING, cast
|
|
55
|
+
|
|
56
|
+
import polars as pl
|
|
57
|
+
from loguru import logger
|
|
58
|
+
|
|
59
|
+
from .config import PhoenixConfig
|
|
60
|
+
|
|
61
|
+
if TYPE_CHECKING:
|
|
62
|
+
from phoenix.client import Client
|
|
63
|
+
from phoenix.client.resources.datasets import Dataset
|
|
64
|
+
from phoenix.client.resources.experiments.types import RanExperiment
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def dataframe_to_phoenix_dataset(
|
|
68
|
+
client: "PhoenixClient",
|
|
69
|
+
df: pl.DataFrame,
|
|
70
|
+
dataset_name: str,
|
|
71
|
+
input_keys: list[str] | None = None,
|
|
72
|
+
output_keys: list[str] | None = None,
|
|
73
|
+
metadata_keys: list[str] | None = None,
|
|
74
|
+
description: str | None = None,
|
|
75
|
+
) -> "Dataset":
|
|
76
|
+
"""Convert a Polars DataFrame to a Phoenix Dataset.
|
|
77
|
+
|
|
78
|
+
This function transforms a Polars DataFrame into a Phoenix Dataset by:
|
|
79
|
+
1. Extracting input columns (what agents receive)
|
|
80
|
+
2. Extracting output columns (ground truth/expected output)
|
|
81
|
+
3. Extracting metadata columns (optional labels, difficulty, etc.)
|
|
82
|
+
|
|
83
|
+
If column keys are not specified, uses smart defaults:
|
|
84
|
+
- input_keys: columns containing 'input', 'query', 'question', or 'prompt'
|
|
85
|
+
- output_keys: columns containing 'output', 'expected', 'answer', or 'response'
|
|
86
|
+
- metadata_keys: remaining columns
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
client: PhoenixClient instance
|
|
90
|
+
df: Polars DataFrame with experiment data
|
|
91
|
+
dataset_name: Name for the created Phoenix dataset
|
|
92
|
+
input_keys: Optional list of column names for inputs
|
|
93
|
+
output_keys: Optional list of column names for outputs (ground truth)
|
|
94
|
+
metadata_keys: Optional list of column names for metadata
|
|
95
|
+
description: Optional dataset description
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Phoenix Dataset instance
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
>>> df = pl.read_csv("golden_set.csv")
|
|
102
|
+
>>> dataset = dataframe_to_phoenix_dataset(
|
|
103
|
+
... client=phoenix_client,
|
|
104
|
+
... df=df,
|
|
105
|
+
... dataset_name="my-golden-set",
|
|
106
|
+
... input_keys=["query"],
|
|
107
|
+
... output_keys=["expected_output"],
|
|
108
|
+
... metadata_keys=["difficulty"]
|
|
109
|
+
... )
|
|
110
|
+
"""
|
|
111
|
+
columns = df.columns
|
|
112
|
+
|
|
113
|
+
# Smart defaults for column detection
|
|
114
|
+
if input_keys is None:
|
|
115
|
+
input_keys = [c for c in columns if any(
|
|
116
|
+
k in c.lower() for k in ["input", "query", "question", "prompt"]
|
|
117
|
+
)]
|
|
118
|
+
if not input_keys:
|
|
119
|
+
# Fallback: first column
|
|
120
|
+
input_keys = [columns[0]] if columns else []
|
|
121
|
+
|
|
122
|
+
if output_keys is None:
|
|
123
|
+
output_keys = [c for c in columns if any(
|
|
124
|
+
k in c.lower() for k in ["output", "expected", "answer", "response", "reference"]
|
|
125
|
+
)]
|
|
126
|
+
if not output_keys:
|
|
127
|
+
# Fallback: second column
|
|
128
|
+
output_keys = [columns[1]] if len(columns) > 1 else []
|
|
129
|
+
|
|
130
|
+
if metadata_keys is None:
|
|
131
|
+
used_keys = set(input_keys) | set(output_keys)
|
|
132
|
+
metadata_keys = [c for c in columns if c not in used_keys]
|
|
133
|
+
|
|
134
|
+
logger.debug(
|
|
135
|
+
f"DataFrame to Phoenix Dataset: inputs={input_keys}, "
|
|
136
|
+
f"outputs={output_keys}, metadata={metadata_keys}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Convert to list of dicts
|
|
140
|
+
records = df.to_dicts()
|
|
141
|
+
|
|
142
|
+
inputs = [{k: row.get(k) for k in input_keys} for row in records]
|
|
143
|
+
outputs = [{k: row.get(k) for k in output_keys} for row in records]
|
|
144
|
+
metadata = [{k: row.get(k) for k in metadata_keys} for row in records] if metadata_keys else None
|
|
145
|
+
|
|
146
|
+
# Create Phoenix dataset
|
|
147
|
+
return client.create_dataset_from_data(
|
|
148
|
+
name=dataset_name,
|
|
149
|
+
inputs=inputs,
|
|
150
|
+
outputs=outputs,
|
|
151
|
+
metadata=metadata,
|
|
152
|
+
description=description,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class PhoenixClient:
|
|
157
|
+
"""High-level Phoenix client for REM evaluation workflows.
|
|
158
|
+
|
|
159
|
+
Wraps the official Phoenix client with REM-specific methods for:
|
|
160
|
+
- Creating and managing evaluation datasets
|
|
161
|
+
- Running agent and evaluator experiments
|
|
162
|
+
- Querying trace data for analysis
|
|
163
|
+
- Managing dataset labels
|
|
164
|
+
|
|
165
|
+
Attributes:
|
|
166
|
+
config: Phoenix connection configuration
|
|
167
|
+
_client: Underlying Phoenix Client instance
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def __init__(self, config: PhoenixConfig | None = None):
|
|
171
|
+
"""Initialize Phoenix client.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
config: Optional Phoenix configuration (auto-loads if not provided)
|
|
175
|
+
"""
|
|
176
|
+
if config is None:
|
|
177
|
+
config = PhoenixConfig.from_settings()
|
|
178
|
+
|
|
179
|
+
self.config = config
|
|
180
|
+
self._client = self._create_client()
|
|
181
|
+
|
|
182
|
+
logger.info(f"Phoenix client initialized (endpoint: {self.config.base_url})")
|
|
183
|
+
|
|
184
|
+
def _create_client(self) -> "Client":
|
|
185
|
+
"""Create underlying Phoenix client.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Configured Phoenix Client instance
|
|
189
|
+
"""
|
|
190
|
+
from phoenix.client import Client
|
|
191
|
+
|
|
192
|
+
return Client(
|
|
193
|
+
base_url=self.config.base_url,
|
|
194
|
+
api_key=self.config.api_key,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# =========================================================================
|
|
198
|
+
# DATASET MANAGEMENT
|
|
199
|
+
# =========================================================================
|
|
200
|
+
|
|
201
|
+
def list_datasets(self) -> list[dict[str, Any]]:
|
|
202
|
+
"""List all datasets in Phoenix.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
List of dataset metadata dicts with keys:
|
|
206
|
+
- id: Dataset ID
|
|
207
|
+
- name: Dataset name
|
|
208
|
+
- example_count: Number of examples
|
|
209
|
+
- created_at: Creation timestamp
|
|
210
|
+
"""
|
|
211
|
+
try:
|
|
212
|
+
datasets = list(self._client.datasets.list())
|
|
213
|
+
logger.debug(f"Found {len(datasets)} datasets")
|
|
214
|
+
return [
|
|
215
|
+
{
|
|
216
|
+
"id": str(ds.get("id", "")),
|
|
217
|
+
"name": ds.get("name", ""),
|
|
218
|
+
"example_count": ds.get("example_count", 0),
|
|
219
|
+
"created_at": ds.get("created_at", ""),
|
|
220
|
+
}
|
|
221
|
+
for ds in datasets
|
|
222
|
+
]
|
|
223
|
+
except Exception as e:
|
|
224
|
+
logger.error(f"Failed to list datasets: {e}")
|
|
225
|
+
raise
|
|
226
|
+
|
|
227
|
+
def get_dataset(self, name: str) -> "Dataset":
|
|
228
|
+
"""Get a dataset by name.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
name: Dataset name
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Dataset instance
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
ValueError: If dataset not found
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
dataset = self._client.datasets.get_dataset(dataset=name)
|
|
241
|
+
logger.debug(f"Loaded dataset: {name} ({len(dataset)} examples)")
|
|
242
|
+
return dataset
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.error(f"Failed to get dataset '{name}': {e}")
|
|
245
|
+
raise ValueError(f"Dataset not found: {name}") from e
|
|
246
|
+
|
|
247
|
+
def create_dataset_from_data(
|
|
248
|
+
self,
|
|
249
|
+
name: str,
|
|
250
|
+
inputs: list[dict[str, Any]],
|
|
251
|
+
outputs: list[dict[str, Any]],
|
|
252
|
+
metadata: list[dict[str, Any]] | None = None,
|
|
253
|
+
description: str | None = None,
|
|
254
|
+
) -> "Dataset":
|
|
255
|
+
"""Create a dataset from input/output pairs (SME golden set creation).
|
|
256
|
+
|
|
257
|
+
This is the primary method for SMEs to create evaluation datasets.
|
|
258
|
+
Each example consists of:
|
|
259
|
+
- input: What the agent receives (e.g., {"query": "LOOKUP person:sarah-chen"})
|
|
260
|
+
- output: Expected correct result (ground truth/reference)
|
|
261
|
+
- metadata: Optional labels (difficulty, query_type, etc.)
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
name: Dataset name (will be created or updated)
|
|
265
|
+
inputs: List of input dicts (what agents receive)
|
|
266
|
+
outputs: List of expected output dicts (ground truth)
|
|
267
|
+
metadata: Optional list of metadata dicts (labels, difficulty, etc.)
|
|
268
|
+
description: Optional dataset description
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Created Dataset instance
|
|
272
|
+
|
|
273
|
+
Example:
|
|
274
|
+
>>> client = PhoenixClient()
|
|
275
|
+
>>> dataset = client.create_dataset_from_data(
|
|
276
|
+
... name="rem-lookup-golden",
|
|
277
|
+
... inputs=[
|
|
278
|
+
... {"query": "LOOKUP person:sarah-chen"},
|
|
279
|
+
... {"query": "LOOKUP project:tidb-migration"}
|
|
280
|
+
... ],
|
|
281
|
+
... outputs=[
|
|
282
|
+
... {"label": "sarah-chen", "type": "person", "properties": {...}},
|
|
283
|
+
... {"label": "tidb-migration", "type": "project", "properties": {...}}
|
|
284
|
+
... ],
|
|
285
|
+
... metadata=[
|
|
286
|
+
... {"difficulty": "easy", "query_type": "LOOKUP"},
|
|
287
|
+
... {"difficulty": "medium", "query_type": "LOOKUP"}
|
|
288
|
+
... ]
|
|
289
|
+
... )
|
|
290
|
+
"""
|
|
291
|
+
try:
|
|
292
|
+
# Validate inputs/outputs match
|
|
293
|
+
if len(inputs) != len(outputs):
|
|
294
|
+
raise ValueError(
|
|
295
|
+
f"Input count ({len(inputs)}) must match output count ({len(outputs)})"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Create metadata list if not provided
|
|
299
|
+
if metadata is None:
|
|
300
|
+
metadata = [{} for _ in inputs]
|
|
301
|
+
elif len(metadata) != len(inputs):
|
|
302
|
+
raise ValueError(
|
|
303
|
+
f"Metadata count ({len(metadata)}) must match input count ({len(inputs)})"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Create dataset
|
|
307
|
+
dataset = self._client.datasets.create_dataset(
|
|
308
|
+
name=name,
|
|
309
|
+
dataset_description=description,
|
|
310
|
+
inputs=inputs,
|
|
311
|
+
outputs=outputs,
|
|
312
|
+
metadata=metadata,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logger.info(f"Created dataset '{name}' with {len(inputs)} examples")
|
|
316
|
+
return dataset
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error(f"Failed to create dataset '{name}': {e}")
|
|
320
|
+
raise
|
|
321
|
+
|
|
322
|
+
def create_dataset_from_csv(
|
|
323
|
+
self,
|
|
324
|
+
name: str,
|
|
325
|
+
csv_file_path: Path | str,
|
|
326
|
+
input_keys: list[str],
|
|
327
|
+
output_keys: list[str],
|
|
328
|
+
metadata_keys: list[str] | None = None,
|
|
329
|
+
description: str | None = None,
|
|
330
|
+
) -> "Dataset":
|
|
331
|
+
"""Create a dataset from a CSV file.
|
|
332
|
+
|
|
333
|
+
Convenience method for loading golden sets from CSV files.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
name: Dataset name
|
|
337
|
+
csv_file_path: Path to CSV file
|
|
338
|
+
input_keys: Column names to use as inputs
|
|
339
|
+
output_keys: Column names to use as outputs (reference/ground truth)
|
|
340
|
+
metadata_keys: Optional column names for metadata
|
|
341
|
+
description: Optional dataset description
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Created Dataset instance
|
|
345
|
+
|
|
346
|
+
Example CSV structure:
|
|
347
|
+
query,expected_label,expected_type,difficulty,query_type
|
|
348
|
+
"LOOKUP person:sarah-chen",sarah-chen,person,easy,LOOKUP
|
|
349
|
+
"SEARCH semantic AI engineer",sarah-chen,person,medium,SEARCH
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
# Load CSV with Polars
|
|
353
|
+
df = pl.read_csv(csv_file_path)
|
|
354
|
+
|
|
355
|
+
# Convert to list of dicts
|
|
356
|
+
records = df.to_dicts()
|
|
357
|
+
|
|
358
|
+
# Extract inputs
|
|
359
|
+
inputs = [{k: row.get(k) for k in input_keys} for row in records]
|
|
360
|
+
|
|
361
|
+
# Extract outputs
|
|
362
|
+
outputs = [{k: row.get(k) for k in output_keys} for row in records]
|
|
363
|
+
|
|
364
|
+
# Extract metadata if specified
|
|
365
|
+
metadata = None
|
|
366
|
+
if metadata_keys:
|
|
367
|
+
metadata = [{k: row.get(k) for k in metadata_keys} for row in records]
|
|
368
|
+
|
|
369
|
+
return self.create_dataset_from_data(
|
|
370
|
+
name=name,
|
|
371
|
+
inputs=inputs,
|
|
372
|
+
outputs=outputs,
|
|
373
|
+
metadata=metadata,
|
|
374
|
+
description=description,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.error(f"Failed to create dataset from CSV '{csv_file_path}': {e}")
|
|
379
|
+
raise
|
|
380
|
+
|
|
381
|
+
def add_examples_to_dataset(
|
|
382
|
+
self,
|
|
383
|
+
dataset: str,
|
|
384
|
+
inputs: list[dict[str, Any]],
|
|
385
|
+
outputs: list[dict[str, Any]],
|
|
386
|
+
metadata: list[dict[str, Any]] | None = None,
|
|
387
|
+
) -> "Dataset":
|
|
388
|
+
"""Add examples to an existing dataset.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
dataset: Dataset name
|
|
392
|
+
inputs: List of input dicts
|
|
393
|
+
outputs: List of output dicts
|
|
394
|
+
metadata: Optional list of metadata dicts
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
Updated Dataset instance
|
|
398
|
+
"""
|
|
399
|
+
try:
|
|
400
|
+
if len(inputs) != len(outputs):
|
|
401
|
+
raise ValueError("Input/output counts must match")
|
|
402
|
+
|
|
403
|
+
if metadata is None:
|
|
404
|
+
metadata = [{} for _ in inputs]
|
|
405
|
+
|
|
406
|
+
updated_dataset = self._client.datasets.add_examples_to_dataset(
|
|
407
|
+
dataset, # Positional argument instead of keyword
|
|
408
|
+
inputs=inputs,
|
|
409
|
+
outputs=outputs,
|
|
410
|
+
metadata=metadata,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
logger.info(f"Added {len(inputs)} examples to dataset '{dataset}'")
|
|
414
|
+
return updated_dataset
|
|
415
|
+
|
|
416
|
+
except Exception as e:
|
|
417
|
+
logger.error(f"Failed to add examples to dataset '{dataset}': {e}")
|
|
418
|
+
raise
|
|
419
|
+
|
|
420
|
+
# =========================================================================
|
|
421
|
+
# EXPERIMENT EXECUTION
|
|
422
|
+
# =========================================================================
|
|
423
|
+
|
|
424
|
+
def run_experiment(
|
|
425
|
+
self,
|
|
426
|
+
dataset: "Dataset" | str | pl.DataFrame,
|
|
427
|
+
task: Callable[[Any], Any] | None = None,
|
|
428
|
+
evaluators: list[Callable[[Any], Any]] | None = None,
|
|
429
|
+
experiment_name: str | None = None,
|
|
430
|
+
experiment_description: str | None = None,
|
|
431
|
+
experiment_metadata: dict[str, Any] | None = None,
|
|
432
|
+
experiment_config: Any | None = None,
|
|
433
|
+
input_keys: list[str] | None = None,
|
|
434
|
+
output_keys: list[str] | None = None,
|
|
435
|
+
metadata_keys: list[str] | None = None,
|
|
436
|
+
) -> "RanExperiment":
|
|
437
|
+
"""Run an evaluation experiment.
|
|
438
|
+
|
|
439
|
+
Three modes:
|
|
440
|
+
1. ExperimentConfig mode: Provide experiment_config with all settings
|
|
441
|
+
2. Agent run: Provide task function to execute agents on dataset
|
|
442
|
+
3. Evaluator run: Provide evaluators to score existing outputs
|
|
443
|
+
|
|
444
|
+
Dataset can be:
|
|
445
|
+
- Phoenix Dataset instance
|
|
446
|
+
- Dataset name (string) - will be loaded from Phoenix
|
|
447
|
+
- Polars DataFrame - will be converted to Phoenix Dataset
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
dataset: Dataset instance, name, or Polars DataFrame
|
|
451
|
+
task: Optional task function to run on each example (agent execution)
|
|
452
|
+
evaluators: Optional list of evaluator functions
|
|
453
|
+
experiment_name: Optional experiment name
|
|
454
|
+
experiment_description: Optional description
|
|
455
|
+
experiment_metadata: Optional metadata dict
|
|
456
|
+
experiment_config: Optional ExperimentConfig instance (overrides other params)
|
|
457
|
+
input_keys: Column names for inputs (required if dataset is DataFrame)
|
|
458
|
+
output_keys: Column names for outputs (required if dataset is DataFrame)
|
|
459
|
+
metadata_keys: Optional column names for metadata
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
RanExperiment with results
|
|
463
|
+
|
|
464
|
+
Example - Agent Run (Phase 2a):
|
|
465
|
+
>>> async def run_agent(example):
|
|
466
|
+
... from rem.mcp.tools.rem import ask_rem
|
|
467
|
+
... result = await ask_rem(example["input"]["query"])
|
|
468
|
+
... return result
|
|
469
|
+
>>> experiment = client.run_experiment(
|
|
470
|
+
... dataset="rem-lookup-golden",
|
|
471
|
+
... task=run_agent,
|
|
472
|
+
... experiment_name="rem-v1-baseline"
|
|
473
|
+
... )
|
|
474
|
+
|
|
475
|
+
Example - With Polars DataFrame:
|
|
476
|
+
>>> df = pl.read_csv("golden_set.csv")
|
|
477
|
+
>>> experiment = client.run_experiment(
|
|
478
|
+
... dataset=df,
|
|
479
|
+
... task=run_agent,
|
|
480
|
+
... experiment_name="rem-v1-baseline",
|
|
481
|
+
... input_keys=["query"],
|
|
482
|
+
... output_keys=["expected_output"]
|
|
483
|
+
... )
|
|
484
|
+
|
|
485
|
+
Example - Evaluator Run (Phase 2b):
|
|
486
|
+
>>> experiment = client.run_experiment(
|
|
487
|
+
... dataset=agent_results,
|
|
488
|
+
... evaluators=[correctness_eval, completeness_eval],
|
|
489
|
+
... experiment_name="rem-v1-evaluation"
|
|
490
|
+
... )
|
|
491
|
+
"""
|
|
492
|
+
try:
|
|
493
|
+
# Handle ExperimentConfig mode
|
|
494
|
+
if experiment_config:
|
|
495
|
+
experiment_name = experiment_name or experiment_config.name
|
|
496
|
+
experiment_description = experiment_description or experiment_config.description
|
|
497
|
+
|
|
498
|
+
# Merge metadata
|
|
499
|
+
config_metadata = {
|
|
500
|
+
"agent_schema": experiment_config.agent_schema_ref.name,
|
|
501
|
+
"agent_version": experiment_config.agent_schema_ref.version,
|
|
502
|
+
"evaluator_schema": experiment_config.evaluator_schema_ref.name,
|
|
503
|
+
"evaluator_version": experiment_config.evaluator_schema_ref.version,
|
|
504
|
+
"config_status": experiment_config.status.value,
|
|
505
|
+
"config_tags": experiment_config.tags,
|
|
506
|
+
}
|
|
507
|
+
config_metadata.update(experiment_config.metadata or {})
|
|
508
|
+
experiment_metadata = experiment_metadata or config_metadata
|
|
509
|
+
|
|
510
|
+
# Use ground_truth dataset if dataset not provided
|
|
511
|
+
if not dataset and "ground_truth" in experiment_config.datasets:
|
|
512
|
+
dataset_ref = experiment_config.datasets["ground_truth"]
|
|
513
|
+
# Load from Git or use provided path
|
|
514
|
+
if dataset_ref.location.value == "git":
|
|
515
|
+
# Assume dataset is already loaded
|
|
516
|
+
logger.warning(
|
|
517
|
+
f"Dataset location is 'git' but path-based loading not implemented. "
|
|
518
|
+
f"Pass dataset explicitly or use Phoenix dataset name."
|
|
519
|
+
)
|
|
520
|
+
else:
|
|
521
|
+
dataset = dataset_ref.path
|
|
522
|
+
|
|
523
|
+
# Convert Polars DataFrame to Phoenix Dataset
|
|
524
|
+
if isinstance(dataset, pl.DataFrame):
|
|
525
|
+
dataset_name_for_phoenix = f"{experiment_name or 'experiment'}-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
526
|
+
logger.info(f"Converting Polars DataFrame to Phoenix Dataset: {dataset_name_for_phoenix}")
|
|
527
|
+
dataset = dataframe_to_phoenix_dataset(
|
|
528
|
+
client=self,
|
|
529
|
+
df=dataset,
|
|
530
|
+
dataset_name=dataset_name_for_phoenix,
|
|
531
|
+
input_keys=input_keys,
|
|
532
|
+
output_keys=output_keys,
|
|
533
|
+
metadata_keys=metadata_keys,
|
|
534
|
+
description=f"Auto-created from DataFrame for experiment: {experiment_name}",
|
|
535
|
+
)
|
|
536
|
+
logger.info(f"✓ Created Phoenix Dataset: {dataset_name_for_phoenix}")
|
|
537
|
+
|
|
538
|
+
# Load dataset if name provided
|
|
539
|
+
if isinstance(dataset, str):
|
|
540
|
+
dataset = self.get_dataset(dataset)
|
|
541
|
+
|
|
542
|
+
logger.info(
|
|
543
|
+
f"Running experiment '{experiment_name or 'unnamed'}' "
|
|
544
|
+
f"on dataset with {len(dataset)} examples"
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# Run experiment
|
|
548
|
+
experiment = self._client.experiments.run_experiment(
|
|
549
|
+
dataset=dataset,
|
|
550
|
+
task=task, # type: ignore[arg-type]
|
|
551
|
+
evaluators=evaluators or [],
|
|
552
|
+
experiment_name=experiment_name,
|
|
553
|
+
experiment_description=experiment_description,
|
|
554
|
+
experiment_metadata=experiment_metadata,
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
logger.success(f"Experiment complete: {experiment_name or 'unnamed'}")
|
|
558
|
+
if hasattr(experiment, "url"):
|
|
559
|
+
logger.info(f"View results: {experiment.url}") # type: ignore[attr-defined]
|
|
560
|
+
|
|
561
|
+
# Update ExperimentConfig if provided
|
|
562
|
+
if experiment_config:
|
|
563
|
+
experiment_config.last_run_at = datetime.now()
|
|
564
|
+
experiment_config.status = "running" if hasattr(experiment, "runs") else "completed"
|
|
565
|
+
|
|
566
|
+
return experiment
|
|
567
|
+
|
|
568
|
+
except Exception as e:
|
|
569
|
+
logger.error(f"Failed to run experiment: {e}")
|
|
570
|
+
raise
|
|
571
|
+
|
|
572
|
+
# =========================================================================
|
|
573
|
+
# TRACE RETRIEVAL
|
|
574
|
+
# =========================================================================
|
|
575
|
+
|
|
576
|
+
def get_traces(
|
|
577
|
+
self,
|
|
578
|
+
project_name: str | None = None,
|
|
579
|
+
start_time: datetime | None = None,
|
|
580
|
+
end_time: datetime | None = None,
|
|
581
|
+
limit: int = 100,
|
|
582
|
+
root_spans_only: bool = True,
|
|
583
|
+
trace_id: str | None = None,
|
|
584
|
+
span_id: str | None = None,
|
|
585
|
+
) -> pl.DataFrame:
|
|
586
|
+
"""Query traces from Phoenix.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
project_name: Filter by project name
|
|
590
|
+
start_time: Filter traces after this time
|
|
591
|
+
end_time: Filter traces before this time
|
|
592
|
+
limit: Maximum number of traces to return
|
|
593
|
+
root_spans_only: Only return root spans (default: True)
|
|
594
|
+
trace_id: Filter by specific trace ID
|
|
595
|
+
span_id: Filter by specific span ID
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
Polars DataFrame with trace data
|
|
599
|
+
|
|
600
|
+
Example:
|
|
601
|
+
>>> traces = client.get_traces(
|
|
602
|
+
... project_name="rem-agents",
|
|
603
|
+
... start_time=datetime.now() - timedelta(days=7),
|
|
604
|
+
... limit=50
|
|
605
|
+
... )
|
|
606
|
+
"""
|
|
607
|
+
try:
|
|
608
|
+
# Build query
|
|
609
|
+
query_params: dict[str, Any] = {}
|
|
610
|
+
if project_name:
|
|
611
|
+
query_params["project_name"] = project_name
|
|
612
|
+
if start_time:
|
|
613
|
+
query_params["start_time"] = start_time.isoformat()
|
|
614
|
+
if end_time:
|
|
615
|
+
query_params["end_time"] = end_time.isoformat()
|
|
616
|
+
if root_spans_only:
|
|
617
|
+
query_params["root_spans_only"] = True
|
|
618
|
+
if trace_id:
|
|
619
|
+
query_params["trace_id"] = trace_id
|
|
620
|
+
if span_id:
|
|
621
|
+
query_params["span_id"] = span_id
|
|
622
|
+
|
|
623
|
+
# Query traces (Phoenix returns pandas DataFrame)
|
|
624
|
+
pandas_df = self._client.query_spans(limit=limit, **query_params) # type: ignore[attr-defined]
|
|
625
|
+
|
|
626
|
+
# Convert pandas to Polars
|
|
627
|
+
traces_df = pl.from_pandas(pandas_df)
|
|
628
|
+
|
|
629
|
+
logger.debug(f"Retrieved {len(traces_df)} traces")
|
|
630
|
+
return traces_df
|
|
631
|
+
|
|
632
|
+
except Exception as e:
|
|
633
|
+
logger.error(f"Failed to query traces: {e}")
|
|
634
|
+
raise
|
|
635
|
+
|
|
636
|
+
def create_dataset_from_traces(
|
|
637
|
+
self,
|
|
638
|
+
name: str,
|
|
639
|
+
project_name: str,
|
|
640
|
+
start_time: datetime | None = None,
|
|
641
|
+
end_time: datetime | None = None,
|
|
642
|
+
limit: int = 100,
|
|
643
|
+
description: str | None = None,
|
|
644
|
+
) -> "Dataset":
|
|
645
|
+
"""Create a dataset from production traces.
|
|
646
|
+
|
|
647
|
+
Useful for regression testing and coverage analysis.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
name: Dataset name
|
|
651
|
+
project_name: Phoenix project name to query traces from
|
|
652
|
+
start_time: Optional start time for trace window
|
|
653
|
+
end_time: Optional end time for trace window
|
|
654
|
+
limit: Maximum number of traces to include
|
|
655
|
+
description: Optional dataset description
|
|
656
|
+
|
|
657
|
+
Returns:
|
|
658
|
+
Created Dataset instance
|
|
659
|
+
|
|
660
|
+
Example:
|
|
661
|
+
>>> dataset = client.create_dataset_from_traces(
|
|
662
|
+
... name="rem-production-regression",
|
|
663
|
+
... project_name="rem-production",
|
|
664
|
+
... start_time=datetime.now() - timedelta(days=30),
|
|
665
|
+
... limit=500
|
|
666
|
+
... )
|
|
667
|
+
"""
|
|
668
|
+
try:
|
|
669
|
+
# Query traces (returns Polars DataFrame)
|
|
670
|
+
traces_df = self.get_traces(
|
|
671
|
+
project_name=project_name,
|
|
672
|
+
start_time=start_time,
|
|
673
|
+
end_time=end_time,
|
|
674
|
+
limit=limit,
|
|
675
|
+
root_spans_only=True,
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
if len(traces_df) == 0:
|
|
679
|
+
raise ValueError("No traces found matching criteria")
|
|
680
|
+
|
|
681
|
+
# Convert to list of dicts for iteration
|
|
682
|
+
records = traces_df.to_dicts()
|
|
683
|
+
|
|
684
|
+
# Extract inputs and outputs from traces
|
|
685
|
+
inputs = []
|
|
686
|
+
outputs = []
|
|
687
|
+
metadata = []
|
|
688
|
+
|
|
689
|
+
for row in records:
|
|
690
|
+
# Extract input
|
|
691
|
+
span_input = row.get("attributes.input")
|
|
692
|
+
if span_input:
|
|
693
|
+
if isinstance(span_input, str):
|
|
694
|
+
inputs.append({"input": span_input})
|
|
695
|
+
else:
|
|
696
|
+
inputs.append(span_input)
|
|
697
|
+
else:
|
|
698
|
+
inputs.append({})
|
|
699
|
+
|
|
700
|
+
# Extract output
|
|
701
|
+
span_output = row.get("attributes.output")
|
|
702
|
+
if span_output:
|
|
703
|
+
if isinstance(span_output, str):
|
|
704
|
+
outputs.append({"output": span_output})
|
|
705
|
+
else:
|
|
706
|
+
outputs.append(span_output)
|
|
707
|
+
else:
|
|
708
|
+
outputs.append({})
|
|
709
|
+
|
|
710
|
+
# Extract metadata
|
|
711
|
+
metadata.append({
|
|
712
|
+
"span_id": str(row.get("context.span_id", "")),
|
|
713
|
+
"trace_id": str(row.get("context.trace_id", "")),
|
|
714
|
+
"start_time": str(row.get("start_time", "")),
|
|
715
|
+
"latency_ms": row.get("latency_ms", 0),
|
|
716
|
+
})
|
|
717
|
+
|
|
718
|
+
# Create dataset
|
|
719
|
+
dataset = self.create_dataset_from_data(
|
|
720
|
+
name=name,
|
|
721
|
+
inputs=inputs,
|
|
722
|
+
outputs=outputs,
|
|
723
|
+
metadata=metadata,
|
|
724
|
+
description=description,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
logger.info(f"Created dataset '{name}' from {len(inputs)} traces")
|
|
728
|
+
return dataset
|
|
729
|
+
|
|
730
|
+
except Exception as e:
|
|
731
|
+
logger.error(f"Failed to create dataset from traces: {e}")
|
|
732
|
+
raise
|
|
733
|
+
|
|
734
|
+
def get_experiment(self, experiment_id: str) -> dict[str, Any]:
|
|
735
|
+
"""Get experiment data including task runs.
|
|
736
|
+
|
|
737
|
+
Args:
|
|
738
|
+
experiment_id: Experiment ID (from Phoenix UI URL)
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
Dictionary with experiment data including:
|
|
742
|
+
- id: Experiment ID
|
|
743
|
+
- name: Experiment name
|
|
744
|
+
- dataset_id: Associated dataset ID
|
|
745
|
+
- experiment_metadata: Metadata dict
|
|
746
|
+
- task_runs: List of task run results
|
|
747
|
+
|
|
748
|
+
Example:
|
|
749
|
+
>>> exp_data = client.get_experiment("RXhwZXJpbWVudDoxMjM=")
|
|
750
|
+
>>> print(f"Experiment: {exp_data['name']}")
|
|
751
|
+
>>> print(f"Task runs: {len(exp_data['task_runs'])}")
|
|
752
|
+
"""
|
|
753
|
+
try:
|
|
754
|
+
# Get experiment object
|
|
755
|
+
experiment = self._client.experiments.get_experiment(experiment_id) # type: ignore[misc]
|
|
756
|
+
|
|
757
|
+
# Extract task runs
|
|
758
|
+
task_runs = []
|
|
759
|
+
for run in experiment.runs: # type: ignore[attr-defined]
|
|
760
|
+
task_runs.append({
|
|
761
|
+
"input": run.input,
|
|
762
|
+
"output": run.output,
|
|
763
|
+
"expected": run.expected,
|
|
764
|
+
"dataset_example_id": getattr(run, "dataset_example_id", None),
|
|
765
|
+
})
|
|
766
|
+
|
|
767
|
+
# Build response
|
|
768
|
+
exp_data = {
|
|
769
|
+
"id": experiment.id, # type: ignore[attr-defined]
|
|
770
|
+
"name": experiment.name, # type: ignore[attr-defined]
|
|
771
|
+
"dataset_id": experiment.dataset_id, # type: ignore[attr-defined]
|
|
772
|
+
"experiment_metadata": experiment.metadata or {}, # type: ignore[attr-defined]
|
|
773
|
+
"task_runs": task_runs,
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
logger.info(f"Retrieved experiment '{experiment.name}' with {len(task_runs)} task runs") # type: ignore[attr-defined]
|
|
777
|
+
return exp_data
|
|
778
|
+
|
|
779
|
+
except Exception as e:
|
|
780
|
+
logger.error(f"Failed to get experiment '{experiment_id}': {e}")
|
|
781
|
+
raise
|
|
782
|
+
|
|
783
|
+
# =========================================================================
|
|
784
|
+
# FEEDBACK/ANNOTATION
|
|
785
|
+
# =========================================================================
|
|
786
|
+
|
|
787
|
+
def add_span_feedback(
|
|
788
|
+
self,
|
|
789
|
+
span_id: str,
|
|
790
|
+
annotation_name: str,
|
|
791
|
+
annotator_kind: str = "HUMAN",
|
|
792
|
+
label: str | None = None,
|
|
793
|
+
score: float | None = None,
|
|
794
|
+
explanation: str | None = None,
|
|
795
|
+
metadata: dict[str, Any] | None = None,
|
|
796
|
+
trace_id: str | None = None,
|
|
797
|
+
) -> str | None:
|
|
798
|
+
"""Add feedback annotation to a span via Phoenix REST API.
|
|
799
|
+
|
|
800
|
+
Uses direct HTTP POST to /v1/span_annotations for reliability
|
|
801
|
+
(Phoenix Python client API changes frequently).
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
span_id: Span ID to annotate (hex string)
|
|
805
|
+
annotation_name: Name of the annotation (e.g., "correctness", "user_feedback")
|
|
806
|
+
annotator_kind: Type of annotator ("HUMAN", "LLM", "CODE")
|
|
807
|
+
label: Optional label (e.g., "correct", "incorrect", "helpful")
|
|
808
|
+
score: Optional numeric score (0.0-1.0)
|
|
809
|
+
explanation: Optional explanation text
|
|
810
|
+
metadata: Optional additional metadata dict
|
|
811
|
+
trace_id: Optional trace ID (used if span lookup needed)
|
|
812
|
+
|
|
813
|
+
Returns:
|
|
814
|
+
Annotation ID if successful, None otherwise
|
|
815
|
+
"""
|
|
816
|
+
import httpx
|
|
817
|
+
|
|
818
|
+
try:
|
|
819
|
+
# Build annotation payload for Phoenix REST API
|
|
820
|
+
annotation_data = {
|
|
821
|
+
"span_id": span_id,
|
|
822
|
+
"name": annotation_name,
|
|
823
|
+
"annotator_kind": annotator_kind,
|
|
824
|
+
"result": {
|
|
825
|
+
"label": label,
|
|
826
|
+
"score": score,
|
|
827
|
+
"explanation": explanation,
|
|
828
|
+
},
|
|
829
|
+
"metadata": metadata or {},
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
# Add trace_id if provided
|
|
833
|
+
if trace_id:
|
|
834
|
+
annotation_data["trace_id"] = trace_id
|
|
835
|
+
|
|
836
|
+
# POST to Phoenix REST API
|
|
837
|
+
annotations_endpoint = f"{self.config.base_url}/v1/span_annotations"
|
|
838
|
+
headers = {}
|
|
839
|
+
if self.config.api_key:
|
|
840
|
+
headers["Authorization"] = f"Bearer {self.config.api_key}"
|
|
841
|
+
|
|
842
|
+
with httpx.Client(timeout=5.0) as client:
|
|
843
|
+
response = client.post(
|
|
844
|
+
annotations_endpoint,
|
|
845
|
+
json={"data": [annotation_data]},
|
|
846
|
+
headers=headers,
|
|
847
|
+
)
|
|
848
|
+
response.raise_for_status()
|
|
849
|
+
|
|
850
|
+
logger.info(f"Added {annotator_kind} feedback to span {span_id}")
|
|
851
|
+
return span_id # Return span_id as annotation reference
|
|
852
|
+
|
|
853
|
+
except httpx.HTTPStatusError as e:
|
|
854
|
+
logger.error(
|
|
855
|
+
f"Failed to add span feedback (HTTP {e.response.status_code}): "
|
|
856
|
+
f"{e.response.text if hasattr(e, 'response') else 'N/A'}"
|
|
857
|
+
)
|
|
858
|
+
return None
|
|
859
|
+
except Exception as e:
|
|
860
|
+
logger.error(f"Failed to add span feedback: {e}")
|
|
861
|
+
return None
|
|
862
|
+
|
|
863
|
+
def sync_user_feedback(
|
|
864
|
+
self,
|
|
865
|
+
span_id: str,
|
|
866
|
+
rating: int | None = None,
|
|
867
|
+
categories: list[str] | None = None,
|
|
868
|
+
comment: str | None = None,
|
|
869
|
+
feedback_id: str | None = None,
|
|
870
|
+
trace_id: str | None = None,
|
|
871
|
+
) -> str | None:
|
|
872
|
+
"""Sync user feedback to Phoenix as a span annotation.
|
|
873
|
+
|
|
874
|
+
Convenience method for syncing Feedback entities to Phoenix.
|
|
875
|
+
Converts REM feedback format to Phoenix annotation format.
|
|
876
|
+
|
|
877
|
+
Args:
|
|
878
|
+
span_id: OTEL span ID to annotate
|
|
879
|
+
rating: User rating (-1, 1-5 scale)
|
|
880
|
+
categories: List of feedback categories
|
|
881
|
+
comment: Free-text comment
|
|
882
|
+
feedback_id: Optional REM feedback ID for reference
|
|
883
|
+
trace_id: Optional trace ID for the span
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
Phoenix annotation ID if successful
|
|
887
|
+
|
|
888
|
+
Example:
|
|
889
|
+
>>> client.sync_user_feedback(
|
|
890
|
+
... span_id="abc123",
|
|
891
|
+
... rating=4,
|
|
892
|
+
... categories=["helpful", "accurate"],
|
|
893
|
+
... comment="Great response!"
|
|
894
|
+
... )
|
|
895
|
+
"""
|
|
896
|
+
# Convert rating to 0-1 score
|
|
897
|
+
# Rating scheme:
|
|
898
|
+
# -1 = thumbs down → score 0.0
|
|
899
|
+
# 1 = thumbs up → score 1.0
|
|
900
|
+
# 2-5 = star rating → normalized to 0-1 range
|
|
901
|
+
score = None
|
|
902
|
+
if rating is not None:
|
|
903
|
+
if rating == -1:
|
|
904
|
+
score = 0.0
|
|
905
|
+
elif rating == 1:
|
|
906
|
+
score = 1.0 # Thumbs up
|
|
907
|
+
elif 2 <= rating <= 5:
|
|
908
|
+
score = (rating - 1) / 4.0 # 2→0.25, 3→0.5, 4→0.75, 5→1.0
|
|
909
|
+
|
|
910
|
+
# Use primary category as label
|
|
911
|
+
label = categories[0] if categories else None
|
|
912
|
+
|
|
913
|
+
# Build explanation from comment and additional categories
|
|
914
|
+
explanation = comment
|
|
915
|
+
if categories and len(categories) > 1:
|
|
916
|
+
cats_str = ", ".join(categories[1:])
|
|
917
|
+
if explanation:
|
|
918
|
+
explanation = f"{explanation} [Categories: {cats_str}]"
|
|
919
|
+
else:
|
|
920
|
+
explanation = f"Categories: {cats_str}"
|
|
921
|
+
|
|
922
|
+
# Build metadata
|
|
923
|
+
metadata: dict[str, Any] = {
|
|
924
|
+
"rating": rating,
|
|
925
|
+
"categories": categories or [],
|
|
926
|
+
}
|
|
927
|
+
if feedback_id:
|
|
928
|
+
metadata["rem_feedback_id"] = feedback_id
|
|
929
|
+
|
|
930
|
+
return self.add_span_feedback(
|
|
931
|
+
span_id=span_id,
|
|
932
|
+
annotation_name="user_feedback",
|
|
933
|
+
annotator_kind="HUMAN",
|
|
934
|
+
label=label,
|
|
935
|
+
score=score,
|
|
936
|
+
explanation=explanation,
|
|
937
|
+
metadata=metadata,
|
|
938
|
+
trace_id=trace_id,
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
def get_span_annotations(
|
|
942
|
+
self,
|
|
943
|
+
span_id: str,
|
|
944
|
+
annotation_name: str | None = None,
|
|
945
|
+
) -> list[dict[str, Any]]:
|
|
946
|
+
"""Get annotations for a span.
|
|
947
|
+
|
|
948
|
+
Args:
|
|
949
|
+
span_id: Span ID to query
|
|
950
|
+
annotation_name: Optional filter by annotation name
|
|
951
|
+
|
|
952
|
+
Returns:
|
|
953
|
+
List of annotation dicts
|
|
954
|
+
|
|
955
|
+
TODO: Implement once Phoenix client exposes this method
|
|
956
|
+
"""
|
|
957
|
+
# TODO: Phoenix client doesn't expose annotation query yet
|
|
958
|
+
# This is a stub for future implementation
|
|
959
|
+
logger.warning("get_span_annotations not yet implemented in Phoenix client")
|
|
960
|
+
return []
|