remdb 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.0.dist-info/METADATA +1455 -0
- remdb-0.3.0.dist-info/RECORD +187 -0
- remdb-0.3.0.dist-info/WHEEL +4 -0
- remdb-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
"""Phoenix client for REM evaluation workflows.
|
|
2
|
+
|
|
3
|
+
This client provides a lean interface to Arize Phoenix for:
|
|
4
|
+
- Dataset management (create golden sets, add examples)
|
|
5
|
+
- Experiment execution (run agents, run evaluators)
|
|
6
|
+
- Trace retrieval (query agent execution history)
|
|
7
|
+
- Label management (organize evaluations by type/difficulty)
|
|
8
|
+
|
|
9
|
+
Two-Phase Evaluation Pattern:
|
|
10
|
+
==============================
|
|
11
|
+
|
|
12
|
+
Phase 1 - Golden Set Creation (SME-driven):
|
|
13
|
+
1. SMEs create datasets with (input, reference) pairs
|
|
14
|
+
2. Store in Phoenix with metadata labels
|
|
15
|
+
3. No agent execution required
|
|
16
|
+
|
|
17
|
+
Phase 2 - Automated Evaluation (Agent-driven):
|
|
18
|
+
1. Run agents on golden set → agent outputs
|
|
19
|
+
2. Run evaluators on (input, agent_output, reference) → scores
|
|
20
|
+
3. Track in Phoenix for analysis
|
|
21
|
+
|
|
22
|
+
Example Workflow:
|
|
23
|
+
-----------------
|
|
24
|
+
|
|
25
|
+
# Phase 1: SME creates golden set
|
|
26
|
+
client = PhoenixClient()
|
|
27
|
+
dataset = client.create_dataset_from_data(
|
|
28
|
+
name="rem-lookup-golden",
|
|
29
|
+
inputs=[{"query": "LOOKUP person:sarah-chen"}],
|
|
30
|
+
outputs=[{"label": "sarah-chen", "type": "person", ...}],
|
|
31
|
+
metadata=[{"difficulty": "easy", "query_type": "LOOKUP"}]
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Phase 2a: Run agents to produce outputs
|
|
35
|
+
experiment = client.run_experiment(
|
|
36
|
+
dataset=dataset,
|
|
37
|
+
task=run_agent_task, # Calls ask_rem agent
|
|
38
|
+
experiment_name="rem-v1-baseline"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Phase 2b: Run evaluators on results
|
|
42
|
+
evaluator_exp = client.run_experiment(
|
|
43
|
+
dataset=experiment_results, # Uses agent outputs
|
|
44
|
+
task=None, # No task, just evaluate existing outputs
|
|
45
|
+
evaluators=[correctness_evaluator, completeness_evaluator],
|
|
46
|
+
experiment_name="rem-v1-evaluation"
|
|
47
|
+
)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
from __future__ import annotations
|
|
51
|
+
|
|
52
|
+
from datetime import datetime
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
from typing import Any, Callable, TYPE_CHECKING, cast
|
|
55
|
+
|
|
56
|
+
import pandas as pd
|
|
57
|
+
from loguru import logger
|
|
58
|
+
|
|
59
|
+
from .config import PhoenixConfig
|
|
60
|
+
|
|
61
|
+
if TYPE_CHECKING:
|
|
62
|
+
from phoenix.client import Client
|
|
63
|
+
from phoenix.client.resources.datasets import Dataset
|
|
64
|
+
from phoenix.client.resources.experiments.types import RanExperiment
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PhoenixClient:
|
|
68
|
+
"""High-level Phoenix client for REM evaluation workflows.
|
|
69
|
+
|
|
70
|
+
Wraps the official Phoenix client with REM-specific methods for:
|
|
71
|
+
- Creating and managing evaluation datasets
|
|
72
|
+
- Running agent and evaluator experiments
|
|
73
|
+
- Querying trace data for analysis
|
|
74
|
+
- Managing dataset labels
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
config: Phoenix connection configuration
|
|
78
|
+
_client: Underlying Phoenix Client instance
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self, config: PhoenixConfig | None = None):
|
|
82
|
+
"""Initialize Phoenix client.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
config: Optional Phoenix configuration (auto-loads if not provided)
|
|
86
|
+
"""
|
|
87
|
+
if config is None:
|
|
88
|
+
config = PhoenixConfig.from_settings()
|
|
89
|
+
|
|
90
|
+
self.config = config
|
|
91
|
+
self._client = self._create_client()
|
|
92
|
+
|
|
93
|
+
logger.info(f"Phoenix client initialized (endpoint: {self.config.base_url})")
|
|
94
|
+
|
|
95
|
+
def _create_client(self) -> "Client":
|
|
96
|
+
"""Create underlying Phoenix client.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Configured Phoenix Client instance
|
|
100
|
+
"""
|
|
101
|
+
from phoenix.client import Client
|
|
102
|
+
|
|
103
|
+
return Client(
|
|
104
|
+
base_url=self.config.base_url,
|
|
105
|
+
api_key=self.config.api_key,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# =========================================================================
|
|
109
|
+
# DATASET MANAGEMENT
|
|
110
|
+
# =========================================================================
|
|
111
|
+
|
|
112
|
+
def list_datasets(self) -> list[dict[str, Any]]:
|
|
113
|
+
"""List all datasets in Phoenix.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List of dataset metadata dicts with keys:
|
|
117
|
+
- id: Dataset ID
|
|
118
|
+
- name: Dataset name
|
|
119
|
+
- example_count: Number of examples
|
|
120
|
+
- created_at: Creation timestamp
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
datasets = list(self._client.datasets.list())
|
|
124
|
+
logger.debug(f"Found {len(datasets)} datasets")
|
|
125
|
+
return [
|
|
126
|
+
{
|
|
127
|
+
"id": str(ds.get("id", "")),
|
|
128
|
+
"name": ds.get("name", ""),
|
|
129
|
+
"example_count": ds.get("example_count", 0),
|
|
130
|
+
"created_at": ds.get("created_at", ""),
|
|
131
|
+
}
|
|
132
|
+
for ds in datasets
|
|
133
|
+
]
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"Failed to list datasets: {e}")
|
|
136
|
+
raise
|
|
137
|
+
|
|
138
|
+
def get_dataset(self, name: str) -> "Dataset":
|
|
139
|
+
"""Get a dataset by name.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
name: Dataset name
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dataset instance
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
ValueError: If dataset not found
|
|
149
|
+
"""
|
|
150
|
+
try:
|
|
151
|
+
dataset = self._client.datasets.get_dataset(dataset=name)
|
|
152
|
+
logger.debug(f"Loaded dataset: {name} ({len(dataset)} examples)")
|
|
153
|
+
return dataset
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.error(f"Failed to get dataset '{name}': {e}")
|
|
156
|
+
raise ValueError(f"Dataset not found: {name}") from e
|
|
157
|
+
|
|
158
|
+
def create_dataset_from_data(
|
|
159
|
+
self,
|
|
160
|
+
name: str,
|
|
161
|
+
inputs: list[dict[str, Any]],
|
|
162
|
+
outputs: list[dict[str, Any]],
|
|
163
|
+
metadata: list[dict[str, Any]] | None = None,
|
|
164
|
+
description: str | None = None,
|
|
165
|
+
) -> "Dataset":
|
|
166
|
+
"""Create a dataset from input/output pairs (SME golden set creation).
|
|
167
|
+
|
|
168
|
+
This is the primary method for SMEs to create evaluation datasets.
|
|
169
|
+
Each example consists of:
|
|
170
|
+
- input: What the agent receives (e.g., {"query": "LOOKUP person:sarah-chen"})
|
|
171
|
+
- output: Expected correct result (ground truth/reference)
|
|
172
|
+
- metadata: Optional labels (difficulty, query_type, etc.)
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
name: Dataset name (will be created or updated)
|
|
176
|
+
inputs: List of input dicts (what agents receive)
|
|
177
|
+
outputs: List of expected output dicts (ground truth)
|
|
178
|
+
metadata: Optional list of metadata dicts (labels, difficulty, etc.)
|
|
179
|
+
description: Optional dataset description
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Created Dataset instance
|
|
183
|
+
|
|
184
|
+
Example:
|
|
185
|
+
>>> client = PhoenixClient()
|
|
186
|
+
>>> dataset = client.create_dataset_from_data(
|
|
187
|
+
... name="rem-lookup-golden",
|
|
188
|
+
... inputs=[
|
|
189
|
+
... {"query": "LOOKUP person:sarah-chen"},
|
|
190
|
+
... {"query": "LOOKUP project:tidb-migration"}
|
|
191
|
+
... ],
|
|
192
|
+
... outputs=[
|
|
193
|
+
... {"label": "sarah-chen", "type": "person", "properties": {...}},
|
|
194
|
+
... {"label": "tidb-migration", "type": "project", "properties": {...}}
|
|
195
|
+
... ],
|
|
196
|
+
... metadata=[
|
|
197
|
+
... {"difficulty": "easy", "query_type": "LOOKUP"},
|
|
198
|
+
... {"difficulty": "medium", "query_type": "LOOKUP"}
|
|
199
|
+
... ]
|
|
200
|
+
... )
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
# Validate inputs/outputs match
|
|
204
|
+
if len(inputs) != len(outputs):
|
|
205
|
+
raise ValueError(
|
|
206
|
+
f"Input count ({len(inputs)}) must match output count ({len(outputs)})"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Create metadata list if not provided
|
|
210
|
+
if metadata is None:
|
|
211
|
+
metadata = [{} for _ in inputs]
|
|
212
|
+
elif len(metadata) != len(inputs):
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"Metadata count ({len(metadata)}) must match input count ({len(inputs)})"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Create dataset
|
|
218
|
+
dataset = self._client.datasets.create_dataset(
|
|
219
|
+
name=name,
|
|
220
|
+
dataset_description=description,
|
|
221
|
+
inputs=inputs,
|
|
222
|
+
outputs=outputs,
|
|
223
|
+
metadata=metadata,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
logger.info(f"Created dataset '{name}' with {len(inputs)} examples")
|
|
227
|
+
return dataset
|
|
228
|
+
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.error(f"Failed to create dataset '{name}': {e}")
|
|
231
|
+
raise
|
|
232
|
+
|
|
233
|
+
def create_dataset_from_csv(
|
|
234
|
+
self,
|
|
235
|
+
name: str,
|
|
236
|
+
csv_file_path: Path | str,
|
|
237
|
+
input_keys: list[str],
|
|
238
|
+
output_keys: list[str],
|
|
239
|
+
metadata_keys: list[str] | None = None,
|
|
240
|
+
description: str | None = None,
|
|
241
|
+
) -> "Dataset":
|
|
242
|
+
"""Create a dataset from a CSV file.
|
|
243
|
+
|
|
244
|
+
Convenience method for loading golden sets from CSV files.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
name: Dataset name
|
|
248
|
+
csv_file_path: Path to CSV file
|
|
249
|
+
input_keys: Column names to use as inputs
|
|
250
|
+
output_keys: Column names to use as outputs (reference/ground truth)
|
|
251
|
+
metadata_keys: Optional column names for metadata
|
|
252
|
+
description: Optional dataset description
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Created Dataset instance
|
|
256
|
+
|
|
257
|
+
Example CSV structure:
|
|
258
|
+
query,expected_label,expected_type,difficulty,query_type
|
|
259
|
+
"LOOKUP person:sarah-chen",sarah-chen,person,easy,LOOKUP
|
|
260
|
+
"SEARCH semantic AI engineer",sarah-chen,person,medium,SEARCH
|
|
261
|
+
"""
|
|
262
|
+
try:
|
|
263
|
+
# Load CSV
|
|
264
|
+
df = pd.read_csv(csv_file_path)
|
|
265
|
+
|
|
266
|
+
# Extract inputs
|
|
267
|
+
inputs = cast(list[dict[str, Any]], df[input_keys].to_dict("records"))
|
|
268
|
+
|
|
269
|
+
# Extract outputs
|
|
270
|
+
outputs = cast(list[dict[str, Any]], df[output_keys].to_dict("records"))
|
|
271
|
+
|
|
272
|
+
# Extract metadata if specified
|
|
273
|
+
metadata = None
|
|
274
|
+
if metadata_keys:
|
|
275
|
+
metadata = cast(list[dict[str, Any]], df[metadata_keys].to_dict("records"))
|
|
276
|
+
|
|
277
|
+
return self.create_dataset_from_data(
|
|
278
|
+
name=name,
|
|
279
|
+
inputs=inputs,
|
|
280
|
+
outputs=outputs,
|
|
281
|
+
metadata=metadata,
|
|
282
|
+
description=description,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.error(f"Failed to create dataset from CSV '{csv_file_path}': {e}")
|
|
287
|
+
raise
|
|
288
|
+
|
|
289
|
+
def add_examples_to_dataset(
|
|
290
|
+
self,
|
|
291
|
+
dataset: str,
|
|
292
|
+
inputs: list[dict[str, Any]],
|
|
293
|
+
outputs: list[dict[str, Any]],
|
|
294
|
+
metadata: list[dict[str, Any]] | None = None,
|
|
295
|
+
) -> "Dataset":
|
|
296
|
+
"""Add examples to an existing dataset.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
dataset: Dataset name
|
|
300
|
+
inputs: List of input dicts
|
|
301
|
+
outputs: List of output dicts
|
|
302
|
+
metadata: Optional list of metadata dicts
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Updated Dataset instance
|
|
306
|
+
"""
|
|
307
|
+
try:
|
|
308
|
+
if len(inputs) != len(outputs):
|
|
309
|
+
raise ValueError("Input/output counts must match")
|
|
310
|
+
|
|
311
|
+
if metadata is None:
|
|
312
|
+
metadata = [{} for _ in inputs]
|
|
313
|
+
|
|
314
|
+
updated_dataset = self._client.datasets.add_examples_to_dataset(
|
|
315
|
+
dataset, # Positional argument instead of keyword
|
|
316
|
+
inputs=inputs,
|
|
317
|
+
outputs=outputs,
|
|
318
|
+
metadata=metadata,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
logger.info(f"Added {len(inputs)} examples to dataset '{dataset}'")
|
|
322
|
+
return updated_dataset
|
|
323
|
+
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.error(f"Failed to add examples to dataset '{dataset}': {e}")
|
|
326
|
+
raise
|
|
327
|
+
|
|
328
|
+
# =========================================================================
|
|
329
|
+
# EXPERIMENT EXECUTION
|
|
330
|
+
# =========================================================================
|
|
331
|
+
|
|
332
|
+
def run_experiment(
|
|
333
|
+
self,
|
|
334
|
+
dataset: "Dataset" | str,
|
|
335
|
+
task: Callable[[Any], Any] | None = None,
|
|
336
|
+
evaluators: list[Callable[[Any], Any]] | None = None,
|
|
337
|
+
experiment_name: str | None = None,
|
|
338
|
+
experiment_description: str | None = None,
|
|
339
|
+
experiment_metadata: dict[str, Any] | None = None,
|
|
340
|
+
experiment_config: Any | None = None,
|
|
341
|
+
) -> "RanExperiment":
|
|
342
|
+
"""Run an evaluation experiment.
|
|
343
|
+
|
|
344
|
+
Three modes:
|
|
345
|
+
1. ExperimentConfig mode: Provide experiment_config with all settings
|
|
346
|
+
2. Agent run: Provide task function to execute agents on dataset
|
|
347
|
+
3. Evaluator run: Provide evaluators to score existing outputs
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
dataset: Dataset instance or name (required unless experiment_config provided)
|
|
351
|
+
task: Optional task function to run on each example (agent execution)
|
|
352
|
+
evaluators: Optional list of evaluator functions
|
|
353
|
+
experiment_name: Optional experiment name
|
|
354
|
+
experiment_description: Optional description
|
|
355
|
+
experiment_metadata: Optional metadata dict
|
|
356
|
+
experiment_config: Optional ExperimentConfig instance (overrides other params)
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
RanExperiment with results
|
|
360
|
+
|
|
361
|
+
Example - Agent Run (Phase 2a):
|
|
362
|
+
>>> async def run_agent(example):
|
|
363
|
+
... from rem.mcp.tools.rem import ask_rem
|
|
364
|
+
... result = await ask_rem(example["input"]["query"])
|
|
365
|
+
... return result
|
|
366
|
+
>>> experiment = client.run_experiment(
|
|
367
|
+
... dataset="rem-lookup-golden",
|
|
368
|
+
... task=run_agent,
|
|
369
|
+
... experiment_name="rem-v1-baseline"
|
|
370
|
+
... )
|
|
371
|
+
|
|
372
|
+
Example - Evaluator Run (Phase 2b):
|
|
373
|
+
>>> experiment = client.run_experiment(
|
|
374
|
+
... dataset=agent_results,
|
|
375
|
+
... evaluators=[correctness_eval, completeness_eval],
|
|
376
|
+
... experiment_name="rem-v1-evaluation"
|
|
377
|
+
... )
|
|
378
|
+
"""
|
|
379
|
+
try:
|
|
380
|
+
# Handle ExperimentConfig mode
|
|
381
|
+
if experiment_config:
|
|
382
|
+
experiment_name = experiment_name or experiment_config.name
|
|
383
|
+
experiment_description = experiment_description or experiment_config.description
|
|
384
|
+
|
|
385
|
+
# Merge metadata
|
|
386
|
+
config_metadata = {
|
|
387
|
+
"agent_schema": experiment_config.agent_schema_ref.name,
|
|
388
|
+
"agent_version": experiment_config.agent_schema_ref.version,
|
|
389
|
+
"evaluator_schema": experiment_config.evaluator_schema_ref.name,
|
|
390
|
+
"evaluator_version": experiment_config.evaluator_schema_ref.version,
|
|
391
|
+
"config_status": experiment_config.status.value,
|
|
392
|
+
"config_tags": experiment_config.tags,
|
|
393
|
+
}
|
|
394
|
+
config_metadata.update(experiment_config.metadata or {})
|
|
395
|
+
experiment_metadata = experiment_metadata or config_metadata
|
|
396
|
+
|
|
397
|
+
# Use ground_truth dataset if dataset not provided
|
|
398
|
+
if not dataset and "ground_truth" in experiment_config.datasets:
|
|
399
|
+
dataset_ref = experiment_config.datasets["ground_truth"]
|
|
400
|
+
# Load from Git or use provided path
|
|
401
|
+
if dataset_ref.location.value == "git":
|
|
402
|
+
# Assume dataset is already loaded
|
|
403
|
+
logger.warning(
|
|
404
|
+
f"Dataset location is 'git' but path-based loading not implemented. "
|
|
405
|
+
f"Pass dataset explicitly or use Phoenix dataset name."
|
|
406
|
+
)
|
|
407
|
+
else:
|
|
408
|
+
dataset = dataset_ref.path
|
|
409
|
+
|
|
410
|
+
# Load dataset if name provided
|
|
411
|
+
if isinstance(dataset, str):
|
|
412
|
+
dataset = self.get_dataset(dataset)
|
|
413
|
+
|
|
414
|
+
logger.info(
|
|
415
|
+
f"Running experiment '{experiment_name or 'unnamed'}' "
|
|
416
|
+
f"on dataset with {len(dataset)} examples"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Run experiment
|
|
420
|
+
experiment = self._client.experiments.run_experiment(
|
|
421
|
+
dataset=dataset,
|
|
422
|
+
task=task, # type: ignore[arg-type]
|
|
423
|
+
evaluators=evaluators or [],
|
|
424
|
+
experiment_name=experiment_name,
|
|
425
|
+
experiment_description=experiment_description,
|
|
426
|
+
experiment_metadata=experiment_metadata,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
logger.success(f"Experiment complete: {experiment_name or 'unnamed'}")
|
|
430
|
+
if hasattr(experiment, "url"):
|
|
431
|
+
logger.info(f"View results: {experiment.url}") # type: ignore[attr-defined]
|
|
432
|
+
|
|
433
|
+
# Update ExperimentConfig if provided
|
|
434
|
+
if experiment_config:
|
|
435
|
+
experiment_config.last_run_at = datetime.now()
|
|
436
|
+
experiment_config.status = "running" if hasattr(experiment, "runs") else "completed"
|
|
437
|
+
|
|
438
|
+
return experiment
|
|
439
|
+
|
|
440
|
+
except Exception as e:
|
|
441
|
+
logger.error(f"Failed to run experiment: {e}")
|
|
442
|
+
raise
|
|
443
|
+
|
|
444
|
+
# =========================================================================
|
|
445
|
+
# TRACE RETRIEVAL
|
|
446
|
+
# =========================================================================
|
|
447
|
+
|
|
448
|
+
def get_traces(
|
|
449
|
+
self,
|
|
450
|
+
project_name: str | None = None,
|
|
451
|
+
start_time: datetime | None = None,
|
|
452
|
+
end_time: datetime | None = None,
|
|
453
|
+
limit: int = 100,
|
|
454
|
+
root_spans_only: bool = True,
|
|
455
|
+
trace_id: str | None = None,
|
|
456
|
+
span_id: str | None = None,
|
|
457
|
+
) -> pd.DataFrame:
|
|
458
|
+
"""Query traces from Phoenix.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
project_name: Filter by project name
|
|
462
|
+
start_time: Filter traces after this time
|
|
463
|
+
end_time: Filter traces before this time
|
|
464
|
+
limit: Maximum number of traces to return
|
|
465
|
+
root_spans_only: Only return root spans (default: True)
|
|
466
|
+
trace_id: Filter by specific trace ID
|
|
467
|
+
span_id: Filter by specific span ID
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
DataFrame with trace data
|
|
471
|
+
|
|
472
|
+
Example:
|
|
473
|
+
>>> traces = client.get_traces(
|
|
474
|
+
... project_name="rem-agents",
|
|
475
|
+
... start_time=datetime.now() - timedelta(days=7),
|
|
476
|
+
... limit=50
|
|
477
|
+
... )
|
|
478
|
+
"""
|
|
479
|
+
try:
|
|
480
|
+
# Build query
|
|
481
|
+
query_params: dict[str, Any] = {}
|
|
482
|
+
if project_name:
|
|
483
|
+
query_params["project_name"] = project_name
|
|
484
|
+
if start_time:
|
|
485
|
+
query_params["start_time"] = start_time.isoformat()
|
|
486
|
+
if end_time:
|
|
487
|
+
query_params["end_time"] = end_time.isoformat()
|
|
488
|
+
if root_spans_only:
|
|
489
|
+
query_params["root_spans_only"] = True
|
|
490
|
+
if trace_id:
|
|
491
|
+
query_params["trace_id"] = trace_id
|
|
492
|
+
if span_id:
|
|
493
|
+
query_params["span_id"] = span_id
|
|
494
|
+
|
|
495
|
+
# Query traces
|
|
496
|
+
traces_df = self._client.query_spans(limit=limit, **query_params) # type: ignore[attr-defined]
|
|
497
|
+
|
|
498
|
+
logger.debug(f"Retrieved {len(traces_df)} traces")
|
|
499
|
+
return traces_df
|
|
500
|
+
|
|
501
|
+
except Exception as e:
|
|
502
|
+
logger.error(f"Failed to query traces: {e}")
|
|
503
|
+
raise
|
|
504
|
+
|
|
505
|
+
def create_dataset_from_traces(
|
|
506
|
+
self,
|
|
507
|
+
name: str,
|
|
508
|
+
project_name: str,
|
|
509
|
+
start_time: datetime | None = None,
|
|
510
|
+
end_time: datetime | None = None,
|
|
511
|
+
limit: int = 100,
|
|
512
|
+
description: str | None = None,
|
|
513
|
+
) -> "Dataset":
|
|
514
|
+
"""Create a dataset from production traces.
|
|
515
|
+
|
|
516
|
+
Useful for regression testing and coverage analysis.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
name: Dataset name
|
|
520
|
+
project_name: Phoenix project name to query traces from
|
|
521
|
+
start_time: Optional start time for trace window
|
|
522
|
+
end_time: Optional end time for trace window
|
|
523
|
+
limit: Maximum number of traces to include
|
|
524
|
+
description: Optional dataset description
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Created Dataset instance
|
|
528
|
+
|
|
529
|
+
Example:
|
|
530
|
+
>>> dataset = client.create_dataset_from_traces(
|
|
531
|
+
... name="rem-production-regression",
|
|
532
|
+
... project_name="rem-production",
|
|
533
|
+
... start_time=datetime.now() - timedelta(days=30),
|
|
534
|
+
... limit=500
|
|
535
|
+
... )
|
|
536
|
+
"""
|
|
537
|
+
try:
|
|
538
|
+
# Query traces
|
|
539
|
+
traces_df = self.get_traces(
|
|
540
|
+
project_name=project_name,
|
|
541
|
+
start_time=start_time,
|
|
542
|
+
end_time=end_time,
|
|
543
|
+
limit=limit,
|
|
544
|
+
root_spans_only=True,
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
if len(traces_df) == 0:
|
|
548
|
+
raise ValueError("No traces found matching criteria")
|
|
549
|
+
|
|
550
|
+
# Extract inputs and outputs from traces
|
|
551
|
+
inputs = []
|
|
552
|
+
outputs = []
|
|
553
|
+
metadata = []
|
|
554
|
+
|
|
555
|
+
for _, row in traces_df.iterrows():
|
|
556
|
+
# Extract input
|
|
557
|
+
span_input = row.get("attributes.input")
|
|
558
|
+
if span_input:
|
|
559
|
+
if isinstance(span_input, str):
|
|
560
|
+
inputs.append({"input": span_input})
|
|
561
|
+
else:
|
|
562
|
+
inputs.append(span_input)
|
|
563
|
+
else:
|
|
564
|
+
inputs.append({})
|
|
565
|
+
|
|
566
|
+
# Extract output
|
|
567
|
+
span_output = row.get("attributes.output")
|
|
568
|
+
if span_output:
|
|
569
|
+
if isinstance(span_output, str):
|
|
570
|
+
outputs.append({"output": span_output})
|
|
571
|
+
else:
|
|
572
|
+
outputs.append(span_output)
|
|
573
|
+
else:
|
|
574
|
+
outputs.append({})
|
|
575
|
+
|
|
576
|
+
# Extract metadata
|
|
577
|
+
metadata.append({
|
|
578
|
+
"span_id": str(row.get("context.span_id", "")),
|
|
579
|
+
"trace_id": str(row.get("context.trace_id", "")),
|
|
580
|
+
"start_time": str(row.get("start_time", "")),
|
|
581
|
+
"latency_ms": row.get("latency_ms", 0),
|
|
582
|
+
})
|
|
583
|
+
|
|
584
|
+
# Create dataset
|
|
585
|
+
dataset = self.create_dataset_from_data(
|
|
586
|
+
name=name,
|
|
587
|
+
inputs=inputs,
|
|
588
|
+
outputs=outputs,
|
|
589
|
+
metadata=metadata,
|
|
590
|
+
description=description,
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
logger.info(f"Created dataset '{name}' from {len(inputs)} traces")
|
|
594
|
+
return dataset
|
|
595
|
+
|
|
596
|
+
except Exception as e:
|
|
597
|
+
logger.error(f"Failed to create dataset from traces: {e}")
|
|
598
|
+
raise
|
|
599
|
+
|
|
600
|
+
def get_experiment(self, experiment_id: str) -> dict[str, Any]:
|
|
601
|
+
"""Get experiment data including task runs.
|
|
602
|
+
|
|
603
|
+
Args:
|
|
604
|
+
experiment_id: Experiment ID (from Phoenix UI URL)
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
Dictionary with experiment data including:
|
|
608
|
+
- id: Experiment ID
|
|
609
|
+
- name: Experiment name
|
|
610
|
+
- dataset_id: Associated dataset ID
|
|
611
|
+
- experiment_metadata: Metadata dict
|
|
612
|
+
- task_runs: List of task run results
|
|
613
|
+
|
|
614
|
+
Example:
|
|
615
|
+
>>> exp_data = client.get_experiment("RXhwZXJpbWVudDoxMjM=")
|
|
616
|
+
>>> print(f"Experiment: {exp_data['name']}")
|
|
617
|
+
>>> print(f"Task runs: {len(exp_data['task_runs'])}")
|
|
618
|
+
"""
|
|
619
|
+
try:
|
|
620
|
+
# Get experiment object
|
|
621
|
+
experiment = self._client.experiments.get_experiment(experiment_id) # type: ignore[misc]
|
|
622
|
+
|
|
623
|
+
# Extract task runs
|
|
624
|
+
task_runs = []
|
|
625
|
+
for run in experiment.runs: # type: ignore[attr-defined]
|
|
626
|
+
task_runs.append({
|
|
627
|
+
"input": run.input,
|
|
628
|
+
"output": run.output,
|
|
629
|
+
"expected": run.expected,
|
|
630
|
+
"dataset_example_id": getattr(run, "dataset_example_id", None),
|
|
631
|
+
})
|
|
632
|
+
|
|
633
|
+
# Build response
|
|
634
|
+
exp_data = {
|
|
635
|
+
"id": experiment.id, # type: ignore[attr-defined]
|
|
636
|
+
"name": experiment.name, # type: ignore[attr-defined]
|
|
637
|
+
"dataset_id": experiment.dataset_id, # type: ignore[attr-defined]
|
|
638
|
+
"experiment_metadata": experiment.metadata or {}, # type: ignore[attr-defined]
|
|
639
|
+
"task_runs": task_runs,
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
logger.info(f"Retrieved experiment '{experiment.name}' with {len(task_runs)} task runs") # type: ignore[attr-defined]
|
|
643
|
+
return exp_data
|
|
644
|
+
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.error(f"Failed to get experiment '{experiment_id}': {e}")
|
|
647
|
+
raise
|
|
648
|
+
|
|
649
|
+
# =========================================================================
|
|
650
|
+
# FEEDBACK/ANNOTATION
|
|
651
|
+
# =========================================================================
|
|
652
|
+
|
|
653
|
+
def add_span_feedback(
|
|
654
|
+
self,
|
|
655
|
+
span_id: str,
|
|
656
|
+
annotation_name: str,
|
|
657
|
+
annotator_kind: str = "HUMAN",
|
|
658
|
+
label: str | None = None,
|
|
659
|
+
score: float | None = None,
|
|
660
|
+
explanation: str | None = None,
|
|
661
|
+
) -> None:
|
|
662
|
+
"""Add feedback annotation to a span.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
span_id: Span ID to annotate
|
|
666
|
+
annotation_name: Name of the annotation (e.g., "correctness")
|
|
667
|
+
annotator_kind: Type of annotator ("HUMAN", "LLM", "CODE")
|
|
668
|
+
label: Optional label (e.g., "correct", "incorrect")
|
|
669
|
+
score: Optional numeric score (0.0-1.0)
|
|
670
|
+
explanation: Optional explanation text
|
|
671
|
+
"""
|
|
672
|
+
try:
|
|
673
|
+
self._client.add_span_annotation( # type: ignore[attr-defined]
|
|
674
|
+
span_id=span_id,
|
|
675
|
+
name=annotation_name,
|
|
676
|
+
annotator_kind=annotator_kind,
|
|
677
|
+
label=label,
|
|
678
|
+
score=score,
|
|
679
|
+
explanation=explanation,
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
logger.info(f"Added {annotator_kind} feedback to span {span_id}")
|
|
683
|
+
|
|
684
|
+
except Exception as e:
|
|
685
|
+
logger.error(f"Failed to add span feedback: {e}")
|
|
686
|
+
raise
|