remdb 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +565 -0
  44. rem/cli/commands/configure.py +423 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1124 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +88 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +657 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +229 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.2.6.dist-info/METADATA +1191 -0
  185. remdb-0.2.6.dist-info/RECORD +187 -0
  186. remdb-0.2.6.dist-info/WHEEL +4 -0
  187. remdb-0.2.6.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,686 @@
1
+ """Phoenix client for REM evaluation workflows.
2
+
3
+ This client provides a lean interface to Arize Phoenix for:
4
+ - Dataset management (create golden sets, add examples)
5
+ - Experiment execution (run agents, run evaluators)
6
+ - Trace retrieval (query agent execution history)
7
+ - Label management (organize evaluations by type/difficulty)
8
+
9
+ Two-Phase Evaluation Pattern:
10
+ ==============================
11
+
12
+ Phase 1 - Golden Set Creation (SME-driven):
13
+ 1. SMEs create datasets with (input, reference) pairs
14
+ 2. Store in Phoenix with metadata labels
15
+ 3. No agent execution required
16
+
17
+ Phase 2 - Automated Evaluation (Agent-driven):
18
+ 1. Run agents on golden set → agent outputs
19
+ 2. Run evaluators on (input, agent_output, reference) → scores
20
+ 3. Track in Phoenix for analysis
21
+
22
+ Example Workflow:
23
+ -----------------
24
+
25
+ # Phase 1: SME creates golden set
26
+ client = PhoenixClient()
27
+ dataset = client.create_dataset_from_data(
28
+ name="rem-lookup-golden",
29
+ inputs=[{"query": "LOOKUP person:sarah-chen"}],
30
+ outputs=[{"label": "sarah-chen", "type": "person", ...}],
31
+ metadata=[{"difficulty": "easy", "query_type": "LOOKUP"}]
32
+ )
33
+
34
+ # Phase 2a: Run agents to produce outputs
35
+ experiment = client.run_experiment(
36
+ dataset=dataset,
37
+ task=run_agent_task, # Calls ask_rem agent
38
+ experiment_name="rem-v1-baseline"
39
+ )
40
+
41
+ # Phase 2b: Run evaluators on results
42
+ evaluator_exp = client.run_experiment(
43
+ dataset=experiment_results, # Uses agent outputs
44
+ task=None, # No task, just evaluate existing outputs
45
+ evaluators=[correctness_evaluator, completeness_evaluator],
46
+ experiment_name="rem-v1-evaluation"
47
+ )
48
+ """
49
+
50
+ from __future__ import annotations
51
+
52
+ from datetime import datetime
53
+ from pathlib import Path
54
+ from typing import Any, Callable, TYPE_CHECKING, cast
55
+
56
+ import pandas as pd
57
+ from loguru import logger
58
+
59
+ from .config import PhoenixConfig
60
+
61
+ if TYPE_CHECKING:
62
+ from phoenix.client import Client
63
+ from phoenix.client.resources.datasets import Dataset
64
+ from phoenix.client.resources.experiments.types import RanExperiment
65
+
66
+
67
+ class PhoenixClient:
68
+ """High-level Phoenix client for REM evaluation workflows.
69
+
70
+ Wraps the official Phoenix client with REM-specific methods for:
71
+ - Creating and managing evaluation datasets
72
+ - Running agent and evaluator experiments
73
+ - Querying trace data for analysis
74
+ - Managing dataset labels
75
+
76
+ Attributes:
77
+ config: Phoenix connection configuration
78
+ _client: Underlying Phoenix Client instance
79
+ """
80
+
81
+ def __init__(self, config: PhoenixConfig | None = None):
82
+ """Initialize Phoenix client.
83
+
84
+ Args:
85
+ config: Optional Phoenix configuration (auto-loads if not provided)
86
+ """
87
+ if config is None:
88
+ config = PhoenixConfig.from_settings()
89
+
90
+ self.config = config
91
+ self._client = self._create_client()
92
+
93
+ logger.info(f"Phoenix client initialized (endpoint: {self.config.base_url})")
94
+
95
+ def _create_client(self) -> "Client":
96
+ """Create underlying Phoenix client.
97
+
98
+ Returns:
99
+ Configured Phoenix Client instance
100
+ """
101
+ from phoenix.client import Client
102
+
103
+ return Client(
104
+ base_url=self.config.base_url,
105
+ api_key=self.config.api_key,
106
+ )
107
+
108
+ # =========================================================================
109
+ # DATASET MANAGEMENT
110
+ # =========================================================================
111
+
112
+ def list_datasets(self) -> list[dict[str, Any]]:
113
+ """List all datasets in Phoenix.
114
+
115
+ Returns:
116
+ List of dataset metadata dicts with keys:
117
+ - id: Dataset ID
118
+ - name: Dataset name
119
+ - example_count: Number of examples
120
+ - created_at: Creation timestamp
121
+ """
122
+ try:
123
+ datasets = list(self._client.datasets.list())
124
+ logger.debug(f"Found {len(datasets)} datasets")
125
+ return [
126
+ {
127
+ "id": str(ds.get("id", "")),
128
+ "name": ds.get("name", ""),
129
+ "example_count": ds.get("example_count", 0),
130
+ "created_at": ds.get("created_at", ""),
131
+ }
132
+ for ds in datasets
133
+ ]
134
+ except Exception as e:
135
+ logger.error(f"Failed to list datasets: {e}")
136
+ raise
137
+
138
+ def get_dataset(self, name: str) -> "Dataset":
139
+ """Get a dataset by name.
140
+
141
+ Args:
142
+ name: Dataset name
143
+
144
+ Returns:
145
+ Dataset instance
146
+
147
+ Raises:
148
+ ValueError: If dataset not found
149
+ """
150
+ try:
151
+ dataset = self._client.datasets.get_dataset(dataset=name)
152
+ logger.debug(f"Loaded dataset: {name} ({len(dataset)} examples)")
153
+ return dataset
154
+ except Exception as e:
155
+ logger.error(f"Failed to get dataset '{name}': {e}")
156
+ raise ValueError(f"Dataset not found: {name}") from e
157
+
158
+ def create_dataset_from_data(
159
+ self,
160
+ name: str,
161
+ inputs: list[dict[str, Any]],
162
+ outputs: list[dict[str, Any]],
163
+ metadata: list[dict[str, Any]] | None = None,
164
+ description: str | None = None,
165
+ ) -> "Dataset":
166
+ """Create a dataset from input/output pairs (SME golden set creation).
167
+
168
+ This is the primary method for SMEs to create evaluation datasets.
169
+ Each example consists of:
170
+ - input: What the agent receives (e.g., {"query": "LOOKUP person:sarah-chen"})
171
+ - output: Expected correct result (ground truth/reference)
172
+ - metadata: Optional labels (difficulty, query_type, etc.)
173
+
174
+ Args:
175
+ name: Dataset name (will be created or updated)
176
+ inputs: List of input dicts (what agents receive)
177
+ outputs: List of expected output dicts (ground truth)
178
+ metadata: Optional list of metadata dicts (labels, difficulty, etc.)
179
+ description: Optional dataset description
180
+
181
+ Returns:
182
+ Created Dataset instance
183
+
184
+ Example:
185
+ >>> client = PhoenixClient()
186
+ >>> dataset = client.create_dataset_from_data(
187
+ ... name="rem-lookup-golden",
188
+ ... inputs=[
189
+ ... {"query": "LOOKUP person:sarah-chen"},
190
+ ... {"query": "LOOKUP project:tidb-migration"}
191
+ ... ],
192
+ ... outputs=[
193
+ ... {"label": "sarah-chen", "type": "person", "properties": {...}},
194
+ ... {"label": "tidb-migration", "type": "project", "properties": {...}}
195
+ ... ],
196
+ ... metadata=[
197
+ ... {"difficulty": "easy", "query_type": "LOOKUP"},
198
+ ... {"difficulty": "medium", "query_type": "LOOKUP"}
199
+ ... ]
200
+ ... )
201
+ """
202
+ try:
203
+ # Validate inputs/outputs match
204
+ if len(inputs) != len(outputs):
205
+ raise ValueError(
206
+ f"Input count ({len(inputs)}) must match output count ({len(outputs)})"
207
+ )
208
+
209
+ # Create metadata list if not provided
210
+ if metadata is None:
211
+ metadata = [{} for _ in inputs]
212
+ elif len(metadata) != len(inputs):
213
+ raise ValueError(
214
+ f"Metadata count ({len(metadata)}) must match input count ({len(inputs)})"
215
+ )
216
+
217
+ # Create dataset
218
+ dataset = self._client.datasets.create_dataset(
219
+ name=name,
220
+ dataset_description=description,
221
+ inputs=inputs,
222
+ outputs=outputs,
223
+ metadata=metadata,
224
+ )
225
+
226
+ logger.info(f"Created dataset '{name}' with {len(inputs)} examples")
227
+ return dataset
228
+
229
+ except Exception as e:
230
+ logger.error(f"Failed to create dataset '{name}': {e}")
231
+ raise
232
+
233
+ def create_dataset_from_csv(
234
+ self,
235
+ name: str,
236
+ csv_file_path: Path | str,
237
+ input_keys: list[str],
238
+ output_keys: list[str],
239
+ metadata_keys: list[str] | None = None,
240
+ description: str | None = None,
241
+ ) -> "Dataset":
242
+ """Create a dataset from a CSV file.
243
+
244
+ Convenience method for loading golden sets from CSV files.
245
+
246
+ Args:
247
+ name: Dataset name
248
+ csv_file_path: Path to CSV file
249
+ input_keys: Column names to use as inputs
250
+ output_keys: Column names to use as outputs (reference/ground truth)
251
+ metadata_keys: Optional column names for metadata
252
+ description: Optional dataset description
253
+
254
+ Returns:
255
+ Created Dataset instance
256
+
257
+ Example CSV structure:
258
+ query,expected_label,expected_type,difficulty,query_type
259
+ "LOOKUP person:sarah-chen",sarah-chen,person,easy,LOOKUP
260
+ "SEARCH semantic AI engineer",sarah-chen,person,medium,SEARCH
261
+ """
262
+ try:
263
+ # Load CSV
264
+ df = pd.read_csv(csv_file_path)
265
+
266
+ # Extract inputs
267
+ inputs = cast(list[dict[str, Any]], df[input_keys].to_dict("records"))
268
+
269
+ # Extract outputs
270
+ outputs = cast(list[dict[str, Any]], df[output_keys].to_dict("records"))
271
+
272
+ # Extract metadata if specified
273
+ metadata = None
274
+ if metadata_keys:
275
+ metadata = cast(list[dict[str, Any]], df[metadata_keys].to_dict("records"))
276
+
277
+ return self.create_dataset_from_data(
278
+ name=name,
279
+ inputs=inputs,
280
+ outputs=outputs,
281
+ metadata=metadata,
282
+ description=description,
283
+ )
284
+
285
+ except Exception as e:
286
+ logger.error(f"Failed to create dataset from CSV '{csv_file_path}': {e}")
287
+ raise
288
+
289
+ def add_examples_to_dataset(
290
+ self,
291
+ dataset: str,
292
+ inputs: list[dict[str, Any]],
293
+ outputs: list[dict[str, Any]],
294
+ metadata: list[dict[str, Any]] | None = None,
295
+ ) -> "Dataset":
296
+ """Add examples to an existing dataset.
297
+
298
+ Args:
299
+ dataset: Dataset name
300
+ inputs: List of input dicts
301
+ outputs: List of output dicts
302
+ metadata: Optional list of metadata dicts
303
+
304
+ Returns:
305
+ Updated Dataset instance
306
+ """
307
+ try:
308
+ if len(inputs) != len(outputs):
309
+ raise ValueError("Input/output counts must match")
310
+
311
+ if metadata is None:
312
+ metadata = [{} for _ in inputs]
313
+
314
+ updated_dataset = self._client.datasets.add_examples_to_dataset(
315
+ dataset, # Positional argument instead of keyword
316
+ inputs=inputs,
317
+ outputs=outputs,
318
+ metadata=metadata,
319
+ )
320
+
321
+ logger.info(f"Added {len(inputs)} examples to dataset '{dataset}'")
322
+ return updated_dataset
323
+
324
+ except Exception as e:
325
+ logger.error(f"Failed to add examples to dataset '{dataset}': {e}")
326
+ raise
327
+
328
+ # =========================================================================
329
+ # EXPERIMENT EXECUTION
330
+ # =========================================================================
331
+
332
+ def run_experiment(
333
+ self,
334
+ dataset: "Dataset" | str,
335
+ task: Callable[[Any], Any] | None = None,
336
+ evaluators: list[Callable[[Any], Any]] | None = None,
337
+ experiment_name: str | None = None,
338
+ experiment_description: str | None = None,
339
+ experiment_metadata: dict[str, Any] | None = None,
340
+ experiment_config: Any | None = None,
341
+ ) -> "RanExperiment":
342
+ """Run an evaluation experiment.
343
+
344
+ Three modes:
345
+ 1. ExperimentConfig mode: Provide experiment_config with all settings
346
+ 2. Agent run: Provide task function to execute agents on dataset
347
+ 3. Evaluator run: Provide evaluators to score existing outputs
348
+
349
+ Args:
350
+ dataset: Dataset instance or name (required unless experiment_config provided)
351
+ task: Optional task function to run on each example (agent execution)
352
+ evaluators: Optional list of evaluator functions
353
+ experiment_name: Optional experiment name
354
+ experiment_description: Optional description
355
+ experiment_metadata: Optional metadata dict
356
+ experiment_config: Optional ExperimentConfig instance (overrides other params)
357
+
358
+ Returns:
359
+ RanExperiment with results
360
+
361
+ Example - Agent Run (Phase 2a):
362
+ >>> async def run_agent(example):
363
+ ... from rem.mcp.tools.rem import ask_rem
364
+ ... result = await ask_rem(example["input"]["query"])
365
+ ... return result
366
+ >>> experiment = client.run_experiment(
367
+ ... dataset="rem-lookup-golden",
368
+ ... task=run_agent,
369
+ ... experiment_name="rem-v1-baseline"
370
+ ... )
371
+
372
+ Example - Evaluator Run (Phase 2b):
373
+ >>> experiment = client.run_experiment(
374
+ ... dataset=agent_results,
375
+ ... evaluators=[correctness_eval, completeness_eval],
376
+ ... experiment_name="rem-v1-evaluation"
377
+ ... )
378
+ """
379
+ try:
380
+ # Handle ExperimentConfig mode
381
+ if experiment_config:
382
+ experiment_name = experiment_name or experiment_config.name
383
+ experiment_description = experiment_description or experiment_config.description
384
+
385
+ # Merge metadata
386
+ config_metadata = {
387
+ "agent_schema": experiment_config.agent_schema_ref.name,
388
+ "agent_version": experiment_config.agent_schema_ref.version,
389
+ "evaluator_schema": experiment_config.evaluator_schema_ref.name,
390
+ "evaluator_version": experiment_config.evaluator_schema_ref.version,
391
+ "config_status": experiment_config.status.value,
392
+ "config_tags": experiment_config.tags,
393
+ }
394
+ config_metadata.update(experiment_config.metadata or {})
395
+ experiment_metadata = experiment_metadata or config_metadata
396
+
397
+ # Use ground_truth dataset if dataset not provided
398
+ if not dataset and "ground_truth" in experiment_config.datasets:
399
+ dataset_ref = experiment_config.datasets["ground_truth"]
400
+ # Load from Git or use provided path
401
+ if dataset_ref.location.value == "git":
402
+ # Assume dataset is already loaded
403
+ logger.warning(
404
+ f"Dataset location is 'git' but path-based loading not implemented. "
405
+ f"Pass dataset explicitly or use Phoenix dataset name."
406
+ )
407
+ else:
408
+ dataset = dataset_ref.path
409
+
410
+ # Load dataset if name provided
411
+ if isinstance(dataset, str):
412
+ dataset = self.get_dataset(dataset)
413
+
414
+ logger.info(
415
+ f"Running experiment '{experiment_name or 'unnamed'}' "
416
+ f"on dataset with {len(dataset)} examples"
417
+ )
418
+
419
+ # Run experiment
420
+ experiment = self._client.experiments.run_experiment(
421
+ dataset=dataset,
422
+ task=task, # type: ignore[arg-type]
423
+ evaluators=evaluators or [],
424
+ experiment_name=experiment_name,
425
+ experiment_description=experiment_description,
426
+ experiment_metadata=experiment_metadata,
427
+ )
428
+
429
+ logger.success(f"Experiment complete: {experiment_name or 'unnamed'}")
430
+ if hasattr(experiment, "url"):
431
+ logger.info(f"View results: {experiment.url}") # type: ignore[attr-defined]
432
+
433
+ # Update ExperimentConfig if provided
434
+ if experiment_config:
435
+ experiment_config.last_run_at = datetime.now()
436
+ experiment_config.status = "running" if hasattr(experiment, "runs") else "completed"
437
+
438
+ return experiment
439
+
440
+ except Exception as e:
441
+ logger.error(f"Failed to run experiment: {e}")
442
+ raise
443
+
444
+ # =========================================================================
445
+ # TRACE RETRIEVAL
446
+ # =========================================================================
447
+
448
+ def get_traces(
449
+ self,
450
+ project_name: str | None = None,
451
+ start_time: datetime | None = None,
452
+ end_time: datetime | None = None,
453
+ limit: int = 100,
454
+ root_spans_only: bool = True,
455
+ trace_id: str | None = None,
456
+ span_id: str | None = None,
457
+ ) -> pd.DataFrame:
458
+ """Query traces from Phoenix.
459
+
460
+ Args:
461
+ project_name: Filter by project name
462
+ start_time: Filter traces after this time
463
+ end_time: Filter traces before this time
464
+ limit: Maximum number of traces to return
465
+ root_spans_only: Only return root spans (default: True)
466
+ trace_id: Filter by specific trace ID
467
+ span_id: Filter by specific span ID
468
+
469
+ Returns:
470
+ DataFrame with trace data
471
+
472
+ Example:
473
+ >>> traces = client.get_traces(
474
+ ... project_name="rem-agents",
475
+ ... start_time=datetime.now() - timedelta(days=7),
476
+ ... limit=50
477
+ ... )
478
+ """
479
+ try:
480
+ # Build query
481
+ query_params: dict[str, Any] = {}
482
+ if project_name:
483
+ query_params["project_name"] = project_name
484
+ if start_time:
485
+ query_params["start_time"] = start_time.isoformat()
486
+ if end_time:
487
+ query_params["end_time"] = end_time.isoformat()
488
+ if root_spans_only:
489
+ query_params["root_spans_only"] = True
490
+ if trace_id:
491
+ query_params["trace_id"] = trace_id
492
+ if span_id:
493
+ query_params["span_id"] = span_id
494
+
495
+ # Query traces
496
+ traces_df = self._client.query_spans(limit=limit, **query_params) # type: ignore[attr-defined]
497
+
498
+ logger.debug(f"Retrieved {len(traces_df)} traces")
499
+ return traces_df
500
+
501
+ except Exception as e:
502
+ logger.error(f"Failed to query traces: {e}")
503
+ raise
504
+
505
+ def create_dataset_from_traces(
506
+ self,
507
+ name: str,
508
+ project_name: str,
509
+ start_time: datetime | None = None,
510
+ end_time: datetime | None = None,
511
+ limit: int = 100,
512
+ description: str | None = None,
513
+ ) -> "Dataset":
514
+ """Create a dataset from production traces.
515
+
516
+ Useful for regression testing and coverage analysis.
517
+
518
+ Args:
519
+ name: Dataset name
520
+ project_name: Phoenix project name to query traces from
521
+ start_time: Optional start time for trace window
522
+ end_time: Optional end time for trace window
523
+ limit: Maximum number of traces to include
524
+ description: Optional dataset description
525
+
526
+ Returns:
527
+ Created Dataset instance
528
+
529
+ Example:
530
+ >>> dataset = client.create_dataset_from_traces(
531
+ ... name="rem-production-regression",
532
+ ... project_name="rem-production",
533
+ ... start_time=datetime.now() - timedelta(days=30),
534
+ ... limit=500
535
+ ... )
536
+ """
537
+ try:
538
+ # Query traces
539
+ traces_df = self.get_traces(
540
+ project_name=project_name,
541
+ start_time=start_time,
542
+ end_time=end_time,
543
+ limit=limit,
544
+ root_spans_only=True,
545
+ )
546
+
547
+ if len(traces_df) == 0:
548
+ raise ValueError("No traces found matching criteria")
549
+
550
+ # Extract inputs and outputs from traces
551
+ inputs = []
552
+ outputs = []
553
+ metadata = []
554
+
555
+ for _, row in traces_df.iterrows():
556
+ # Extract input
557
+ span_input = row.get("attributes.input")
558
+ if span_input:
559
+ if isinstance(span_input, str):
560
+ inputs.append({"input": span_input})
561
+ else:
562
+ inputs.append(span_input)
563
+ else:
564
+ inputs.append({})
565
+
566
+ # Extract output
567
+ span_output = row.get("attributes.output")
568
+ if span_output:
569
+ if isinstance(span_output, str):
570
+ outputs.append({"output": span_output})
571
+ else:
572
+ outputs.append(span_output)
573
+ else:
574
+ outputs.append({})
575
+
576
+ # Extract metadata
577
+ metadata.append({
578
+ "span_id": str(row.get("context.span_id", "")),
579
+ "trace_id": str(row.get("context.trace_id", "")),
580
+ "start_time": str(row.get("start_time", "")),
581
+ "latency_ms": row.get("latency_ms", 0),
582
+ })
583
+
584
+ # Create dataset
585
+ dataset = self.create_dataset_from_data(
586
+ name=name,
587
+ inputs=inputs,
588
+ outputs=outputs,
589
+ metadata=metadata,
590
+ description=description,
591
+ )
592
+
593
+ logger.info(f"Created dataset '{name}' from {len(inputs)} traces")
594
+ return dataset
595
+
596
+ except Exception as e:
597
+ logger.error(f"Failed to create dataset from traces: {e}")
598
+ raise
599
+
600
+ def get_experiment(self, experiment_id: str) -> dict[str, Any]:
601
+ """Get experiment data including task runs.
602
+
603
+ Args:
604
+ experiment_id: Experiment ID (from Phoenix UI URL)
605
+
606
+ Returns:
607
+ Dictionary with experiment data including:
608
+ - id: Experiment ID
609
+ - name: Experiment name
610
+ - dataset_id: Associated dataset ID
611
+ - experiment_metadata: Metadata dict
612
+ - task_runs: List of task run results
613
+
614
+ Example:
615
+ >>> exp_data = client.get_experiment("RXhwZXJpbWVudDoxMjM=")
616
+ >>> print(f"Experiment: {exp_data['name']}")
617
+ >>> print(f"Task runs: {len(exp_data['task_runs'])}")
618
+ """
619
+ try:
620
+ # Get experiment object
621
+ experiment = self._client.experiments.get_experiment(experiment_id) # type: ignore[misc]
622
+
623
+ # Extract task runs
624
+ task_runs = []
625
+ for run in experiment.runs: # type: ignore[attr-defined]
626
+ task_runs.append({
627
+ "input": run.input,
628
+ "output": run.output,
629
+ "expected": run.expected,
630
+ "dataset_example_id": getattr(run, "dataset_example_id", None),
631
+ })
632
+
633
+ # Build response
634
+ exp_data = {
635
+ "id": experiment.id, # type: ignore[attr-defined]
636
+ "name": experiment.name, # type: ignore[attr-defined]
637
+ "dataset_id": experiment.dataset_id, # type: ignore[attr-defined]
638
+ "experiment_metadata": experiment.metadata or {}, # type: ignore[attr-defined]
639
+ "task_runs": task_runs,
640
+ }
641
+
642
+ logger.info(f"Retrieved experiment '{experiment.name}' with {len(task_runs)} task runs") # type: ignore[attr-defined]
643
+ return exp_data
644
+
645
+ except Exception as e:
646
+ logger.error(f"Failed to get experiment '{experiment_id}': {e}")
647
+ raise
648
+
649
+ # =========================================================================
650
+ # FEEDBACK/ANNOTATION
651
+ # =========================================================================
652
+
653
+ def add_span_feedback(
654
+ self,
655
+ span_id: str,
656
+ annotation_name: str,
657
+ annotator_kind: str = "HUMAN",
658
+ label: str | None = None,
659
+ score: float | None = None,
660
+ explanation: str | None = None,
661
+ ) -> None:
662
+ """Add feedback annotation to a span.
663
+
664
+ Args:
665
+ span_id: Span ID to annotate
666
+ annotation_name: Name of the annotation (e.g., "correctness")
667
+ annotator_kind: Type of annotator ("HUMAN", "LLM", "CODE")
668
+ label: Optional label (e.g., "correct", "incorrect")
669
+ score: Optional numeric score (0.0-1.0)
670
+ explanation: Optional explanation text
671
+ """
672
+ try:
673
+ self._client.add_span_annotation( # type: ignore[attr-defined]
674
+ span_id=span_id,
675
+ name=annotation_name,
676
+ annotator_kind=annotator_kind,
677
+ label=label,
678
+ score=score,
679
+ explanation=explanation,
680
+ )
681
+
682
+ logger.info(f"Added {annotator_kind} feedback to span {span_id}")
683
+
684
+ except Exception as e:
685
+ logger.error(f"Failed to add span feedback: {e}")
686
+ raise