remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,960 @@
1
+ """Phoenix client for REM evaluation workflows.
2
+
3
+ This client provides a lean interface to Arize Phoenix for:
4
+ - Dataset management (create golden sets, add examples)
5
+ - Experiment execution (run agents, run evaluators)
6
+ - Trace retrieval (query agent execution history)
7
+ - Label management (organize evaluations by type/difficulty)
8
+
9
+ Two-Phase Evaluation Pattern:
10
+ ==============================
11
+
12
+ Phase 1 - Golden Set Creation (SME-driven):
13
+ 1. SMEs create datasets with (input, reference) pairs
14
+ 2. Store in Phoenix with metadata labels
15
+ 3. No agent execution required
16
+
17
+ Phase 2 - Automated Evaluation (Agent-driven):
18
+ 1. Run agents on golden set → agent outputs
19
+ 2. Run evaluators on (input, agent_output, reference) → scores
20
+ 3. Track in Phoenix for analysis
21
+
22
+ Example Workflow:
23
+ -----------------
24
+
25
+ # Phase 1: SME creates golden set
26
+ client = PhoenixClient()
27
+ dataset = client.create_dataset_from_data(
28
+ name="rem-lookup-golden",
29
+ inputs=[{"query": "LOOKUP person:sarah-chen"}],
30
+ outputs=[{"label": "sarah-chen", "type": "person", ...}],
31
+ metadata=[{"difficulty": "easy", "query_type": "LOOKUP"}]
32
+ )
33
+
34
+ # Phase 2a: Run agents to produce outputs
35
+ experiment = client.run_experiment(
36
+ dataset=dataset,
37
+ task=run_agent_task, # Calls ask_rem agent
38
+ experiment_name="rem-v1-baseline"
39
+ )
40
+
41
+ # Phase 2b: Run evaluators on results
42
+ evaluator_exp = client.run_experiment(
43
+ dataset=experiment_results, # Uses agent outputs
44
+ task=None, # No task, just evaluate existing outputs
45
+ evaluators=[correctness_evaluator, completeness_evaluator],
46
+ experiment_name="rem-v1-evaluation"
47
+ )
48
+ """
49
+
50
+ from __future__ import annotations
51
+
52
+ from datetime import datetime
53
+ from pathlib import Path
54
+ from typing import Any, Callable, TYPE_CHECKING, cast
55
+
56
+ import polars as pl
57
+ from loguru import logger
58
+
59
+ from .config import PhoenixConfig
60
+
61
+ if TYPE_CHECKING:
62
+ from phoenix.client import Client
63
+ from phoenix.client.resources.datasets import Dataset
64
+ from phoenix.client.resources.experiments.types import RanExperiment
65
+
66
+
67
+ def dataframe_to_phoenix_dataset(
68
+ client: "PhoenixClient",
69
+ df: pl.DataFrame,
70
+ dataset_name: str,
71
+ input_keys: list[str] | None = None,
72
+ output_keys: list[str] | None = None,
73
+ metadata_keys: list[str] | None = None,
74
+ description: str | None = None,
75
+ ) -> "Dataset":
76
+ """Convert a Polars DataFrame to a Phoenix Dataset.
77
+
78
+ This function transforms a Polars DataFrame into a Phoenix Dataset by:
79
+ 1. Extracting input columns (what agents receive)
80
+ 2. Extracting output columns (ground truth/expected output)
81
+ 3. Extracting metadata columns (optional labels, difficulty, etc.)
82
+
83
+ If column keys are not specified, uses smart defaults:
84
+ - input_keys: columns containing 'input', 'query', 'question', or 'prompt'
85
+ - output_keys: columns containing 'output', 'expected', 'answer', or 'response'
86
+ - metadata_keys: remaining columns
87
+
88
+ Args:
89
+ client: PhoenixClient instance
90
+ df: Polars DataFrame with experiment data
91
+ dataset_name: Name for the created Phoenix dataset
92
+ input_keys: Optional list of column names for inputs
93
+ output_keys: Optional list of column names for outputs (ground truth)
94
+ metadata_keys: Optional list of column names for metadata
95
+ description: Optional dataset description
96
+
97
+ Returns:
98
+ Phoenix Dataset instance
99
+
100
+ Example:
101
+ >>> df = pl.read_csv("golden_set.csv")
102
+ >>> dataset = dataframe_to_phoenix_dataset(
103
+ ... client=phoenix_client,
104
+ ... df=df,
105
+ ... dataset_name="my-golden-set",
106
+ ... input_keys=["query"],
107
+ ... output_keys=["expected_output"],
108
+ ... metadata_keys=["difficulty"]
109
+ ... )
110
+ """
111
+ columns = df.columns
112
+
113
+ # Smart defaults for column detection
114
+ if input_keys is None:
115
+ input_keys = [c for c in columns if any(
116
+ k in c.lower() for k in ["input", "query", "question", "prompt"]
117
+ )]
118
+ if not input_keys:
119
+ # Fallback: first column
120
+ input_keys = [columns[0]] if columns else []
121
+
122
+ if output_keys is None:
123
+ output_keys = [c for c in columns if any(
124
+ k in c.lower() for k in ["output", "expected", "answer", "response", "reference"]
125
+ )]
126
+ if not output_keys:
127
+ # Fallback: second column
128
+ output_keys = [columns[1]] if len(columns) > 1 else []
129
+
130
+ if metadata_keys is None:
131
+ used_keys = set(input_keys) | set(output_keys)
132
+ metadata_keys = [c for c in columns if c not in used_keys]
133
+
134
+ logger.debug(
135
+ f"DataFrame to Phoenix Dataset: inputs={input_keys}, "
136
+ f"outputs={output_keys}, metadata={metadata_keys}"
137
+ )
138
+
139
+ # Convert to list of dicts
140
+ records = df.to_dicts()
141
+
142
+ inputs = [{k: row.get(k) for k in input_keys} for row in records]
143
+ outputs = [{k: row.get(k) for k in output_keys} for row in records]
144
+ metadata = [{k: row.get(k) for k in metadata_keys} for row in records] if metadata_keys else None
145
+
146
+ # Create Phoenix dataset
147
+ return client.create_dataset_from_data(
148
+ name=dataset_name,
149
+ inputs=inputs,
150
+ outputs=outputs,
151
+ metadata=metadata,
152
+ description=description,
153
+ )
154
+
155
+
156
+ class PhoenixClient:
157
+ """High-level Phoenix client for REM evaluation workflows.
158
+
159
+ Wraps the official Phoenix client with REM-specific methods for:
160
+ - Creating and managing evaluation datasets
161
+ - Running agent and evaluator experiments
162
+ - Querying trace data for analysis
163
+ - Managing dataset labels
164
+
165
+ Attributes:
166
+ config: Phoenix connection configuration
167
+ _client: Underlying Phoenix Client instance
168
+ """
169
+
170
+ def __init__(self, config: PhoenixConfig | None = None):
171
+ """Initialize Phoenix client.
172
+
173
+ Args:
174
+ config: Optional Phoenix configuration (auto-loads if not provided)
175
+ """
176
+ if config is None:
177
+ config = PhoenixConfig.from_settings()
178
+
179
+ self.config = config
180
+ self._client = self._create_client()
181
+
182
+ logger.info(f"Phoenix client initialized (endpoint: {self.config.base_url})")
183
+
184
+ def _create_client(self) -> "Client":
185
+ """Create underlying Phoenix client.
186
+
187
+ Returns:
188
+ Configured Phoenix Client instance
189
+ """
190
+ from phoenix.client import Client
191
+
192
+ return Client(
193
+ base_url=self.config.base_url,
194
+ api_key=self.config.api_key,
195
+ )
196
+
197
+ # =========================================================================
198
+ # DATASET MANAGEMENT
199
+ # =========================================================================
200
+
201
+ def list_datasets(self) -> list[dict[str, Any]]:
202
+ """List all datasets in Phoenix.
203
+
204
+ Returns:
205
+ List of dataset metadata dicts with keys:
206
+ - id: Dataset ID
207
+ - name: Dataset name
208
+ - example_count: Number of examples
209
+ - created_at: Creation timestamp
210
+ """
211
+ try:
212
+ datasets = list(self._client.datasets.list())
213
+ logger.debug(f"Found {len(datasets)} datasets")
214
+ return [
215
+ {
216
+ "id": str(ds.get("id", "")),
217
+ "name": ds.get("name", ""),
218
+ "example_count": ds.get("example_count", 0),
219
+ "created_at": ds.get("created_at", ""),
220
+ }
221
+ for ds in datasets
222
+ ]
223
+ except Exception as e:
224
+ logger.error(f"Failed to list datasets: {e}")
225
+ raise
226
+
227
+ def get_dataset(self, name: str) -> "Dataset":
228
+ """Get a dataset by name.
229
+
230
+ Args:
231
+ name: Dataset name
232
+
233
+ Returns:
234
+ Dataset instance
235
+
236
+ Raises:
237
+ ValueError: If dataset not found
238
+ """
239
+ try:
240
+ dataset = self._client.datasets.get_dataset(dataset=name)
241
+ logger.debug(f"Loaded dataset: {name} ({len(dataset)} examples)")
242
+ return dataset
243
+ except Exception as e:
244
+ logger.error(f"Failed to get dataset '{name}': {e}")
245
+ raise ValueError(f"Dataset not found: {name}") from e
246
+
247
+ def create_dataset_from_data(
248
+ self,
249
+ name: str,
250
+ inputs: list[dict[str, Any]],
251
+ outputs: list[dict[str, Any]],
252
+ metadata: list[dict[str, Any]] | None = None,
253
+ description: str | None = None,
254
+ ) -> "Dataset":
255
+ """Create a dataset from input/output pairs (SME golden set creation).
256
+
257
+ This is the primary method for SMEs to create evaluation datasets.
258
+ Each example consists of:
259
+ - input: What the agent receives (e.g., {"query": "LOOKUP person:sarah-chen"})
260
+ - output: Expected correct result (ground truth/reference)
261
+ - metadata: Optional labels (difficulty, query_type, etc.)
262
+
263
+ Args:
264
+ name: Dataset name (will be created or updated)
265
+ inputs: List of input dicts (what agents receive)
266
+ outputs: List of expected output dicts (ground truth)
267
+ metadata: Optional list of metadata dicts (labels, difficulty, etc.)
268
+ description: Optional dataset description
269
+
270
+ Returns:
271
+ Created Dataset instance
272
+
273
+ Example:
274
+ >>> client = PhoenixClient()
275
+ >>> dataset = client.create_dataset_from_data(
276
+ ... name="rem-lookup-golden",
277
+ ... inputs=[
278
+ ... {"query": "LOOKUP person:sarah-chen"},
279
+ ... {"query": "LOOKUP project:tidb-migration"}
280
+ ... ],
281
+ ... outputs=[
282
+ ... {"label": "sarah-chen", "type": "person", "properties": {...}},
283
+ ... {"label": "tidb-migration", "type": "project", "properties": {...}}
284
+ ... ],
285
+ ... metadata=[
286
+ ... {"difficulty": "easy", "query_type": "LOOKUP"},
287
+ ... {"difficulty": "medium", "query_type": "LOOKUP"}
288
+ ... ]
289
+ ... )
290
+ """
291
+ try:
292
+ # Validate inputs/outputs match
293
+ if len(inputs) != len(outputs):
294
+ raise ValueError(
295
+ f"Input count ({len(inputs)}) must match output count ({len(outputs)})"
296
+ )
297
+
298
+ # Create metadata list if not provided
299
+ if metadata is None:
300
+ metadata = [{} for _ in inputs]
301
+ elif len(metadata) != len(inputs):
302
+ raise ValueError(
303
+ f"Metadata count ({len(metadata)}) must match input count ({len(inputs)})"
304
+ )
305
+
306
+ # Create dataset
307
+ dataset = self._client.datasets.create_dataset(
308
+ name=name,
309
+ dataset_description=description,
310
+ inputs=inputs,
311
+ outputs=outputs,
312
+ metadata=metadata,
313
+ )
314
+
315
+ logger.info(f"Created dataset '{name}' with {len(inputs)} examples")
316
+ return dataset
317
+
318
+ except Exception as e:
319
+ logger.error(f"Failed to create dataset '{name}': {e}")
320
+ raise
321
+
322
+ def create_dataset_from_csv(
323
+ self,
324
+ name: str,
325
+ csv_file_path: Path | str,
326
+ input_keys: list[str],
327
+ output_keys: list[str],
328
+ metadata_keys: list[str] | None = None,
329
+ description: str | None = None,
330
+ ) -> "Dataset":
331
+ """Create a dataset from a CSV file.
332
+
333
+ Convenience method for loading golden sets from CSV files.
334
+
335
+ Args:
336
+ name: Dataset name
337
+ csv_file_path: Path to CSV file
338
+ input_keys: Column names to use as inputs
339
+ output_keys: Column names to use as outputs (reference/ground truth)
340
+ metadata_keys: Optional column names for metadata
341
+ description: Optional dataset description
342
+
343
+ Returns:
344
+ Created Dataset instance
345
+
346
+ Example CSV structure:
347
+ query,expected_label,expected_type,difficulty,query_type
348
+ "LOOKUP person:sarah-chen",sarah-chen,person,easy,LOOKUP
349
+ "SEARCH semantic AI engineer",sarah-chen,person,medium,SEARCH
350
+ """
351
+ try:
352
+ # Load CSV with Polars
353
+ df = pl.read_csv(csv_file_path)
354
+
355
+ # Convert to list of dicts
356
+ records = df.to_dicts()
357
+
358
+ # Extract inputs
359
+ inputs = [{k: row.get(k) for k in input_keys} for row in records]
360
+
361
+ # Extract outputs
362
+ outputs = [{k: row.get(k) for k in output_keys} for row in records]
363
+
364
+ # Extract metadata if specified
365
+ metadata = None
366
+ if metadata_keys:
367
+ metadata = [{k: row.get(k) for k in metadata_keys} for row in records]
368
+
369
+ return self.create_dataset_from_data(
370
+ name=name,
371
+ inputs=inputs,
372
+ outputs=outputs,
373
+ metadata=metadata,
374
+ description=description,
375
+ )
376
+
377
+ except Exception as e:
378
+ logger.error(f"Failed to create dataset from CSV '{csv_file_path}': {e}")
379
+ raise
380
+
381
+ def add_examples_to_dataset(
382
+ self,
383
+ dataset: str,
384
+ inputs: list[dict[str, Any]],
385
+ outputs: list[dict[str, Any]],
386
+ metadata: list[dict[str, Any]] | None = None,
387
+ ) -> "Dataset":
388
+ """Add examples to an existing dataset.
389
+
390
+ Args:
391
+ dataset: Dataset name
392
+ inputs: List of input dicts
393
+ outputs: List of output dicts
394
+ metadata: Optional list of metadata dicts
395
+
396
+ Returns:
397
+ Updated Dataset instance
398
+ """
399
+ try:
400
+ if len(inputs) != len(outputs):
401
+ raise ValueError("Input/output counts must match")
402
+
403
+ if metadata is None:
404
+ metadata = [{} for _ in inputs]
405
+
406
+ updated_dataset = self._client.datasets.add_examples_to_dataset(
407
+ dataset, # Positional argument instead of keyword
408
+ inputs=inputs,
409
+ outputs=outputs,
410
+ metadata=metadata,
411
+ )
412
+
413
+ logger.info(f"Added {len(inputs)} examples to dataset '{dataset}'")
414
+ return updated_dataset
415
+
416
+ except Exception as e:
417
+ logger.error(f"Failed to add examples to dataset '{dataset}': {e}")
418
+ raise
419
+
420
+ # =========================================================================
421
+ # EXPERIMENT EXECUTION
422
+ # =========================================================================
423
+
424
+ def run_experiment(
425
+ self,
426
+ dataset: "Dataset" | str | pl.DataFrame,
427
+ task: Callable[[Any], Any] | None = None,
428
+ evaluators: list[Callable[[Any], Any]] | None = None,
429
+ experiment_name: str | None = None,
430
+ experiment_description: str | None = None,
431
+ experiment_metadata: dict[str, Any] | None = None,
432
+ experiment_config: Any | None = None,
433
+ input_keys: list[str] | None = None,
434
+ output_keys: list[str] | None = None,
435
+ metadata_keys: list[str] | None = None,
436
+ ) -> "RanExperiment":
437
+ """Run an evaluation experiment.
438
+
439
+ Three modes:
440
+ 1. ExperimentConfig mode: Provide experiment_config with all settings
441
+ 2. Agent run: Provide task function to execute agents on dataset
442
+ 3. Evaluator run: Provide evaluators to score existing outputs
443
+
444
+ Dataset can be:
445
+ - Phoenix Dataset instance
446
+ - Dataset name (string) - will be loaded from Phoenix
447
+ - Polars DataFrame - will be converted to Phoenix Dataset
448
+
449
+ Args:
450
+ dataset: Dataset instance, name, or Polars DataFrame
451
+ task: Optional task function to run on each example (agent execution)
452
+ evaluators: Optional list of evaluator functions
453
+ experiment_name: Optional experiment name
454
+ experiment_description: Optional description
455
+ experiment_metadata: Optional metadata dict
456
+ experiment_config: Optional ExperimentConfig instance (overrides other params)
457
+ input_keys: Column names for inputs (required if dataset is DataFrame)
458
+ output_keys: Column names for outputs (required if dataset is DataFrame)
459
+ metadata_keys: Optional column names for metadata
460
+
461
+ Returns:
462
+ RanExperiment with results
463
+
464
+ Example - Agent Run (Phase 2a):
465
+ >>> async def run_agent(example):
466
+ ... from rem.mcp.tools.rem import ask_rem
467
+ ... result = await ask_rem(example["input"]["query"])
468
+ ... return result
469
+ >>> experiment = client.run_experiment(
470
+ ... dataset="rem-lookup-golden",
471
+ ... task=run_agent,
472
+ ... experiment_name="rem-v1-baseline"
473
+ ... )
474
+
475
+ Example - With Polars DataFrame:
476
+ >>> df = pl.read_csv("golden_set.csv")
477
+ >>> experiment = client.run_experiment(
478
+ ... dataset=df,
479
+ ... task=run_agent,
480
+ ... experiment_name="rem-v1-baseline",
481
+ ... input_keys=["query"],
482
+ ... output_keys=["expected_output"]
483
+ ... )
484
+
485
+ Example - Evaluator Run (Phase 2b):
486
+ >>> experiment = client.run_experiment(
487
+ ... dataset=agent_results,
488
+ ... evaluators=[correctness_eval, completeness_eval],
489
+ ... experiment_name="rem-v1-evaluation"
490
+ ... )
491
+ """
492
+ try:
493
+ # Handle ExperimentConfig mode
494
+ if experiment_config:
495
+ experiment_name = experiment_name or experiment_config.name
496
+ experiment_description = experiment_description or experiment_config.description
497
+
498
+ # Merge metadata
499
+ config_metadata = {
500
+ "agent_schema": experiment_config.agent_schema_ref.name,
501
+ "agent_version": experiment_config.agent_schema_ref.version,
502
+ "evaluator_schema": experiment_config.evaluator_schema_ref.name,
503
+ "evaluator_version": experiment_config.evaluator_schema_ref.version,
504
+ "config_status": experiment_config.status.value,
505
+ "config_tags": experiment_config.tags,
506
+ }
507
+ config_metadata.update(experiment_config.metadata or {})
508
+ experiment_metadata = experiment_metadata or config_metadata
509
+
510
+ # Use ground_truth dataset if dataset not provided
511
+ if not dataset and "ground_truth" in experiment_config.datasets:
512
+ dataset_ref = experiment_config.datasets["ground_truth"]
513
+ # Load from Git or use provided path
514
+ if dataset_ref.location.value == "git":
515
+ # Assume dataset is already loaded
516
+ logger.warning(
517
+ f"Dataset location is 'git' but path-based loading not implemented. "
518
+ f"Pass dataset explicitly or use Phoenix dataset name."
519
+ )
520
+ else:
521
+ dataset = dataset_ref.path
522
+
523
+ # Convert Polars DataFrame to Phoenix Dataset
524
+ if isinstance(dataset, pl.DataFrame):
525
+ dataset_name_for_phoenix = f"{experiment_name or 'experiment'}-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
526
+ logger.info(f"Converting Polars DataFrame to Phoenix Dataset: {dataset_name_for_phoenix}")
527
+ dataset = dataframe_to_phoenix_dataset(
528
+ client=self,
529
+ df=dataset,
530
+ dataset_name=dataset_name_for_phoenix,
531
+ input_keys=input_keys,
532
+ output_keys=output_keys,
533
+ metadata_keys=metadata_keys,
534
+ description=f"Auto-created from DataFrame for experiment: {experiment_name}",
535
+ )
536
+ logger.info(f"✓ Created Phoenix Dataset: {dataset_name_for_phoenix}")
537
+
538
+ # Load dataset if name provided
539
+ if isinstance(dataset, str):
540
+ dataset = self.get_dataset(dataset)
541
+
542
+ logger.info(
543
+ f"Running experiment '{experiment_name or 'unnamed'}' "
544
+ f"on dataset with {len(dataset)} examples"
545
+ )
546
+
547
+ # Run experiment
548
+ experiment = self._client.experiments.run_experiment(
549
+ dataset=dataset,
550
+ task=task, # type: ignore[arg-type]
551
+ evaluators=evaluators or [],
552
+ experiment_name=experiment_name,
553
+ experiment_description=experiment_description,
554
+ experiment_metadata=experiment_metadata,
555
+ )
556
+
557
+ logger.success(f"Experiment complete: {experiment_name or 'unnamed'}")
558
+ if hasattr(experiment, "url"):
559
+ logger.info(f"View results: {experiment.url}") # type: ignore[attr-defined]
560
+
561
+ # Update ExperimentConfig if provided
562
+ if experiment_config:
563
+ experiment_config.last_run_at = datetime.now()
564
+ experiment_config.status = "running" if hasattr(experiment, "runs") else "completed"
565
+
566
+ return experiment
567
+
568
+ except Exception as e:
569
+ logger.error(f"Failed to run experiment: {e}")
570
+ raise
571
+
572
+ # =========================================================================
573
+ # TRACE RETRIEVAL
574
+ # =========================================================================
575
+
576
+ def get_traces(
577
+ self,
578
+ project_name: str | None = None,
579
+ start_time: datetime | None = None,
580
+ end_time: datetime | None = None,
581
+ limit: int = 100,
582
+ root_spans_only: bool = True,
583
+ trace_id: str | None = None,
584
+ span_id: str | None = None,
585
+ ) -> pl.DataFrame:
586
+ """Query traces from Phoenix.
587
+
588
+ Args:
589
+ project_name: Filter by project name
590
+ start_time: Filter traces after this time
591
+ end_time: Filter traces before this time
592
+ limit: Maximum number of traces to return
593
+ root_spans_only: Only return root spans (default: True)
594
+ trace_id: Filter by specific trace ID
595
+ span_id: Filter by specific span ID
596
+
597
+ Returns:
598
+ Polars DataFrame with trace data
599
+
600
+ Example:
601
+ >>> traces = client.get_traces(
602
+ ... project_name="rem-agents",
603
+ ... start_time=datetime.now() - timedelta(days=7),
604
+ ... limit=50
605
+ ... )
606
+ """
607
+ try:
608
+ # Build query
609
+ query_params: dict[str, Any] = {}
610
+ if project_name:
611
+ query_params["project_name"] = project_name
612
+ if start_time:
613
+ query_params["start_time"] = start_time.isoformat()
614
+ if end_time:
615
+ query_params["end_time"] = end_time.isoformat()
616
+ if root_spans_only:
617
+ query_params["root_spans_only"] = True
618
+ if trace_id:
619
+ query_params["trace_id"] = trace_id
620
+ if span_id:
621
+ query_params["span_id"] = span_id
622
+
623
+ # Query traces (Phoenix returns pandas DataFrame)
624
+ pandas_df = self._client.query_spans(limit=limit, **query_params) # type: ignore[attr-defined]
625
+
626
+ # Convert pandas to Polars
627
+ traces_df = pl.from_pandas(pandas_df)
628
+
629
+ logger.debug(f"Retrieved {len(traces_df)} traces")
630
+ return traces_df
631
+
632
+ except Exception as e:
633
+ logger.error(f"Failed to query traces: {e}")
634
+ raise
635
+
636
+ def create_dataset_from_traces(
637
+ self,
638
+ name: str,
639
+ project_name: str,
640
+ start_time: datetime | None = None,
641
+ end_time: datetime | None = None,
642
+ limit: int = 100,
643
+ description: str | None = None,
644
+ ) -> "Dataset":
645
+ """Create a dataset from production traces.
646
+
647
+ Useful for regression testing and coverage analysis.
648
+
649
+ Args:
650
+ name: Dataset name
651
+ project_name: Phoenix project name to query traces from
652
+ start_time: Optional start time for trace window
653
+ end_time: Optional end time for trace window
654
+ limit: Maximum number of traces to include
655
+ description: Optional dataset description
656
+
657
+ Returns:
658
+ Created Dataset instance
659
+
660
+ Example:
661
+ >>> dataset = client.create_dataset_from_traces(
662
+ ... name="rem-production-regression",
663
+ ... project_name="rem-production",
664
+ ... start_time=datetime.now() - timedelta(days=30),
665
+ ... limit=500
666
+ ... )
667
+ """
668
+ try:
669
+ # Query traces (returns Polars DataFrame)
670
+ traces_df = self.get_traces(
671
+ project_name=project_name,
672
+ start_time=start_time,
673
+ end_time=end_time,
674
+ limit=limit,
675
+ root_spans_only=True,
676
+ )
677
+
678
+ if len(traces_df) == 0:
679
+ raise ValueError("No traces found matching criteria")
680
+
681
+ # Convert to list of dicts for iteration
682
+ records = traces_df.to_dicts()
683
+
684
+ # Extract inputs and outputs from traces
685
+ inputs = []
686
+ outputs = []
687
+ metadata = []
688
+
689
+ for row in records:
690
+ # Extract input
691
+ span_input = row.get("attributes.input")
692
+ if span_input:
693
+ if isinstance(span_input, str):
694
+ inputs.append({"input": span_input})
695
+ else:
696
+ inputs.append(span_input)
697
+ else:
698
+ inputs.append({})
699
+
700
+ # Extract output
701
+ span_output = row.get("attributes.output")
702
+ if span_output:
703
+ if isinstance(span_output, str):
704
+ outputs.append({"output": span_output})
705
+ else:
706
+ outputs.append(span_output)
707
+ else:
708
+ outputs.append({})
709
+
710
+ # Extract metadata
711
+ metadata.append({
712
+ "span_id": str(row.get("context.span_id", "")),
713
+ "trace_id": str(row.get("context.trace_id", "")),
714
+ "start_time": str(row.get("start_time", "")),
715
+ "latency_ms": row.get("latency_ms", 0),
716
+ })
717
+
718
+ # Create dataset
719
+ dataset = self.create_dataset_from_data(
720
+ name=name,
721
+ inputs=inputs,
722
+ outputs=outputs,
723
+ metadata=metadata,
724
+ description=description,
725
+ )
726
+
727
+ logger.info(f"Created dataset '{name}' from {len(inputs)} traces")
728
+ return dataset
729
+
730
+ except Exception as e:
731
+ logger.error(f"Failed to create dataset from traces: {e}")
732
+ raise
733
+
734
+ def get_experiment(self, experiment_id: str) -> dict[str, Any]:
735
+ """Get experiment data including task runs.
736
+
737
+ Args:
738
+ experiment_id: Experiment ID (from Phoenix UI URL)
739
+
740
+ Returns:
741
+ Dictionary with experiment data including:
742
+ - id: Experiment ID
743
+ - name: Experiment name
744
+ - dataset_id: Associated dataset ID
745
+ - experiment_metadata: Metadata dict
746
+ - task_runs: List of task run results
747
+
748
+ Example:
749
+ >>> exp_data = client.get_experiment("RXhwZXJpbWVudDoxMjM=")
750
+ >>> print(f"Experiment: {exp_data['name']}")
751
+ >>> print(f"Task runs: {len(exp_data['task_runs'])}")
752
+ """
753
+ try:
754
+ # Get experiment object
755
+ experiment = self._client.experiments.get_experiment(experiment_id) # type: ignore[misc]
756
+
757
+ # Extract task runs
758
+ task_runs = []
759
+ for run in experiment.runs: # type: ignore[attr-defined]
760
+ task_runs.append({
761
+ "input": run.input,
762
+ "output": run.output,
763
+ "expected": run.expected,
764
+ "dataset_example_id": getattr(run, "dataset_example_id", None),
765
+ })
766
+
767
+ # Build response
768
+ exp_data = {
769
+ "id": experiment.id, # type: ignore[attr-defined]
770
+ "name": experiment.name, # type: ignore[attr-defined]
771
+ "dataset_id": experiment.dataset_id, # type: ignore[attr-defined]
772
+ "experiment_metadata": experiment.metadata or {}, # type: ignore[attr-defined]
773
+ "task_runs": task_runs,
774
+ }
775
+
776
+ logger.info(f"Retrieved experiment '{experiment.name}' with {len(task_runs)} task runs") # type: ignore[attr-defined]
777
+ return exp_data
778
+
779
+ except Exception as e:
780
+ logger.error(f"Failed to get experiment '{experiment_id}': {e}")
781
+ raise
782
+
783
+ # =========================================================================
784
+ # FEEDBACK/ANNOTATION
785
+ # =========================================================================
786
+
787
+ def add_span_feedback(
788
+ self,
789
+ span_id: str,
790
+ annotation_name: str,
791
+ annotator_kind: str = "HUMAN",
792
+ label: str | None = None,
793
+ score: float | None = None,
794
+ explanation: str | None = None,
795
+ metadata: dict[str, Any] | None = None,
796
+ trace_id: str | None = None,
797
+ ) -> str | None:
798
+ """Add feedback annotation to a span via Phoenix REST API.
799
+
800
+ Uses direct HTTP POST to /v1/span_annotations for reliability
801
+ (Phoenix Python client API changes frequently).
802
+
803
+ Args:
804
+ span_id: Span ID to annotate (hex string)
805
+ annotation_name: Name of the annotation (e.g., "correctness", "user_feedback")
806
+ annotator_kind: Type of annotator ("HUMAN", "LLM", "CODE")
807
+ label: Optional label (e.g., "correct", "incorrect", "helpful")
808
+ score: Optional numeric score (0.0-1.0)
809
+ explanation: Optional explanation text
810
+ metadata: Optional additional metadata dict
811
+ trace_id: Optional trace ID (used if span lookup needed)
812
+
813
+ Returns:
814
+ Annotation ID if successful, None otherwise
815
+ """
816
+ import httpx
817
+
818
+ try:
819
+ # Build annotation payload for Phoenix REST API
820
+ annotation_data = {
821
+ "span_id": span_id,
822
+ "name": annotation_name,
823
+ "annotator_kind": annotator_kind,
824
+ "result": {
825
+ "label": label,
826
+ "score": score,
827
+ "explanation": explanation,
828
+ },
829
+ "metadata": metadata or {},
830
+ }
831
+
832
+ # Add trace_id if provided
833
+ if trace_id:
834
+ annotation_data["trace_id"] = trace_id
835
+
836
+ # POST to Phoenix REST API
837
+ annotations_endpoint = f"{self.config.base_url}/v1/span_annotations"
838
+ headers = {}
839
+ if self.config.api_key:
840
+ headers["Authorization"] = f"Bearer {self.config.api_key}"
841
+
842
+ with httpx.Client(timeout=5.0) as client:
843
+ response = client.post(
844
+ annotations_endpoint,
845
+ json={"data": [annotation_data]},
846
+ headers=headers,
847
+ )
848
+ response.raise_for_status()
849
+
850
+ logger.info(f"Added {annotator_kind} feedback to span {span_id}")
851
+ return span_id # Return span_id as annotation reference
852
+
853
+ except httpx.HTTPStatusError as e:
854
+ logger.error(
855
+ f"Failed to add span feedback (HTTP {e.response.status_code}): "
856
+ f"{e.response.text if hasattr(e, 'response') else 'N/A'}"
857
+ )
858
+ return None
859
+ except Exception as e:
860
+ logger.error(f"Failed to add span feedback: {e}")
861
+ return None
862
+
863
+ def sync_user_feedback(
864
+ self,
865
+ span_id: str,
866
+ rating: int | None = None,
867
+ categories: list[str] | None = None,
868
+ comment: str | None = None,
869
+ feedback_id: str | None = None,
870
+ trace_id: str | None = None,
871
+ ) -> str | None:
872
+ """Sync user feedback to Phoenix as a span annotation.
873
+
874
+ Convenience method for syncing Feedback entities to Phoenix.
875
+ Converts REM feedback format to Phoenix annotation format.
876
+
877
+ Args:
878
+ span_id: OTEL span ID to annotate
879
+ rating: User rating (-1, 1-5 scale)
880
+ categories: List of feedback categories
881
+ comment: Free-text comment
882
+ feedback_id: Optional REM feedback ID for reference
883
+ trace_id: Optional trace ID for the span
884
+
885
+ Returns:
886
+ Phoenix annotation ID if successful
887
+
888
+ Example:
889
+ >>> client.sync_user_feedback(
890
+ ... span_id="abc123",
891
+ ... rating=4,
892
+ ... categories=["helpful", "accurate"],
893
+ ... comment="Great response!"
894
+ ... )
895
+ """
896
+ # Convert rating to 0-1 score
897
+ # Rating scheme:
898
+ # -1 = thumbs down → score 0.0
899
+ # 1 = thumbs up → score 1.0
900
+ # 2-5 = star rating → normalized to 0-1 range
901
+ score = None
902
+ if rating is not None:
903
+ if rating == -1:
904
+ score = 0.0
905
+ elif rating == 1:
906
+ score = 1.0 # Thumbs up
907
+ elif 2 <= rating <= 5:
908
+ score = (rating - 1) / 4.0 # 2→0.25, 3→0.5, 4→0.75, 5→1.0
909
+
910
+ # Use primary category as label
911
+ label = categories[0] if categories else None
912
+
913
+ # Build explanation from comment and additional categories
914
+ explanation = comment
915
+ if categories and len(categories) > 1:
916
+ cats_str = ", ".join(categories[1:])
917
+ if explanation:
918
+ explanation = f"{explanation} [Categories: {cats_str}]"
919
+ else:
920
+ explanation = f"Categories: {cats_str}"
921
+
922
+ # Build metadata
923
+ metadata: dict[str, Any] = {
924
+ "rating": rating,
925
+ "categories": categories or [],
926
+ }
927
+ if feedback_id:
928
+ metadata["rem_feedback_id"] = feedback_id
929
+
930
+ return self.add_span_feedback(
931
+ span_id=span_id,
932
+ annotation_name="user_feedback",
933
+ annotator_kind="HUMAN",
934
+ label=label,
935
+ score=score,
936
+ explanation=explanation,
937
+ metadata=metadata,
938
+ trace_id=trace_id,
939
+ )
940
+
941
+ def get_span_annotations(
942
+ self,
943
+ span_id: str,
944
+ annotation_name: str | None = None,
945
+ ) -> list[dict[str, Any]]:
946
+ """Get annotations for a span.
947
+
948
+ Args:
949
+ span_id: Span ID to query
950
+ annotation_name: Optional filter by annotation name
951
+
952
+ Returns:
953
+ List of annotation dicts
954
+
955
+ TODO: Implement once Phoenix client exposes this method
956
+ """
957
+ # TODO: Phoenix client doesn't expose annotation query yet
958
+ # This is a stub for future implementation
959
+ logger.warning("get_span_annotations not yet implemented in Phoenix client")
960
+ return []