remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,672 @@
1
+ """
2
+ Experiment configuration model for Phoenix evaluations.
3
+
4
+ This model defines the structure and conventions for REM experiments,
5
+ supporting hybrid storage between Git (configurations) and S3 (datasets/results).
6
+
7
+ **Storage Convention**:
8
+ - **Git**: `.experiments/{experiment-name}/` for configuration and metadata
9
+ - **S3**: `s3://bucket/experiments/{experiment-name}/` for datasets and results
10
+ - **Hybrid**: Git acts as an "overlay" - configurations reference S3 paths
11
+
12
+ **Directory Structure**:
13
+ ```
14
+ .experiments/
15
+ └── {experiment-name}/
16
+ ├── experiment.yaml # This model (configuration)
17
+ ├── README.md # Experiment documentation
18
+ └── results/ # Optional: Git-tracked results (small datasets)
19
+ ├── metrics.json
20
+ └── traces/
21
+
22
+ s3://bucket/experiments/
23
+ └── {experiment-name}/
24
+ ├── datasets/ # Source data (too large for Git)
25
+ │ ├── ground_truth.csv
26
+ │ └── test_cases.jsonl
27
+ └── results/ # Experiment outputs
28
+ ├── run-2025-01-15/
29
+ └── run-2025-01-16/
30
+ ```
31
+
32
+ **Use Cases**:
33
+
34
+ 1. **Small Experiments (Git-only)**:
35
+ - Q&A validation with <100 examples
36
+ - Manual test cases
37
+ - Configuration-driven experiments
38
+ - Store everything in `.experiments/{name}/`
39
+
40
+ 2. **Large Experiments (Hybrid)**:
41
+ - Thousands of test cases
42
+ - Source data on S3, configs in Git
43
+ - Results on S3, metrics in Git
44
+ - `.experiments/{name}/experiment.yaml` references `s3://` paths
45
+
46
+ 3. **Production Experiments (S3-primary)**:
47
+ - Continuous evaluation pipelines
48
+ - Large-scale A/B tests
49
+ - Real-time dataset generation
50
+ - Git stores only configuration, all data on S3
51
+
52
+ **Workflow**:
53
+
54
+ ```bash
55
+ # 1. Create experiment scaffold
56
+ rem experiments create my-experiment \\
57
+ --agent cv-parser \\
58
+ --evaluator default \\
59
+ --description "Test CV parsing accuracy"
60
+
61
+ # 2. Generated structure:
62
+ .experiments/my-experiment/
63
+ ├── experiment.yaml # Configuration (this model)
64
+ ├── README.md # Auto-generated documentation
65
+ └── datasets/ # Optional: small datasets
66
+ └── ground_truth.csv
67
+
68
+ # 3. Run experiment
69
+ rem experiments run my-experiment
70
+
71
+ # 4. Commit configuration to Git
72
+ git add .experiments/my-experiment/
73
+ git commit -m "feat: Add CV parser experiment"
74
+ git tag -a experiments/my-experiment/v1.0.0 \\
75
+ -m "my-experiment v1.0.0: Initial experiment"
76
+ ```
77
+
78
+ **Version Tags**:
79
+ - Format: `experiments/{experiment-name}/vX.Y.Z`
80
+ - Example: `experiments/cv-parser-accuracy/v1.0.0`
81
+ - Allows tracking experiment configuration evolution
82
+ - GitProvider can load specific experiment versions
83
+
84
+ **Integration with Phoenix**:
85
+ ```python
86
+ from rem.models.core.experiment import ExperimentConfig
87
+ from rem.services.phoenix import PhoenixClient
88
+
89
+ # Load experiment configuration
90
+ config = ExperimentConfig.from_yaml(".experiments/my-experiment/experiment.yaml")
91
+
92
+ # Run experiment
93
+ client = PhoenixClient()
94
+ results = client.run_experiment(
95
+ name=config.name,
96
+ agent_schema=config.agent_schema_ref,
97
+ evaluator_schema=config.evaluator_schema_ref,
98
+ dataset=config.load_dataset(),
99
+ metadata=config.metadata
100
+ )
101
+
102
+ # Save results
103
+ config.save_results(results)
104
+ ```
105
+ """
106
+
107
+ from datetime import datetime
108
+ from enum import Enum
109
+ from pathlib import Path
110
+ from typing import Any, Literal
111
+
112
+ from pydantic import BaseModel, Field, field_validator
113
+
114
+
115
+ class DatasetLocation(str, Enum):
116
+ """Where experiment datasets are stored."""
117
+ GIT = "git" # Small datasets in .experiments/{name}/datasets/
118
+ S3 = "s3" # Large datasets on S3
119
+ HYBRID = "hybrid" # Configuration in Git, data on S3
120
+
121
+
122
+ class ExperimentStatus(str, Enum):
123
+ """Experiment lifecycle status."""
124
+ DRAFT = "draft" # Configuration being defined
125
+ READY = "ready" # Ready to run
126
+ RUNNING = "running" # Currently executing
127
+ COMPLETED = "completed" # Finished successfully
128
+ FAILED = "failed" # Execution failed
129
+ ARCHIVED = "archived" # Historical experiment
130
+
131
+
132
+ class DatasetReference(BaseModel):
133
+ """Reference to a dataset (Git or S3)."""
134
+
135
+ location: DatasetLocation = Field(
136
+ description="Where the dataset is stored (git, s3, hybrid)"
137
+ )
138
+
139
+ path: str = Field(
140
+ description=(
141
+ "Path to dataset. Format is inferred from file extension.\n"
142
+ "Supported: .csv, .tsv, .parquet, .json, .jsonl, .xlsx, .ods, .avro, .ipc\n"
143
+ "- Git: Relative path from experiment root (e.g., 'datasets/ground_truth.csv')\n"
144
+ "- S3: Full S3 URI (e.g., 's3://bucket/experiments/my-exp/datasets/data.parquet')\n"
145
+ "- Hybrid: S3 URI for data, Git path for schema"
146
+ )
147
+ )
148
+
149
+ schema_path: str | None = Field(
150
+ default=None,
151
+ description=(
152
+ "Optional: Path to dataset schema definition (for hybrid mode).\n"
153
+ "Useful for documenting expected columns/fields in Git."
154
+ )
155
+ )
156
+
157
+ description: str | None = Field(
158
+ default=None,
159
+ description="Human-readable description of this dataset"
160
+ )
161
+
162
+
163
+ class SchemaReference(BaseModel):
164
+ """Reference to an agent or evaluator schema."""
165
+
166
+ name: str = Field(
167
+ description=(
168
+ "Schema name (e.g., 'cv-parser', 'hello-world').\n"
169
+ "Corresponds to schemas/agents/{name}.yaml or schemas/evaluators/{agent}/{name}.yaml"
170
+ )
171
+ )
172
+
173
+ version: str | None = Field(
174
+ default=None,
175
+ description=(
176
+ "Semantic version tag (e.g., 'schemas/cv-parser/v2.1.0').\n"
177
+ "If None, uses latest version from main branch."
178
+ )
179
+ )
180
+
181
+ type: Literal["agent", "evaluator"] = Field(
182
+ description="Schema type (agent or evaluator)"
183
+ )
184
+
185
+
186
+ class ResultsConfig(BaseModel):
187
+ """Configuration for where experiment results are stored."""
188
+
189
+ location: DatasetLocation = Field(
190
+ description="Where to store results (git, s3, hybrid)"
191
+ )
192
+
193
+ base_path: str = Field(
194
+ description=(
195
+ "Base path for results storage:\n"
196
+ "- Git: '.experiments/{experiment-name}/results/'\n"
197
+ "- S3: 's3://bucket/experiments/{experiment-name}/results/'\n"
198
+ "- Hybrid: Both (small metrics in Git, full traces on S3)"
199
+ )
200
+ )
201
+
202
+ save_traces: bool = Field(
203
+ default=True,
204
+ description="Save full Phoenix traces (can be large)"
205
+ )
206
+
207
+ save_metrics_summary: bool = Field(
208
+ default=True,
209
+ description="Save metrics summary (small, suitable for Git)"
210
+ )
211
+
212
+ metrics_file: str = Field(
213
+ default="metrics.json",
214
+ description="Filename for metrics summary (stored in base_path)"
215
+ )
216
+
217
+
218
+ class ExperimentConfig(BaseModel):
219
+ """
220
+ Complete experiment configuration for Phoenix evaluations.
221
+
222
+ This model defines everything needed to run a reproducible experiment:
223
+ - Agent and evaluator schemas (versioned via Git)
224
+ - Dataset references (Git or S3)
225
+ - Results storage configuration
226
+ - Experiment metadata and documentation
227
+
228
+ **Naming Convention**:
229
+ - Experiment names: lowercase-with-hyphens (e.g., 'cv-parser-accuracy')
230
+ - Directory: `.experiments/{experiment-name}/`
231
+ - Config file: `.experiments/{experiment-name}/experiment.yaml`
232
+ - Version tags: `experiments/{experiment-name}/vX.Y.Z`
233
+
234
+ **Fields**:
235
+ - `name`: Unique experiment identifier
236
+ - `description`: Human-readable purpose
237
+ - `agent_schema_ref`: Which agent to evaluate
238
+ - `evaluator_schema_ref`: Which evaluator to use
239
+ - `datasets`: Input datasets (ground truth, test cases)
240
+ - `results`: Where to store outputs
241
+ - `metadata`: Custom key-value pairs
242
+ - `status`: Current lifecycle stage
243
+ - `tags`: Organizational labels
244
+
245
+ **Examples**:
246
+
247
+ ```yaml
248
+ # Small experiment (Git-only)
249
+ name: hello-world-validation
250
+ description: Validate hello-world agent responses
251
+ agent_schema_ref:
252
+ name: hello-world
253
+ version: schemas/hello-world/v1.0.0
254
+ type: agent
255
+ evaluator_schema_ref:
256
+ name: default
257
+ type: evaluator
258
+ datasets:
259
+ ground_truth:
260
+ location: git
261
+ path: datasets/ground_truth.csv # format inferred from extension
262
+ results:
263
+ location: git
264
+ base_path: results/
265
+ save_traces: false
266
+ save_metrics_summary: true
267
+ status: ready
268
+ tags: [validation, smoke-test]
269
+ ```
270
+
271
+ ```yaml
272
+ # Large experiment (Hybrid)
273
+ name: cv-parser-production
274
+ description: Production CV parser evaluation with 10K resumes
275
+ agent_schema_ref:
276
+ name: cv-parser
277
+ version: schemas/cv-parser/v2.1.0
278
+ type: agent
279
+ evaluator_schema_ref:
280
+ name: default
281
+ type: evaluator
282
+ datasets:
283
+ ground_truth:
284
+ location: s3
285
+ path: s3://rem-prod/experiments/cv-parser-production/datasets/ground_truth.parquet
286
+ schema_path: datasets/schema.yaml # Schema in Git for documentation
287
+ test_cases:
288
+ location: s3
289
+ path: s3://rem-prod/experiments/cv-parser-production/datasets/test_cases.jsonl
290
+ results:
291
+ location: hybrid
292
+ base_path: s3://rem-prod/experiments/cv-parser-production/results/
293
+ save_traces: true
294
+ save_metrics_summary: true
295
+ metrics_file: metrics.json # Copied to Git after run
296
+ metadata:
297
+ cost_per_run_usd: 5.25
298
+ expected_runtime_minutes: 45
299
+ team: recruitment-ai
300
+ priority: high
301
+ status: ready
302
+ tags: [production, cv-parser, weekly]
303
+ ```
304
+ """
305
+
306
+ # Core identification
307
+ name: str = Field(
308
+ description=(
309
+ "Unique experiment identifier (lowercase-with-hyphens).\n"
310
+ "Used for directory name, tags, and references."
311
+ )
312
+ )
313
+
314
+ task: str = Field(
315
+ default="general",
316
+ description=(
317
+ "Task name for organizing experiments by purpose.\n"
318
+ "Used with agent name to form directory: {agent}/{task}/\n"
319
+ "Examples: 'risk-assessment', 'classification', 'general'"
320
+ )
321
+ )
322
+
323
+ description: str = Field(
324
+ description="Human-readable description of experiment purpose and goals"
325
+ )
326
+
327
+ # Schema references
328
+ agent_schema_ref: SchemaReference = Field(
329
+ description=(
330
+ "Reference to agent schema being evaluated.\n"
331
+ "Supports versioning via Git tags (e.g., schemas/cv-parser/v2.1.0)"
332
+ )
333
+ )
334
+
335
+ evaluator_schema_ref: SchemaReference = Field(
336
+ description=(
337
+ "Reference to evaluator schema for judging agent outputs.\n"
338
+ "Can reference evaluators/{agent-name}/{evaluator-name}.yaml"
339
+ )
340
+ )
341
+
342
+ # Dataset configuration
343
+ datasets: dict[str, DatasetReference] = Field(
344
+ description=(
345
+ "Named datasets for this experiment.\n"
346
+ "Common keys: 'ground_truth', 'test_cases', 'validation_set'\n"
347
+ "Supports Git (small datasets), S3 (large datasets), or hybrid"
348
+ )
349
+ )
350
+
351
+ # Results configuration
352
+ results: ResultsConfig = Field(
353
+ description=(
354
+ "Configuration for experiment results storage.\n"
355
+ "Supports Git (small results), S3 (large results), or hybrid"
356
+ )
357
+ )
358
+
359
+ # Metadata and organization
360
+ status: ExperimentStatus = Field(
361
+ default=ExperimentStatus.DRAFT,
362
+ description="Current experiment lifecycle status"
363
+ )
364
+
365
+ tags: list[str] = Field(
366
+ default_factory=list,
367
+ description=(
368
+ "Tags for organizing experiments.\n"
369
+ "Examples: ['production', 'cv-parser', 'weekly', 'regression']"
370
+ )
371
+ )
372
+
373
+ metadata: dict[str, Any] = Field(
374
+ default_factory=dict,
375
+ description=(
376
+ "Custom metadata key-value pairs.\n"
377
+ "Examples: cost_per_run, expected_runtime, team, priority"
378
+ )
379
+ )
380
+
381
+ # Timestamps (auto-managed)
382
+ created_at: datetime = Field(
383
+ default_factory=datetime.now,
384
+ description="When this experiment configuration was created"
385
+ )
386
+
387
+ updated_at: datetime = Field(
388
+ default_factory=datetime.now,
389
+ description="When this experiment configuration was last modified"
390
+ )
391
+
392
+ last_run_at: datetime | None = Field(
393
+ default=None,
394
+ description="When this experiment was last executed"
395
+ )
396
+
397
+ @field_validator("name")
398
+ @classmethod
399
+ def validate_name(cls, v: str) -> str:
400
+ """Validate experiment name follows conventions."""
401
+ if not v:
402
+ raise ValueError("Experiment name cannot be empty")
403
+
404
+ if not v.islower():
405
+ raise ValueError("Experiment name must be lowercase")
406
+
407
+ if " " in v:
408
+ raise ValueError("Experiment name cannot contain spaces (use hyphens)")
409
+
410
+ if not all(c.isalnum() or c == "-" for c in v):
411
+ raise ValueError("Experiment name can only contain lowercase letters, numbers, and hyphens")
412
+
413
+ return v
414
+
415
+ @field_validator("task")
416
+ @classmethod
417
+ def validate_task(cls, v: str) -> str:
418
+ """Validate task name follows conventions."""
419
+ if not v:
420
+ return "general" # Default value
421
+
422
+ if not v.islower():
423
+ raise ValueError("Task name must be lowercase")
424
+
425
+ if " " in v:
426
+ raise ValueError("Task name cannot contain spaces (use hyphens)")
427
+
428
+ if not all(c.isalnum() or c == "-" for c in v):
429
+ raise ValueError("Task name can only contain lowercase letters, numbers, and hyphens")
430
+
431
+ return v
432
+
433
+ @field_validator("tags")
434
+ @classmethod
435
+ def validate_tags(cls, v: list[str]) -> list[str]:
436
+ """Ensure tags are lowercase and normalized."""
437
+ return [tag.lower().strip() for tag in v]
438
+
439
+ def get_experiment_dir(self, base_path: str = ".experiments") -> Path:
440
+ """Get the experiment directory path."""
441
+ return Path(base_path) / self.name
442
+
443
+ def get_agent_task_dir(self, base_path: str = ".experiments") -> Path:
444
+ """
445
+ Get the experiment directory path organized by agent/task.
446
+
447
+ Returns: Path like .experiments/{agent}/{task}/
448
+ This is the recommended structure for S3 export compatibility.
449
+ """
450
+ return Path(base_path) / self.agent_schema_ref.name / self.task
451
+
452
+ def get_config_path(self, base_path: str = ".experiments") -> Path:
453
+ """Get the path to experiment.yaml file."""
454
+ return self.get_experiment_dir(base_path) / "experiment.yaml"
455
+
456
+ def get_readme_path(self, base_path: str = ".experiments") -> Path:
457
+ """Get the path to README.md file."""
458
+ return self.get_experiment_dir(base_path) / "README.md"
459
+
460
+ def get_evaluator_filename(self) -> str:
461
+ """
462
+ Get the evaluator filename with task prefix.
463
+
464
+ Returns: {agent_name}-{task}.yaml (e.g., siggy-risk-assessment.yaml)
465
+ """
466
+ return f"{self.agent_schema_ref.name}-{self.task}.yaml"
467
+
468
+ def get_s3_export_path(self, bucket: str, version: str = "v0") -> str:
469
+ """
470
+ Get the S3 path for exporting this experiment.
471
+
472
+ Returns: s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
473
+ """
474
+ return f"s3://{bucket}/{version}/datasets/calibration/experiments/{self.agent_schema_ref.name}/{self.task}"
475
+
476
+ def to_yaml(self) -> str:
477
+ """Export configuration as YAML string."""
478
+ import yaml
479
+ return yaml.dump(
480
+ self.model_dump(mode="json", exclude_none=True),
481
+ default_flow_style=False,
482
+ sort_keys=False
483
+ )
484
+
485
+ @classmethod
486
+ def from_yaml(cls, path: str | Path) -> "ExperimentConfig":
487
+ """Load configuration from YAML file."""
488
+ import yaml
489
+ with open(path) as f:
490
+ data = yaml.safe_load(f)
491
+ return cls(**data)
492
+
493
+ def save(self, base_path: str = ".experiments") -> Path:
494
+ """
495
+ Save experiment configuration to YAML file.
496
+
497
+ Creates directory structure if it doesn't exist.
498
+ Updates `updated_at` timestamp.
499
+
500
+ Returns:
501
+ Path to saved experiment.yaml file
502
+ """
503
+ self.updated_at = datetime.now()
504
+
505
+ config_path = self.get_config_path(base_path)
506
+ config_path.parent.mkdir(parents=True, exist_ok=True)
507
+
508
+ with open(config_path, "w") as f:
509
+ f.write(self.to_yaml())
510
+
511
+ return config_path
512
+
513
+ def generate_readme(self) -> str:
514
+ """
515
+ Generate README.md content for experiment.
516
+
517
+ Includes:
518
+ - Experiment description
519
+ - Schema references
520
+ - Dataset information
521
+ - How to run
522
+ - Results location
523
+ """
524
+ readme = f"""# {self.name}
525
+
526
+ {self.description}
527
+
528
+ ## Configuration
529
+
530
+ **Status**: `{self.status.value}`
531
+ **Task**: `{self.task}`
532
+ **Tags**: {', '.join(f'`{tag}`' for tag in self.tags) if self.tags else 'None'}
533
+
534
+ ## Agent Schema
535
+
536
+ - **Name**: `{self.agent_schema_ref.name}`
537
+ - **Version**: `{self.agent_schema_ref.version or 'latest'}`
538
+ - **Type**: `{self.agent_schema_ref.type}`
539
+
540
+ ## Evaluator Schema
541
+
542
+ - **Name**: `{self.evaluator_schema_ref.name}`
543
+ - **File**: `{self.get_evaluator_filename()}`
544
+ - **Type**: `{self.evaluator_schema_ref.type}`
545
+
546
+ ## Datasets
547
+
548
+ """
549
+ for name, dataset in self.datasets.items():
550
+ readme += f"""### {name}
551
+
552
+ - **Location**: `{dataset.location.value}`
553
+ - **Path**: `{dataset.path}`
554
+ """
555
+ if dataset.description:
556
+ readme += f"- **Description**: {dataset.description}\n"
557
+ readme += "\n"
558
+
559
+ readme += f"""## Results
560
+
561
+ - **Location**: `{self.results.location.value}`
562
+ - **Base Path**: `{self.results.base_path}`
563
+ - **Save Traces**: `{self.results.save_traces}`
564
+ - **Metrics File**: `{self.results.metrics_file}`
565
+
566
+ ## How to Run
567
+
568
+ ```bash
569
+ # Run this experiment
570
+ rem experiments run {self.name}
571
+
572
+ # Run with specific version
573
+ rem experiments run {self.name} --version experiments/{self.name}/v1.0.0
574
+ ```
575
+
576
+ ## Metadata
577
+
578
+ """
579
+ if self.metadata:
580
+ for key, value in self.metadata.items():
581
+ readme += f"- **{key}**: `{value}`\n"
582
+ else:
583
+ readme += "None\n"
584
+
585
+ readme += f"""
586
+ ## Timestamps
587
+
588
+ - **Created**: {self.created_at.isoformat()}
589
+ - **Updated**: {self.updated_at.isoformat()}
590
+ """
591
+ if self.last_run_at:
592
+ readme += f"- **Last Run**: {self.last_run_at.isoformat()}\n"
593
+
594
+ return readme
595
+
596
+ def save_readme(self, base_path: str = ".experiments") -> Path:
597
+ """Save auto-generated README.md file."""
598
+ readme_path = self.get_readme_path(base_path)
599
+ readme_path.parent.mkdir(parents=True, exist_ok=True)
600
+
601
+ with open(readme_path, "w") as f:
602
+ f.write(self.generate_readme())
603
+
604
+ return readme_path
605
+
606
+
607
+ # Example configurations for reference
608
+ EXAMPLE_SMALL_EXPERIMENT = ExperimentConfig(
609
+ name="hello-world-validation",
610
+ description="Smoke test for hello-world agent responses",
611
+ agent_schema_ref=SchemaReference(
612
+ name="hello-world",
613
+ version="schemas/hello-world/v1.0.0",
614
+ type="agent"
615
+ ),
616
+ evaluator_schema_ref=SchemaReference(
617
+ name="default",
618
+ type="evaluator"
619
+ ),
620
+ datasets={
621
+ "ground_truth": DatasetReference(
622
+ location=DatasetLocation.GIT,
623
+ path="datasets/ground_truth.csv",
624
+ description="10 manually curated test cases"
625
+ )
626
+ },
627
+ results=ResultsConfig(
628
+ location=DatasetLocation.GIT,
629
+ base_path="results/",
630
+ save_traces=False,
631
+ save_metrics_summary=True
632
+ ),
633
+ status=ExperimentStatus.READY,
634
+ tags=["validation", "smoke-test"]
635
+ )
636
+
637
+ EXAMPLE_LARGE_EXPERIMENT = ExperimentConfig(
638
+ name="cv-parser-production",
639
+ description="Production CV parser evaluation with 10K resumes",
640
+ agent_schema_ref=SchemaReference(
641
+ name="cv-parser",
642
+ version="schemas/cv-parser/v2.1.0",
643
+ type="agent"
644
+ ),
645
+ evaluator_schema_ref=SchemaReference(
646
+ name="default",
647
+ type="evaluator"
648
+ ),
649
+ datasets={
650
+ "ground_truth": DatasetReference(
651
+ location=DatasetLocation.S3,
652
+ path="s3://rem-prod/experiments/cv-parser-production/datasets/ground_truth.parquet",
653
+ schema_path="datasets/schema.yaml",
654
+ description="10,000 CV/resume pairs with ground truth extractions"
655
+ )
656
+ },
657
+ results=ResultsConfig(
658
+ location=DatasetLocation.HYBRID,
659
+ base_path="s3://rem-prod/experiments/cv-parser-production/results/",
660
+ save_traces=True,
661
+ save_metrics_summary=True,
662
+ metrics_file="metrics.json"
663
+ ),
664
+ metadata={
665
+ "cost_per_run_usd": 5.25,
666
+ "expected_runtime_minutes": 45,
667
+ "team": "recruitment-ai",
668
+ "priority": "high"
669
+ },
670
+ status=ExperimentStatus.READY,
671
+ tags=["production", "cv-parser", "weekly"]
672
+ )