remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,628 @@
1
+ """
2
+ Experiment configuration model for Phoenix evaluations.
3
+
4
+ This model defines the structure and conventions for REM experiments,
5
+ supporting hybrid storage between Git (configurations) and S3 (datasets/results).
6
+
7
+ **Storage Convention**:
8
+ - **Git**: `.experiments/{experiment-name}/` for configuration and metadata
9
+ - **S3**: `s3://bucket/experiments/{experiment-name}/` for datasets and results
10
+ - **Hybrid**: Git acts as an "overlay" - configurations reference S3 paths
11
+
12
+ **Directory Structure**:
13
+ ```
14
+ .experiments/
15
+ └── {experiment-name}/
16
+ ├── experiment.yaml # This model (configuration)
17
+ ├── README.md # Experiment documentation
18
+ └── results/ # Optional: Git-tracked results (small datasets)
19
+ ├── metrics.json
20
+ └── traces/
21
+
22
+ s3://bucket/experiments/
23
+ └── {experiment-name}/
24
+ ├── datasets/ # Source data (too large for Git)
25
+ │ ├── ground_truth.csv
26
+ │ └── test_cases.jsonl
27
+ └── results/ # Experiment outputs
28
+ ├── run-2025-01-15/
29
+ └── run-2025-01-16/
30
+ ```
31
+
32
+ **Use Cases**:
33
+
34
+ 1. **Small Experiments (Git-only)**:
35
+ - Q&A validation with <100 examples
36
+ - Manual test cases
37
+ - Configuration-driven experiments
38
+ - Store everything in `.experiments/{name}/`
39
+
40
+ 2. **Large Experiments (Hybrid)**:
41
+ - Thousands of test cases
42
+ - Source data on S3, configs in Git
43
+ - Results on S3, metrics in Git
44
+ - `.experiments/{name}/experiment.yaml` references `s3://` paths
45
+
46
+ 3. **Production Experiments (S3-primary)**:
47
+ - Continuous evaluation pipelines
48
+ - Large-scale A/B tests
49
+ - Real-time dataset generation
50
+ - Git stores only configuration, all data on S3
51
+
52
+ **Workflow**:
53
+
54
+ ```bash
55
+ # 1. Create experiment scaffold
56
+ rem experiments create my-experiment \\
57
+ --agent cv-parser \\
58
+ --evaluator default \\
59
+ --description "Test CV parsing accuracy"
60
+
61
+ # 2. Generated structure:
62
+ .experiments/my-experiment/
63
+ ├── experiment.yaml # Configuration (this model)
64
+ ├── README.md # Auto-generated documentation
65
+ └── datasets/ # Optional: small datasets
66
+ └── ground_truth.csv
67
+
68
+ # 3. Run experiment
69
+ rem experiments run my-experiment
70
+
71
+ # 4. Commit configuration to Git
72
+ git add .experiments/my-experiment/
73
+ git commit -m "feat: Add CV parser experiment"
74
+ git tag -a experiments/my-experiment/v1.0.0 \\
75
+ -m "my-experiment v1.0.0: Initial experiment"
76
+ ```
77
+
78
+ **Version Tags**:
79
+ - Format: `experiments/{experiment-name}/vX.Y.Z`
80
+ - Example: `experiments/cv-parser-accuracy/v1.0.0`
81
+ - Allows tracking experiment configuration evolution
82
+ - GitProvider can load specific experiment versions
83
+
84
+ **Integration with Phoenix**:
85
+ ```python
86
+ from rem.models.core.experiment import ExperimentConfig
87
+ from rem.services.phoenix import PhoenixClient
88
+
89
+ # Load experiment configuration
90
+ config = ExperimentConfig.from_yaml(".experiments/my-experiment/experiment.yaml")
91
+
92
+ # Run experiment
93
+ client = PhoenixClient()
94
+ results = client.run_experiment(
95
+ name=config.name,
96
+ agent_schema=config.agent_schema_ref,
97
+ evaluator_schema=config.evaluator_schema_ref,
98
+ dataset=config.load_dataset(),
99
+ metadata=config.metadata
100
+ )
101
+
102
+ # Save results
103
+ config.save_results(results)
104
+ ```
105
+ """
106
+
107
+ from datetime import datetime
108
+ from enum import Enum
109
+ from pathlib import Path
110
+ from typing import Any, Literal
111
+
112
+ from pydantic import BaseModel, Field, field_validator
113
+
114
+
115
+ class DatasetLocation(str, Enum):
116
+ """Where experiment datasets are stored."""
117
+ GIT = "git" # Small datasets in .experiments/{name}/datasets/
118
+ S3 = "s3" # Large datasets on S3
119
+ HYBRID = "hybrid" # Configuration in Git, data on S3
120
+
121
+
122
+ class ExperimentStatus(str, Enum):
123
+ """Experiment lifecycle status."""
124
+ DRAFT = "draft" # Configuration being defined
125
+ READY = "ready" # Ready to run
126
+ RUNNING = "running" # Currently executing
127
+ COMPLETED = "completed" # Finished successfully
128
+ FAILED = "failed" # Execution failed
129
+ ARCHIVED = "archived" # Historical experiment
130
+
131
+
132
+ class DatasetReference(BaseModel):
133
+ """Reference to a dataset (Git or S3)."""
134
+
135
+ location: DatasetLocation = Field(
136
+ description="Where the dataset is stored (git, s3, hybrid)"
137
+ )
138
+
139
+ path: str = Field(
140
+ description=(
141
+ "Path to dataset:\n"
142
+ "- Git: Relative path from experiment root (e.g., 'datasets/ground_truth.csv')\n"
143
+ "- S3: Full S3 URI (e.g., 's3://bucket/experiments/my-exp/datasets/ground_truth.csv')\n"
144
+ "- Hybrid: S3 URI for data, Git path for schema"
145
+ )
146
+ )
147
+
148
+ format: Literal["csv", "jsonl", "parquet", "json"] = Field(
149
+ default="csv",
150
+ description="Dataset file format"
151
+ )
152
+
153
+ schema_path: str | None = Field(
154
+ default=None,
155
+ description=(
156
+ "Optional: Path to dataset schema definition (for hybrid mode).\n"
157
+ "Useful for documenting expected columns/fields in Git."
158
+ )
159
+ )
160
+
161
+ description: str | None = Field(
162
+ default=None,
163
+ description="Human-readable description of this dataset"
164
+ )
165
+
166
+
167
+ class SchemaReference(BaseModel):
168
+ """Reference to an agent or evaluator schema."""
169
+
170
+ name: str = Field(
171
+ description=(
172
+ "Schema name (e.g., 'cv-parser', 'hello-world').\n"
173
+ "Corresponds to schemas/agents/{name}.yaml or schemas/evaluators/{agent}/{name}.yaml"
174
+ )
175
+ )
176
+
177
+ version: str | None = Field(
178
+ default=None,
179
+ description=(
180
+ "Semantic version tag (e.g., 'schemas/cv-parser/v2.1.0').\n"
181
+ "If None, uses latest version from main branch."
182
+ )
183
+ )
184
+
185
+ type: Literal["agent", "evaluator"] = Field(
186
+ description="Schema type (agent or evaluator)"
187
+ )
188
+
189
+
190
+ class ResultsConfig(BaseModel):
191
+ """Configuration for where experiment results are stored."""
192
+
193
+ location: DatasetLocation = Field(
194
+ description="Where to store results (git, s3, hybrid)"
195
+ )
196
+
197
+ base_path: str = Field(
198
+ description=(
199
+ "Base path for results storage:\n"
200
+ "- Git: '.experiments/{experiment-name}/results/'\n"
201
+ "- S3: 's3://bucket/experiments/{experiment-name}/results/'\n"
202
+ "- Hybrid: Both (small metrics in Git, full traces on S3)"
203
+ )
204
+ )
205
+
206
+ save_traces: bool = Field(
207
+ default=True,
208
+ description="Save full Phoenix traces (can be large)"
209
+ )
210
+
211
+ save_metrics_summary: bool = Field(
212
+ default=True,
213
+ description="Save metrics summary (small, suitable for Git)"
214
+ )
215
+
216
+ metrics_file: str = Field(
217
+ default="metrics.json",
218
+ description="Filename for metrics summary (stored in base_path)"
219
+ )
220
+
221
+
222
+ class ExperimentConfig(BaseModel):
223
+ """
224
+ Complete experiment configuration for Phoenix evaluations.
225
+
226
+ This model defines everything needed to run a reproducible experiment:
227
+ - Agent and evaluator schemas (versioned via Git)
228
+ - Dataset references (Git or S3)
229
+ - Results storage configuration
230
+ - Experiment metadata and documentation
231
+
232
+ **Naming Convention**:
233
+ - Experiment names: lowercase-with-hyphens (e.g., 'cv-parser-accuracy')
234
+ - Directory: `.experiments/{experiment-name}/`
235
+ - Config file: `.experiments/{experiment-name}/experiment.yaml`
236
+ - Version tags: `experiments/{experiment-name}/vX.Y.Z`
237
+
238
+ **Fields**:
239
+ - `name`: Unique experiment identifier
240
+ - `description`: Human-readable purpose
241
+ - `agent_schema_ref`: Which agent to evaluate
242
+ - `evaluator_schema_ref`: Which evaluator to use
243
+ - `datasets`: Input datasets (ground truth, test cases)
244
+ - `results`: Where to store outputs
245
+ - `metadata`: Custom key-value pairs
246
+ - `status`: Current lifecycle stage
247
+ - `tags`: Organizational labels
248
+
249
+ **Examples**:
250
+
251
+ ```yaml
252
+ # Small experiment (Git-only)
253
+ name: hello-world-validation
254
+ description: Validate hello-world agent responses
255
+ agent_schema_ref:
256
+ name: hello-world
257
+ version: schemas/hello-world/v1.0.0
258
+ type: agent
259
+ evaluator_schema_ref:
260
+ name: default
261
+ type: evaluator
262
+ datasets:
263
+ ground_truth:
264
+ location: git
265
+ path: datasets/ground_truth.csv
266
+ format: csv
267
+ results:
268
+ location: git
269
+ base_path: results/
270
+ save_traces: false
271
+ save_metrics_summary: true
272
+ status: ready
273
+ tags: [validation, smoke-test]
274
+ ```
275
+
276
+ ```yaml
277
+ # Large experiment (Hybrid)
278
+ name: cv-parser-production
279
+ description: Production CV parser evaluation with 10K resumes
280
+ agent_schema_ref:
281
+ name: cv-parser
282
+ version: schemas/cv-parser/v2.1.0
283
+ type: agent
284
+ evaluator_schema_ref:
285
+ name: default
286
+ type: evaluator
287
+ datasets:
288
+ ground_truth:
289
+ location: s3
290
+ path: s3://rem-prod/experiments/cv-parser-production/datasets/ground_truth.parquet
291
+ format: parquet
292
+ schema_path: datasets/schema.yaml # Schema in Git for documentation
293
+ test_cases:
294
+ location: s3
295
+ path: s3://rem-prod/experiments/cv-parser-production/datasets/test_cases.jsonl
296
+ format: jsonl
297
+ results:
298
+ location: hybrid
299
+ base_path: s3://rem-prod/experiments/cv-parser-production/results/
300
+ save_traces: true
301
+ save_metrics_summary: true
302
+ metrics_file: metrics.json # Copied to Git after run
303
+ metadata:
304
+ cost_per_run_usd: 5.25
305
+ expected_runtime_minutes: 45
306
+ team: recruitment-ai
307
+ priority: high
308
+ status: ready
309
+ tags: [production, cv-parser, weekly]
310
+ ```
311
+ """
312
+
313
+ # Core identification
314
+ name: str = Field(
315
+ description=(
316
+ "Unique experiment identifier (lowercase-with-hyphens).\n"
317
+ "Used for directory name, tags, and references."
318
+ )
319
+ )
320
+
321
+ description: str = Field(
322
+ description="Human-readable description of experiment purpose and goals"
323
+ )
324
+
325
+ # Schema references
326
+ agent_schema_ref: SchemaReference = Field(
327
+ description=(
328
+ "Reference to agent schema being evaluated.\n"
329
+ "Supports versioning via Git tags (e.g., schemas/cv-parser/v2.1.0)"
330
+ )
331
+ )
332
+
333
+ evaluator_schema_ref: SchemaReference = Field(
334
+ description=(
335
+ "Reference to evaluator schema for judging agent outputs.\n"
336
+ "Can reference evaluators/{agent-name}/{evaluator-name}.yaml"
337
+ )
338
+ )
339
+
340
+ # Dataset configuration
341
+ datasets: dict[str, DatasetReference] = Field(
342
+ description=(
343
+ "Named datasets for this experiment.\n"
344
+ "Common keys: 'ground_truth', 'test_cases', 'validation_set'\n"
345
+ "Supports Git (small datasets), S3 (large datasets), or hybrid"
346
+ )
347
+ )
348
+
349
+ # Results configuration
350
+ results: ResultsConfig = Field(
351
+ description=(
352
+ "Configuration for experiment results storage.\n"
353
+ "Supports Git (small results), S3 (large results), or hybrid"
354
+ )
355
+ )
356
+
357
+ # Metadata and organization
358
+ status: ExperimentStatus = Field(
359
+ default=ExperimentStatus.DRAFT,
360
+ description="Current experiment lifecycle status"
361
+ )
362
+
363
+ tags: list[str] = Field(
364
+ default_factory=list,
365
+ description=(
366
+ "Tags for organizing experiments.\n"
367
+ "Examples: ['production', 'cv-parser', 'weekly', 'regression']"
368
+ )
369
+ )
370
+
371
+ metadata: dict[str, Any] = Field(
372
+ default_factory=dict,
373
+ description=(
374
+ "Custom metadata key-value pairs.\n"
375
+ "Examples: cost_per_run, expected_runtime, team, priority"
376
+ )
377
+ )
378
+
379
+ # Timestamps (auto-managed)
380
+ created_at: datetime = Field(
381
+ default_factory=datetime.now,
382
+ description="When this experiment configuration was created"
383
+ )
384
+
385
+ updated_at: datetime = Field(
386
+ default_factory=datetime.now,
387
+ description="When this experiment configuration was last modified"
388
+ )
389
+
390
+ last_run_at: datetime | None = Field(
391
+ default=None,
392
+ description="When this experiment was last executed"
393
+ )
394
+
395
+ @field_validator("name")
396
+ @classmethod
397
+ def validate_name(cls, v: str) -> str:
398
+ """Validate experiment name follows conventions."""
399
+ if not v:
400
+ raise ValueError("Experiment name cannot be empty")
401
+
402
+ if not v.islower():
403
+ raise ValueError("Experiment name must be lowercase")
404
+
405
+ if " " in v:
406
+ raise ValueError("Experiment name cannot contain spaces (use hyphens)")
407
+
408
+ if not all(c.isalnum() or c == "-" for c in v):
409
+ raise ValueError("Experiment name can only contain lowercase letters, numbers, and hyphens")
410
+
411
+ return v
412
+
413
+ @field_validator("tags")
414
+ @classmethod
415
+ def validate_tags(cls, v: list[str]) -> list[str]:
416
+ """Ensure tags are lowercase and normalized."""
417
+ return [tag.lower().strip() for tag in v]
418
+
419
+ def get_experiment_dir(self, base_path: str = ".experiments") -> Path:
420
+ """Get the experiment directory path."""
421
+ return Path(base_path) / self.name
422
+
423
+ def get_config_path(self, base_path: str = ".experiments") -> Path:
424
+ """Get the path to experiment.yaml file."""
425
+ return self.get_experiment_dir(base_path) / "experiment.yaml"
426
+
427
+ def get_readme_path(self, base_path: str = ".experiments") -> Path:
428
+ """Get the path to README.md file."""
429
+ return self.get_experiment_dir(base_path) / "README.md"
430
+
431
+ def to_yaml(self) -> str:
432
+ """Export configuration as YAML string."""
433
+ import yaml
434
+ return yaml.dump(
435
+ self.model_dump(mode="json", exclude_none=True),
436
+ default_flow_style=False,
437
+ sort_keys=False
438
+ )
439
+
440
+ @classmethod
441
+ def from_yaml(cls, path: str | Path) -> "ExperimentConfig":
442
+ """Load configuration from YAML file."""
443
+ import yaml
444
+ with open(path) as f:
445
+ data = yaml.safe_load(f)
446
+ return cls(**data)
447
+
448
+ def save(self, base_path: str = ".experiments") -> Path:
449
+ """
450
+ Save experiment configuration to YAML file.
451
+
452
+ Creates directory structure if it doesn't exist.
453
+ Updates `updated_at` timestamp.
454
+
455
+ Returns:
456
+ Path to saved experiment.yaml file
457
+ """
458
+ self.updated_at = datetime.now()
459
+
460
+ config_path = self.get_config_path(base_path)
461
+ config_path.parent.mkdir(parents=True, exist_ok=True)
462
+
463
+ with open(config_path, "w") as f:
464
+ f.write(self.to_yaml())
465
+
466
+ return config_path
467
+
468
+ def generate_readme(self) -> str:
469
+ """
470
+ Generate README.md content for experiment.
471
+
472
+ Includes:
473
+ - Experiment description
474
+ - Schema references
475
+ - Dataset information
476
+ - How to run
477
+ - Results location
478
+ """
479
+ readme = f"""# {self.name}
480
+
481
+ {self.description}
482
+
483
+ ## Configuration
484
+
485
+ **Status**: `{self.status.value}`
486
+ **Tags**: {', '.join(f'`{tag}`' for tag in self.tags) if self.tags else 'None'}
487
+
488
+ ## Agent Schema
489
+
490
+ - **Name**: `{self.agent_schema_ref.name}`
491
+ - **Version**: `{self.agent_schema_ref.version or 'latest'}`
492
+ - **Type**: `{self.agent_schema_ref.type}`
493
+
494
+ ## Evaluator Schema
495
+
496
+ - **Name**: `{self.evaluator_schema_ref.name}`
497
+ - **Type**: `{self.evaluator_schema_ref.type}`
498
+
499
+ ## Datasets
500
+
501
+ """
502
+ for name, dataset in self.datasets.items():
503
+ readme += f"""### {name}
504
+
505
+ - **Location**: `{dataset.location.value}`
506
+ - **Path**: `{dataset.path}`
507
+ - **Format**: `{dataset.format}`
508
+ """
509
+ if dataset.description:
510
+ readme += f"- **Description**: {dataset.description}\n"
511
+ readme += "\n"
512
+
513
+ readme += f"""## Results
514
+
515
+ - **Location**: `{self.results.location.value}`
516
+ - **Base Path**: `{self.results.base_path}`
517
+ - **Save Traces**: `{self.results.save_traces}`
518
+ - **Metrics File**: `{self.results.metrics_file}`
519
+
520
+ ## How to Run
521
+
522
+ ```bash
523
+ # Run this experiment
524
+ rem experiments run {self.name}
525
+
526
+ # Run with specific version
527
+ rem experiments run {self.name} --version experiments/{self.name}/v1.0.0
528
+ ```
529
+
530
+ ## Metadata
531
+
532
+ """
533
+ if self.metadata:
534
+ for key, value in self.metadata.items():
535
+ readme += f"- **{key}**: `{value}`\n"
536
+ else:
537
+ readme += "None\n"
538
+
539
+ readme += f"""
540
+ ## Timestamps
541
+
542
+ - **Created**: {self.created_at.isoformat()}
543
+ - **Updated**: {self.updated_at.isoformat()}
544
+ """
545
+ if self.last_run_at:
546
+ readme += f"- **Last Run**: {self.last_run_at.isoformat()}\n"
547
+
548
+ return readme
549
+
550
+ def save_readme(self, base_path: str = ".experiments") -> Path:
551
+ """Save auto-generated README.md file."""
552
+ readme_path = self.get_readme_path(base_path)
553
+ readme_path.parent.mkdir(parents=True, exist_ok=True)
554
+
555
+ with open(readme_path, "w") as f:
556
+ f.write(self.generate_readme())
557
+
558
+ return readme_path
559
+
560
+
561
+ # Example configurations for reference
562
+ EXAMPLE_SMALL_EXPERIMENT = ExperimentConfig(
563
+ name="hello-world-validation",
564
+ description="Smoke test for hello-world agent responses",
565
+ agent_schema_ref=SchemaReference(
566
+ name="hello-world",
567
+ version="schemas/hello-world/v1.0.0",
568
+ type="agent"
569
+ ),
570
+ evaluator_schema_ref=SchemaReference(
571
+ name="default",
572
+ type="evaluator"
573
+ ),
574
+ datasets={
575
+ "ground_truth": DatasetReference(
576
+ location=DatasetLocation.GIT,
577
+ path="datasets/ground_truth.csv",
578
+ format="csv",
579
+ description="10 manually curated test cases"
580
+ )
581
+ },
582
+ results=ResultsConfig(
583
+ location=DatasetLocation.GIT,
584
+ base_path="results/",
585
+ save_traces=False,
586
+ save_metrics_summary=True
587
+ ),
588
+ status=ExperimentStatus.READY,
589
+ tags=["validation", "smoke-test"]
590
+ )
591
+
592
+ EXAMPLE_LARGE_EXPERIMENT = ExperimentConfig(
593
+ name="cv-parser-production",
594
+ description="Production CV parser evaluation with 10K resumes",
595
+ agent_schema_ref=SchemaReference(
596
+ name="cv-parser",
597
+ version="schemas/cv-parser/v2.1.0",
598
+ type="agent"
599
+ ),
600
+ evaluator_schema_ref=SchemaReference(
601
+ name="default",
602
+ type="evaluator"
603
+ ),
604
+ datasets={
605
+ "ground_truth": DatasetReference(
606
+ location=DatasetLocation.S3,
607
+ path="s3://rem-prod/experiments/cv-parser-production/datasets/ground_truth.parquet",
608
+ format="parquet",
609
+ schema_path="datasets/schema.yaml",
610
+ description="10,000 CV/resume pairs with ground truth extractions"
611
+ )
612
+ },
613
+ results=ResultsConfig(
614
+ location=DatasetLocation.HYBRID,
615
+ base_path="s3://rem-prod/experiments/cv-parser-production/results/",
616
+ save_traces=True,
617
+ save_metrics_summary=True,
618
+ metrics_file="metrics.json"
619
+ ),
620
+ metadata={
621
+ "cost_per_run_usd": 5.25,
622
+ "expected_runtime_minutes": 45,
623
+ "team": "recruitment-ai",
624
+ "priority": "high"
625
+ },
626
+ status=ExperimentStatus.READY,
627
+ tags=["production", "cv-parser", "weekly"]
628
+ )