remdb 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +565 -0
- rem/cli/commands/configure.py +423 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1124 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +88 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +657 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +229 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.2.6.dist-info/METADATA +1191 -0
- remdb-0.2.6.dist-info/RECORD +187 -0
- remdb-0.2.6.dist-info/WHEEL +4 -0
- remdb-0.2.6.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,628 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Experiment configuration model for Phoenix evaluations.
|
|
3
|
+
|
|
4
|
+
This model defines the structure and conventions for REM experiments,
|
|
5
|
+
supporting hybrid storage between Git (configurations) and S3 (datasets/results).
|
|
6
|
+
|
|
7
|
+
**Storage Convention**:
|
|
8
|
+
- **Git**: `.experiments/{experiment-name}/` for configuration and metadata
|
|
9
|
+
- **S3**: `s3://bucket/experiments/{experiment-name}/` for datasets and results
|
|
10
|
+
- **Hybrid**: Git acts as an "overlay" - configurations reference S3 paths
|
|
11
|
+
|
|
12
|
+
**Directory Structure**:
|
|
13
|
+
```
|
|
14
|
+
.experiments/
|
|
15
|
+
└── {experiment-name}/
|
|
16
|
+
├── experiment.yaml # This model (configuration)
|
|
17
|
+
├── README.md # Experiment documentation
|
|
18
|
+
└── results/ # Optional: Git-tracked results (small datasets)
|
|
19
|
+
├── metrics.json
|
|
20
|
+
└── traces/
|
|
21
|
+
|
|
22
|
+
s3://bucket/experiments/
|
|
23
|
+
└── {experiment-name}/
|
|
24
|
+
├── datasets/ # Source data (too large for Git)
|
|
25
|
+
│ ├── ground_truth.csv
|
|
26
|
+
│ └── test_cases.jsonl
|
|
27
|
+
└── results/ # Experiment outputs
|
|
28
|
+
├── run-2025-01-15/
|
|
29
|
+
└── run-2025-01-16/
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
**Use Cases**:
|
|
33
|
+
|
|
34
|
+
1. **Small Experiments (Git-only)**:
|
|
35
|
+
- Q&A validation with <100 examples
|
|
36
|
+
- Manual test cases
|
|
37
|
+
- Configuration-driven experiments
|
|
38
|
+
- Store everything in `.experiments/{name}/`
|
|
39
|
+
|
|
40
|
+
2. **Large Experiments (Hybrid)**:
|
|
41
|
+
- Thousands of test cases
|
|
42
|
+
- Source data on S3, configs in Git
|
|
43
|
+
- Results on S3, metrics in Git
|
|
44
|
+
- `.experiments/{name}/experiment.yaml` references `s3://` paths
|
|
45
|
+
|
|
46
|
+
3. **Production Experiments (S3-primary)**:
|
|
47
|
+
- Continuous evaluation pipelines
|
|
48
|
+
- Large-scale A/B tests
|
|
49
|
+
- Real-time dataset generation
|
|
50
|
+
- Git stores only configuration, all data on S3
|
|
51
|
+
|
|
52
|
+
**Workflow**:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# 1. Create experiment scaffold
|
|
56
|
+
rem experiments create my-experiment \\
|
|
57
|
+
--agent cv-parser \\
|
|
58
|
+
--evaluator default \\
|
|
59
|
+
--description "Test CV parsing accuracy"
|
|
60
|
+
|
|
61
|
+
# 2. Generated structure:
|
|
62
|
+
.experiments/my-experiment/
|
|
63
|
+
├── experiment.yaml # Configuration (this model)
|
|
64
|
+
├── README.md # Auto-generated documentation
|
|
65
|
+
└── datasets/ # Optional: small datasets
|
|
66
|
+
└── ground_truth.csv
|
|
67
|
+
|
|
68
|
+
# 3. Run experiment
|
|
69
|
+
rem experiments run my-experiment
|
|
70
|
+
|
|
71
|
+
# 4. Commit configuration to Git
|
|
72
|
+
git add .experiments/my-experiment/
|
|
73
|
+
git commit -m "feat: Add CV parser experiment"
|
|
74
|
+
git tag -a experiments/my-experiment/v1.0.0 \\
|
|
75
|
+
-m "my-experiment v1.0.0: Initial experiment"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Version Tags**:
|
|
79
|
+
- Format: `experiments/{experiment-name}/vX.Y.Z`
|
|
80
|
+
- Example: `experiments/cv-parser-accuracy/v1.0.0`
|
|
81
|
+
- Allows tracking experiment configuration evolution
|
|
82
|
+
- GitProvider can load specific experiment versions
|
|
83
|
+
|
|
84
|
+
**Integration with Phoenix**:
|
|
85
|
+
```python
|
|
86
|
+
from rem.models.core.experiment import ExperimentConfig
|
|
87
|
+
from rem.services.phoenix import PhoenixClient
|
|
88
|
+
|
|
89
|
+
# Load experiment configuration
|
|
90
|
+
config = ExperimentConfig.from_yaml(".experiments/my-experiment/experiment.yaml")
|
|
91
|
+
|
|
92
|
+
# Run experiment
|
|
93
|
+
client = PhoenixClient()
|
|
94
|
+
results = client.run_experiment(
|
|
95
|
+
name=config.name,
|
|
96
|
+
agent_schema=config.agent_schema_ref,
|
|
97
|
+
evaluator_schema=config.evaluator_schema_ref,
|
|
98
|
+
dataset=config.load_dataset(),
|
|
99
|
+
metadata=config.metadata
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Save results
|
|
103
|
+
config.save_results(results)
|
|
104
|
+
```
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
from datetime import datetime
|
|
108
|
+
from enum import Enum
|
|
109
|
+
from pathlib import Path
|
|
110
|
+
from typing import Any, Literal
|
|
111
|
+
|
|
112
|
+
from pydantic import BaseModel, Field, field_validator
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class DatasetLocation(str, Enum):
|
|
116
|
+
"""Where experiment datasets are stored."""
|
|
117
|
+
GIT = "git" # Small datasets in .experiments/{name}/datasets/
|
|
118
|
+
S3 = "s3" # Large datasets on S3
|
|
119
|
+
HYBRID = "hybrid" # Configuration in Git, data on S3
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class ExperimentStatus(str, Enum):
|
|
123
|
+
"""Experiment lifecycle status."""
|
|
124
|
+
DRAFT = "draft" # Configuration being defined
|
|
125
|
+
READY = "ready" # Ready to run
|
|
126
|
+
RUNNING = "running" # Currently executing
|
|
127
|
+
COMPLETED = "completed" # Finished successfully
|
|
128
|
+
FAILED = "failed" # Execution failed
|
|
129
|
+
ARCHIVED = "archived" # Historical experiment
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class DatasetReference(BaseModel):
|
|
133
|
+
"""Reference to a dataset (Git or S3)."""
|
|
134
|
+
|
|
135
|
+
location: DatasetLocation = Field(
|
|
136
|
+
description="Where the dataset is stored (git, s3, hybrid)"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
path: str = Field(
|
|
140
|
+
description=(
|
|
141
|
+
"Path to dataset:\n"
|
|
142
|
+
"- Git: Relative path from experiment root (e.g., 'datasets/ground_truth.csv')\n"
|
|
143
|
+
"- S3: Full S3 URI (e.g., 's3://bucket/experiments/my-exp/datasets/ground_truth.csv')\n"
|
|
144
|
+
"- Hybrid: S3 URI for data, Git path for schema"
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
format: Literal["csv", "jsonl", "parquet", "json"] = Field(
|
|
149
|
+
default="csv",
|
|
150
|
+
description="Dataset file format"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
schema_path: str | None = Field(
|
|
154
|
+
default=None,
|
|
155
|
+
description=(
|
|
156
|
+
"Optional: Path to dataset schema definition (for hybrid mode).\n"
|
|
157
|
+
"Useful for documenting expected columns/fields in Git."
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
description: str | None = Field(
|
|
162
|
+
default=None,
|
|
163
|
+
description="Human-readable description of this dataset"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class SchemaReference(BaseModel):
|
|
168
|
+
"""Reference to an agent or evaluator schema."""
|
|
169
|
+
|
|
170
|
+
name: str = Field(
|
|
171
|
+
description=(
|
|
172
|
+
"Schema name (e.g., 'cv-parser', 'hello-world').\n"
|
|
173
|
+
"Corresponds to schemas/agents/{name}.yaml or schemas/evaluators/{agent}/{name}.yaml"
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
version: str | None = Field(
|
|
178
|
+
default=None,
|
|
179
|
+
description=(
|
|
180
|
+
"Semantic version tag (e.g., 'schemas/cv-parser/v2.1.0').\n"
|
|
181
|
+
"If None, uses latest version from main branch."
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
type: Literal["agent", "evaluator"] = Field(
|
|
186
|
+
description="Schema type (agent or evaluator)"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class ResultsConfig(BaseModel):
|
|
191
|
+
"""Configuration for where experiment results are stored."""
|
|
192
|
+
|
|
193
|
+
location: DatasetLocation = Field(
|
|
194
|
+
description="Where to store results (git, s3, hybrid)"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
base_path: str = Field(
|
|
198
|
+
description=(
|
|
199
|
+
"Base path for results storage:\n"
|
|
200
|
+
"- Git: '.experiments/{experiment-name}/results/'\n"
|
|
201
|
+
"- S3: 's3://bucket/experiments/{experiment-name}/results/'\n"
|
|
202
|
+
"- Hybrid: Both (small metrics in Git, full traces on S3)"
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
save_traces: bool = Field(
|
|
207
|
+
default=True,
|
|
208
|
+
description="Save full Phoenix traces (can be large)"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
save_metrics_summary: bool = Field(
|
|
212
|
+
default=True,
|
|
213
|
+
description="Save metrics summary (small, suitable for Git)"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
metrics_file: str = Field(
|
|
217
|
+
default="metrics.json",
|
|
218
|
+
description="Filename for metrics summary (stored in base_path)"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class ExperimentConfig(BaseModel):
|
|
223
|
+
"""
|
|
224
|
+
Complete experiment configuration for Phoenix evaluations.
|
|
225
|
+
|
|
226
|
+
This model defines everything needed to run a reproducible experiment:
|
|
227
|
+
- Agent and evaluator schemas (versioned via Git)
|
|
228
|
+
- Dataset references (Git or S3)
|
|
229
|
+
- Results storage configuration
|
|
230
|
+
- Experiment metadata and documentation
|
|
231
|
+
|
|
232
|
+
**Naming Convention**:
|
|
233
|
+
- Experiment names: lowercase-with-hyphens (e.g., 'cv-parser-accuracy')
|
|
234
|
+
- Directory: `.experiments/{experiment-name}/`
|
|
235
|
+
- Config file: `.experiments/{experiment-name}/experiment.yaml`
|
|
236
|
+
- Version tags: `experiments/{experiment-name}/vX.Y.Z`
|
|
237
|
+
|
|
238
|
+
**Fields**:
|
|
239
|
+
- `name`: Unique experiment identifier
|
|
240
|
+
- `description`: Human-readable purpose
|
|
241
|
+
- `agent_schema_ref`: Which agent to evaluate
|
|
242
|
+
- `evaluator_schema_ref`: Which evaluator to use
|
|
243
|
+
- `datasets`: Input datasets (ground truth, test cases)
|
|
244
|
+
- `results`: Where to store outputs
|
|
245
|
+
- `metadata`: Custom key-value pairs
|
|
246
|
+
- `status`: Current lifecycle stage
|
|
247
|
+
- `tags`: Organizational labels
|
|
248
|
+
|
|
249
|
+
**Examples**:
|
|
250
|
+
|
|
251
|
+
```yaml
|
|
252
|
+
# Small experiment (Git-only)
|
|
253
|
+
name: hello-world-validation
|
|
254
|
+
description: Validate hello-world agent responses
|
|
255
|
+
agent_schema_ref:
|
|
256
|
+
name: hello-world
|
|
257
|
+
version: schemas/hello-world/v1.0.0
|
|
258
|
+
type: agent
|
|
259
|
+
evaluator_schema_ref:
|
|
260
|
+
name: default
|
|
261
|
+
type: evaluator
|
|
262
|
+
datasets:
|
|
263
|
+
ground_truth:
|
|
264
|
+
location: git
|
|
265
|
+
path: datasets/ground_truth.csv
|
|
266
|
+
format: csv
|
|
267
|
+
results:
|
|
268
|
+
location: git
|
|
269
|
+
base_path: results/
|
|
270
|
+
save_traces: false
|
|
271
|
+
save_metrics_summary: true
|
|
272
|
+
status: ready
|
|
273
|
+
tags: [validation, smoke-test]
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
```yaml
|
|
277
|
+
# Large experiment (Hybrid)
|
|
278
|
+
name: cv-parser-production
|
|
279
|
+
description: Production CV parser evaluation with 10K resumes
|
|
280
|
+
agent_schema_ref:
|
|
281
|
+
name: cv-parser
|
|
282
|
+
version: schemas/cv-parser/v2.1.0
|
|
283
|
+
type: agent
|
|
284
|
+
evaluator_schema_ref:
|
|
285
|
+
name: default
|
|
286
|
+
type: evaluator
|
|
287
|
+
datasets:
|
|
288
|
+
ground_truth:
|
|
289
|
+
location: s3
|
|
290
|
+
path: s3://rem-prod/experiments/cv-parser-production/datasets/ground_truth.parquet
|
|
291
|
+
format: parquet
|
|
292
|
+
schema_path: datasets/schema.yaml # Schema in Git for documentation
|
|
293
|
+
test_cases:
|
|
294
|
+
location: s3
|
|
295
|
+
path: s3://rem-prod/experiments/cv-parser-production/datasets/test_cases.jsonl
|
|
296
|
+
format: jsonl
|
|
297
|
+
results:
|
|
298
|
+
location: hybrid
|
|
299
|
+
base_path: s3://rem-prod/experiments/cv-parser-production/results/
|
|
300
|
+
save_traces: true
|
|
301
|
+
save_metrics_summary: true
|
|
302
|
+
metrics_file: metrics.json # Copied to Git after run
|
|
303
|
+
metadata:
|
|
304
|
+
cost_per_run_usd: 5.25
|
|
305
|
+
expected_runtime_minutes: 45
|
|
306
|
+
team: recruitment-ai
|
|
307
|
+
priority: high
|
|
308
|
+
status: ready
|
|
309
|
+
tags: [production, cv-parser, weekly]
|
|
310
|
+
```
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
# Core identification
|
|
314
|
+
name: str = Field(
|
|
315
|
+
description=(
|
|
316
|
+
"Unique experiment identifier (lowercase-with-hyphens).\n"
|
|
317
|
+
"Used for directory name, tags, and references."
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
description: str = Field(
|
|
322
|
+
description="Human-readable description of experiment purpose and goals"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Schema references
|
|
326
|
+
agent_schema_ref: SchemaReference = Field(
|
|
327
|
+
description=(
|
|
328
|
+
"Reference to agent schema being evaluated.\n"
|
|
329
|
+
"Supports versioning via Git tags (e.g., schemas/cv-parser/v2.1.0)"
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
evaluator_schema_ref: SchemaReference = Field(
|
|
334
|
+
description=(
|
|
335
|
+
"Reference to evaluator schema for judging agent outputs.\n"
|
|
336
|
+
"Can reference evaluators/{agent-name}/{evaluator-name}.yaml"
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Dataset configuration
|
|
341
|
+
datasets: dict[str, DatasetReference] = Field(
|
|
342
|
+
description=(
|
|
343
|
+
"Named datasets for this experiment.\n"
|
|
344
|
+
"Common keys: 'ground_truth', 'test_cases', 'validation_set'\n"
|
|
345
|
+
"Supports Git (small datasets), S3 (large datasets), or hybrid"
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Results configuration
|
|
350
|
+
results: ResultsConfig = Field(
|
|
351
|
+
description=(
|
|
352
|
+
"Configuration for experiment results storage.\n"
|
|
353
|
+
"Supports Git (small results), S3 (large results), or hybrid"
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Metadata and organization
|
|
358
|
+
status: ExperimentStatus = Field(
|
|
359
|
+
default=ExperimentStatus.DRAFT,
|
|
360
|
+
description="Current experiment lifecycle status"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
tags: list[str] = Field(
|
|
364
|
+
default_factory=list,
|
|
365
|
+
description=(
|
|
366
|
+
"Tags for organizing experiments.\n"
|
|
367
|
+
"Examples: ['production', 'cv-parser', 'weekly', 'regression']"
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
metadata: dict[str, Any] = Field(
|
|
372
|
+
default_factory=dict,
|
|
373
|
+
description=(
|
|
374
|
+
"Custom metadata key-value pairs.\n"
|
|
375
|
+
"Examples: cost_per_run, expected_runtime, team, priority"
|
|
376
|
+
)
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Timestamps (auto-managed)
|
|
380
|
+
created_at: datetime = Field(
|
|
381
|
+
default_factory=datetime.now,
|
|
382
|
+
description="When this experiment configuration was created"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
updated_at: datetime = Field(
|
|
386
|
+
default_factory=datetime.now,
|
|
387
|
+
description="When this experiment configuration was last modified"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
last_run_at: datetime | None = Field(
|
|
391
|
+
default=None,
|
|
392
|
+
description="When this experiment was last executed"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
@field_validator("name")
|
|
396
|
+
@classmethod
|
|
397
|
+
def validate_name(cls, v: str) -> str:
|
|
398
|
+
"""Validate experiment name follows conventions."""
|
|
399
|
+
if not v:
|
|
400
|
+
raise ValueError("Experiment name cannot be empty")
|
|
401
|
+
|
|
402
|
+
if not v.islower():
|
|
403
|
+
raise ValueError("Experiment name must be lowercase")
|
|
404
|
+
|
|
405
|
+
if " " in v:
|
|
406
|
+
raise ValueError("Experiment name cannot contain spaces (use hyphens)")
|
|
407
|
+
|
|
408
|
+
if not all(c.isalnum() or c == "-" for c in v):
|
|
409
|
+
raise ValueError("Experiment name can only contain lowercase letters, numbers, and hyphens")
|
|
410
|
+
|
|
411
|
+
return v
|
|
412
|
+
|
|
413
|
+
@field_validator("tags")
|
|
414
|
+
@classmethod
|
|
415
|
+
def validate_tags(cls, v: list[str]) -> list[str]:
|
|
416
|
+
"""Ensure tags are lowercase and normalized."""
|
|
417
|
+
return [tag.lower().strip() for tag in v]
|
|
418
|
+
|
|
419
|
+
def get_experiment_dir(self, base_path: str = ".experiments") -> Path:
|
|
420
|
+
"""Get the experiment directory path."""
|
|
421
|
+
return Path(base_path) / self.name
|
|
422
|
+
|
|
423
|
+
def get_config_path(self, base_path: str = ".experiments") -> Path:
|
|
424
|
+
"""Get the path to experiment.yaml file."""
|
|
425
|
+
return self.get_experiment_dir(base_path) / "experiment.yaml"
|
|
426
|
+
|
|
427
|
+
def get_readme_path(self, base_path: str = ".experiments") -> Path:
|
|
428
|
+
"""Get the path to README.md file."""
|
|
429
|
+
return self.get_experiment_dir(base_path) / "README.md"
|
|
430
|
+
|
|
431
|
+
def to_yaml(self) -> str:
|
|
432
|
+
"""Export configuration as YAML string."""
|
|
433
|
+
import yaml
|
|
434
|
+
return yaml.dump(
|
|
435
|
+
self.model_dump(mode="json", exclude_none=True),
|
|
436
|
+
default_flow_style=False,
|
|
437
|
+
sort_keys=False
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
@classmethod
|
|
441
|
+
def from_yaml(cls, path: str | Path) -> "ExperimentConfig":
|
|
442
|
+
"""Load configuration from YAML file."""
|
|
443
|
+
import yaml
|
|
444
|
+
with open(path) as f:
|
|
445
|
+
data = yaml.safe_load(f)
|
|
446
|
+
return cls(**data)
|
|
447
|
+
|
|
448
|
+
def save(self, base_path: str = ".experiments") -> Path:
|
|
449
|
+
"""
|
|
450
|
+
Save experiment configuration to YAML file.
|
|
451
|
+
|
|
452
|
+
Creates directory structure if it doesn't exist.
|
|
453
|
+
Updates `updated_at` timestamp.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
Path to saved experiment.yaml file
|
|
457
|
+
"""
|
|
458
|
+
self.updated_at = datetime.now()
|
|
459
|
+
|
|
460
|
+
config_path = self.get_config_path(base_path)
|
|
461
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
462
|
+
|
|
463
|
+
with open(config_path, "w") as f:
|
|
464
|
+
f.write(self.to_yaml())
|
|
465
|
+
|
|
466
|
+
return config_path
|
|
467
|
+
|
|
468
|
+
def generate_readme(self) -> str:
|
|
469
|
+
"""
|
|
470
|
+
Generate README.md content for experiment.
|
|
471
|
+
|
|
472
|
+
Includes:
|
|
473
|
+
- Experiment description
|
|
474
|
+
- Schema references
|
|
475
|
+
- Dataset information
|
|
476
|
+
- How to run
|
|
477
|
+
- Results location
|
|
478
|
+
"""
|
|
479
|
+
readme = f"""# {self.name}
|
|
480
|
+
|
|
481
|
+
{self.description}
|
|
482
|
+
|
|
483
|
+
## Configuration
|
|
484
|
+
|
|
485
|
+
**Status**: `{self.status.value}`
|
|
486
|
+
**Tags**: {', '.join(f'`{tag}`' for tag in self.tags) if self.tags else 'None'}
|
|
487
|
+
|
|
488
|
+
## Agent Schema
|
|
489
|
+
|
|
490
|
+
- **Name**: `{self.agent_schema_ref.name}`
|
|
491
|
+
- **Version**: `{self.agent_schema_ref.version or 'latest'}`
|
|
492
|
+
- **Type**: `{self.agent_schema_ref.type}`
|
|
493
|
+
|
|
494
|
+
## Evaluator Schema
|
|
495
|
+
|
|
496
|
+
- **Name**: `{self.evaluator_schema_ref.name}`
|
|
497
|
+
- **Type**: `{self.evaluator_schema_ref.type}`
|
|
498
|
+
|
|
499
|
+
## Datasets
|
|
500
|
+
|
|
501
|
+
"""
|
|
502
|
+
for name, dataset in self.datasets.items():
|
|
503
|
+
readme += f"""### {name}
|
|
504
|
+
|
|
505
|
+
- **Location**: `{dataset.location.value}`
|
|
506
|
+
- **Path**: `{dataset.path}`
|
|
507
|
+
- **Format**: `{dataset.format}`
|
|
508
|
+
"""
|
|
509
|
+
if dataset.description:
|
|
510
|
+
readme += f"- **Description**: {dataset.description}\n"
|
|
511
|
+
readme += "\n"
|
|
512
|
+
|
|
513
|
+
readme += f"""## Results
|
|
514
|
+
|
|
515
|
+
- **Location**: `{self.results.location.value}`
|
|
516
|
+
- **Base Path**: `{self.results.base_path}`
|
|
517
|
+
- **Save Traces**: `{self.results.save_traces}`
|
|
518
|
+
- **Metrics File**: `{self.results.metrics_file}`
|
|
519
|
+
|
|
520
|
+
## How to Run
|
|
521
|
+
|
|
522
|
+
```bash
|
|
523
|
+
# Run this experiment
|
|
524
|
+
rem experiments run {self.name}
|
|
525
|
+
|
|
526
|
+
# Run with specific version
|
|
527
|
+
rem experiments run {self.name} --version experiments/{self.name}/v1.0.0
|
|
528
|
+
```
|
|
529
|
+
|
|
530
|
+
## Metadata
|
|
531
|
+
|
|
532
|
+
"""
|
|
533
|
+
if self.metadata:
|
|
534
|
+
for key, value in self.metadata.items():
|
|
535
|
+
readme += f"- **{key}**: `{value}`\n"
|
|
536
|
+
else:
|
|
537
|
+
readme += "None\n"
|
|
538
|
+
|
|
539
|
+
readme += f"""
|
|
540
|
+
## Timestamps
|
|
541
|
+
|
|
542
|
+
- **Created**: {self.created_at.isoformat()}
|
|
543
|
+
- **Updated**: {self.updated_at.isoformat()}
|
|
544
|
+
"""
|
|
545
|
+
if self.last_run_at:
|
|
546
|
+
readme += f"- **Last Run**: {self.last_run_at.isoformat()}\n"
|
|
547
|
+
|
|
548
|
+
return readme
|
|
549
|
+
|
|
550
|
+
def save_readme(self, base_path: str = ".experiments") -> Path:
|
|
551
|
+
"""Save auto-generated README.md file."""
|
|
552
|
+
readme_path = self.get_readme_path(base_path)
|
|
553
|
+
readme_path.parent.mkdir(parents=True, exist_ok=True)
|
|
554
|
+
|
|
555
|
+
with open(readme_path, "w") as f:
|
|
556
|
+
f.write(self.generate_readme())
|
|
557
|
+
|
|
558
|
+
return readme_path
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
# Example configurations for reference
|
|
562
|
+
EXAMPLE_SMALL_EXPERIMENT = ExperimentConfig(
|
|
563
|
+
name="hello-world-validation",
|
|
564
|
+
description="Smoke test for hello-world agent responses",
|
|
565
|
+
agent_schema_ref=SchemaReference(
|
|
566
|
+
name="hello-world",
|
|
567
|
+
version="schemas/hello-world/v1.0.0",
|
|
568
|
+
type="agent"
|
|
569
|
+
),
|
|
570
|
+
evaluator_schema_ref=SchemaReference(
|
|
571
|
+
name="default",
|
|
572
|
+
type="evaluator"
|
|
573
|
+
),
|
|
574
|
+
datasets={
|
|
575
|
+
"ground_truth": DatasetReference(
|
|
576
|
+
location=DatasetLocation.GIT,
|
|
577
|
+
path="datasets/ground_truth.csv",
|
|
578
|
+
format="csv",
|
|
579
|
+
description="10 manually curated test cases"
|
|
580
|
+
)
|
|
581
|
+
},
|
|
582
|
+
results=ResultsConfig(
|
|
583
|
+
location=DatasetLocation.GIT,
|
|
584
|
+
base_path="results/",
|
|
585
|
+
save_traces=False,
|
|
586
|
+
save_metrics_summary=True
|
|
587
|
+
),
|
|
588
|
+
status=ExperimentStatus.READY,
|
|
589
|
+
tags=["validation", "smoke-test"]
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
EXAMPLE_LARGE_EXPERIMENT = ExperimentConfig(
|
|
593
|
+
name="cv-parser-production",
|
|
594
|
+
description="Production CV parser evaluation with 10K resumes",
|
|
595
|
+
agent_schema_ref=SchemaReference(
|
|
596
|
+
name="cv-parser",
|
|
597
|
+
version="schemas/cv-parser/v2.1.0",
|
|
598
|
+
type="agent"
|
|
599
|
+
),
|
|
600
|
+
evaluator_schema_ref=SchemaReference(
|
|
601
|
+
name="default",
|
|
602
|
+
type="evaluator"
|
|
603
|
+
),
|
|
604
|
+
datasets={
|
|
605
|
+
"ground_truth": DatasetReference(
|
|
606
|
+
location=DatasetLocation.S3,
|
|
607
|
+
path="s3://rem-prod/experiments/cv-parser-production/datasets/ground_truth.parquet",
|
|
608
|
+
format="parquet",
|
|
609
|
+
schema_path="datasets/schema.yaml",
|
|
610
|
+
description="10,000 CV/resume pairs with ground truth extractions"
|
|
611
|
+
)
|
|
612
|
+
},
|
|
613
|
+
results=ResultsConfig(
|
|
614
|
+
location=DatasetLocation.HYBRID,
|
|
615
|
+
base_path="s3://rem-prod/experiments/cv-parser-production/results/",
|
|
616
|
+
save_traces=True,
|
|
617
|
+
save_metrics_summary=True,
|
|
618
|
+
metrics_file="metrics.json"
|
|
619
|
+
),
|
|
620
|
+
metadata={
|
|
621
|
+
"cost_per_run_usd": 5.25,
|
|
622
|
+
"expected_runtime_minutes": 45,
|
|
623
|
+
"team": "recruitment-ai",
|
|
624
|
+
"priority": "high"
|
|
625
|
+
},
|
|
626
|
+
status=ExperimentStatus.READY,
|
|
627
|
+
tags=["production", "cv-parser", "weekly"]
|
|
628
|
+
)
|