remdb 0.2.6__py3-none-any.whl → 0.3.118__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -2
- rem/agentic/README.md +76 -0
- rem/agentic/__init__.py +15 -0
- rem/agentic/agents/__init__.py +16 -2
- rem/agentic/agents/sse_simulator.py +500 -0
- rem/agentic/context.py +28 -22
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/tool_wrapper.py +29 -3
- rem/agentic/otel/setup.py +92 -4
- rem/agentic/providers/phoenix.py +32 -43
- rem/agentic/providers/pydantic_ai.py +168 -24
- rem/agentic/schema.py +358 -21
- rem/agentic/tools/rem_tools.py +3 -3
- rem/api/README.md +238 -1
- rem/api/deps.py +255 -0
- rem/api/main.py +154 -37
- rem/api/mcp_router/resources.py +1 -1
- rem/api/mcp_router/server.py +26 -5
- rem/api/mcp_router/tools.py +454 -7
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +494 -0
- rem/api/routers/auth.py +124 -0
- rem/api/routers/chat/completions.py +152 -16
- rem/api/routers/chat/models.py +7 -3
- rem/api/routers/chat/sse_events.py +526 -0
- rem/api/routers/chat/streaming.py +608 -45
- rem/api/routers/dev.py +81 -0
- rem/api/routers/feedback.py +148 -0
- rem/api/routers/messages.py +473 -0
- rem/api/routers/models.py +78 -0
- rem/api/routers/query.py +360 -0
- rem/api/routers/shared_sessions.py +406 -0
- rem/auth/middleware.py +126 -27
- rem/cli/commands/README.md +237 -64
- rem/cli/commands/ask.py +15 -11
- rem/cli/commands/cluster.py +1300 -0
- rem/cli/commands/configure.py +170 -97
- rem/cli/commands/db.py +396 -139
- rem/cli/commands/experiments.py +278 -96
- rem/cli/commands/process.py +22 -15
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +97 -50
- rem/cli/main.py +37 -6
- rem/config.py +2 -2
- rem/models/core/core_model.py +7 -1
- rem/models/core/rem_query.py +5 -2
- rem/models/entities/__init__.py +21 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/message.py +30 -1
- rem/models/entities/session.py +83 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/user.py +10 -3
- rem/registry.py +373 -0
- rem/schemas/agents/rem.yaml +7 -3
- rem/services/content/providers.py +94 -140
- rem/services/content/service.py +115 -24
- rem/services/dreaming/affinity_service.py +2 -16
- rem/services/dreaming/moment_service.py +2 -15
- rem/services/embeddings/api.py +24 -17
- rem/services/embeddings/worker.py +16 -16
- rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
- rem/services/phoenix/client.py +252 -19
- rem/services/postgres/README.md +159 -15
- rem/services/postgres/__init__.py +2 -1
- rem/services/postgres/diff_service.py +531 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
- rem/services/postgres/repository.py +132 -0
- rem/services/postgres/schema_generator.py +291 -9
- rem/services/postgres/service.py +6 -6
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +14 -0
- rem/services/rem/parser.py +44 -9
- rem/services/rem/service.py +36 -2
- rem/services/session/compression.py +17 -1
- rem/services/session/reload.py +1 -1
- rem/services/user_service.py +98 -0
- rem/settings.py +169 -22
- rem/sql/background_indexes.sql +21 -16
- rem/sql/migrations/001_install.sql +387 -54
- rem/sql/migrations/002_install_models.sql +2320 -393
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +548 -0
- rem/utils/__init__.py +18 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/embeddings.py +17 -4
- rem/utils/files.py +167 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +156 -1
- rem/utils/schema_loader.py +284 -21
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +3 -1
- rem/utils/vision.py +9 -14
- rem/workers/README.md +14 -14
- rem/workers/__init__.py +2 -1
- rem/workers/db_maintainer.py +74 -0
- rem/workers/unlogged_maintainer.py +463 -0
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/METADATA +598 -171
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/RECORD +102 -73
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/WHEEL +1 -1
- rem/sql/002_install_models.sql +0 -1068
- rem/sql/install_models.sql +0 -1038
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/entry_points.txt +0 -0
rem/cli/commands/experiments.py
CHANGED
|
@@ -5,11 +5,21 @@ Experiments use ExperimentConfig (rem/models/core/experiment.py) for configurati
|
|
|
5
5
|
and support Git+S3 hybrid storage. Includes dataset, prompt, and trace management.
|
|
6
6
|
|
|
7
7
|
Directory Structure:
|
|
8
|
-
|
|
9
|
-
├── experiment.yaml # ExperimentConfig
|
|
10
|
-
├── README.md # Auto-generated
|
|
11
|
-
├──
|
|
12
|
-
|
|
8
|
+
experiments/{experiment-name}/
|
|
9
|
+
├── experiment.yaml # ExperimentConfig (metadata, agent ref, evaluator ref)
|
|
10
|
+
├── README.md # Auto-generated documentation
|
|
11
|
+
├── ground-truth/ # Evaluation datasets (Q&A pairs)
|
|
12
|
+
│ ├── dataset.csv # Input/output pairs for evaluation
|
|
13
|
+
│ └── dataset.yaml # Alternative YAML format
|
|
14
|
+
├── seed-data/ # Data to seed REM before running experiments
|
|
15
|
+
│ └── data.yaml # Users, resources, moments in REM format
|
|
16
|
+
└── results/ # Experiment results and metrics
|
|
17
|
+
└── {run-timestamp}/ # Each run gets its own timestamped folder
|
|
18
|
+
├── metrics.json # Summary metrics
|
|
19
|
+
└── run_info.json # Run metadata (eval framework URLs, etc)
|
|
20
|
+
|
|
21
|
+
Environment Variables:
|
|
22
|
+
EXPERIMENTS_HOME: Override default experiment directory (default: "experiments")
|
|
13
23
|
|
|
14
24
|
Commands:
|
|
15
25
|
# Experiment lifecycle
|
|
@@ -60,7 +70,7 @@ def experiments():
|
|
|
60
70
|
@click.option("--results-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
61
71
|
help="Where to store results")
|
|
62
72
|
@click.option("--tags", help="Comma-separated tags (e.g., 'production,cv-parser')")
|
|
63
|
-
@click.option("--base-path",
|
|
73
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
64
74
|
def create(
|
|
65
75
|
name: str,
|
|
66
76
|
agent: str,
|
|
@@ -69,12 +79,17 @@ def create(
|
|
|
69
79
|
dataset_location: str,
|
|
70
80
|
results_location: str,
|
|
71
81
|
tags: Optional[str],
|
|
72
|
-
base_path: str,
|
|
82
|
+
base_path: Optional[str],
|
|
73
83
|
):
|
|
74
84
|
"""Create a new experiment configuration.
|
|
75
85
|
|
|
76
86
|
Creates directory structure and generates experiment.yaml and README.md.
|
|
77
87
|
|
|
88
|
+
The experiment directory will contain:
|
|
89
|
+
- ground-truth/: Q&A pairs for evaluation
|
|
90
|
+
- seed-data/: REM data (users, resources, moments) to load before running
|
|
91
|
+
- results/: Timestamped run results
|
|
92
|
+
|
|
78
93
|
Examples:
|
|
79
94
|
# Small experiment (Git-only)
|
|
80
95
|
rem experiments create hello-world-validation \\
|
|
@@ -90,6 +105,9 @@ def create(
|
|
|
90
105
|
--dataset-location s3 \\
|
|
91
106
|
--results-location hybrid \\
|
|
92
107
|
--tags "production,cv-parser,weekly"
|
|
108
|
+
|
|
109
|
+
# Custom location
|
|
110
|
+
EXPERIMENTS_HOME=/path/to/experiments rem experiments create my-test --agent my-agent
|
|
93
111
|
"""
|
|
94
112
|
from rem.models.core.experiment import (
|
|
95
113
|
ExperimentConfig,
|
|
@@ -99,15 +117,19 @@ def create(
|
|
|
99
117
|
ResultsConfig,
|
|
100
118
|
ExperimentStatus,
|
|
101
119
|
)
|
|
120
|
+
import os
|
|
102
121
|
|
|
103
122
|
try:
|
|
123
|
+
# Resolve base path: CLI arg > EXPERIMENTS_HOME env var > default "experiments"
|
|
124
|
+
if base_path is None:
|
|
125
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
104
126
|
# Build dataset reference
|
|
105
127
|
if dataset_location == "git":
|
|
106
128
|
dataset_ref = DatasetReference(
|
|
107
129
|
location=DatasetLocation.GIT,
|
|
108
|
-
path="
|
|
130
|
+
path="ground-truth/dataset.csv",
|
|
109
131
|
format="csv",
|
|
110
|
-
description="Ground truth dataset for evaluation"
|
|
132
|
+
description="Ground truth Q&A dataset for evaluation"
|
|
111
133
|
)
|
|
112
134
|
else: # s3 or hybrid
|
|
113
135
|
dataset_ref = DatasetReference(
|
|
@@ -168,26 +190,167 @@ def create(
|
|
|
168
190
|
config_path = config.save(base_path)
|
|
169
191
|
readme_path = config.save_readme(base_path)
|
|
170
192
|
|
|
171
|
-
# Create
|
|
172
|
-
|
|
173
|
-
|
|
193
|
+
# Create new directory structure
|
|
194
|
+
exp_dir = config.get_experiment_dir(base_path)
|
|
195
|
+
|
|
196
|
+
# Create ground-truth directory
|
|
197
|
+
ground_truth_dir = exp_dir / "ground-truth"
|
|
198
|
+
ground_truth_dir.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
|
|
200
|
+
# Create seed-data directory
|
|
201
|
+
seed_data_dir = exp_dir / "seed-data"
|
|
202
|
+
seed_data_dir.mkdir(parents=True, exist_ok=True)
|
|
174
203
|
|
|
175
204
|
# Create results directory if Git-based
|
|
176
205
|
if results_location == "git":
|
|
177
|
-
results_dir =
|
|
206
|
+
results_dir = exp_dir / "results"
|
|
178
207
|
results_dir.mkdir(parents=True, exist_ok=True)
|
|
179
208
|
|
|
209
|
+
# Create placeholder files with documentation
|
|
210
|
+
ground_truth_readme = ground_truth_dir / "README.md"
|
|
211
|
+
ground_truth_readme.write_text("""# Ground Truth Dataset
|
|
212
|
+
|
|
213
|
+
This directory contains Q&A pairs for evaluating the agent.
|
|
214
|
+
|
|
215
|
+
## Format
|
|
216
|
+
|
|
217
|
+
**CSV format** (`dataset.csv`):
|
|
218
|
+
```csv
|
|
219
|
+
input,expected_output,metadata
|
|
220
|
+
"What is the capital of France?","Paris","{\"difficulty\": \"easy\"}"
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
**YAML format** (`dataset.yaml`):
|
|
224
|
+
```yaml
|
|
225
|
+
- input: "What is the capital of France?"
|
|
226
|
+
expected_output: "Paris"
|
|
227
|
+
metadata:
|
|
228
|
+
difficulty: easy
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Generating Ground Truth
|
|
232
|
+
|
|
233
|
+
### Using AI Assistants
|
|
234
|
+
|
|
235
|
+
AI coding assistants (like Claude, GPT-4, etc.) can help generate comprehensive ground-truth datasets:
|
|
236
|
+
|
|
237
|
+
1. **Generate from existing examples**: Show the assistant examples from your domain and ask it to create similar Q&A pairs
|
|
238
|
+
2. **Create challenging questions**: Ask the assistant to act as a judge and generate HARD questions that test edge cases
|
|
239
|
+
3. **Vary difficulty levels**: Request a mix of easy, medium, and hard questions with appropriate metadata tags
|
|
240
|
+
|
|
241
|
+
Example prompt:
|
|
242
|
+
```
|
|
243
|
+
Based on these example documents about [your domain], generate 20 Q&A pairs
|
|
244
|
+
for evaluating an agent. Include:
|
|
245
|
+
- 5 easy factual questions
|
|
246
|
+
- 10 medium questions requiring reasoning
|
|
247
|
+
- 5 hard questions with edge cases
|
|
248
|
+
Format as CSV with difficulty and category metadata.
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Ground Truth as Judge
|
|
252
|
+
|
|
253
|
+
**Important**: Keep ground-truth data **separate** from the agent being tested:
|
|
254
|
+
- Ground truth should be hidden from the agent during evaluation
|
|
255
|
+
- The agent should only see the `input` field
|
|
256
|
+
- The evaluator compares agent output against `expected_output`
|
|
257
|
+
- This ensures unbiased evaluation
|
|
258
|
+
|
|
259
|
+
### Quality Guidelines
|
|
260
|
+
|
|
261
|
+
1. **Diverse Coverage**: Include various question types and difficulty levels
|
|
262
|
+
2. **Domain-Specific**: Use terminology and scenarios from your actual use case
|
|
263
|
+
3. **Metadata Tags**: Add difficulty, category, priority for analysis
|
|
264
|
+
4. **SME Review**: Have domain experts validate expected outputs
|
|
265
|
+
|
|
266
|
+
## Usage
|
|
267
|
+
|
|
268
|
+
These datasets can be:
|
|
269
|
+
- Loaded into evaluation frameworks (Arize Phoenix, etc.)
|
|
270
|
+
- Used for regression testing
|
|
271
|
+
- Converted to different formats as needed
|
|
272
|
+
|
|
273
|
+
The experiment runner will automatically use this data for evaluation.
|
|
274
|
+
""")
|
|
275
|
+
|
|
276
|
+
seed_data_readme = seed_data_dir / "README.md"
|
|
277
|
+
seed_data_readme.write_text("""# Seed Data
|
|
278
|
+
|
|
279
|
+
This directory contains REM data to load before running the experiment.
|
|
280
|
+
|
|
281
|
+
## Format
|
|
282
|
+
|
|
283
|
+
Use standard REM YAML format:
|
|
284
|
+
|
|
285
|
+
```yaml
|
|
286
|
+
users:
|
|
287
|
+
- id: test-user-001
|
|
288
|
+
user_id: experiment-test
|
|
289
|
+
email: test@example.com
|
|
290
|
+
|
|
291
|
+
resources:
|
|
292
|
+
- id: resource-001
|
|
293
|
+
user_id: experiment-test
|
|
294
|
+
label: example-document
|
|
295
|
+
content: "Document content here..."
|
|
296
|
+
|
|
297
|
+
moments:
|
|
298
|
+
- id: moment-001
|
|
299
|
+
user_id: experiment-test
|
|
300
|
+
label: example-meeting
|
|
301
|
+
starts_timestamp: "2024-01-15T14:00:00"
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Generating Seed Data
|
|
305
|
+
|
|
306
|
+
### Using AI Assistants
|
|
307
|
+
|
|
308
|
+
AI coding assistants can help generate realistic seed data for your experiments:
|
|
309
|
+
|
|
310
|
+
1. **From existing datasets**: Reference examples from the `datasets/` directory
|
|
311
|
+
2. **Domain-specific scenarios**: Describe your use case and ask for appropriate test data
|
|
312
|
+
3. **Anonymized versions**: Ask to create fictional data based on real patterns
|
|
313
|
+
|
|
314
|
+
Example prompt:
|
|
315
|
+
```
|
|
316
|
+
Based on the recruitment dataset examples in datasets/domains/recruitment/,
|
|
317
|
+
generate seed data for testing a CV parser agent. Include:
|
|
318
|
+
- 3 test users
|
|
319
|
+
- 5 CV documents (resources) with varied experience levels
|
|
320
|
+
- 2 interview moment entries
|
|
321
|
+
Use fictional names and anonymize all content.
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### Best Practices
|
|
325
|
+
|
|
326
|
+
1. **Minimal**: Only include data necessary for the ground-truth questions to be answerable
|
|
327
|
+
2. **Anonymized**: Always use fictional names, companies, and content
|
|
328
|
+
3. **Relevant**: Seed data should provide context for evaluation questions
|
|
329
|
+
4. **Versioned**: Track changes to seed data in Git for reproducibility
|
|
330
|
+
|
|
331
|
+
## Usage
|
|
332
|
+
|
|
333
|
+
Load this data before running experiments:
|
|
334
|
+
```bash
|
|
335
|
+
rem db load --file seed-data/data.yaml --user-id experiment-test
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
This ensures your agent has the necessary context for evaluation.
|
|
339
|
+
""")
|
|
340
|
+
|
|
180
341
|
click.echo(f"\n✓ Created experiment: {name}")
|
|
181
342
|
click.echo(f" Configuration: {config_path}")
|
|
182
343
|
click.echo(f" Documentation: {readme_path}")
|
|
183
|
-
click.echo(f"
|
|
344
|
+
click.echo(f" Ground Truth: {ground_truth_dir}")
|
|
345
|
+
click.echo(f" Seed Data: {seed_data_dir}")
|
|
184
346
|
if results_location == "git":
|
|
185
347
|
click.echo(f" Results: {results_dir}")
|
|
186
348
|
click.echo(f"\nNext steps:")
|
|
187
|
-
click.echo(f" 1. Add
|
|
188
|
-
click.echo(f" 2.
|
|
189
|
-
click.echo(f" 3.
|
|
190
|
-
click.echo(f" 4.
|
|
349
|
+
click.echo(f" 1. Add ground truth Q&A to {ground_truth_dir}/dataset.csv")
|
|
350
|
+
click.echo(f" 2. Add seed data to {seed_data_dir}/data.yaml (optional)")
|
|
351
|
+
click.echo(f" 3. Review configuration: {config_path}")
|
|
352
|
+
click.echo(f" 4. Run experiment: rem experiments run {name}")
|
|
353
|
+
click.echo(f" 5. Commit to Git: git add {base_path}/{name}/ && git commit")
|
|
191
354
|
|
|
192
355
|
except Exception as e:
|
|
193
356
|
logger.error(f"Failed to create experiment: {e}")
|
|
@@ -201,11 +364,11 @@ def create(
|
|
|
201
364
|
|
|
202
365
|
|
|
203
366
|
@experiments.command("list")
|
|
204
|
-
@click.option("--base-path",
|
|
367
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
205
368
|
@click.option("--status", help="Filter by status (draft, ready, completed, etc.)")
|
|
206
369
|
@click.option("--tags", help="Filter by tags (comma-separated)")
|
|
207
370
|
def list_experiments(
|
|
208
|
-
base_path: str,
|
|
371
|
+
base_path: Optional[str],
|
|
209
372
|
status: Optional[str],
|
|
210
373
|
tags: Optional[str],
|
|
211
374
|
):
|
|
@@ -217,8 +380,13 @@ def list_experiments(
|
|
|
217
380
|
rem experiments list --tags production,cv-parser
|
|
218
381
|
"""
|
|
219
382
|
from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
|
|
383
|
+
import os
|
|
220
384
|
|
|
221
385
|
try:
|
|
386
|
+
# Resolve base path
|
|
387
|
+
if base_path is None:
|
|
388
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
389
|
+
|
|
222
390
|
experiments_dir = Path(base_path)
|
|
223
391
|
if not experiments_dir.exists():
|
|
224
392
|
click.echo(f"No experiments directory found at {base_path}")
|
|
@@ -279,16 +447,21 @@ def list_experiments(
|
|
|
279
447
|
|
|
280
448
|
@experiments.command("show")
|
|
281
449
|
@click.argument("name")
|
|
282
|
-
@click.option("--base-path",
|
|
283
|
-
def show(name: str, base_path: str):
|
|
450
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
451
|
+
def show(name: str, base_path: Optional[str]):
|
|
284
452
|
"""Show experiment details.
|
|
285
453
|
|
|
286
454
|
Examples:
|
|
287
455
|
rem experiments show hello-world-validation
|
|
288
456
|
"""
|
|
289
457
|
from rem.models.core.experiment import ExperimentConfig
|
|
458
|
+
import os
|
|
290
459
|
|
|
291
460
|
try:
|
|
461
|
+
# Resolve base path
|
|
462
|
+
if base_path is None:
|
|
463
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
464
|
+
|
|
292
465
|
config_path = Path(base_path) / name / "experiment.yaml"
|
|
293
466
|
if not config_path.exists():
|
|
294
467
|
click.echo(f"Experiment not found: {name}")
|
|
@@ -348,7 +521,7 @@ def show(name: str, base_path: str):
|
|
|
348
521
|
|
|
349
522
|
@experiments.command("run")
|
|
350
523
|
@click.argument("name")
|
|
351
|
-
@click.option("--base-path",
|
|
524
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
352
525
|
@click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
|
|
353
526
|
@click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
|
|
354
527
|
@click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
|
|
@@ -356,7 +529,7 @@ def show(name: str, base_path: str):
|
|
|
356
529
|
@click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
|
|
357
530
|
def run(
|
|
358
531
|
name: str,
|
|
359
|
-
base_path: str,
|
|
532
|
+
base_path: Optional[str],
|
|
360
533
|
version: Optional[str],
|
|
361
534
|
dry_run: bool,
|
|
362
535
|
update_prompts: bool,
|
|
@@ -405,10 +578,14 @@ def run(
|
|
|
405
578
|
from rem.services.git import GitService
|
|
406
579
|
from rem.services.phoenix import PhoenixClient
|
|
407
580
|
from rem.agentic.providers.phoenix import create_evaluator_from_schema
|
|
408
|
-
from
|
|
409
|
-
import
|
|
581
|
+
from rem.utils.date_utils import utc_now, to_iso, format_timestamp_for_experiment
|
|
582
|
+
import os
|
|
410
583
|
|
|
411
584
|
try:
|
|
585
|
+
# Resolve base path
|
|
586
|
+
if base_path is None:
|
|
587
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
588
|
+
|
|
412
589
|
# Load experiment configuration
|
|
413
590
|
if version:
|
|
414
591
|
# Load from Git at specific version
|
|
@@ -437,36 +614,22 @@ def run(
|
|
|
437
614
|
click.echo(f" Mode: DRY RUN (no data will be saved)")
|
|
438
615
|
click.echo()
|
|
439
616
|
|
|
440
|
-
# Load agent schema
|
|
617
|
+
# Load agent schema using centralized schema loader
|
|
441
618
|
agent_name = config.agent_schema_ref.name
|
|
442
619
|
agent_version = config.agent_schema_ref.version
|
|
443
620
|
|
|
444
621
|
click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
|
|
445
622
|
|
|
446
|
-
|
|
447
|
-
agent_schema = None
|
|
448
|
-
try:
|
|
449
|
-
git_svc = GitService()
|
|
450
|
-
agent_schema = git_svc.load_schema(agent_name, version=agent_version)
|
|
451
|
-
click.echo(f"✓ Loaded agent schema from Git")
|
|
452
|
-
except Exception as e:
|
|
453
|
-
logger.debug(f"Git not available, trying filesystem: {e}")
|
|
454
|
-
|
|
455
|
-
# Fallback to local filesystem
|
|
456
|
-
from rem.services.fs import FS
|
|
457
|
-
fs = FS()
|
|
623
|
+
from rem.utils.schema_loader import load_agent_schema
|
|
458
624
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
click.echo(f" Tried filesystem: {schema_path}")
|
|
468
|
-
click.echo(f" Make sure the schema exists")
|
|
469
|
-
raise click.Abort()
|
|
625
|
+
try:
|
|
626
|
+
agent_schema = load_agent_schema(agent_name)
|
|
627
|
+
click.echo(f"✓ Loaded agent schema: {agent_name}")
|
|
628
|
+
except FileNotFoundError as e:
|
|
629
|
+
logger.error(f"Failed to load agent schema: {e}")
|
|
630
|
+
click.echo(f"Error: Could not load agent schema '{agent_name}'")
|
|
631
|
+
click.echo(f" {e}")
|
|
632
|
+
raise click.Abort()
|
|
470
633
|
|
|
471
634
|
# Create agent function from schema
|
|
472
635
|
from rem.agentic.providers.pydantic_ai import create_agent
|
|
@@ -505,73 +668,85 @@ def run(
|
|
|
505
668
|
return {"output": serialized}
|
|
506
669
|
return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
|
|
507
670
|
|
|
508
|
-
# Load evaluator schema
|
|
671
|
+
# Load evaluator schema using centralized schema loader
|
|
509
672
|
evaluator_name = config.evaluator_schema_ref.name
|
|
510
673
|
evaluator_version = config.evaluator_schema_ref.version
|
|
511
674
|
|
|
512
|
-
# Resolve evaluator path (evaluators are organized by agent name)
|
|
513
|
-
evaluator_schema_path = f"rem/schemas/evaluators/{agent_name}/{evaluator_name}.yaml"
|
|
514
|
-
|
|
515
675
|
click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
|
|
516
676
|
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
677
|
+
# Try multiple evaluator path patterns (agent-specific, then generic)
|
|
678
|
+
evaluator_paths_to_try = [
|
|
679
|
+
f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
|
|
680
|
+
f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
|
|
681
|
+
evaluator_name, # e.g., default (generic)
|
|
682
|
+
]
|
|
683
|
+
|
|
684
|
+
evaluator_fn = None
|
|
685
|
+
evaluator_load_error = None
|
|
686
|
+
|
|
687
|
+
for evaluator_path in evaluator_paths_to_try:
|
|
688
|
+
try:
|
|
689
|
+
evaluator_fn = create_evaluator_from_schema(
|
|
690
|
+
evaluator_schema_path=evaluator_path,
|
|
691
|
+
model_name=None, # Use default from schema
|
|
692
|
+
)
|
|
693
|
+
click.echo(f"✓ Loaded evaluator schema: {evaluator_path}")
|
|
694
|
+
break
|
|
695
|
+
except FileNotFoundError as e:
|
|
696
|
+
evaluator_load_error = e
|
|
697
|
+
logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
|
|
698
|
+
continue
|
|
699
|
+
except Exception as e:
|
|
700
|
+
evaluator_load_error = e
|
|
701
|
+
logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
|
|
702
|
+
continue
|
|
703
|
+
|
|
704
|
+
if evaluator_fn is None:
|
|
705
|
+
click.echo(f"Error: Could not load evaluator schema '{evaluator_name}'")
|
|
706
|
+
click.echo(f" Tried paths: {evaluator_paths_to_try}")
|
|
707
|
+
if evaluator_load_error:
|
|
708
|
+
click.echo(f" Last error: {evaluator_load_error}")
|
|
528
709
|
raise click.Abort()
|
|
529
710
|
|
|
530
|
-
# Load dataset
|
|
711
|
+
# Load dataset using Polars
|
|
712
|
+
import polars as pl
|
|
713
|
+
|
|
531
714
|
click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
|
|
532
715
|
dataset_ref = list(config.datasets.values())[0]
|
|
533
716
|
|
|
534
717
|
if dataset_ref.location.value == "git":
|
|
535
|
-
# Load from Git
|
|
718
|
+
# Load from Git (local filesystem)
|
|
536
719
|
dataset_path = Path(base_path) / name / dataset_ref.path
|
|
537
720
|
if not dataset_path.exists():
|
|
538
721
|
click.echo(f"Error: Dataset not found: {dataset_path}")
|
|
539
722
|
raise click.Abort()
|
|
540
723
|
|
|
541
724
|
if dataset_ref.format == "csv":
|
|
542
|
-
dataset_df =
|
|
725
|
+
dataset_df = pl.read_csv(dataset_path)
|
|
543
726
|
elif dataset_ref.format == "parquet":
|
|
544
|
-
dataset_df =
|
|
727
|
+
dataset_df = pl.read_parquet(dataset_path)
|
|
545
728
|
elif dataset_ref.format == "jsonl":
|
|
546
|
-
dataset_df =
|
|
729
|
+
dataset_df = pl.read_ndjson(dataset_path)
|
|
547
730
|
else:
|
|
548
731
|
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
549
732
|
raise click.Abort()
|
|
550
733
|
elif dataset_ref.location.value in ["s3", "hybrid"]:
|
|
551
734
|
# Load from S3 using FS provider
|
|
552
735
|
from rem.services.fs import FS
|
|
736
|
+
from io import BytesIO
|
|
553
737
|
|
|
554
738
|
fs = FS()
|
|
555
739
|
|
|
556
740
|
try:
|
|
557
741
|
if dataset_ref.format == "csv":
|
|
558
742
|
content = fs.read(dataset_ref.path)
|
|
559
|
-
|
|
560
|
-
dataset_df = pd.read_csv(StringIO(content))
|
|
743
|
+
dataset_df = pl.read_csv(BytesIO(content.encode() if isinstance(content, str) else content))
|
|
561
744
|
elif dataset_ref.format == "parquet":
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
|
565
|
-
tmp_path = tmp.name
|
|
566
|
-
# Download via FS
|
|
567
|
-
content_bytes = fs.read(dataset_ref.path)
|
|
568
|
-
tmp.write(content_bytes)
|
|
569
|
-
dataset_df = pd.read_parquet(tmp_path)
|
|
570
|
-
Path(tmp_path).unlink() # Clean up temp file
|
|
745
|
+
content_bytes = fs.read(dataset_ref.path)
|
|
746
|
+
dataset_df = pl.read_parquet(BytesIO(content_bytes if isinstance(content_bytes, bytes) else content_bytes.encode()))
|
|
571
747
|
elif dataset_ref.format == "jsonl":
|
|
572
748
|
content = fs.read(dataset_ref.path)
|
|
573
|
-
|
|
574
|
-
dataset_df = pd.read_json(StringIO(content), lines=True)
|
|
749
|
+
dataset_df = pl.read_ndjson(BytesIO(content.encode() if isinstance(content, str) else content))
|
|
575
750
|
else:
|
|
576
751
|
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
577
752
|
raise click.Abort()
|
|
@@ -615,13 +790,13 @@ def run(
|
|
|
615
790
|
|
|
616
791
|
client = PhoenixClient(config=phoenix_config)
|
|
617
792
|
|
|
618
|
-
experiment_name = f"{config.name}-{
|
|
793
|
+
experiment_name = f"{config.name}-{format_timestamp_for_experiment()}"
|
|
619
794
|
|
|
620
795
|
click.echo(f"\n⏳ Running experiment: {experiment_name}")
|
|
621
796
|
click.echo(f" This may take several minutes...")
|
|
622
797
|
|
|
623
798
|
experiment = client.run_experiment(
|
|
624
|
-
dataset=dataset_df,
|
|
799
|
+
dataset=dataset_df,
|
|
625
800
|
task=task_fn,
|
|
626
801
|
evaluators=[evaluator_fn],
|
|
627
802
|
experiment_name=experiment_name,
|
|
@@ -631,12 +806,15 @@ def run(
|
|
|
631
806
|
"evaluator": config.evaluator_schema_ref.name,
|
|
632
807
|
"experiment_config": config.name,
|
|
633
808
|
**config.metadata
|
|
634
|
-
}
|
|
809
|
+
},
|
|
810
|
+
# Smart column detection for DataFrame -> Phoenix Dataset conversion
|
|
811
|
+
input_keys=["input"] if "input" in dataset_df.columns else None,
|
|
812
|
+
output_keys=["expected_output"] if "expected_output" in dataset_df.columns else None,
|
|
635
813
|
)
|
|
636
814
|
|
|
637
815
|
# Update experiment status
|
|
638
816
|
config.status = ExperimentStatus.COMPLETED
|
|
639
|
-
config.last_run_at =
|
|
817
|
+
config.last_run_at = utc_now()
|
|
640
818
|
if not version: # Only save if not loading from Git
|
|
641
819
|
config.save(base_path)
|
|
642
820
|
|
|
@@ -657,7 +835,7 @@ def run(
|
|
|
657
835
|
"agent": config.agent_schema_ref.name,
|
|
658
836
|
"evaluator": config.evaluator_schema_ref.name,
|
|
659
837
|
"dataset_size": len(dataset_df),
|
|
660
|
-
"completed_at":
|
|
838
|
+
"completed_at": to_iso(utc_now()),
|
|
661
839
|
"phoenix_url": getattr(experiment, "url", None),
|
|
662
840
|
"task_runs": len(exp_data.get("task_runs", [])),
|
|
663
841
|
}
|
|
@@ -837,20 +1015,24 @@ def dataset_add(
|
|
|
837
1015
|
--output-keys expected_label,expected_type
|
|
838
1016
|
"""
|
|
839
1017
|
from rem.services.phoenix import PhoenixClient
|
|
840
|
-
import
|
|
1018
|
+
import polars as pl
|
|
841
1019
|
|
|
842
1020
|
try:
|
|
843
1021
|
client = PhoenixClient()
|
|
844
1022
|
|
|
845
|
-
# Load CSV
|
|
846
|
-
df =
|
|
1023
|
+
# Load CSV with Polars
|
|
1024
|
+
df = pl.read_csv(from_csv)
|
|
1025
|
+
records = df.to_dicts()
|
|
847
1026
|
|
|
848
1027
|
# Extract data
|
|
849
|
-
|
|
850
|
-
|
|
1028
|
+
input_cols = input_keys.split(",")
|
|
1029
|
+
output_cols = output_keys.split(",")
|
|
1030
|
+
inputs = [{k: row.get(k) for k in input_cols} for row in records]
|
|
1031
|
+
outputs = [{k: row.get(k) for k in output_cols} for row in records]
|
|
851
1032
|
metadata = None
|
|
852
1033
|
if metadata_keys:
|
|
853
|
-
|
|
1034
|
+
meta_cols = metadata_keys.split(",")
|
|
1035
|
+
metadata = [{k: row.get(k) for k in meta_cols} for row in records]
|
|
854
1036
|
|
|
855
1037
|
# Add to dataset
|
|
856
1038
|
dataset = client.add_examples_to_dataset(
|
|
@@ -1091,12 +1273,12 @@ def trace_list(
|
|
|
1091
1273
|
rem experiments trace list --project rem-agents --days 7 --limit 50
|
|
1092
1274
|
"""
|
|
1093
1275
|
from rem.services.phoenix import PhoenixClient
|
|
1094
|
-
from
|
|
1276
|
+
from rem.utils.date_utils import days_ago
|
|
1095
1277
|
|
|
1096
1278
|
try:
|
|
1097
1279
|
client = PhoenixClient()
|
|
1098
1280
|
|
|
1099
|
-
start_time =
|
|
1281
|
+
start_time = days_ago(days)
|
|
1100
1282
|
|
|
1101
1283
|
traces_df = client.get_traces(
|
|
1102
1284
|
project_name=project,
|