harness-evolver 4.2.0 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/tools/__pycache__/setup.cpython-313.pyc +0 -0
- package/tools/adversarial_inject.py +1 -1
- package/tools/dataset_health.py +2 -2
- package/tools/read_results.py +1 -1
- package/tools/regression_tracker.py +1 -1
- package/tools/setup.py +71 -22
- package/tools/trace_insights.py +1 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.2.
|
|
4
|
+
"version": "4.2.2",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/package.json
CHANGED
|
Binary file
|
|
@@ -59,7 +59,7 @@ def detect_memorization(client, experiment_name, dataset_name):
|
|
|
59
59
|
"""Check if agent outputs are suspiciously similar to reference outputs."""
|
|
60
60
|
suspicious = []
|
|
61
61
|
try:
|
|
62
|
-
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=
|
|
62
|
+
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
|
|
63
63
|
examples = {str(e.id): e for e in client.list_examples(dataset_name=dataset_name, limit=500)}
|
|
64
64
|
|
|
65
65
|
for run in runs:
|
package/tools/dataset_health.py
CHANGED
|
@@ -68,7 +68,7 @@ def check_difficulty(client, config):
|
|
|
68
68
|
return None
|
|
69
69
|
|
|
70
70
|
try:
|
|
71
|
-
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=
|
|
71
|
+
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=100))
|
|
72
72
|
if not runs:
|
|
73
73
|
return None
|
|
74
74
|
|
|
@@ -129,7 +129,7 @@ def check_dead_examples(client, config):
|
|
|
129
129
|
|
|
130
130
|
for exp_name in recent_exps:
|
|
131
131
|
try:
|
|
132
|
-
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=
|
|
132
|
+
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=100))
|
|
133
133
|
all_run_ids = [run.id for run in runs]
|
|
134
134
|
if not all_run_ids:
|
|
135
135
|
continue
|
package/tools/read_results.py
CHANGED
|
@@ -60,7 +60,7 @@ def get_per_example_scores(client, experiment_name):
|
|
|
60
60
|
"""Get per-example scores from an experiment."""
|
|
61
61
|
scores = {}
|
|
62
62
|
try:
|
|
63
|
-
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=
|
|
63
|
+
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
|
|
64
64
|
all_run_ids = [run.id for run in runs]
|
|
65
65
|
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
66
66
|
fb_map = {}
|
package/tools/setup.py
CHANGED
|
@@ -32,13 +32,19 @@ import tempfile
|
|
|
32
32
|
from datetime import datetime, timezone
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
# Track where the API key was loaded from
|
|
36
|
+
key_source = None
|
|
37
|
+
|
|
38
|
+
|
|
35
39
|
def ensure_langsmith_api_key():
|
|
36
40
|
"""Load LANGSMITH_API_KEY from credentials file if not in env.
|
|
37
41
|
|
|
38
42
|
The installer saves the key to the langsmith-cli credentials file,
|
|
39
43
|
but the SDK only reads the env var. This bridges the gap.
|
|
40
44
|
"""
|
|
45
|
+
global key_source
|
|
41
46
|
if os.environ.get("LANGSMITH_API_KEY"):
|
|
47
|
+
key_source = "environment"
|
|
42
48
|
return True
|
|
43
49
|
|
|
44
50
|
# Platform-specific credentials path (matches langsmith-cli)
|
|
@@ -56,6 +62,7 @@ def ensure_langsmith_api_key():
|
|
|
56
62
|
key = line.split("=", 1)[1].strip()
|
|
57
63
|
if key:
|
|
58
64
|
os.environ["LANGSMITH_API_KEY"] = key
|
|
65
|
+
key_source = "credentials file"
|
|
59
66
|
return True
|
|
60
67
|
except OSError:
|
|
61
68
|
pass
|
|
@@ -70,6 +77,7 @@ def ensure_langsmith_api_key():
|
|
|
70
77
|
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
71
78
|
if key:
|
|
72
79
|
os.environ["LANGSMITH_API_KEY"] = key
|
|
80
|
+
key_source = ".env file"
|
|
73
81
|
return True
|
|
74
82
|
except OSError:
|
|
75
83
|
pass
|
|
@@ -123,6 +131,21 @@ def resolve_dataset_name(client, base_name):
|
|
|
123
131
|
return f"{base_name}-eval-{ts}", 0
|
|
124
132
|
|
|
125
133
|
|
|
134
|
+
def create_dataset_with_retry(client, dataset_name, description, max_retries=3):
|
|
135
|
+
"""Create dataset with retry for transient errors."""
|
|
136
|
+
import time
|
|
137
|
+
for attempt in range(max_retries):
|
|
138
|
+
try:
|
|
139
|
+
return client.create_dataset(dataset_name=dataset_name, description=description)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
if attempt + 1 < max_retries and ("403" in str(e) or "500" in str(e)):
|
|
142
|
+
wait = 2 ** attempt + 0.5
|
|
143
|
+
print(f" Transient error creating dataset (attempt {attempt + 1}/{max_retries}), retrying in {wait:.0f}s...", file=sys.stderr)
|
|
144
|
+
time.sleep(wait)
|
|
145
|
+
else:
|
|
146
|
+
raise
|
|
147
|
+
|
|
148
|
+
|
|
126
149
|
def create_dataset_from_file(client, dataset_name, file_path):
|
|
127
150
|
"""Create a LangSmith dataset from a JSON file of inputs."""
|
|
128
151
|
with open(file_path) as f:
|
|
@@ -131,8 +154,8 @@ def create_dataset_from_file(client, dataset_name, file_path):
|
|
|
131
154
|
if isinstance(data, dict):
|
|
132
155
|
data = data.get("examples", data.get("tasks", [data]))
|
|
133
156
|
|
|
134
|
-
dataset =
|
|
135
|
-
dataset_name
|
|
157
|
+
dataset = create_dataset_with_retry(
|
|
158
|
+
client, dataset_name,
|
|
136
159
|
description=f"Evaluation dataset created from {os.path.basename(file_path)}",
|
|
137
160
|
)
|
|
138
161
|
|
|
@@ -187,8 +210,8 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
|
|
|
187
210
|
if not runs:
|
|
188
211
|
return None, 0
|
|
189
212
|
|
|
190
|
-
dataset =
|
|
191
|
-
dataset_name
|
|
213
|
+
dataset = create_dataset_with_retry(
|
|
214
|
+
client, dataset_name,
|
|
192
215
|
description=f"Evaluation dataset from production traces ({source_project})",
|
|
193
216
|
)
|
|
194
217
|
|
|
@@ -211,8 +234,8 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
|
|
|
211
234
|
|
|
212
235
|
def create_empty_dataset(client, dataset_name):
|
|
213
236
|
"""Create an empty dataset (to be populated by testgen agent)."""
|
|
214
|
-
dataset =
|
|
215
|
-
dataset_name
|
|
237
|
+
dataset = create_dataset_with_retry(
|
|
238
|
+
client, dataset_name,
|
|
216
239
|
description="Evaluation dataset (pending test generation)",
|
|
217
240
|
)
|
|
218
241
|
return dataset
|
|
@@ -339,22 +362,30 @@ def run_baseline(client, dataset_name, entry_point, evaluators):
|
|
|
339
362
|
)
|
|
340
363
|
|
|
341
364
|
experiment_name = results.experiment_name
|
|
342
|
-
|
|
365
|
+
|
|
366
|
+
# Try to extract scores — this can fail with different SDK versions
|
|
367
|
+
mean_score = 0.0
|
|
343
368
|
try:
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
369
|
+
scores = []
|
|
370
|
+
for result in results:
|
|
371
|
+
# Handle both object and dict result formats
|
|
372
|
+
if hasattr(result, 'evaluation_results'):
|
|
373
|
+
eval_results = result.evaluation_results
|
|
374
|
+
elif isinstance(result, dict):
|
|
375
|
+
eval_results = result.get("evaluation_results", {})
|
|
376
|
+
else:
|
|
377
|
+
continue
|
|
348
378
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
if er.get("score") is not None:
|
|
355
|
-
scores.append(er["score"])
|
|
379
|
+
results_list = eval_results.get("results", []) if isinstance(eval_results, dict) else []
|
|
380
|
+
for er in results_list:
|
|
381
|
+
score = er.get("score") if isinstance(er, dict) else getattr(er, "score", None)
|
|
382
|
+
if score is not None:
|
|
383
|
+
scores.append(score)
|
|
356
384
|
|
|
357
|
-
|
|
385
|
+
mean_score = sum(scores) / len(scores) if scores else 0.0
|
|
386
|
+
except Exception as e:
|
|
387
|
+
print(f" Warning: Could not extract baseline scores: {e}", file=sys.stderr)
|
|
388
|
+
print(f" Baseline experiment '{experiment_name}' was created — scores will be computed during /evolve", file=sys.stderr)
|
|
358
389
|
|
|
359
390
|
return experiment_name, mean_score
|
|
360
391
|
|
|
@@ -393,10 +424,28 @@ def main():
|
|
|
393
424
|
# Verify connection
|
|
394
425
|
try:
|
|
395
426
|
client.list_datasets(limit=1)
|
|
396
|
-
print("LangSmith connection verified.")
|
|
427
|
+
print(f"LangSmith connection verified (key from {key_source}).")
|
|
428
|
+
except Exception as e:
|
|
429
|
+
if key_source in ("credentials file", ".env file"):
|
|
430
|
+
print(f"ERROR: API key loaded from {key_source} is invalid or lacks permissions.", file=sys.stderr)
|
|
431
|
+
print(f"The key was loaded from the {key_source} but LangSmith rejected it.", file=sys.stderr)
|
|
432
|
+
print(f"Fix: export LANGSMITH_API_KEY=lsv2_pt_... (with a valid key)", file=sys.stderr)
|
|
433
|
+
else:
|
|
434
|
+
print(f"Failed to connect to LangSmith: {e}", file=sys.stderr)
|
|
435
|
+
sys.exit(1)
|
|
436
|
+
|
|
437
|
+
# Verify write permissions
|
|
438
|
+
try:
|
|
439
|
+
test_ds = client.create_dataset(
|
|
440
|
+
dataset_name="_evolver-permission-check",
|
|
441
|
+
description="Temporary — verifying write permissions",
|
|
442
|
+
)
|
|
443
|
+
client.delete_dataset(dataset_id=test_ds.id)
|
|
444
|
+
print("Write permissions verified.")
|
|
397
445
|
except Exception as e:
|
|
398
|
-
print(f"
|
|
399
|
-
print("
|
|
446
|
+
print(f"ERROR: API key can read but cannot write to LangSmith.", file=sys.stderr)
|
|
447
|
+
print(f"The key needs 'Editor' role or higher to create datasets.", file=sys.stderr)
|
|
448
|
+
print(f"Details: {e}", file=sys.stderr)
|
|
400
449
|
sys.exit(1)
|
|
401
450
|
|
|
402
451
|
project_name = f"evolver-{args.project_name}"
|
package/tools/trace_insights.py
CHANGED