harness-evolver 4.2.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
3
  "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
4
- "version": "4.2.0",
4
+ "version": "4.2.1",
5
5
  "author": {
6
6
  "name": "Raphael Valdetaro"
7
7
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "4.2.0",
3
+ "version": "4.2.1",
4
4
  "description": "LangSmith-native autonomous agent optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
package/tools/setup.py CHANGED
@@ -32,13 +32,19 @@ import tempfile
32
32
  from datetime import datetime, timezone
33
33
 
34
34
 
35
+ # Track where the API key was loaded from
36
+ key_source = None
37
+
38
+
35
39
  def ensure_langsmith_api_key():
36
40
  """Load LANGSMITH_API_KEY from credentials file if not in env.
37
41
 
38
42
  The installer saves the key to the langsmith-cli credentials file,
39
43
  but the SDK only reads the env var. This bridges the gap.
40
44
  """
45
+ global key_source
41
46
  if os.environ.get("LANGSMITH_API_KEY"):
47
+ key_source = "environment"
42
48
  return True
43
49
 
44
50
  # Platform-specific credentials path (matches langsmith-cli)
@@ -56,6 +62,7 @@ def ensure_langsmith_api_key():
56
62
  key = line.split("=", 1)[1].strip()
57
63
  if key:
58
64
  os.environ["LANGSMITH_API_KEY"] = key
65
+ key_source = "credentials file"
59
66
  return True
60
67
  except OSError:
61
68
  pass
@@ -70,6 +77,7 @@ def ensure_langsmith_api_key():
70
77
  key = line.split("=", 1)[1].strip().strip("'\"")
71
78
  if key:
72
79
  os.environ["LANGSMITH_API_KEY"] = key
80
+ key_source = ".env file"
73
81
  return True
74
82
  except OSError:
75
83
  pass
@@ -123,6 +131,21 @@ def resolve_dataset_name(client, base_name):
123
131
  return f"{base_name}-eval-{ts}", 0
124
132
 
125
133
 
134
+ def create_dataset_with_retry(client, dataset_name, description, max_retries=3):
135
+ """Create dataset with retry for transient errors."""
136
+ import time
137
+ for attempt in range(max_retries):
138
+ try:
139
+ return client.create_dataset(dataset_name=dataset_name, description=description)
140
+ except Exception as e:
141
+ if attempt + 1 < max_retries and ("403" in str(e) or "500" in str(e)):
142
+ wait = 2 ** attempt + 0.5
143
+ print(f" Transient error creating dataset (attempt {attempt + 1}/{max_retries}), retrying in {wait:.0f}s...", file=sys.stderr)
144
+ time.sleep(wait)
145
+ else:
146
+ raise
147
+
148
+
126
149
  def create_dataset_from_file(client, dataset_name, file_path):
127
150
  """Create a LangSmith dataset from a JSON file of inputs."""
128
151
  with open(file_path) as f:
@@ -131,8 +154,8 @@ def create_dataset_from_file(client, dataset_name, file_path):
131
154
  if isinstance(data, dict):
132
155
  data = data.get("examples", data.get("tasks", [data]))
133
156
 
134
- dataset = client.create_dataset(
135
- dataset_name=dataset_name,
157
+ dataset = create_dataset_with_retry(
158
+ client, dataset_name,
136
159
  description=f"Evaluation dataset created from {os.path.basename(file_path)}",
137
160
  )
138
161
 
@@ -187,8 +210,8 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
187
210
  if not runs:
188
211
  return None, 0
189
212
 
190
- dataset = client.create_dataset(
191
- dataset_name=dataset_name,
213
+ dataset = create_dataset_with_retry(
214
+ client, dataset_name,
192
215
  description=f"Evaluation dataset from production traces ({source_project})",
193
216
  )
194
217
 
@@ -211,8 +234,8 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
211
234
 
212
235
  def create_empty_dataset(client, dataset_name):
213
236
  """Create an empty dataset (to be populated by testgen agent)."""
214
- dataset = client.create_dataset(
215
- dataset_name=dataset_name,
237
+ dataset = create_dataset_with_retry(
238
+ client, dataset_name,
216
239
  description="Evaluation dataset (pending test generation)",
217
240
  )
218
241
  return dataset
@@ -339,22 +362,30 @@ def run_baseline(client, dataset_name, entry_point, evaluators):
339
362
  )
340
363
 
341
364
  experiment_name = results.experiment_name
342
- # Read aggregate metrics
365
+
366
+ # Try to extract scores — this can fail with different SDK versions
367
+ mean_score = 0.0
343
368
  try:
344
- project = client.read_project(project_name=experiment_name, include_stats=True)
345
- stats = project.model_dump() if hasattr(project, "model_dump") else {}
346
- except Exception:
347
- stats = {}
369
+ scores = []
370
+ for result in results:
371
+ # Handle both object and dict result formats
372
+ if hasattr(result, 'evaluation_results'):
373
+ eval_results = result.evaluation_results
374
+ elif isinstance(result, dict):
375
+ eval_results = result.get("evaluation_results", {})
376
+ else:
377
+ continue
348
378
 
349
- # Calculate mean score from results
350
- scores = []
351
- for result in results:
352
- if result.evaluation_results and result.evaluation_results.get("results"):
353
- for er in result.evaluation_results["results"]:
354
- if er.get("score") is not None:
355
- scores.append(er["score"])
379
+ results_list = eval_results.get("results", []) if isinstance(eval_results, dict) else []
380
+ for er in results_list:
381
+ score = er.get("score") if isinstance(er, dict) else getattr(er, "score", None)
382
+ if score is not None:
383
+ scores.append(score)
356
384
 
357
- mean_score = sum(scores) / len(scores) if scores else 0.0
385
+ mean_score = sum(scores) / len(scores) if scores else 0.0
386
+ except Exception as e:
387
+ print(f" Warning: Could not extract baseline scores: {e}", file=sys.stderr)
388
+ print(f" Baseline experiment '{experiment_name}' was created — scores will be computed during /evolve", file=sys.stderr)
358
389
 
359
390
  return experiment_name, mean_score
360
391
 
@@ -393,10 +424,28 @@ def main():
393
424
  # Verify connection
394
425
  try:
395
426
  client.list_datasets(limit=1)
396
- print("LangSmith connection verified.")
427
+ print(f"LangSmith connection verified (key from {key_source}).")
428
+ except Exception as e:
429
+ if key_source in ("credentials file", ".env file"):
430
+ print(f"ERROR: API key loaded from {key_source} is invalid or lacks permissions.", file=sys.stderr)
431
+ print(f"The key was loaded from the {key_source} but LangSmith rejected it.", file=sys.stderr)
432
+ print(f"Fix: export LANGSMITH_API_KEY=lsv2_pt_... (with a valid key)", file=sys.stderr)
433
+ else:
434
+ print(f"Failed to connect to LangSmith: {e}", file=sys.stderr)
435
+ sys.exit(1)
436
+
437
+ # Verify write permissions
438
+ try:
439
+ test_ds = client.create_dataset(
440
+ dataset_name="_evolver-permission-check",
441
+ description="Temporary — verifying write permissions",
442
+ )
443
+ client.delete_dataset(dataset_id=test_ds.id)
444
+ print("Write permissions verified.")
397
445
  except Exception as e:
398
- print(f"Failed to connect to LangSmith: {e}", file=sys.stderr)
399
- print("Check LANGSMITH_API_KEY is set correctly.", file=sys.stderr)
446
+ print(f"ERROR: API key can read but cannot write to LangSmith.", file=sys.stderr)
447
+ print(f"The key needs 'Editor' role or higher to create datasets.", file=sys.stderr)
448
+ print(f"Details: {e}", file=sys.stderr)
400
449
  sys.exit(1)
401
450
 
402
451
  project_name = f"evolver-{args.project_name}"