harness-evolver 4.2.1 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.2.
|
|
4
|
+
"version": "4.2.2",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/package.json
CHANGED
|
@@ -59,7 +59,7 @@ def detect_memorization(client, experiment_name, dataset_name):
|
|
|
59
59
|
"""Check if agent outputs are suspiciously similar to reference outputs."""
|
|
60
60
|
suspicious = []
|
|
61
61
|
try:
|
|
62
|
-
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=
|
|
62
|
+
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
|
|
63
63
|
examples = {str(e.id): e for e in client.list_examples(dataset_name=dataset_name, limit=500)}
|
|
64
64
|
|
|
65
65
|
for run in runs:
|
package/tools/dataset_health.py
CHANGED
|
@@ -68,7 +68,7 @@ def check_difficulty(client, config):
|
|
|
68
68
|
return None
|
|
69
69
|
|
|
70
70
|
try:
|
|
71
|
-
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=
|
|
71
|
+
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=100))
|
|
72
72
|
if not runs:
|
|
73
73
|
return None
|
|
74
74
|
|
|
@@ -129,7 +129,7 @@ def check_dead_examples(client, config):
|
|
|
129
129
|
|
|
130
130
|
for exp_name in recent_exps:
|
|
131
131
|
try:
|
|
132
|
-
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=
|
|
132
|
+
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=100))
|
|
133
133
|
all_run_ids = [run.id for run in runs]
|
|
134
134
|
if not all_run_ids:
|
|
135
135
|
continue
|
package/tools/read_results.py
CHANGED
|
@@ -60,7 +60,7 @@ def get_per_example_scores(client, experiment_name):
|
|
|
60
60
|
"""Get per-example scores from an experiment."""
|
|
61
61
|
scores = {}
|
|
62
62
|
try:
|
|
63
|
-
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=
|
|
63
|
+
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
|
|
64
64
|
all_run_ids = [run.id for run in runs]
|
|
65
65
|
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
66
66
|
fb_map = {}
|
package/tools/trace_insights.py
CHANGED