pearmut 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pearmut/app.py +52 -29
- pearmut/assignment.py +256 -46
- pearmut/cli.py +104 -25
- pearmut/results_export.py +210 -0
- pearmut/static/basic.bundle.js +1 -1
- pearmut/static/basic.html +25 -2
- pearmut/static/dashboard.bundle.js +1 -1
- pearmut/static/dashboard.html +28 -13
- pearmut/static/index.bundle.js +1 -0
- pearmut/static/index.html +1 -1
- pearmut/static/style.css +1 -1
- pearmut/utils.py +16 -2
- {pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/METADATA +56 -27
- pearmut-1.0.0.dist-info/RECORD +19 -0
- pearmut-0.3.2.dist-info/RECORD +0 -17
- {pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/WHEEL +0 -0
- {pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/entry_points.txt +0 -0
- {pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/top_level.txt +0 -0
pearmut/cli.py
CHANGED
|
@@ -34,21 +34,25 @@ def _run(args_unknown):
|
|
|
34
34
|
|
|
35
35
|
# print access dashboard URL for all campaigns
|
|
36
36
|
if tasks_data:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
37
|
+
dashboard_url = args.server + "/dashboard.html?" + "&".join([
|
|
38
|
+
f"campaign_id={urllib.parse.quote_plus(campaign_id)}&token={campaign_data["token"]}"
|
|
39
|
+
for campaign_id, campaign_data in tasks_data.items()
|
|
40
|
+
])
|
|
41
|
+
print("\033[92mNow serving Pearmut, use the following URL to access the everything-dashboard:\033[0m")
|
|
42
|
+
print("🍐", dashboard_url+"\n", flush=True)
|
|
43
|
+
|
|
44
|
+
# disable startup message
|
|
45
|
+
uvicorn.config.LOGGING_CONFIG["loggers"]["uvicorn.error"]["level"] = "WARNING"
|
|
46
|
+
# set time logging
|
|
47
|
+
uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M"
|
|
48
|
+
uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["fmt"] = (
|
|
49
|
+
'%(asctime)s %(levelprefix)s %(client_addr)s - %(request_line)s %(status_code)s'
|
|
50
|
+
)
|
|
46
51
|
uvicorn.run(
|
|
47
52
|
app,
|
|
48
53
|
host="0.0.0.0",
|
|
49
54
|
port=args.port,
|
|
50
55
|
reload=False,
|
|
51
|
-
# log_level="info",
|
|
52
56
|
)
|
|
53
57
|
|
|
54
58
|
|
|
@@ -108,6 +112,38 @@ def _validate_item_structure(items):
|
|
|
108
112
|
raise ValueError(f"Validation rule for model '{model_name}' must be a dictionary")
|
|
109
113
|
|
|
110
114
|
|
|
115
|
+
def _validate_document_models(doc):
|
|
116
|
+
"""
|
|
117
|
+
Validate that all items in a document have the same model outputs.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
doc: List of items in a document
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
None if valid
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
ValueError: If items have different model outputs
|
|
127
|
+
"""
|
|
128
|
+
# Get model names from the first item
|
|
129
|
+
first_item = doc[0]
|
|
130
|
+
first_models = set(first_item['tgt'].keys())
|
|
131
|
+
|
|
132
|
+
# Check all other items have the same model names
|
|
133
|
+
for i, item in enumerate(doc[1:], start=1):
|
|
134
|
+
if 'tgt' not in item or not isinstance(item['tgt'], dict):
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
item_models = set(item['tgt'].keys())
|
|
138
|
+
if item_models != first_models:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Document contains items with different model outputs. "
|
|
141
|
+
f"Item 0 has models {sorted(first_models)}, but item {i} has models {sorted(item_models)}. "
|
|
142
|
+
f"This is fine, but we can't shuffle (on by default). "
|
|
143
|
+
f"To fix this, set 'shuffle': false in the campaign 'info' section. "
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
111
147
|
def _shuffle_campaign_data(campaign_data, rng):
|
|
112
148
|
"""
|
|
113
149
|
Shuffle campaign data at the document level in-place
|
|
@@ -120,14 +156,11 @@ def _shuffle_campaign_data(campaign_data, rng):
|
|
|
120
156
|
"""
|
|
121
157
|
def shuffle_document(doc):
|
|
122
158
|
"""Shuffle a single document (list of items) by reordering models in tgt dict."""
|
|
123
|
-
|
|
124
|
-
|
|
159
|
+
# Validate that all items have the same models
|
|
160
|
+
_validate_document_models(doc)
|
|
125
161
|
|
|
126
162
|
# Get all model names from the first item's tgt dict
|
|
127
163
|
first_item = doc[0]
|
|
128
|
-
if 'tgt' not in first_item or not isinstance(first_item['tgt'], dict):
|
|
129
|
-
return
|
|
130
|
-
|
|
131
164
|
model_names = list(first_item['tgt'].keys())
|
|
132
165
|
rng.shuffle(model_names)
|
|
133
166
|
|
|
@@ -146,7 +179,7 @@ def _shuffle_campaign_data(campaign_data, rng):
|
|
|
146
179
|
for user_id, task in campaign_data["data"].items():
|
|
147
180
|
for doc in task:
|
|
148
181
|
shuffle_document(doc)
|
|
149
|
-
elif assignment
|
|
182
|
+
elif assignment in ["single-stream", "dynamic"]:
|
|
150
183
|
# Shuffle each document in the shared pool
|
|
151
184
|
for doc in campaign_data["data"]:
|
|
152
185
|
shuffle_document(doc)
|
|
@@ -226,8 +259,46 @@ def _add_single_campaign(data_file, overwrite, server):
|
|
|
226
259
|
else:
|
|
227
260
|
raise ValueError("'users' must be an integer or a list.")
|
|
228
261
|
elif assignment == "dynamic":
|
|
229
|
-
|
|
230
|
-
|
|
262
|
+
tasks = campaign_data["data"]
|
|
263
|
+
if users_spec is None:
|
|
264
|
+
raise ValueError(
|
|
265
|
+
"Dynamic campaigns must specify 'users' in info.")
|
|
266
|
+
if not isinstance(campaign_data["data"], list):
|
|
267
|
+
raise ValueError(
|
|
268
|
+
"Dynamic campaign 'data' must be a list of items.")
|
|
269
|
+
# Validate item structure for dynamic
|
|
270
|
+
for doc_i, doc in enumerate(tasks):
|
|
271
|
+
try:
|
|
272
|
+
_validate_item_structure(doc)
|
|
273
|
+
except ValueError as e:
|
|
274
|
+
raise ValueError(f"Document {doc_i}: {e}")
|
|
275
|
+
if isinstance(users_spec, int):
|
|
276
|
+
num_users = users_spec
|
|
277
|
+
elif isinstance(users_spec, list):
|
|
278
|
+
num_users = len(users_spec)
|
|
279
|
+
else:
|
|
280
|
+
raise ValueError("'users' must be an integer or a list.")
|
|
281
|
+
# Validate dynamic-specific parameters
|
|
282
|
+
if "dynamic_top" not in campaign_data["info"]:
|
|
283
|
+
campaign_data["info"]["dynamic_top"] = 2
|
|
284
|
+
if "dynamic_first" not in campaign_data["info"]:
|
|
285
|
+
campaign_data["info"]["dynamic_first"] = 5
|
|
286
|
+
if "dynamic_contrastive_models" not in campaign_data["info"]:
|
|
287
|
+
campaign_data["info"]["dynamic_contrastive_models"] = 1
|
|
288
|
+
# Validate that dynamic_first is at least 1
|
|
289
|
+
assert campaign_data["info"]["dynamic_first"] >= 1, "dynamic_first must be at least 1"
|
|
290
|
+
# Validate that dynamic_contrastive_models is at most dynamic_top
|
|
291
|
+
assert campaign_data["info"]["dynamic_contrastive_models"] <= campaign_data["info"]["dynamic_top"], \
|
|
292
|
+
"dynamic_contrastive_models must be at most dynamic_top"
|
|
293
|
+
# Validate that all items have the same models
|
|
294
|
+
all_models = set()
|
|
295
|
+
for item in campaign_data["data"]:
|
|
296
|
+
if item and len(item) > 0:
|
|
297
|
+
all_models.update(item[0]["tgt"].keys())
|
|
298
|
+
for item in campaign_data["data"]:
|
|
299
|
+
if item and len(item) > 0:
|
|
300
|
+
item_models = set(item[0]["tgt"].keys())
|
|
301
|
+
assert item_models == all_models, "All items must have the same model outputs"
|
|
231
302
|
else:
|
|
232
303
|
raise ValueError(f"Unknown campaign assignment type: {assignment}")
|
|
233
304
|
|
|
@@ -270,14 +341,20 @@ def _add_single_campaign(data_file, overwrite, server):
|
|
|
270
341
|
campaign_data["info"]["protocol"] = "ESA"
|
|
271
342
|
print("Warning: 'protocol' not specified in campaign info. Defaulting to 'ESA'.")
|
|
272
343
|
|
|
344
|
+
# Remove output file when overwriting (after all validations pass)
|
|
345
|
+
if overwrite and campaign_data['campaign_id'] in progress_data:
|
|
346
|
+
output_file = f"{ROOT}/data/outputs/{campaign_data['campaign_id']}.jsonl"
|
|
347
|
+
if os.path.exists(output_file):
|
|
348
|
+
os.remove(output_file)
|
|
349
|
+
|
|
273
350
|
# For task-based, data is a dict mapping user_id -> tasks
|
|
274
|
-
# For single-stream, data is a flat list (shared among all users)
|
|
351
|
+
# For single-stream and dynamic, data is a flat list (shared among all users)
|
|
275
352
|
if assignment == "task-based":
|
|
276
353
|
campaign_data["data"] = {
|
|
277
354
|
user_id: task
|
|
278
355
|
for user_id, task in zip(user_ids, tasks)
|
|
279
356
|
}
|
|
280
|
-
elif assignment
|
|
357
|
+
elif assignment in ["single-stream", "dynamic"]:
|
|
281
358
|
campaign_data["data"] = tasks
|
|
282
359
|
|
|
283
360
|
# generate a token for dashboard access if not present
|
|
@@ -299,6 +376,7 @@ def _add_single_campaign(data_file, overwrite, server):
|
|
|
299
376
|
"progress": (
|
|
300
377
|
[False]*len(campaign_data["data"][user_id]) if assignment == "task-based"
|
|
301
378
|
else [False]*len(campaign_data["data"]) if assignment == "single-stream"
|
|
379
|
+
else [list() for _ in range(len(campaign_data["data"]))] if assignment == "dynamic"
|
|
302
380
|
else []
|
|
303
381
|
),
|
|
304
382
|
"time_start": None,
|
|
@@ -382,9 +460,7 @@ def _add_single_campaign(data_file, overwrite, server):
|
|
|
382
460
|
json.dump(campaign_data, f, indent=2, ensure_ascii=False)
|
|
383
461
|
|
|
384
462
|
progress_data[campaign_data['campaign_id']] = user_progress
|
|
385
|
-
|
|
386
|
-
with open(f"{ROOT}/data/progress.json", "w") as f:
|
|
387
|
-
json.dump(progress_data, f, indent=2, ensure_ascii=False)
|
|
463
|
+
save_progress_data(progress_data)
|
|
388
464
|
|
|
389
465
|
|
|
390
466
|
print(
|
|
@@ -395,7 +471,7 @@ def _add_single_campaign(data_file, overwrite, server):
|
|
|
395
471
|
)
|
|
396
472
|
for user_id, user_val in user_progress.items():
|
|
397
473
|
# point to the protocol URL
|
|
398
|
-
print(f'{server}/{user_val["url"]}')
|
|
474
|
+
print(f'🧑 {server}/{user_val["url"]}')
|
|
399
475
|
print()
|
|
400
476
|
|
|
401
477
|
|
|
@@ -469,10 +545,14 @@ def main():
|
|
|
469
545
|
help='Optional campaign name to purge (purges all if not specified)'
|
|
470
546
|
)
|
|
471
547
|
purge_args = purge_args.parse_args(args_unknown)
|
|
548
|
+
progress_data = load_progress_data()
|
|
472
549
|
|
|
473
550
|
if purge_args.campaign is not None:
|
|
474
551
|
# Purge specific campaign
|
|
475
552
|
campaign_id = purge_args.campaign
|
|
553
|
+
if campaign_id not in progress_data:
|
|
554
|
+
print(f"Campaign '{campaign_id}' does not exist.")
|
|
555
|
+
return
|
|
476
556
|
confirm = input(
|
|
477
557
|
f"Are you sure you want to purge campaign '{campaign_id}'? This action cannot be undone. [y/n] "
|
|
478
558
|
)
|
|
@@ -502,7 +582,6 @@ def main():
|
|
|
502
582
|
)
|
|
503
583
|
if confirm.lower() == 'y':
|
|
504
584
|
# Unlink all assets first
|
|
505
|
-
progress_data = load_progress_data()
|
|
506
585
|
for campaign_id in progress_data.keys():
|
|
507
586
|
_unlink_assets(campaign_id)
|
|
508
587
|
shutil.rmtree(f"{ROOT}/data/tasks", ignore_errors=True)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import statistics
|
|
5
|
+
|
|
6
|
+
from .utils import get_db_log
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def comparison_significant(
|
|
10
|
+
scores1: dict[str, float], scores2: dict[str, float]
|
|
11
|
+
) -> bool:
|
|
12
|
+
"""Check if the difference between two sets of scores is statistically significant.
|
|
13
|
+
Assume scores1 > scores2.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import scipy.stats
|
|
17
|
+
|
|
18
|
+
# compute intersection
|
|
19
|
+
common_items = set(scores1.keys()).intersection(set(scores2.keys()))
|
|
20
|
+
if len(common_items) < 2:
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
scores1 = [scores1[k] for k in common_items]
|
|
24
|
+
scores2 = [scores2[k] for k in common_items]
|
|
25
|
+
|
|
26
|
+
return bool(
|
|
27
|
+
scipy.stats.ttest_rel(scores1, scores2, alternative="two-sided").pvalue < 0.05
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def compute_model_scores(campaign_id):
|
|
32
|
+
"""
|
|
33
|
+
Compute model scores from annotations for a campaign.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of dicts with keys: model, score, count
|
|
37
|
+
Sorted by score in descending order
|
|
38
|
+
"""
|
|
39
|
+
# Compute model scores from annotations
|
|
40
|
+
model_scores = collections.defaultdict(dict)
|
|
41
|
+
|
|
42
|
+
# Iterate through all tasks to find items with 'models' field (basic template)
|
|
43
|
+
log = get_db_log(campaign_id)
|
|
44
|
+
for entry in log:
|
|
45
|
+
if "item" not in entry or "annotation" not in entry:
|
|
46
|
+
continue
|
|
47
|
+
for item, annotation in zip(entry["item"], entry["annotation"]):
|
|
48
|
+
for model, annotation in annotation.items():
|
|
49
|
+
if "score" in annotation and annotation["score"] is not None:
|
|
50
|
+
item_id = item.get("item_id") or json.dumps(item | {"tgt": None})
|
|
51
|
+
model_scores[model][item_id] = annotation["score"]
|
|
52
|
+
|
|
53
|
+
model_scores = list(model_scores.items())
|
|
54
|
+
model_scores.sort(key=lambda x: statistics.mean(x[1].values()), reverse=True)
|
|
55
|
+
|
|
56
|
+
results = []
|
|
57
|
+
for i, (model, scores) in enumerate(model_scores):
|
|
58
|
+
avg_score = statistics.mean(scores.values())
|
|
59
|
+
sig_better = False
|
|
60
|
+
if i < len(model_scores) - 1:
|
|
61
|
+
# Compare with next model
|
|
62
|
+
scores_next = model_scores[i + 1][1]
|
|
63
|
+
sig_better = comparison_significant(scores, scores_next)
|
|
64
|
+
else:
|
|
65
|
+
sig_better = False
|
|
66
|
+
results.append(
|
|
67
|
+
{
|
|
68
|
+
"model": model,
|
|
69
|
+
"score": avg_score,
|
|
70
|
+
"count": len(scores),
|
|
71
|
+
"sig_better_than_next": sig_better,
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def escape_typst(s: str):
|
|
78
|
+
return (
|
|
79
|
+
s.replace("\\", "\\\\")
|
|
80
|
+
.replace("#", "\\#")
|
|
81
|
+
.replace("*", "\\*")
|
|
82
|
+
.replace("_", "\\_")
|
|
83
|
+
.replace("`", "\\`")
|
|
84
|
+
.replace("[", "\\[")
|
|
85
|
+
.replace("]", "\\]")
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def generate_typst_table(results):
|
|
90
|
+
"""
|
|
91
|
+
Generate Typst code for a two-column table with results.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
results: List of dicts with keys: model, score, count
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
String containing Typst table markup
|
|
98
|
+
"""
|
|
99
|
+
if not results:
|
|
100
|
+
return "// No results available"
|
|
101
|
+
|
|
102
|
+
typst_code = """#table(
|
|
103
|
+
columns: (auto, auto),
|
|
104
|
+
align: (left, right),
|
|
105
|
+
stroke: none,
|
|
106
|
+
table.hline(),
|
|
107
|
+
[*Model*], [*Score*],
|
|
108
|
+
table.hline(),
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
for result in results:
|
|
112
|
+
# Escape Typst special characters
|
|
113
|
+
model = escape_typst(result["model"])
|
|
114
|
+
score = f"{result['score']:.1f}"
|
|
115
|
+
typst_code += f" [{model}], [{score}],\n"
|
|
116
|
+
if result["sig_better_than_next"]:
|
|
117
|
+
typst_code += " table.hline(end: 1),\n"
|
|
118
|
+
|
|
119
|
+
typst_code += " table.hline(),\n"
|
|
120
|
+
typst_code += ")\n"
|
|
121
|
+
return typst_code
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def generate_latex_table(results):
|
|
125
|
+
"""
|
|
126
|
+
Generate LaTeX code for a booktabs two-column table with results.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
results: List of dicts with keys: model, score, count
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
String containing LaTeX table markup
|
|
133
|
+
"""
|
|
134
|
+
if not results:
|
|
135
|
+
return "% No results available"
|
|
136
|
+
|
|
137
|
+
latex_code = """\\begin{table}[h]
|
|
138
|
+
\\centering
|
|
139
|
+
\\begin{tabular}{lr}
|
|
140
|
+
\\toprule
|
|
141
|
+
\\textbf{Model} & \\textbf{Score} \\\\
|
|
142
|
+
\\midrule
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
for result in results:
|
|
146
|
+
# Escape LaTeX special characters
|
|
147
|
+
model = result["model"]
|
|
148
|
+
model = model.replace("\\", "\\textbackslash ")
|
|
149
|
+
model = model.replace("_", "\\_")
|
|
150
|
+
model = model.replace("&", "\\&")
|
|
151
|
+
model = model.replace("%", "\\%")
|
|
152
|
+
model = model.replace("$", "\\$")
|
|
153
|
+
model = model.replace("#", "\\#")
|
|
154
|
+
model = model.replace("{", "\\{")
|
|
155
|
+
model = model.replace("}", "\\}")
|
|
156
|
+
model = model.replace("~", "\\textasciitilde ")
|
|
157
|
+
model = model.replace("^", "\\textasciicircum ")
|
|
158
|
+
|
|
159
|
+
score = f"{result['score']:.1f}"
|
|
160
|
+
latex_code += f"{model} & {score} \\\\\n"
|
|
161
|
+
if result["sig_better_than_next"]:
|
|
162
|
+
latex_code += "\\cmidrule{1-1}\n"
|
|
163
|
+
|
|
164
|
+
latex_code += """\\bottomrule
|
|
165
|
+
\\end{tabular}
|
|
166
|
+
\\caption{Model ranking results}
|
|
167
|
+
\\label{tab:results}
|
|
168
|
+
\\end{table}
|
|
169
|
+
"""
|
|
170
|
+
return latex_code
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def generate_pdf(results, campaign_id):
|
|
174
|
+
"""
|
|
175
|
+
Generate PDF from Typst code using typst-py.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
results: List of dicts with keys: model, score, count
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
bytes containing the PDF
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
import tempfile
|
|
185
|
+
|
|
186
|
+
import typst
|
|
187
|
+
|
|
188
|
+
if not results:
|
|
189
|
+
# Return empty PDF with message
|
|
190
|
+
typst_code = "[No results available]"
|
|
191
|
+
else:
|
|
192
|
+
typst_code = f"""
|
|
193
|
+
#set page(width: auto, height: auto, margin: 1.5pt)
|
|
194
|
+
== {escape_typst(campaign_id)}
|
|
195
|
+
""" + generate_typst_table(
|
|
196
|
+
results
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Create a temporary file for the typst source
|
|
200
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".typ", delete=False) as f:
|
|
201
|
+
f.write(typst_code)
|
|
202
|
+
typst_file = f.name
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# Compile to PDF
|
|
206
|
+
pdf_bytes = typst.compile(typst_file)
|
|
207
|
+
return pdf_bytes
|
|
208
|
+
finally:
|
|
209
|
+
# Clean up
|
|
210
|
+
os.unlink(typst_file)
|