pearmut 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pearmut/cli.py CHANGED
@@ -34,21 +34,25 @@ def _run(args_unknown):
34
34
 
35
35
  # print access dashboard URL for all campaigns
36
36
  if tasks_data:
37
- print(
38
- args.server + "/dashboard.html?" + "&".join([
39
- f"campaign_id={urllib.parse.quote_plus(campaign_id)}&token={campaign_data["token"]}"
40
- for campaign_id, campaign_data in tasks_data.items()
41
- ]),
42
- # this is important to flush
43
- flush=True,
44
- )
45
-
37
+ dashboard_url = args.server + "/dashboard.html?" + "&".join([
38
+ f"campaign_id={urllib.parse.quote_plus(campaign_id)}&token={campaign_data["token"]}"
39
+ for campaign_id, campaign_data in tasks_data.items()
40
+ ])
41
+ print("\033[92mNow serving Pearmut, use the following URL to access the everything-dashboard:\033[0m")
42
+ print("🍐", dashboard_url+"\n", flush=True)
43
+
44
+ # disable startup message
45
+ uvicorn.config.LOGGING_CONFIG["loggers"]["uvicorn.error"]["level"] = "WARNING"
46
+ # set time logging
47
+ uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M"
48
+ uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["fmt"] = (
49
+ '%(asctime)s %(levelprefix)s %(client_addr)s - %(request_line)s %(status_code)s'
50
+ )
46
51
  uvicorn.run(
47
52
  app,
48
53
  host="0.0.0.0",
49
54
  port=args.port,
50
55
  reload=False,
51
- # log_level="info",
52
56
  )
53
57
 
54
58
 
@@ -108,6 +112,38 @@ def _validate_item_structure(items):
108
112
  raise ValueError(f"Validation rule for model '{model_name}' must be a dictionary")
109
113
 
110
114
 
115
+ def _validate_document_models(doc):
116
+ """
117
+ Validate that all items in a document have the same model outputs.
118
+
119
+ Args:
120
+ doc: List of items in a document
121
+
122
+ Returns:
123
+ None if valid
124
+
125
+ Raises:
126
+ ValueError: If items have different model outputs
127
+ """
128
+ # Get model names from the first item
129
+ first_item = doc[0]
130
+ first_models = set(first_item['tgt'].keys())
131
+
132
+ # Check all other items have the same model names
133
+ for i, item in enumerate(doc[1:], start=1):
134
+ if 'tgt' not in item or not isinstance(item['tgt'], dict):
135
+ continue
136
+
137
+ item_models = set(item['tgt'].keys())
138
+ if item_models != first_models:
139
+ raise ValueError(
140
+ f"Document contains items with different model outputs. "
141
+ f"Item 0 has models {sorted(first_models)}, but item {i} has models {sorted(item_models)}. "
142
+ f"This is fine, but we can't shuffle (on by default). "
143
+ f"To fix this, set 'shuffle': false in the campaign 'info' section. "
144
+ )
145
+
146
+
111
147
  def _shuffle_campaign_data(campaign_data, rng):
112
148
  """
113
149
  Shuffle campaign data at the document level in-place
@@ -120,14 +156,11 @@ def _shuffle_campaign_data(campaign_data, rng):
120
156
  """
121
157
  def shuffle_document(doc):
122
158
  """Shuffle a single document (list of items) by reordering models in tgt dict."""
123
- if not doc or not isinstance(doc, list):
124
- return
159
+ # Validate that all items have the same models
160
+ _validate_document_models(doc)
125
161
 
126
162
  # Get all model names from the first item's tgt dict
127
163
  first_item = doc[0]
128
- if 'tgt' not in first_item or not isinstance(first_item['tgt'], dict):
129
- return
130
-
131
164
  model_names = list(first_item['tgt'].keys())
132
165
  rng.shuffle(model_names)
133
166
 
@@ -146,7 +179,7 @@ def _shuffle_campaign_data(campaign_data, rng):
146
179
  for user_id, task in campaign_data["data"].items():
147
180
  for doc in task:
148
181
  shuffle_document(doc)
149
- elif assignment == "single-stream":
182
+ elif assignment in ["single-stream", "dynamic"]:
150
183
  # Shuffle each document in the shared pool
151
184
  for doc in campaign_data["data"]:
152
185
  shuffle_document(doc)
@@ -226,8 +259,46 @@ def _add_single_campaign(data_file, overwrite, server):
226
259
  else:
227
260
  raise ValueError("'users' must be an integer or a list.")
228
261
  elif assignment == "dynamic":
229
- raise NotImplementedError(
230
- "Dynamic campaign assignment is not yet implemented.")
262
+ tasks = campaign_data["data"]
263
+ if users_spec is None:
264
+ raise ValueError(
265
+ "Dynamic campaigns must specify 'users' in info.")
266
+ if not isinstance(campaign_data["data"], list):
267
+ raise ValueError(
268
+ "Dynamic campaign 'data' must be a list of items.")
269
+ # Validate item structure for dynamic
270
+ for doc_i, doc in enumerate(tasks):
271
+ try:
272
+ _validate_item_structure(doc)
273
+ except ValueError as e:
274
+ raise ValueError(f"Document {doc_i}: {e}")
275
+ if isinstance(users_spec, int):
276
+ num_users = users_spec
277
+ elif isinstance(users_spec, list):
278
+ num_users = len(users_spec)
279
+ else:
280
+ raise ValueError("'users' must be an integer or a list.")
281
+ # Validate dynamic-specific parameters
282
+ if "dynamic_top" not in campaign_data["info"]:
283
+ campaign_data["info"]["dynamic_top"] = 2
284
+ if "dynamic_first" not in campaign_data["info"]:
285
+ campaign_data["info"]["dynamic_first"] = 5
286
+ if "dynamic_contrastive_models" not in campaign_data["info"]:
287
+ campaign_data["info"]["dynamic_contrastive_models"] = 1
288
+ # Validate that dynamic_first is at least 1
289
+ assert campaign_data["info"]["dynamic_first"] >= 1, "dynamic_first must be at least 1"
290
+ # Validate that dynamic_contrastive_models is at most dynamic_top
291
+ assert campaign_data["info"]["dynamic_contrastive_models"] <= campaign_data["info"]["dynamic_top"], \
292
+ "dynamic_contrastive_models must be at most dynamic_top"
293
+ # Validate that all items have the same models
294
+ all_models = set()
295
+ for item in campaign_data["data"]:
296
+ if item and len(item) > 0:
297
+ all_models.update(item[0]["tgt"].keys())
298
+ for item in campaign_data["data"]:
299
+ if item and len(item) > 0:
300
+ item_models = set(item[0]["tgt"].keys())
301
+ assert item_models == all_models, "All items must have the same model outputs"
231
302
  else:
232
303
  raise ValueError(f"Unknown campaign assignment type: {assignment}")
233
304
 
@@ -270,14 +341,20 @@ def _add_single_campaign(data_file, overwrite, server):
270
341
  campaign_data["info"]["protocol"] = "ESA"
271
342
  print("Warning: 'protocol' not specified in campaign info. Defaulting to 'ESA'.")
272
343
 
344
+ # Remove output file when overwriting (after all validations pass)
345
+ if overwrite and campaign_data['campaign_id'] in progress_data:
346
+ output_file = f"{ROOT}/data/outputs/{campaign_data['campaign_id']}.jsonl"
347
+ if os.path.exists(output_file):
348
+ os.remove(output_file)
349
+
273
350
  # For task-based, data is a dict mapping user_id -> tasks
274
- # For single-stream, data is a flat list (shared among all users)
351
+ # For single-stream and dynamic, data is a flat list (shared among all users)
275
352
  if assignment == "task-based":
276
353
  campaign_data["data"] = {
277
354
  user_id: task
278
355
  for user_id, task in zip(user_ids, tasks)
279
356
  }
280
- elif assignment == "single-stream":
357
+ elif assignment in ["single-stream", "dynamic"]:
281
358
  campaign_data["data"] = tasks
282
359
 
283
360
  # generate a token for dashboard access if not present
@@ -299,6 +376,7 @@ def _add_single_campaign(data_file, overwrite, server):
299
376
  "progress": (
300
377
  [False]*len(campaign_data["data"][user_id]) if assignment == "task-based"
301
378
  else [False]*len(campaign_data["data"]) if assignment == "single-stream"
379
+ else [list() for _ in range(len(campaign_data["data"]))] if assignment == "dynamic"
302
380
  else []
303
381
  ),
304
382
  "time_start": None,
@@ -382,9 +460,7 @@ def _add_single_campaign(data_file, overwrite, server):
382
460
  json.dump(campaign_data, f, indent=2, ensure_ascii=False)
383
461
 
384
462
  progress_data[campaign_data['campaign_id']] = user_progress
385
-
386
- with open(f"{ROOT}/data/progress.json", "w") as f:
387
- json.dump(progress_data, f, indent=2, ensure_ascii=False)
463
+ save_progress_data(progress_data)
388
464
 
389
465
 
390
466
  print(
@@ -395,7 +471,7 @@ def _add_single_campaign(data_file, overwrite, server):
395
471
  )
396
472
  for user_id, user_val in user_progress.items():
397
473
  # point to the protocol URL
398
- print(f'{server}/{user_val["url"]}')
474
+ print(f'🧑 {server}/{user_val["url"]}')
399
475
  print()
400
476
 
401
477
 
@@ -469,10 +545,14 @@ def main():
469
545
  help='Optional campaign name to purge (purges all if not specified)'
470
546
  )
471
547
  purge_args = purge_args.parse_args(args_unknown)
548
+ progress_data = load_progress_data()
472
549
 
473
550
  if purge_args.campaign is not None:
474
551
  # Purge specific campaign
475
552
  campaign_id = purge_args.campaign
553
+ if campaign_id not in progress_data:
554
+ print(f"Campaign '{campaign_id}' does not exist.")
555
+ return
476
556
  confirm = input(
477
557
  f"Are you sure you want to purge campaign '{campaign_id}'? This action cannot be undone. [y/n] "
478
558
  )
@@ -502,7 +582,6 @@ def main():
502
582
  )
503
583
  if confirm.lower() == 'y':
504
584
  # Unlink all assets first
505
- progress_data = load_progress_data()
506
585
  for campaign_id in progress_data.keys():
507
586
  _unlink_assets(campaign_id)
508
587
  shutil.rmtree(f"{ROOT}/data/tasks", ignore_errors=True)
@@ -0,0 +1,210 @@
1
+ import collections
2
+ import json
3
+ import os
4
+ import statistics
5
+
6
+ from .utils import get_db_log
7
+
8
+
9
+ def comparison_significant(
10
+ scores1: dict[str, float], scores2: dict[str, float]
11
+ ) -> bool:
12
+ """Check if the difference between two sets of scores is statistically significant.
13
+ Assume scores1 > scores2.
14
+ """
15
+
16
+ import scipy.stats
17
+
18
+ # compute intersection
19
+ common_items = set(scores1.keys()).intersection(set(scores2.keys()))
20
+ if len(common_items) < 2:
21
+ return False
22
+
23
+ scores1 = [scores1[k] for k in common_items]
24
+ scores2 = [scores2[k] for k in common_items]
25
+
26
+ return bool(
27
+ scipy.stats.ttest_rel(scores1, scores2, alternative="two-sided").pvalue < 0.05
28
+ )
29
+
30
+
31
+ def compute_model_scores(campaign_id):
32
+ """
33
+ Compute model scores from annotations for a campaign.
34
+
35
+ Returns:
36
+ List of dicts with keys: model, score, count
37
+ Sorted by score in descending order
38
+ """
39
+ # Compute model scores from annotations
40
+ model_scores = collections.defaultdict(dict)
41
+
42
+ # Iterate through all tasks to find items with 'models' field (basic template)
43
+ log = get_db_log(campaign_id)
44
+ for entry in log:
45
+ if "item" not in entry or "annotation" not in entry:
46
+ continue
47
+ for item, annotation in zip(entry["item"], entry["annotation"]):
48
+ for model, annotation in annotation.items():
49
+ if "score" in annotation and annotation["score"] is not None:
50
+ item_id = item.get("item_id") or json.dumps(item | {"tgt": None})
51
+ model_scores[model][item_id] = annotation["score"]
52
+
53
+ model_scores = list(model_scores.items())
54
+ model_scores.sort(key=lambda x: statistics.mean(x[1].values()), reverse=True)
55
+
56
+ results = []
57
+ for i, (model, scores) in enumerate(model_scores):
58
+ avg_score = statistics.mean(scores.values())
59
+ sig_better = False
60
+ if i < len(model_scores) - 1:
61
+ # Compare with next model
62
+ scores_next = model_scores[i + 1][1]
63
+ sig_better = comparison_significant(scores, scores_next)
64
+ else:
65
+ sig_better = False
66
+ results.append(
67
+ {
68
+ "model": model,
69
+ "score": avg_score,
70
+ "count": len(scores),
71
+ "sig_better_than_next": sig_better,
72
+ }
73
+ )
74
+ return results
75
+
76
+
77
+ def escape_typst(s: str):
78
+ return (
79
+ s.replace("\\", "\\\\")
80
+ .replace("#", "\\#")
81
+ .replace("*", "\\*")
82
+ .replace("_", "\\_")
83
+ .replace("`", "\\`")
84
+ .replace("[", "\\[")
85
+ .replace("]", "\\]")
86
+ )
87
+
88
+
89
+ def generate_typst_table(results):
90
+ """
91
+ Generate Typst code for a two-column table with results.
92
+
93
+ Args:
94
+ results: List of dicts with keys: model, score, count
95
+
96
+ Returns:
97
+ String containing Typst table markup
98
+ """
99
+ if not results:
100
+ return "// No results available"
101
+
102
+ typst_code = """#table(
103
+ columns: (auto, auto),
104
+ align: (left, right),
105
+ stroke: none,
106
+ table.hline(),
107
+ [*Model*], [*Score*],
108
+ table.hline(),
109
+ """
110
+
111
+ for result in results:
112
+ # Escape Typst special characters
113
+ model = escape_typst(result["model"])
114
+ score = f"{result['score']:.1f}"
115
+ typst_code += f" [{model}], [{score}],\n"
116
+ if result["sig_better_than_next"]:
117
+ typst_code += " table.hline(end: 1),\n"
118
+
119
+ typst_code += " table.hline(),\n"
120
+ typst_code += ")\n"
121
+ return typst_code
122
+
123
+
124
+ def generate_latex_table(results):
125
+ """
126
+ Generate LaTeX code for a booktabs two-column table with results.
127
+
128
+ Args:
129
+ results: List of dicts with keys: model, score, count
130
+
131
+ Returns:
132
+ String containing LaTeX table markup
133
+ """
134
+ if not results:
135
+ return "% No results available"
136
+
137
+ latex_code = """\\begin{table}[h]
138
+ \\centering
139
+ \\begin{tabular}{lr}
140
+ \\toprule
141
+ \\textbf{Model} & \\textbf{Score} \\\\
142
+ \\midrule
143
+ """
144
+
145
+ for result in results:
146
+ # Escape LaTeX special characters
147
+ model = result["model"]
148
+ model = model.replace("\\", "\\textbackslash ")
149
+ model = model.replace("_", "\\_")
150
+ model = model.replace("&", "\\&")
151
+ model = model.replace("%", "\\%")
152
+ model = model.replace("$", "\\$")
153
+ model = model.replace("#", "\\#")
154
+ model = model.replace("{", "\\{")
155
+ model = model.replace("}", "\\}")
156
+ model = model.replace("~", "\\textasciitilde ")
157
+ model = model.replace("^", "\\textasciicircum ")
158
+
159
+ score = f"{result['score']:.1f}"
160
+ latex_code += f"{model} & {score} \\\\\n"
161
+ if result["sig_better_than_next"]:
162
+ latex_code += "\\cmidrule{1-1}\n"
163
+
164
+ latex_code += """\\bottomrule
165
+ \\end{tabular}
166
+ \\caption{Model ranking results}
167
+ \\label{tab:results}
168
+ \\end{table}
169
+ """
170
+ return latex_code
171
+
172
+
173
+ def generate_pdf(results, campaign_id):
174
+ """
175
+ Generate PDF from Typst code using typst-py.
176
+
177
+ Args:
178
+ results: List of dicts with keys: model, score, count
179
+
180
+ Returns:
181
+ bytes containing the PDF
182
+ """
183
+
184
+ import tempfile
185
+
186
+ import typst
187
+
188
+ if not results:
189
+ # Return empty PDF with message
190
+ typst_code = "[No results available]"
191
+ else:
192
+ typst_code = f"""
193
+ #set page(width: auto, height: auto, margin: 1.5pt)
194
+ == {escape_typst(campaign_id)}
195
+ """ + generate_typst_table(
196
+ results
197
+ )
198
+
199
+ # Create a temporary file for the typst source
200
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".typ", delete=False) as f:
201
+ f.write(typst_code)
202
+ typst_file = f.name
203
+
204
+ try:
205
+ # Compile to PDF
206
+ pdf_bytes = typst.compile(typst_file)
207
+ return pdf_bytes
208
+ finally:
209
+ # Clean up
210
+ os.unlink(typst_file)